#Exercise 20 #The dataset sau contains weight of lambs at slaughter from a Norwegian county collected in the years 1989-1998. We have 7 variables: sau<-read.table(file="http://www.uio.no/studier/emner/matnat/math/STK3100/data/sau.txt", header=T) attach(sau) head(sau) #? hvkt Body weight for lambs at slaughter (Response variable) #? aar year #? ageewe Age of mother #? alderlam Age of lamb at slaughter (in days) #? kjoenn Gender #? burdH Number of lambs in the litter #? NAO A climate index for the current year. #### a, Do some exploratory analysis to become familiar with the data set: par(mfrow=c(1,2)) boxplot(split(hvkt,kjoenn)) boxplot(split(hvkt,burdH)) par(mfrow=c(2,2)) plot( aar, hvkt) plot( ageewe, hvkt) plot( alderlam, hvkt) plot( NAO, hvkt) #not obvious what covariates that explains the data best. ########### Try different GLM models using the log-link with all covariates: #### b, Try y~gamma distribution: fit.gam = glm(hvkt ~ .,family=Gamma(link=log), data=sau) summary(fit.gam) #the (residual) deviance for the model = 54.198 on 1998-7=1991 df. The approximation of chi-square dist can be problematic since we estimate phi! pchisq(54.198, 1991, lower.tail=F) #### c, Try y~inverse gaussian: fit.ig = glm(hvkt ~ .,family=inverse.gaussian(link=log), data=sau) summary(fit.ig) AIC(fit.gam, fit.ig) #AIC = 2(-logLik + k), lowest AIC best model. k=#of parameters. #fit.gam = 13562.88 #fit.ig = 13609.02 #### d, Try y~log-normal: then log(y)~normal, so we can do a standard linear regression on log(y). fit.log = lm(log(hvkt) ~ .,data=sau) summary(fit.log) cbind(fit.log$coef,fit.gam$coef,fit.ig$coef) #very similar #### e), compute the AIC for the log-linear model and compare with the previous models. (see notes for calculations) # AIC = -1468.854. the response is on another scale here, so the (log)likelihood values are not directly comparable. logLik.originalScale = as.numeric(logLik(fit.log)) - sum(log(hvkt)) AIC.log = 2*(-logLik.originalScale + 8) #=13606.22 #################################################################### ##################### Further on we will use the Gamma regression: #y ~ gamma (mu, nu), E(y)=mu, Var(y)=mu^2/nu #### f, fit.gam_full = glm(hvkt ~ aar+ageewe+alderlam+kjoenn+burdH+NAO,family=Gamma(link=log), data=sau) fit.gam_noNAO = glm(hvkt ~ aar+ageewe+alderlam+kjoenn+burdH ,family=Gamma(link=log), data=sau) LRT = 2*(logLik(fit.gam_full) - logLik(fit.gam_noNAO)) LRT = as.numeric(LRT) pchisq(LRT, lower.tail=F, df=1) #0.3868399, accept H0, beta_NAO is not sign diff from 0. summary(fit.gam_noNAO) AIC(fit.gam_noNAO) mu_hat = fit.gam_noNAO$fitted nu_hat = 1/summary(fit.gam_noNAO)$disp #phi=1/nu #### g, Examine the final model! #plotting the residuals: devres<-sign(hvkt-mu_hat)*sqrt(-2*( log(hvkt/mu_hat)+1-hvkt/mu_hat ) ) # residuals(fit.gam_noNAO, method="deviance") plot(mu_hat,devres,xlab="fitted",ylab="deviance residuals"); abline(h=0, col="red") #plotting ¦Ì? against empirical variance: quant_hat = quantile(mu_hat,c(0:100)/100) #deler y and my_hat in 100 groups based on quantiles Mu = NULL V = NULL for(i in 1:100){ #calculate emp_var = var(hvkt) for groupe i: yi = hvkt[mu_hat>quant_hat[i] & mu_hatquant_hat[i] & mu_hat