### MASM22/FMSN30/FMSN40 spring 2019. Anna Lindgren ### ### lecture 2, 27/3-19 ### ### intervals, t-test and residuals in simple linear regression # put ice-cream data from lecture 1 in a data frame: icecream <- data.frame(weeks = c(25, 26, 27, 30, 31, 32, 35), loss = c(28, 28.3, 29.7, 35.3, 36.4, 37.0, 40.2)) # and plot it: with(icecream, plot(loss ~ weeks, xlab = "time (weeks)", ylab = "weight loss (g)", xlim = c(23, 38), ylim = c(25, 45), main = "Ice cream weight loss")) model <- lm(loss ~ weeks, data = icecream) model summary(model) # add the estimated model to the plot abline(model, col = "blue") # new in lecture 2: # predictions, residuals and residual standard deviation yhat <- model\$fitted.values e <- model\$residuals s <- summary(model)\$sigma # 95% confidence intervals for parameters by hand: # we need 's', the square root of the estimate of the error variance, see above # 2.5%-quantile for a Student's distribution with n-2 degrees of freedom: t_quant <- qt(1 - 0.05 / 2, n - 2) se_beta0 <- s * sqrt(1 / n + mx^2 / sum((icecream\$week - mx)^2)) se_beta1 <- s / sqrt(sum((icecream\$week - mx)^2)) ci_beta0 <- beta0 + c(-1, 1) * t_quant * se_beta0 ci_beta1 <- beta1 + c(-1, 1) * t_quant * se_beta1 ci_beta0 ci_beta1 # or using confint() # default uses 95% confidence level, otherwise use confint(model,level=0.9) etc. confint(model) # Let's plot confidence and prediction intervals for E(Y0) and Y0 respectively xx0 <- seq(23, 38, 0.1) # just a grid of values for the predictor, from 23 to # 38 with step 0.1 predx <- data.frame(weeks = xx0) # add the predictions and confidence interval as new columns in the predx data # frame, with name prefix "conf": predx <- cbind(predx, conf = predict(model, predx, interval = "confidence")) # add the predictions (again) and prediction interval as new columns in the # predx data frame, adding the prefix "pred" to the prediction columns: predx <- cbind(predx, pred = predict(model, predx, interval = "prediction")) head(predx) # add confidence interval lines with(predx, { lines(weeks, conf.lwr, lty = 2, col = "red", lwd = 2) lines(weeks, conf.upr, lty = 2, col = "red", lwd = 2) }) # add prediction interval lines with(predx, { lines(weeks, pred.lwr, lty = 3, col = "blue", lwd = 2) lines(weeks, pred.upr, lty = 3, col = "blue", lwd = 2) }) grid() ################################## ## Some R-commands for exercise 1.9 in Rawlings, J.O., ## ... Pantula, S.G., Dickey, D.A.: Applied Regression ... ## ... Analysis - A Research Tool, 2ed, Springer soybeans <- data.frame(radiation = c(29.7, 68.4, 120.7, 217.2, 313.5, 419.1, 535.9, 641.5), biomass = c(16.6, 49.1, 121.7, 219.6, 375.5, 570.8, 648.2, 755.6)) with(soybeans, plot(biomass ~ radiation, xlab = "time (weeks)", ylab = "plant biomass", main = "Observations and fitted line")) model <- lm(biomass ~ radiation, data = soybeans) abline(model) summary(model) confint(model) ################################### ## t-test of H0: beta1 = 0 ## ################################### # Estimate the model y = beta0 + beta1*x: summod <- summary(model) summod ## Reject H0 since Pr(>|t|) = 4.36*10^(-7) < 0.05. ################################### ## t-test of H0: beta1 = 0 "by hand" ## ################################### # The slope estimate: beta1 <- model\$coefficients[2] ## same as # beta1 <- summod\$coefficients[2, 1] # beta1 <- summod\$coefficients["radiation", "Estimate"] # The sigma-estimate: s <- summod\$sigma # also given as "Residual standard error" in summod # Standard error for beta_1: se_beta1 <- s / sqrt(sum((soybeans\$radiation - mean(soybeans\$radiation))^2)) ## same as # se_beta1 <- summod\$coefficients[2, 2] # se_beta1 <- summod\$coefficients["radiation", "Std. Error"] # Test-value: t <- (beta1 - 0) / se_beta1 t ## same as # t <- summod\$coefficients[2, 3] # t <- summod\$coefficients["radiation", "t value"] # t-quantile for comparison: (here alpha = 0.05) t_alfa <- qt(1 - 0.05 / 2, model\$df.residual) t_alfa ## Reject H0 at alpha=0.05 since abs(t) > t_alfa. ## Test H0 using p-value "by hand" as 2*upper tail pvalue <- 2 * pt(abs(t), model\$df.residual, lower.tail = FALSE) pvalue # Reject H0 since Pr(>|t|) = 4.355*10^-7 < alpha=0.05 ## Looking at the summary summod ## or extracting the p-value from the summary: summod\$coefficients["radiation", "Pr(>|t|)"] # check assumptions using residuals e <- model\$residuals # No forgotten trends in x? plot(e ~ soybeans\$radiation, xlab = "time (weeks)", ylab = "residuals", main = "nonlinear in x?", ylim = c(-70, 70)) abline(h = 0) # Equally "bad" fit for all levels of Y? plot(e ~ model\$fit, xlab = "predicted plant biomass", ylab = "residuals", main = "same for all prediction levels?", ylim = c(-70, 70)) abline(h = 0) ## Since we have measurements from eight consecutive weeks there might be ## correlation between the successive e-values: plot(e) # plots e vs index abline(h = 0) # about ok # errors have Gaussian distribution? qqnorm(e, ylim = c(-70, 70)) # qqplots: difficult to test with so few data points qqline(e) # add line corresponding to quantiles from N(0,1) # Mostly ok but with one suspiciously large residual hist(e) # the large residual is seen here as well.