## Sec 0.1: Introduction ## ss 0.1.1 Examples NA The Age of the Universe library(gamair) data(hubble) names(hubble) <- c("Velocity", "Distance") plot(Velocity ~ Distance, data=hubble) hubble.lm <- lm(Velocity ~ -1 + Distance, data=hubble) hubble.rlm <- rlm(Velocity ~ -1 + Distance, data=hubble) hubble.lm2 <- rlm(Velocity ~ -1 + Distance + I(Distance^2), data=hubble) NA Tomato plant growth -- three different nutrients tomato <- data.frame(weight= c(1.5, 1.9, 1.3, 1.5, 2.4, 1.5, # water 1.5, 1.2, 1.2, 2.1, 2.9, 1.6, # Nutrient 1.9, 1.6, 0.8, 1.15, 0.9, 1.6), # Nutrient+24D trt = factor(rep(c("water", "Nutrient", "Nutrient+24D"), c(6, 6, 6)))) ## Now make water the first level of trt. It will then appear as ## the initial level in the graphs. In aov or lm calculations, it ## will appear as the baseline or reference level. tomato$trt <- relevel(tomato$trt, ref="water") tomato.lm <- aov(weight ~ 0 + trt, data=tomato) summary(tomato.lm) termplot(tomato.lm, partial=T, col.res="gray30") NA Electrical resistance of fruit, vs apparent juice content library(DAAG) plot(ohms ~ juice, data=fruitohms) library(splines) juice.ns3 <- lm(ohms~ns(juice, 3), data=fruitohms) plot(ohms ~ juice, data=fruitohms) ord <- with(fruitohms, order(juice)) lines(fitted(juice.ns3)[ord] ~ juice[ord], data=fruitohms, col=2) coef(juice.ns3) library(lattice) ns3 <- as.data.frame(with(fruitohms, ns(juice, 3))[, 1:3]) names(ns3) <- c("Curve_1", "Curve_2", "Curve_3") ns3$juice <- fruitohms$juice xyplot(Curve_1 + Curve_2 + Curve_3 ~ juice, type="l", data= ns3, auto.key = list(columns=3, points=FALSE, lines=TRUE)) NA Record times for Scottish hillraces hills.loglm <- lm(log(time) ~ log(dist) + log(climb), data=hills2000) par(mfrow=c(1,2), pty="s") termplot(hills.loglm, partial=TRUE, smooth=panel.smooth) ## ss 0.1.2 Terminology ## ss 0.1.3 Least squares and maximum likelihood NA Correct and incorrect models ## ss 0.1.4 Justification of causative interpretations ## ss 0.1.5 Ordinal and categorical outcomes ## Sec 0.2: Linear Models -- Basis Functions ## ss 0.2.1 Terms -- groups of basis functions ## ss 0.2.2 Factor basis functions > contrasts(sugar$trt) A B C Control 0 0 0 A 1 0 0 B 0 1 0 C 0 0 1 > sugar.lm <- lm(weight ~ trt, data=sugar) > model.matrix(sugar.lm) (Intercept) trtA trtB trtC 1 1 0 0 0 2 1 0 0 0 3 1 0 0 0 4 1 1 0 0 5 1 1 0 0 6 1 1 0 0 7 1 0 1 0 8 1 0 1 0 9 1 0 1 0 10 1 0 0 1 11 1 0 0 1 12 1 0 0 1 attr(,"assign") [1] 0 1 1 1 attr(,"contrasts") attr(,"contrasts")$trt [1] "contr.treatment" > sugar.lm0 <- lm(weight ~ -1 + trt, data=sugar) > model.matrix(sugar.lm0) trtControl trtA trtB trtC 1 1 0 0 0 2 1 0 0 0 3 1 0 0 0 4 0 1 0 0 5 0 1 0 0 6 0 1 0 0 7 0 0 1 0 8 0 0 1 0 9 0 0 1 0 10 0 0 0 1 11 0 0 0 1 12 0 0 0 1 attr(,"assign") [1] 1 1 1 1 attr(,"contrasts") attr(,"contrasts")$trt [1] "contr.treatment" ## ss 0.2.3 Spline basis functions ## Sec 0.3: Solving Least Squares Systems ## ss 0.3.1 Householder Reflections, and QR NA Properties of Householder reflections ## ss 0.3.2 Least Squares NA Normal equations NA Computational issues ## ss 0.3.3 Demonstrations of the computational steps NA Example 1 NA Use of R X1df <- data.frame(X1=c(-6,-3,6,21), X2=c(0,6,15,9), y=c(-3,1,2,6)) lm(y ~ X1+X2, data=X1df) summary(lm(y ~ X1+X2, data=X1df)) # Gives more information anova(lm(y ~ X1+X2, data=X1df)) # Gives different information lm(y ~ X1+X2, data=X1df)$qr # qr decomposition NA Example 2 NA Use of R X2df <- data.frame(X1=c(1, -1, 1, -2, 0, 0, 4, 7, 8), X2=c(-1, 6, 9, 8, 5, 3, 2, 0, -5), X3=c(-14, -2, 8, -3, 1, -1, 9, 8, 3), y=c(-7.7, 8.5, 19.6, 11.4, 11.4, 6.8, 2.5, -1.6, -13)) lm(y ~ X1+X2+X3, data=X2df) summary(lm(y ~ X1+X2+X3, data=X2df)) # Gives more information anova(lm(y ~ X1+X2+X3, data=X2df)) # Gives different information lm(y ~ X1+X2+X3, data=X2df)$qr # qr decomposition NA Alternatives to \texttt{lm()} X <- with(X2df, model.matrix(~ X1+X2+X3)) ## X <- cbind(rep(1,9), X2df[, 1:3]) # More efficient alternative QR <- qr(X) qr.coef(QR, X2df$y) qr.resid(QR, X2df$y) ## etc, etc ## ss 0.3.4 The Analysis of Variance Table X2df.lm <- lm(y ~ X1+X2+X3, data=X2df) anova(X2df.lm) ## Now extract the table from the successive components of Qy QR <- qr(cbind(rep(1,9), as.matrix(X2df[,1:3]))) Qty <- qr.qty(QR, X2df$y) duetoSS <- Qty[2:4]^2 # Why is Qty[1] omitted? rss <- sum(Qty[-(1:4)]^2) aovtab <- data.frame(row.names=c("X1", "X2", "X3", "Residual"), Df=c(rep(1,3), length(Qty)-4), SS=c(duetoSS, rss)) aovtab ## ss 0.3.5 Weighted Least Squares ## Sec 0.4: Model Assumptions and Model Choice ## ss 0.4.1 Distributional Theory NA Strict Requirements NA Distributional Results ## ss 0.4.2 Model choice ## ss 0.4.3 The interpretation of regression parameters ## Sec 0.5: Factor and Spline Terms ## ss 0.5.1 Factor terms ## The following is the default with(sugar, C(trt, contr.treatment)) sugar.lm <- lm(weight ~ C(trt, contr.treatment), data=sugar) sugar.lm model.matrix(sugar.lm) NA Ordered factors tinting$tint # tinting is in DAAG Levels: no < lo < hi with(tinting, C(tinting$tint, contr.poly)) attr(,"contrasts") .L .Q no -7.071068e-01 0.4082483 lo -7.850462e-17 -0.8164966 hi 7.071068e-01 0.4082483 Levels: no < lo < hi .L .Q no -1 1 lo 0 -2 hi 1 1 Xply by 0.707 0.408 ## ss 0.5.2 Regression splines ## Generate the matrix of values of the basis functions x <- 1:50 Xns <- ns(x, df=5) > attributes(Xns)$knots 20% 40% 60% 80% 10.8 20.6 30.4 40.2 > attributes(Xns)$Boundary.knots [1] 1 50 ## Create the plots plot(1:50, Xns[,1], ylim=range(Xns), pch="1") points(1:50, Xns[,2], pch="2", col="red") points(1:50, Xns[,3], pch="3", col="green") points(1:50, Xns[,4], pch="4", col="blue") points(1:50, Xns[,5], pch="5", col="magenta") NA How many degrees of freedom? x <- seq(from=0, to = 6*pi, length=50) y <- sin(x) plot(x,y) lines(x, fitted(lm(y~ns(x,7))), col="red") NA Regression splines -- an example covdf <- cbind(covsample[, 1:10], dist=(1:11318-0.5)/11318) ## First, investigate use of lowess() with(covdf, plot(lowess(dist, V1))) ## too smooth with(covdf, plot(lowess(dist, V1, f=0.1))) ## less smooth with(covdf, plot(lowess(dist, V1, f=0.01))) ## reasonable? hat <- fitted(lm(V1 ~ ns(dist,15), data=covdf)) ## Overplot on the earlier lowess smooth lines(hat ~ dist, data=covdf, col="red") with(covdf, plot(lowess(dist, V1, f=0.01))) hat <- fitted(rlm(V1 ~ bs(dist,40), data=covdf)) lines(hat ~ dist, data=covdf, col="red") ## Compare with use of lm() with 40 d.f. hat <- fitted(lm(V1 ~ bs(dist,40), data=covdf)) lines(hat ~ dist, data=covdf, col="blue") ## Sec 0.6: Generalized Linear Models (GLMs) ## ss 0.6.1 Brief summary of theory ## ss 0.6.2 GLMs -- commentary on the theory NA Solution by maximum likelihood NA Asymptotic theory ## Sec 0.7: Discriminant Methods ## Sec 0.8: Validity Issues ## ss 0.8.1 Errors in \textit{x} NA Regression with a single covariate NA One covariate measured with error; others without error NA Implications for variable selection ## ss 0.8.2 Missing variables and/or mis-specification of the model NA Do airbags reduce risk of death in an accident ## ss 0.8.3 Model and variable selection ## Sec 0.9: Resampling Methods ## Sec 0.10: Examples and Issues ## ss 0.10.1 A severely constrained sample of books ## ss 0.10.2 Hill race data > library(DAAG) > names(hills2000) [1] "h" "m" "s" "h0" "m0" "s0" "dist" "climb" "time" [10] "timef" > match("Caerketton", rownames(hill2k)) [1] 42 > hill2k[42, "dist"] [1] 2.5 loghill2k <- log(hill2k[-42, ]) names(loghill2k) <- c("ldist", "lclimb", "ltime", "ltimef") loghill2k.lm <- lm(ltime ~ ldist + lclimb, data = loghill2k) par(mfrow = c(2, 2)) plot(loghill2k.lm) par(mfrow = c(1, 1)) par(mfrow = c(1, 2)) termplot(loghill2k.lm, col.term = "gray", partial = TRUE, col.res = "black", smooth = panel.smooth) par(mfrow = c(1, 1)) NA Spline Terms loghill2k.lm <- lm(ltime ~ ldist + lclimb, data = loghill2k) par(mfrow = c(1, 2)) termplot(loghill2k.lm, col.term = "gray", partial = TRUE, col.res = "black", smooth = panel.smooth) par(mfrow = c(1, 1)) library(splines) loghill2ks.lm <- lm(ltime ~ ns(ldist, 3) + ns(lclimb, 4), data = loghill2k) if (dev.cur() == 2) invisible(dev.set(3)) par(mfrow = c(2, 2)) plot(loghill2ks.lm) par(mfrow = c(1, 1)) if (dev.cur() == 3) invisible(dev.set(2)) par(mfrow = c(1, 2)) termplot(loghill2ks.lm, col.term = "gray", partial = TRUE, col.res = "black", smooth = panel.smooth) par(mfrow = c(1, 1)) NA *The basis functions bases <- model.matrix(loghill2ks.lm) colnames(bases) options(digits = 3) bases[1:5, ] par(mfrow = c(2, 2)) for (i in 0:3) plot(loghill2k$lclimb, bases[, 5 + i]) par(mfrow = c(1, 1)) ## ss 0.10.3 Diet-disease studies ## ss 0.10.4 Lalonde's data -- effectiveness of a labour training program ## Sec 0.11: References and Further Reading