## R-code for lecture 6. vt19 ## # 10/4-19. mydata <- read.table("data/f6data.txt", header = T) View(mydata) with(mydata, plot(y ~ x, xlab = "x", ylab = "y", xlim = c(-5, 30), ylim = c(-10, 50))) # Identify the points to single out: Iout <- which(mydata\$x > 15 & mydata\$y < 10) # get the outlier Iinf <- which(mydata\$x > 22) # get the potentially influential point with(mydata[Iout, ], points(x, y, col = "red", pch = 19)) with(mydata[Iinf, ], points(x, y, col = "green", pch = 19)) # Estimate models with and without the two points: mod <- lm(y ~ x, data = mydata) mod2 <- lm(y ~ x, data = mydata[-c(Iout, Iinf), ]) abline(mod, col = "red") abline(mod2, col = "black", lty = 2) sum <- summary(mod) mydata\$res <- sum\$residuals with(mydata, plot(res ~ x, xlim = c(-10, 30), xlab = "x_i", ylab = "residuals e_i")) with(mydata[Iout, ], points(x, res, col = "red", pch = 19)) with(mydata[Iinf, ], points(x, res, col = "green", pch = 19)) abline(h = 0) ## Plots for page 4: v <- hatvalues(mod) # leverage plot(mydata\$x, v, xlim = c(-10, 30), ylim = c(0, 0.3), xlab = "x_i", ylab = "v_ii", main = "Leverage v_ii. Black=1/n; Red=2(p+1)/n") points(mydata\$x[Iout], v[Iout], col = "red", pch = 19) points(mydata\$x[Iinf], v[Iinf], col = "green", pch = 19) abline(h = 1 / nrow(mydata)) abline(h = 2 * (1 + 1) / nrow(mydata), col = "red") infl <- influence(mod) # will calculate basic influence measures s_i <- infl\$sigma #a vector whose i-th element contains the estimate of the # residual standard deviation obtained when the i-th case is dropped from # the regression # v <- infl\$hat plot(s_i, ylim = c(0, 6), xlab = "i", ylab = "s_(i)", main = "s_(i)") points(Iout, s_i[Iout], col = "red", pch = 19) points(Iinf, s_i[Iinf], col = "green", pch = 19) abline(h = sum\$sigma) # studentised residuals r_stud <- rstudent(mod) plot(r_stud, ylim = c(-7, 7), xlab = "i", ylab = "r*_i", main = "stud. residuals, +/- 2") points(Iout, r_stud[Iout], col = "red", pch = 19) points(Iinf, r_stud[Iinf], col = "green", pch = 19) abline(h = 0) abline(h = c(-2, 2), col = "red") D <- cooks.distance(mod) plot(D, ylim = c(0, 1.2), xlab = "i", ylab = "D_i", main = "Cook's distance with D=4/n and D=1") points(Iout, D[Iout], col = "red", pch = 19) points(Iinf, D[Iinf], col = "green", pch = 19) abline(h = 4 / nrow(mydata)) abline(h = 1) ## Standardized change in beta-estimates (DFBETAS): # you may use dfbetas(mod) dfb <- dfbetas(mod) plot(dfb[, 1], ylim = c(-.5, .5), xlab = "i", ylab = "dfbeta_0(i)", main = "dfbeta_0 +/- 2/sqrt(n)") points(Iout, dfb[Iout, 1], col = "red", pch = 19) points(Iinf, dfb[Iinf, 1], col = "green", pch = 19) abline(h = 0) abline(h = 2 / sqrt(nrow(mydata)), col = "red") abline(h = -2 / sqrt(nrow(mydata)), col = "red") plot(dfb[, 2], ylim = c(-1, 1), xlab = "i", ylab = "dfbeta_1(i)", main = "dfbeta_1 +/- 2/sqrt(n)") points(Iout, dfb[Iout, 2], col = "red", pch = 19) points(Iinf, dfb[Iinf, 2], col = "green", pch = 19) abline(h = 0) abline(h = 2 / sqrt(nrow(mydata)), col = "red") abline(h = -2 / sqrt(nrow(mydata)), col = "red")