--- title: "Computer exercise 3: solutions" author: "Anna Lindgren" date: "15 April 2019" output: pdf_document: fig_caption : true classoption: a4paper --- {r setup, include = FALSE} knitr::opts_chunk$set( echo = TRUE, collapse = TRUE, comment = "#>" )  ## Exercise 3: U.S. county demographic information {r data} cdi <- read.delim("../data/CDI.txt") cdi$region <- factor(cdi$region, levels = c(1, 2, 3, 4), labels = c("Northeast", "Midwest", "South", "West")) cdi$phys1000 <- 1000 * cdi$phys / cdi$popul cdi$crm1000 <- 1000 * cdi$crimes / cdi$popul  ### (a) As seen in Figure 1 the number of physicians per 1000 inhabitants is very skewed, with large variability up, but not down. The logarithm is much more symmetric. ### (b) Fit all eight models, including the empty one with only the intercept. {r all subsets} ### See the R-code for Lecture 5 where I did this: mod.1 <- lm(log(phys1000) ~ 1, data = cdi) mod.2 <- lm(log(phys1000) ~ percapitaincome, data = cdi) mod.3 <- lm(log(phys1000) ~ crm1000, data = cdi) mod.4 <- lm(log(phys1000) ~ pop65plus, data = cdi) mod.5 <- lm(log(phys1000) ~ percapitaincome + crm1000, data = cdi) mod.6 <- lm(log(phys1000) ~ percapitaincome + pop65plus, data = cdi) mod.7 <- lm(log(phys1000) ~ crm1000 + pop65plus, data = cdi) mod.8 <- lm(log(phys1000) ~ percapitaincome + crm1000 + pop65plus, data = cdi) sum.1 <- summary(mod.1) sum.2 <- summary(mod.2) sum.3 <- summary(mod.3) sum.4 <- summary(mod.4) sum.5 <- summary(mod.5) sum.6 <- summary(mod.6) sum.7 <- summary(mod.7) sum.8 <- summary(mod.8) sum.8  Note that all three covariates are significant, which is good. Collect all their$R^2_{\text{adj}}$and BIC values {r r2s} crits <- data.frame(nr = seq(1, 8), name = c("intercept", "percapinc", "crm1000", "pop65", "inc_crm", "inc_65", "crm_65", "inc_crm_65"), p = c(0, 1, 1, 1, 2, 2, 2, 3), r2adj = c(sum.1$adj.r.squared, sum.2$adj.r.squared, sum.3$adj.r.squared, sum.4$adj.r.squared, sum.5$adj.r.squared, sum.6$adj.r.squared, sum.7$adj.r.squared, sum.8$adj.r.squared), bic = AIC(mod.1, mod.2, mod.3, mod.4, mod.5, mod.6, mod.7, mod.8, k = log(nrow(cdi)))[, 2]) crits  The model with all three explanatory variables is the best since it has the highest$R^2_{\text{adj}}$and the lowest BIC. ### (c) {r leverage} v <- influence(mod.8)$hat limit.v <- 2 * (3 + 1) / nrow(cdi) cdi[v > 0.16, ] I.Kings <- 6  As seen in Figure 2, several counties have a high leverage, in particular Kings county, New York. This is due to the unusually high crime rate, as seen in Figure 2(c). ### (d) {r} r <- rstudent(mod.8) pred.8 <- predict(mod.8) cdi[r == max(r), ] I.Olmsted <- 418  As seen in Figure 3(a)-(d) most of the residuals lie within $\pm 2$. They seem to have constant variance and no non-linear trends. However, there are two counties with large residuals. Kings county, NY has a large, negative residual (fewer physicians than expected) while Olmsted, Montana has a large positive residual (more physicians than expected). ### (e) {r} s.i <- influence(mod.8)$sigma  As seen in Figure 4, Olmsted, MN, which had a large residual, produced the largest decrease in$s_{(i)}$when left out, with Kings county, NY as a close second. ### (f) {r} limit.cook <- c(1, 4 / nrow(cdi)) D <- cooks.distance(mod.8)  As seen in Figure~\ref{cook}, Kings county has had a large influence on the$\beta$-estimates. Olmsted has not had an alarming influence. ### (g) {r} dfb <- dfbetas(mod.8) summary(dfb) limit.dfb <- c(-1, -2 / sqrt(nrow(cdi)), 0, 2 / sqrt(nrow(cdi)), 1)  As seen in Figure 6, Kings county as had a huge influence on the$\beta$-estimate for crm1000 (c), as might be expected, and a quite large influence on the intercept (a). Olmsted has not had a huge influence on any of the parameters. ### (h) When Kings county is taken out, the largest model is still the best, no county has a huge leverage or a large influence on the parameter estimates. However, Olmsted still has a large residual. ## Plots {r xxplots, fig.height = 7.5, fig.cap = figcaption} # Figure 1. figcaption = "Physicians or log physicians. Kings County, NY in red, Olmsted, MN in blue." par(mfrow = c(3, 2)) with(cdi, plot(phys1000 ~ percapitaincome, main = "(a) Physicians vs per capita income")) with(cdi[I.Kings, ], points(percapitaincome, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(percapitaincome, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ percapitaincome, log = "y", main = "(b) log Physicians vs per capita income")) with(cdi[I.Kings, ], points(percapitaincome, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(percapitaincome, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ crm1000, main = "(c) Physicians vs crime")) with(cdi[I.Kings, ], points(crm1000, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(crm1000, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ crm1000, log = "y", main = "(d) log Physicians vs crime")) with(cdi[I.Kings, ], points(crm1000, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(crm1000, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ pop65plus, main = "(e) Physicians vs 65+ population")) with(cdi[I.Kings, ], points(pop65plus, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(pop65plus, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ pop65plus, log = "y", main = "(f) log Physicians vs 65+ population")) with(cdi[I.Kings, ], points(pop65plus, phys1000, col = "red", pch = 19)) with(cdi[I.Olmsted, ], points(pop65plus, phys1000, col = "blue", pch = 8))  {r leverageplot, fig.height = 7, fig.cap = figcaption} # Figure 2. figcaption <- "Leverage of the full model. Kings County, NY in red, Olmsted, MN in blue." par(mfrow = c(2, 2)) with(cdi, plot(id, v, main = "(a) Leverage against id")) points(I.Kings, v[I.Kings], col = "red", pch = 19) points(I.Olmsted, v[I.Olmsted], col = "blue", pch = 8) abline(h = limit.v) with(cdi, plot(v ~ percapitaincome, main = "(b) Leverage against per capita income")) with(cdi, points(percapitaincome[I.Kings], v[I.Kings], col = "red", pch = 19)) with(cdi, points(percapitaincome[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(sh = limit.v) with(cdi, plot(v ~ crm1000, main = "(c) Leverage against crime per 1000")) with(cdi, points(crm1000[I.Kings], v[I.Kings], col = "red", pch = 19)) with(cdi, points(crm1000[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.v) with(cdi, plot(v ~ pop65plus, main = "(d) Leverage against population 65+")) with(cdi, points(pop65plus[I.Kings], v[I.Kings], col = "red", pch = 19)) with(cdi, points(pop65plus[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.v)  {r studresids, fig.height = 7, fig.cap = figcaption} # Figure 3. figcaption = "Studentized residuals. Kings County, NY in red, Olmsted, MN in blue" par(mfrow = c(2, 2)) plot(r ~ pred.8, main = "(a) Studentized residuals against predicted values") points(pred.8[I.Kings], r[I.Kings], col = "red", pch = 19) points(pred.8[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ percapitaincome, main = "(b) Studentized residuals against per capita income")) with(cdi, points(percapitaincome[I.Kings], r[I.Kings], col = "red", pch = 19)) with(cdi, points(percapitaincome[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ crm1000, main = "(c) Studentized residuals against crime per 1000")) with(cdi, points(crm1000[I.Kings], r[I.Kings], col = "red", pch = 19)) with(cdi, points(crm1000[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ pop65plus, main = "(d) Studentized residuals against poulation 65+")) with(cdi, points(pop65plus[I.Kings], r[I.Kings], col = "red", pch = 19)) with(cdi, points(pop65plus[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2)  {r sigmai, fig.height = 3.5, fig.cap = figcaption} # Figure 4. figcaption <- "Observations' effect on the sigma estimate. Kings county, NY in red, Olmsted, MN in blue" with(cdi, plot(s.i ~ id, main = "Leave-one-out sigma-estimates")) with(cdi, points(id[I.Kings], s.i[I.Kings], col = "red", pch = 19)) with(cdi, points(id[I.Olmsted], s.i[I.Olmsted], col = "blue", pch = 8))  {r cookplot, fig.cap = figcaption} # Figure 5 figcaption = "Cood's distance. Kings county, NY in red, Olmsted, MN in blue" par(mfrow = c(2, 2)) with(cdi, plot(D ~ percapitaincome, ylim = c(0, 1), main = "(a) Cook's distans against per capita income")) with(cdi, points(percapitaincome[I.Kings], D[I.Kings], col = "red", pch = 19)) with(cdi, points(percapitaincome[I.Olmsted], D[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.cook) with(cdi, plot(D ~ crm1000, ylim = c(0, 1), main = "(b) Cook's distancs against crime per 1000")) with(cdi, points(crm1000[I.Kings], D[I.Kings], col = "red", pch = 19)) with(cdi, points(crm1000[I.Olmsted], D[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.cook) with(cdi, plot(D ~ pop65plus, ylim = c(0, 1), main = "(c) Cook's distance against population 65+")) with(cdi, points(pop65plus[I.Kings], D[I.Kings], col = "red", pch = 19)) with(cdi, points(pop65plus[I.Olmsted], D[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.cook)  {r dfbplot, fig.height = 7, fig.cap = figcaption} # Figure 6 figcaption = "DFbetas. Kings county, NY in red, Olmsted, MN in blue" par(mfrow = c(2, 2)) with(cdi, plot(dfb[, 1] ~ id, main = "(a) bfbeta for the intercept", ylim = c(-1, 1), ylab = "dfbeta_0")) with(cdi, points(id[I.Kings], dfb[I.Kings, 1], pch = 19, col = "red")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 1], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 2] ~ id, main = "(b) bfbeta for per capita income", ylim = c(-1, 1), ylab = "dfbeta_percapitaincome")) with(cdi, points(id[I.Kings], dfb[I.Kings, 2], pch = 19, col = "red")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 2], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 3] ~ id, main = "(c) bfbeta for crime per 1000", ylim = c(-2, 1), ylab = "dfbeta_crm1000")) with(cdi, points(id[I.Kings], dfb[I.Kings, 3], pch = 19, col = "red")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 3], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 4] ~ id, main = "(d) bfbeta for population 65+", ylim = c(-1, 1), ylab = "dfbeta_pop65plus")) with(cdi, points(id[I.Kings], dfb[I.Kings, 4], pch = 19, col = "red")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 4], pch = 8, col = "blue")) abline(h = limit.dfb)  ### (h) plots {r} cdi <- cdi[-I.Kings,] I.Olmsted <- 417  {r, fig.height = 7, fig.cap = "Data without Kings, NY"} ### (a) ### par(mfrow = c(3, 2)) with(cdi, plot(phys1000 ~ percapitaincome, main = "(a) Physicians vs per capita income")) with(cdi[I.Olmsted, ], points(percapitaincome, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ percapitaincome, log = "y", main = "(b) log Physicians vs per capita income")) with(cdi[I.Olmsted, ], points(percapitaincome, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ crm1000, main = "(c) Physicians vs crime")) with(cdi[I.Olmsted, ], points(crm1000, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ crm1000, log = "y", main = "(d) log Physicians vs crime")) with(cdi[I.Olmsted, ], points(crm1000, phys1000, col = "blue", pch = 8)) with(cdi, plot(phys1000 ~ pop65plus, main = "(e) Physicians vs 65+ population")) with(cdi[I.Olmsted, ], points(pop65plus, phys1000, col = "blue",pch = 8)) with(cdi, plot(phys1000 ~ pop65plus, log = "y", main = "(f) log Physicians vs 65+ population")) with(cdi[I.Olmsted, ], points(pop65plus, phys1000, col = "blue", pch = 8))  {r} ### (b) ### mod.1 <- lm(log(phys1000) ~ 1, data=cdi) mod.2 <- lm(log(phys1000) ~ percapitaincome, data=cdi) mod.3 <- lm(log(phys1000) ~ crm1000, data=cdi) mod.4 <- lm(log(phys1000) ~ pop65plus, data=cdi) mod.5 <- lm(log(phys1000) ~ percapitaincome+crm1000, data=cdi) mod.6 <- lm(log(phys1000) ~ percapitaincome+pop65plus, data=cdi) mod.7 <- lm(log(phys1000) ~ crm1000+pop65plus, data=cdi) mod.8 <- lm(log(phys1000) ~ percapitaincome+crm1000+pop65plus, data=cdi) sum.1 <- summary(mod.1) sum.2 <- summary(mod.2) sum.3 <- summary(mod.3) sum.4 <- summary(mod.4) sum.5 <- summary(mod.5) sum.6 <- summary(mod.6) sum.7 <- summary(mod.7) sum.8 <- summary(mod.8) sum.8 crits <- data.frame(nr=c(1:8), name=c("intercept", "percapinc", "crm1000", "pop65", "inc_crm", "inc_65", "crm_65", "inc_crm_65"), p=c(0,1,1,1,2,2,2,3), r2adj=c(sum.1$adj.r.squared, sum.2$adj.r.squared, sum.3$adj.r.squared, sum.4$adj.r.squared, sum.5$adj.r.squared, sum.6$adj.r.squared, sum.7$adj.r.squared, sum.8$adj.r.squared), bic=AIC(mod.1, mod.2, mod.3, mod.4, mod.5, mod.6, mod.7, mod.8, k=log(nrow(cdi)))[,2]) crits  {r, fig.height = 7, fig.cap = "Leverage without Kings, NY"} ### (c) ### par(mfrow = c(2, 2)) v <- influence(mod.8)$hat limit.v <- 2 * (3 + 1) / nrow(cdi) with(cdi, plot(v ~ id, main = "(a) Leverage against id")) points(cdi$id[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8) abline(h = limit.v) with(cdi, plot(v ~ percapitaincome, main = "(b) Leverage against per capita income")) with(cdi, points(percapitaincome[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.v) with(cdi, plot(v ~ crm1000, main = "(c) Leverage against crime per 1000")) with(cdi, points(crm1000[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.v) with(cdi, plot(v ~ pop65plus, main = "(d) Leverage against population 65+")) with(cdi, points(pop65plus[I.Olmsted], v[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.v)  {r, fig.height = 7, fig.cap = "residuals without Kings, NY"} ### (d) ### par(mfrow = c(2, 2)) r <- rstudent(mod.8) pred.8 <- predict(mod.8) plot(r ~ pred.8, main = "(a) Studentized residuals against predicted values") with(cdi, points(pred.8[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ percapitaincome, main = "(b) Studentized residuals against per capita income")) with(cdi, points(percapitaincome[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ crm1000, main = "(c) Studentized residuals against crime per 1000")) with(cdi, points(crm1000[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2) with(cdi, plot(r ~ pop65plus, main = "(d) Studentized residuals against poulation 65+")) with(cdi, points(pop65plus[I.Olmsted], r[I.Olmsted], col = "blue", pch = 8)) abline(h = c(-2, 0, 2), lty = 2)  {r, fig.cap = "sigma without Kings, NY"} ### (e) ### s.i <- influence(mod.8)$sigma with(cdi, plot(s.i ~ id, main = "Leave-one-out sigma-estimates")) with(cdi, points(id[I.Olmsted], s.i[I.Olmsted], col = "blue", pch = 8))  {r, fig.cap = "Cook's distance without Kings, NY"} ### (f) ### limit.cook <- c(1, 4 / nrow(cdi)) D <- cooks.distance(mod.8) with(cdi, plot(D ~ id, ylim = c(0, 1), main = "(a) Cook's distans against per capita income")) with(cdi, points(id[I.Olmsted], D[I.Olmsted], col = "blue", pch = 8)) abline(h = limit.cook)  {r, fig.height = 7, fig.cap = "dfbetas without Kings, NY"} ### (g) ### par(mfrow = c(2, 2)) dfb <- dfbetas(mod.8) limit.dfb <- c(-1, -2 / sqrt(nrow(cdi)), 0, 2 / sqrt(nrow(cdi)), 1) with(cdi, plot(dfb[, 1] ~ id, main = "(a) bfbeta for the intercept", ylim = c(-1, 1), ylab = "dfbeta_0")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 1], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 2] ~ id, main = "(b) bfbeta for per capita income", ylim = c(-1, 1), ylab = "dfbeta_percapitaincome")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 2], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 3] ~ id, main = "(c) bfbeta for crime per 1000", ylim = c(-1, 1), ylab = "dfbeta_crm1000")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 3], pch = 8, col = "blue")) abline(h = limit.dfb) with(cdi, plot(dfb[, 4] ~ id, main = "(d) bfbeta for population 65+", ylim = c(-1, 1), ylab = "dfbeta_pop65plus")) with(cdi, points(id[I.Olmsted], dfb[I.Olmsted, 4], pch = 8, col = "blue")) abline(h = limit.dfb)