## R-code for lecture 7: # 16/4-19. AL #One continuous covariate#### data <- data.frame(x = seq(0, 100, 1)) data\$logodds1 <- -5 + 0.1 * data\$x data\$odds1 <- exp(data\$logodds1) data\$p1 <- data\$odds1 / (1 + data\$odds1) data\$logodds2 <- 5 - 0.1 * data\$x data\$odds2 <- exp(data\$logodds2) data\$p2 <- data\$odds2 / (1 + data\$odds2) data\$logodds3 <- -10 + 0.5 * data\$x data\$odds3 <- exp(data\$logodds3) data\$p3 <- data\$odds3 / (1 + data\$odds3) data\$logodds4 <- -6 + 0.1 * data\$x data\$odds4 <- exp(data\$logodds4) data\$p4 <- data\$odds4 / (1 + data\$odds4) data\$logodds5 <- 1 - 0.04 * data\$x data\$odds5 <- exp(data\$logodds5) data\$p5 <- data\$odds5 / (1 + data\$odds5) with(data, { plot(p1 ~ x, type = "l", ylab = "Pr(success)", lwd = 2, main = "Pr(success; x)") lines(x, p2, col = "blue", lty = 2, lwd = 2) lines(x, p3, col = "black", lty = 3, lwd = 2) lines(x, p4, col = "darkorange", lty = 4, lwd = 2) lines(x, p5, col = "magenta", lty = 5, lwd = 2) }) legend(69, 0.6, c("logit(p)= -5+0.1x", "logit(p)= 5-0.1x", "logit(p)= -10+0.5x", "logit(p)= -6+0.1x", "logit(p)= 1-0.04x"), col = c("black", "blue", "black", "darkorange", "magenta"), lty = c(1, 2, 3, 4, 5), lwd = 2) ### generate random y-observations from p5:#### set.seed(3) # for reproducibility # NOTICE: unless you use set.seed, each time you execute the # rbinom command below you will obtain # different draws hence a different dataset and different results data\$y <- rbinom(n = nrow(data), size = 1, prob = data\$p5) # Plot the data: with(data, plot(y ~ x, main = "Observed Y=0/1 against the covariate X")) # calculate a kernel smoother and add it to the plot: lines(ksmooth(data\$x, data\$y, bandwidth = 50)) # estimate the logistic model: model <- glm(y ~ x, data = data, family = "binomial") summary(model) confint(model) exp(confint(model)) # caclulate predicted probabilities and add to the plot x0 <- data.frame(x = seq(0, 100, 1)) predx <- cbind(x0, prob = predict(model, x0, type = "response")) with(predx, lines(x, prob, col = "blue")) # calculate conf.int for the linear part x*beta: xb <- predict(model, x0, se.fit = TRUE) ci.xb <- data.frame(lwr = xb\$fit - 1.96 * xb\$se.fit, upr = xb\$fit + 1.96 * xb\$se.fit) # transform to CI for the odds: ci.odds <- exp(ci.xb) # and finally CI for the probabilities and add to the plot: predx <- cbind(predx, ci.odds / (1 + ci.odds)) with(predx, { lines(x, lwr, lty = 2, col = "red") lines(x, upr, lty = 2, col = "red") }) ##One categorical x#### ## create the example data set: ## 300 bananas (0), 70 apples (1), and 40 melons (2): ## 100 green (1) and 200 yellow (0) bananas, etc: fruit <- data.frame(type = c(rep(0, each = 300), rep(1, each = 70), rep(2, each = 40)), green = c(rep(1, each = 100), rep(0, each = 200), rep(1, each = 40), rep(0, each = 30), rep(1, each = 20), rep(0, each = 20))) fruit\$type <- factor(fruit\$type, levels = c(0, 1, 2), labels = c("banana", "apple", "melon")) fruit\$green <- factor(fruit\$green, levels = c(0, 1), labels = c("yellow", "green")) summary(fruit) # tabulate them: fruit.table <- table(fruit\$type, fruit\$green) fruit.table ### Estimates and intervals for p the "old fashioned" way # See page 7 in Lecture 7: p.ests <- data.frame(name = c("banana", "apple", "melon"), n0 = c(fruit.table[1, 1], fruit.table[2, 1], fruit.table[3, 1]), n1 = c(fruit.table[1, 2], fruit.table[2, 2], fruit.table[3, 2])) p.ests\$n <- p.ests\$n0 + p.ests\$n1 p.ests\$phat <- p.ests\$n1 / p.ests\$n p.ests\$npq <- p.ests\$n * p.ests\$phat * (1 - p.ests\$phat) p.ests\$se <- sqrt(p.ests\$phat * (1 - p.ests\$phat) / p.ests\$n) p.ests\$lo <- p.ests\$phat - 1.96 * p.ests\$se p.ests\$hi <- p.ests\$phat + 1.96 * p.ests\$se p.ests ########### NOW WITH LOGISTIC REGRESSION ## Fit a logistic regression: # glm = Generalized Linear Model model <- glm(green ~ type, data = fruit, family = "binomial") model summary(model) ## beta-estimates (log odds(ratios)): model\$coefficients ## exp(beta)-estimates (odds(ratios)): exp(model\$coefficients) # confidence interval for log-odds(ratios) beta # ... using the sqrt(1/n + ...) version on page 13 in Lecture 7: se.b0 <- sqrt(1 / p.ests\$n0[1] + 1 / p.ests\$n1[1]) se.b1 <- sqrt(1 / p.ests\$n0[1] + 1 / p.ests\$n1[1] + 1 / p.ests\$n0[2] + 1 / p.ests\$n1[2]) se.b2 <- sqrt(1 / p.ests\$n0[1] + 1 / p.ests\$n1[1] + 1 / p.ests\$n0[3] + 1 / p.ests\$n1[3]) ci.beta <- rbind(model\$coefficients[1] + 1.96 * se.b0 * c(-1, 1), model\$coefficients[2] + 1.96 * se.b1 * c(-1, 1), model\$coefficients[3] + 1.96 * se.b2 *c(-1, 1)) ci.beta # and for exp(beta) exp(ci.beta) # confidence intervals for beta_j using the logistic model: confint(model) # confidence interval for odds(ratios): exp(beta) exp(confint(model)) # PREDICT log-odds: beta0 + beta1*x = X*beta x0 <- data.frame(type = c("banana", "apple", "melon")) ## log-odds, Xbeta: predict(model, x0) ## log-odds with standard errors: pred <- predict(model, x0, se.fit = TRUE) pred\$fit pred\$se.fit ## confidence interval for log-odds ci.logodds <- cbind(lo = pred\$fit - 1.96 * pred\$se.fit, hi = pred\$fit + 1.96 * pred\$se.fit) ci.logodds # predict probability of "success" (green) = exp(X*beta)/(1+exp(X*beta)): predict(model, x0, type = "response") ## confidence interval for probabilities: exp(ci.logodds) / (1 + exp(ci.logodds))