## R Code from "The R System - An Introduction and Overview" ## Chapter 1: Introduction ## Sec 1.1: Commentary on R ## ss 1.1.1 General ## ss 1.1.2 Getting help ## ss 1.1.3 Use of an editor as a run-time environment ## ss 1.1.4 The development model, and development strategies ## ss 1.1.5 Unifying ideas ## ss 1.1.6 Retrospect, prospect and alternatives to R ## ss 1.1.7 Data set size, and databases ## ss 1.1.8 The statistics of data collection ## Sec 1.2: Installation of R and of R Packages ## ss 1.2.1 Help for installation under windows ## Sec 1.3: Documentation ## Chapter 2: An Overview of R ## Sec 2.1: Use of the console (i.e., command line) window 2+2 q() ## ss 2.1.1 Practice with R commands 1:5 # The numbers 1, 2, 3, 4, 5 mean(1:5) sum(1:5) # Apply the sum function to the # the vector of numbers 1, 2, 3, 4, 5 (2:5)^10 # 2 to the power of 10, 3 to the power of 10, ... log2(c(0.5, 1, 2, 4, 8)) # Values that differ by a factor of 2 # are, on this scale, one unit apart. ## Sec 2.2: Demonstrations demo() demo(image) demo(graphics) demo(persp) demo(plotmath) # Mathematical symbols can be visually interesting library(lattice) demo(lattice) # Demonstrates lattice graphics demo(package = .packages(all.available = TRUE)) library(vcd) # The vcd package must of course be installed. demo(mosaic) ## ss 2.2.1 Examples that are included on help pages example(plot) ## Sec 2.3: A Short R Session ## ss 2.3.1 Entry of vector elements from the command line volume <- c(351, 955, 662, 1203, 557, 460) weight <- c(250, 840, 550, 1360, 640, 420) description <- c("Aird's Guide to Sydney", "Moon's Australia handbook", "Explore Australia Road Atlas", "Australian Motoring Guide", "Penguin Touring Atlas", "Canberra - The Guide") ## ss 2.3.2 Operations with \txtt{vectors} volume # Final element of volume volume[6] ## Ratio of weight to volume, i.e., density round(weight/volume,2) ## ss 2.3.3 A simple plot ## Code plot(weight ~ volume, pch=16, cex=1.5) # pch=16: use solid blob as plot symbol # cex=1.5: point size is 1.5 times default ## Alternative plot(volume, weight, pch=16, cex=1.5) plot(weight ~ volume, pch=16, cex=1.5, xlab="Volume (cubic mm)", ylab="Weight (g)") identify(weight ~ volume, labels=description) ## Sec 2.4: Data frames -- Grouping together columns of data ## NB, the row names will now be shortened travelbooks <- data.frame( thickness = c(1.3, 3.9, 1.2, 2, 0.6, 1.5), width = c(11.3, 13.1, 20, 21.1, 25.8, 13.1), height = c(23.9, 18.7, 27.6, 28.5, 36, 23.4), weight = weight, # Include values of weight, entered earlier volume = volume, # Include values of volume, entered earlier type = c("Guide", "Guide", "Roadmaps", "Roadmaps", "Roadmaps", "Guide"), row.names = description ) ## Remove objects that are not now needed. rm(volume, weight, description) ## ss 2.4.1 Accessing the columns of data frames travelbooks[, 4] travelbooks[, "weight"] travelbooks$weight travelbooks[["weight"]] # This treats the data frame as a list. ## 1: Use the data parameter in the function call plot( weight ~ volume, data=travelbooks) # ## 2: Use attach() to include the column names in the search list attach(travelbooks) plot( weight ~ volume) detach(travelbooks) # Detach when no longer required # ## 3: Use with(); attaches for the duration of the statement with(travelbooks, plot(weight ~ volume)) ## ss 2.4.2 Input of Data ## Place the file in the working directory library(DAAGxtras) # DAAGxtras has the needed function dataFile("travelbooks") # Place file in directory file.show("travelbooks.txt") # Display travelbooks.txt ## Now read the file in travelbooks <- read.table("travelbooks.txt") # Row 1 of the file gives column names. Column 1 gives row names # Explicitly specify details of header and row name information travelbooks <- read.table("travelbooks.txt", header=TRUE, row.names=1) ## Sec 2.5: Help, and examples help(help) # Get help on help() help(mean) example(mean) example(lowess) # Fit a smooth curve to scatterplot data example(image) example(contour) example(filled.contour) par(ask=FALSE) # From here on, plot without asking help.search("bar") example(barplot) par(ask=FALSE) apropos("str") ## ss 2.5.1 Vignettes vignette() vignette(package="graph") vignette(package="e1071") # e701 includes functions for # Support Vector Machines (SVMs) vignette(package="mcmc") # Markov Chain Monte Carlo vignette("graph") # Equivalent to vignette(topic="graph") ## Sec 2.6: Summary ## Chapter 3: The Working Environment of an R Session ## Sec 3.1: The Working Directory and the Workspace ## ss 3.1.1 Listing Workspace Contents ls() ls(pattern="^w") ## ss 3.1.2 Setting the Working Directory ## Sec 3.2: Saving and retrieving R objects save(volume, weight, file="books.RData") # Can save many objects in the same file load("books.RData") # Recover the saved objects dump(c("volume", "weight"), file="books.R") source("books.R") ## ss 3.2.1 Writing data frames to text files ## Sec 3.3: Installations, packages and sessions ## ss 3.3.1 The architecture of an R installation -- Packages sessionInfo() NA Installation of R packages install.packages(pkgs="D:/DAAG_0.91.zip", repos=NULL) ## ss 3.3.2 The search path: library() and attach() search() NA Attachment of R packages NA Attachment of image files attach("books.RData") detach("file:books.RData") ## Sec 3.4: Summary ## Chapter 4: Objects and Functions ## Sec 4.1: Columns of Data, \& Data Frames ## ss 4.1.1 Vectors c(2,3,5,2,7,1) 3:10 # The numbers 3, 4,.., 10 c(TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE) c("cherry","mango","apple","prune") ## ss 4.1.2 Subsets of Vectors x <- c(3,11,8,15,12) # Assign to x the values x[c(2,4)] # Extract elements (rows) 2 and 4 x <- c(3,11,8,15,12) x[-c(2,3)] x>10 # This generates a vector of logical x[x > 10] library(DAAG) cuckooEgglengths <- cuckoohosts[,1] cuckooEgglengths ## Assign names to the vector elements names(cuckooEgglengths) <- rownames(travelbooks) cuckooEgglengths ## Names can be used to extract elements cuckooEgglengths[c("hedge.sparrow", "robin", "wren")] subset(cuckooEggLengths, cuckooEggLengths > 23) ## ss 4.1.3 Factors fruit <- c("cherry","mango","apple","prune","cherry") fruitfac <- factor(fruit) as(fruitfac, "numeric") levels(fruitfac) ## Take fruit in order of stated glycemic index (15, 22, 38, 55) fruitfac <- factor(fruit, levels=c("prune","cherry","apple","mango")) levels(fruitfac) fruitfac <- factor(fruitfac, levels=c("prune","cherry","Apple","mango")) table(fruitfac) # The number of Apples is given as 0 fruitfac <- factor(c("cherry","mango","apple","prune","cherry")) fruitfac == "cherry" ## ss 4.1.4 Ordered factors windowTint <- ordered(rep(c("lo","med","hi"), 2), windowTint ## ss 4.1.5 Subsets of data frames travelbooks$weight > 500 ## Result is a vector of logicals subset(travelbooks, weight > 500) ## ss 4.1.6 Data frames -- Lists of Columns ## ss 4.1.7 Inclusion of character vectors in data frames ## Sec 4.2: Missing Values, Infinite Values and NaNs is.na(c(1, NA, 3, 0, NA)) c(1, NA, 3, 0, NA) == NA y <- c(1, NA, 3, 0, NA) y[y > 0] y[y > 0] <- c(11, 12) ## ss 4.2.1 Identifying and processing rows that include missing values test.df <- data.frame(x=c(1:2,NA), y=1:3) test.df complete.cases(test.df) na.omit(test.df) ## Sec 4.3: The Use of Data Frames in Correlation and Regression travelbooks$area <- with(travelbooks, width*height) travelbooks$density <- with(travelbooks, weight/volume) names(travelbooks) # Check full set of column names pairs(travelbooks[, -6]) # Omit column 6 ## Alternative plot(travelbooks[,-6]) ## ss 4.3.1 Correlation calculations round(cor(travelbooks[, -6]),2) ## ss 4.3.2 Dependence of \txtt{weight} on \txtt{volume} wtvol.lm <- lm(weight ~ volume, data=travelbooks) wtvol.lm wtvol0.lm <- lm(weight ~ -1 + volume, data=travelbooks) wtvol0.lm ## ss 4.3.3 Summary information -- print(), summary() and plot() print(wtvol.lm) # Equivalent to typing wtvol.lm at the command line summary(wtvol.lm) par(mfrow=c(2,2)) # Subsequent plots appear in a 2 x 2 layout plot(wtvol.lm) par(mfrow=c(1,1)) # Reset to 1 plot per page, for any later plots ## Sec 4.4: Matrices -- Vectors with a Dimension Attribute ## ss 4.4.1 Data frames versus matrices travelmat <- as.matrix(travelbooks[, 1:4]) dim(travelmat) # Equivalent to attr(travelmat, "dim") attr(travelmat, "dim") ## ss 4.4.2 Removal of the dimension attribute travelvec <- as.matrix(travelbooks[, 1:4]) dim(travelvec) <- NULL # Columns of travelmat become one long travelvec # Use of as(travelmat, "vector") is however preferable # For these data, why would one do this? ## ss 4.4.3 Submatrices ## ss 4.4.4 Matrix Manipulations X + Y # Elementwise addition (The dimensions must agree) X * Y # Elementwise multiplication (The dimensions must agree) X %*% B # Matrix multiplication (X is n by k; B k by m) solve(X, Y) # Solve X B = Y (X is n by k, Y n by m, B k by m) svd(X) # Singular value decomposition of X qr(X) # QR decomposition of X. ## ss 4.4.5 Data frames versus matrices length(travelbooks) # travelbooks is a list of 7 vectors length(as(travelbooks[,1:4], "matrix")) # matrix has 28 elements travelmat <- as.matrix(travelbooks[, 1:4]) # From data frame to matrix newtravelbooks <- as.data.frame(travelmat) # From matrix to data frame ## Sec 4.5: Lists ## ss 4.5.1 Lists of matrices library(sma) data(MouseArray) # MouseArray holds several data objects ls() # Check contents of workspace ## ss 4.5.2 Model objects are lists names(wtvol.lm) wtvol.lm$call coef(wtvol.lm) resid(wtvol.lm) ## Sec 4.6: Functions ## ss 4.6.1 Built-In Functions ## ss 4.6.2 Common useful functions, for use with vectors mean(c(1, NA, 3, 0, NA), na.rm=T) ## ss 4.6.3 User-defined functions mean.and.sd <- function(x){ av <- mean(x) sdev <- sd(x) c(mean=av, sd = sdev) # The function returns this vector } hetero <- c(.43,.25,.53,.47,.81,.42,.61) mean.and.sd(hetero) mean.and.sd <- function(x = rnorm(20)) c(mean=mean(x), sd=sd(x)) mean.and.sd() mean.and.sd() ## ss 4.6.4 Information on R Objects ## ss 4.6.5 Information on data objects str(travelbooks) ## ss 4.6.6 Information on function arguments ## ss 4.6.7 Tables and Cross-Tabulation ## ss 4.6.8 table() table(possum$Pop, possum$sex) library(MASS) table(is.na(Pima.tr2$bp)) sapply(Pima.tr2, function(x)sum(is.na(x))) x <- factor(x, exclude=NULL) ## ss 4.6.9 Creating groups catBP <- cut(Pima.tr2$bp, breaks=4) table(catBP) sum(table(catBP)) table(is.na(catBP)) catBP <- factor(catBP, exclude=NULL) table(catBP) sum(table(catBP)) ## Sec 4.7: Option Settings ## ss 4.7.1 Setting the number of significant digits in output sqrt(10) options(digits=2) # Change until further notice, # or until end of session. sqrt(10) round(sqrt(372/12), 2) round(sqrt(2) * sqrt(372/12), 2) round(sqrt(2) * 5.57, 2) ## ss 4.7.2 Other option settings ## Sec 4.8: Common Sources of Difficulty ## Sec 4.9: Summary ## Attach an archive that was created in an earlier session attach("archive.RData") ## Attach the image file from another directory attach("c:/drosophila/.RData") # NB, forward slashes ## Sec 4.10: Exercises gi14786865 <- "LNLFFAGTETVSTTLRYGFLLLMKHPEVEAKVHEEI" o4cka3 <- "LNIMVAGRDTTAGLLSFAMFELARNPKIWNKLREEV" ## Chapter 5: Base Graphics demo(graphics) library(lattice) demo(lattice) ## Sec 5.1: \txtt{plot()} and allied functions plot(y ~ x) # Use a formula to specify the graph plot(x, y) # Horizontal ordinate, then verhills20-0tical plot((0:20)*pi/10, sin((0:20)*pi/10)) plot((1:30)*0.92, sin((1:30)*0.92)) library(DAAG) attach(elasticband) # R can now find distance & stretch plot(distance ~ stretch) plot(ACT ~ year, data=austpop, type="l") plot(ACT ~ year, data=austpop, type="b") detach(elasticband) with(austpop, plot(spline(Year, ACT), type="l") # Fit smooth curve through points ) ## ss 5.1.1 Newer plot methods plot(travelbooks[,1:5]) # Has the same effect as pairs(travelbooks[, 1:5]) library(lattice) splom(~ travelbooks[, 1:5]) # Lattice alternative to pairs plot ## Sec 5.2: Fine control -- Parameter settings oldpar <- par(cex=1.25, col="red") ## ss 5.2.1 Adding Text in the Margin ## Sec 5.3: Adding points, lines and text -- examples ## Data used in plot primates # DAAG package attach(primates) plot(Bodywt, Brainwt, xlim=c(0, 250), xlab="Body weight (kg)", ylab="Brain weight (g)") # Specify xlim so that there is room for the labels text(x=Bodywt, y=Brainwt, labels=rownames(primates), pos=4) # Alternatives are pos=1 (below), 2 (left), 3 (above) detach(primates) ## ss 5.3.1 Example -- Labels that locate possum study sites possumsites # DAAG package attach(possumsites) ## Ensure that the oz package is installed library(oz) oz() text(longitude ~ latitude, labels=rownames(possumsites), adj=1, col="red") ## Code used for plot: oz(sections=c(3:5, 11:16), col="gray", xlim=c(130, 165)) chh <- par()$cxy[2] points(latitude, longitude+c(0,0,0,.2,-.2, 0,0)*chh, col="blue") text(longitude ~ latitude, labels=rownames(possumsites), pos=c(2,4,2,1,3,2,2), col="red", xpd=TRUE) # pos = 1: below, 2: left, 3:above, 4: right # xpd=TRUE may plot outside figure region detach(possumsites) ## ss 5.3.2 Multiple plots on the one page oldpar <- par(mfrow=c(2,2), pch=16) library(MASS) with(Animals, { # bracket several R statements plot(body, brain) plot(sqrt(body), sqrt(brain)) plot((body)^0.1, (brain)^0.1) plot(log(body),log(brain)) }) # close both sets of brackets par(oldpar) # Restore to 1 figure per page, and pch=1 ## ss 5.3.3 Color palettes view.colours <- function(){ plot(1, 1, xlim=c(0,14), ylim=c(0,3), type="n", axes=F, xlab="",ylab="") text(1:6, rep(2.5,6), paste(1:6), col=palette()[1:6], cex=2.5) text(10, 2.5, "Default palette", adj=0) rainchars <- c("R","O","Y","G","B","I","V") text(1:7, rep(1.5,7), rainchars, col=rainbow(7), cex=2.5) text(10, 1.5, "rainbow(7)", adj=0) cmtxt <- substring("cm.colors", 1:9,1:9) # Split "cm.colors" into its 9 characters text(1:9, rep(0.5,9), cmtxt, col=cm.colors(9), cex=3) text(10, 0.5, "cm.colors(9)", adj=0) } view.colours() ## ss 5.3.4 The shape of the graph sheet ## Sec 5.4: Identification and Location on the Figure Region with(travelbooks, plot(weight ~ volume) locator() ## Sec 5.5: Plots that show the distribution of data values ## ss 5.5.1 Histograms and density plots par(mfrow = c(1, 2), pty="s") # pty="s"; square plots attach(possum) here <- sex == "f" hist(totlngth[here], breaks = 72.5 + (0:5) * 5, ylim = c(0, 22), xlab="Total length", main ="A: Breaks at 72.5, 77.5,...") hist(totlngth[here], breaks = 75 + (0:5) * 5, ylim = c(0, 22), xlab="Total length", main="B: Breaks at 75, 80,...") par(mfrow=c(1,1)) detach(possum) with(subset(possum, sex=="f"), hist(totlngth)) with(subset(possum, sex=="f"), rug(totlngth)) with(subset(possum, sex=="f"), dotchart(totlngth)) with(subset(possum, sex=="f"), stripchart(totlngth)) ## ss 5.5.2 Density Plots with(possum, plot(density(totlngth[here]),type="l")) attach(possum) here <- sex == "f" dens <- density(totlngth[here]) xlim <- range(dens$x) ylim <- range(dens$y) hist(totlngth[here], breaks = 72.5 + (0:5) * 5, probability = T, xlim = xlim, ylim = ylim, xlab="Total length", main="") lines(dens) detach(possum) ## ss 5.5.3 Boxplots ## Code with(subset(possum, sex=="f"), boxplot(totlngth, horizontal=TRUE)) with(subset(possum, sex=="f"), rug(totlngth)) with(subset(possum, sex=="f"), rug(totlngth)) ## ss 5.5.4 Side by side boxplots ## Code used for plot: boxplot(length ~ species, data=cuckoos, xlab="Length of egg", horizontal=TRUE, las=2) ## las=2: Plot labels perpendicular to axis. ## ss 5.5.5 Normal probability plots x11(width=8, height=6) # This is a better shape for this plot attach(possum) here <- sex == "f" par(mfrow=c(3,4)) # 3 by 4 layout of plots y <- totlngth[here] qqnorm(y,xlab="", ylab="Length", main="Possums") for(i in 1:11)qqnorm(rnorm(43), xlab="", ylab="Simulated lengths", main="Simulated") par(mfrow=c(1,1)) # By default, rnorm() generates random samples from a normal # distribution with mean 0 and standard deviation equal to 1. detach(possum) ## Sec 5.6: Scatterplot Smoothing ## Code attach(mtcars) # From the datasets package plot(mpg ~ disp, xlab=expression("Displacement (" * " in"^3 * ")"), ylab="Miles per gallon") lines(lowess(mpg ~ disp, f=0.75)) detach(mtcars) ## ss 5.6.1 The female athletes (AIS) data attach(ais) here<- sex=="f" plot(pcBfat[here]~ht[here], xlab = "Height", ylab = "% Body fat") panel.smooth(ht[here],pcBfat[here]) abline(lm(pcBfat[here] ~ ht[here])) detach(ais) ## Sec 5.7: Plotting Mathematical Symbols ## Code used for plot: r <- seq(0.1, 8.0, by=0.1) plot(r, pi * r^2, xlab=expression(Radius == r), ylab=expression(Area == pi*r^2), type="l") # NB: Use ==, within an expression, to print = demo(graphics) ## Sec 5.8: Multi-way Tables -- Mosaic Plots class(UCBAdmissions) dim(UCBAdmissions) dimnames(UCBAdmissions) UCBAdmissions[, , "A"] # Equivalent to UCBAdmissions[, , 1] mosaicplot(UCBAdmissions, color=TRUE) mosaicplot(aperm(UCBAdmissions, c(2,1,3)), color=TRUE) ## Sec 5.9: Regular Graphics Functions -- Additional Points ## ss 5.9.1 The variety of R graphics functions ## ss 5.9.2 Plots with Large Numbers of Points ## ss 5.9.3 Inclusion of Graphs in Other Documents ## Sec 5.10: Exercises huron <- data.frame(year=as(time(LakeHuron), "vector"), mean.height=LakeHuron) identify(huron$year, huron$mean.height, labels=huron$year) lag.plot(huron$mean.height) plot(LakeHuron) identify(LakeHuron, labels=time(LakeHuron)) ## Chapter 6: Lattice Graphics ## Sec 6.1: Panels of Scatterplots -- Examples of the Use of xyplot() library(DAAG) xyplot(csoa~it|sex*agegp, data=tinting) ## The code is: xyplot(csoa~it|sex*agegp, data=tinting, auto.key=list(columns=2), groups=target) xyplot(csoa~it|sex*agegp, data=tinting, groups=target, auto.key=list(columns=3), type=c("p","smooth")) xyplot(csoa ~ it|sex*agegp, data=tinting, groups=tint, auto.key=list(columns=3)) ## Sec 6.2: Annotation -- the \txtt{auto.key}, \txtt{key} and\txtt{legend} arguments ## Sec 6.3: Trellis settings names(trellis.par.get()) trellis.par.get("superpose.symbol") ## ss 6.3.1 \txtt{par.settings} -- an example trellis.device() xyplot(csoa ~ it|sex*agegp, data=tinting, groups=tint, type=c("p","smooth"), span=0.9, par.settings=list(superpose.symbol=list(col=c("gray","gray","black"), pch=c(1,16,16)), superpose.line=list(col=c("grey", "gray", "black"), lty=c(1,2,4), lwd=c(2,2,1))), auto.key=list(columns=3, lines=TRUE)) # The parameter "span" controls the extent of smoothing. ## Sec 6.4: Overlaid plots xyplot(BC+Alberta+Prairies+Ontario+Quebec+Atlantic ~ Date, data=jobs, ylab="Number of jobs", type="l") xyplot(BC+Alberta+Prairies+Ontario+Quebec+Atlantic ~ Date, data=jobs, type="l", layout=c(3,2), scales=list(y=list(relation="sliced", log=TRUE)), outer=TRUE) ylabpos <- exp(pretty(log(unlist(jobs[,-7])), 100)) ylabels <- paste(round(ylabpos),"\n(", log(ylabpos), ")", sep="") ## Create a date object 'startofmonth'; use this instead of 'Date' startofmonth <- seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="1 month", length=24) atdates <- seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="6 month", length=4) datelabs <- format(atdates, "%b%y") xyplot(BC+Alberta+Prairies+Ontario+Quebec+Atlantic ~ startofmonth, data=jobs, type="l", layout=c(3,2), scales=list(x=list(at=atdates, labels=datelabs), y=list(relation="sliced", log=TRUE, at=ylabpos, labels=ylabels)), xlab="Month", ylab="Number of workers", outer=TRUE) ## Sec 6.5: Lattice Style Stripplots and Boxplots ## The first two lines ensure display of the species name levels(cuckoos$species) <- sub(".", " ", levels(cuckoos$species), fixed=T) stripplot(species ~ length, xlab="Length of egg", aspect=0.5, data=cuckoos) ## NB. The lattice function is bwplot() bwplot(species ~ length, xlab="Length of egg", data=cuckoos) ## Sec 6.6: Inclusion of lattice graphics functions in user functions xyplot(csoa ~ it | sex * agegp, data=tinting) print(xyplot(csoa ~ it | sex * agegp, data=tinting)) print(xyplot(csoa ~ it | sex * agegp, data=tinting)) tinting.trellis <- xyplot(csoa ~ it | sex * agegp, data=tinting) print(tinting.trellis) ## Sec 6.7: Dotplots dotplot(variety ~ yield | site, data = barley, groups = year, key = simpleKey(levels(barley$year), space = "right"), xlab = "Barley Yield (bushels/acre) ", aspect = 0.5, layout = c(1, 6), ylab = NULL) deathrate <- c(40.7, 36,27,30.5,27.6,83.5) ord <- order(deathrate) hosp <- c("Cliniques of Vienna (1834-63)\n(> 2000 cases pa)", "Enfans Trouves at Petersburg\n(1845-59, 1000-2000 cases pa)", "Pesth (500-1000 cases pa)", "Edinburgh (200-500 cases pa)", "Frankfort (100-200 cases pa)", "Lund (< 100 cases pa)") hosp <- factor(hosp, levels=hosp[order(deathrate)]) dotplot(hosp ~ deathrate, xlim=c(0,110), cex=1.5, scale=list(cex=1.25), type=c("h","p"), xlab=list("Death rate per 1000 ", cex=1.5), sub="From Nightingale (1871) - data from Dr Le Fort") ## Sec 6.8: Lattice Style Density Plots densityplot(~earconch | Pop, groups=sex, data=possum) possumsites$site <- 1:7 possumsites$Site <- factor(rownames(possumsites)) possums <- merge(possum, possumsites[, 4:5], by="site") densityplot(~earconch | sex, group=Site, data=possums, aspect=1.5) densityplot(~earconch | sex, group=Site, data=possums, aspect=1.5, key=simpleKey(text=levels(possums$Site), points=TRUE)) ## Sec 6.9: An incomplete list of lattice Functions example(bwplot) ## Sec 6.10: Summary ## Chapter 7: Fitting Statistical Models ## Sec 7.1: A Regression Model library(DAAG) names(races2000) # Require dist, climb, time and timef hills2k <- races2000[races2000$type=="hill", 7:10] plot(hills2k) # Equivalent to pairs(hills2k) plot(log(hills2k)) # Check what difference this makes match("Caerketton", rownames(hills2k)) hills2k[42, "dist"] loghills2k <- log(hills2k[, ]) names(loghills2k) <- c("ldist", "lclimb", "ltime", "ltimef") loghills2k.lm <- lm(ltime ~ ldist + lclimb, data=loghills2k) par(mfrow=c(2,2)) plot(loghills2k.lm) # Diagnostic plot ## Plot the terms in the model termplot(loghills2k.lm, col.term="gray", partial=TRUE, col.res="black", smooth=panel.smooth) library(splines) # Attach the splines package loghills2ks.lm <- lm(ltime ~ ns(ldist,3) + ns(lclimb,4), data=loghills2k) library(MASS) loghills2ks.rlm <- rlm(ltime ~ ns(ldist,3) + ns(lclimb,4), data=loghills2k) par(mfrow=c(2,2)) plot(loghills2ks.lm) # Diagnostic plot, from lm fit plot(loghills2ks.rlm) # Diagnostic plot, from rlm fit loghills2k.rlm <- lm(ltime ~ ns(ldist,4) + ns(lclimb,3), data=loghills2k, subset=row.names(loghills2k)!="Caerketton") plot(loghills2ks.lm) termplot(loghills2ks.lm, col.term="gray", partial=TRUE, col.res="black", smooth=panel.smooth) summary(loghills2ks.lm) ## ss 7.1.1 The Basis Functions bases <- model.matrix(loghills2ks.lm) colnames(bases) options(digits=3) bases[1:5,] par(mfrow=c(2,2)) for (i in 0:3)plot(loghills2k$lclimb, bases[,5+i]) par(mfrow=c(1,1)) ## Sec 7.2: Models that Include Factor Terms -- Contrasts ## ss 7.2.1 Example -- sugar weight ## Display model matrix: uses data frame sugar (DAAG) sugar.aov <- aov(weight ~ trt, data=sugar) model.matrix(sugar.aov) summary(sugar.aov) ## ss 7.2.2 Different choices for the model matrix when there are factors ## Omit constant term from fit; ## force parameters to estimate treatment means sugar.aov <- aov(weight ~ -1 + trt, data=sugar) model.matrix(sugar.aov) options(oldoptions) # Restore default treatment contrasts ## ss 7.2.3 Interaction terms ## Sec 7.3: Hierarchical Multi-level Models # ant111b is in DAAG stripplot(site ~ harvwt, data=ant111b) stripplot(harvwt ~ site, data=ant111b, scales=list(x=list(rot=90))) ## ss 7.3.1 Analysis of the Antiguan corn yield data ## ss 7.3.2 Variance components -- a multi-level model ## ss 7.3.3 Analysis using \textit{lme} library(nlme) ant111b.lme <- lme(fixed=harvwt ~ 1, random = ~1 | site, data=ant111b) ant111b.lme ## Sec 7.4: Additional Calculations ## ss 7.4.1 Fitted values and residuals in \textit{lme} hat.lm <- fitted(lm(harvwt ~ site, data=ant111b)) hat.lme <- fitted(ant111b.lme) # By default, level=1) plot(hat.lme ~ hat.lm, xlab="Location means", ylab="Fitted values (BLUPS) from lme") abline(0,1,col="red") # Show the line y=x hat0.lme <- fitted(ant111b.lme, level=0) res0.lme <- resid(ant111b.lme, level=0) plot(res0.lme, ant111b$harvwt - hat0.lme) # Points lie on y=x abline(0,1,col="red") ## ss 7.4.2 Analysis using \txtt{lmer()}, from \textit{lme4} ## ss 7.4.3 Classification models ## Sec 7.5: Models \& methods -- a more complete list ## Sec 7.6: Exercises volume <- apply(oddbooks[, 1:3], 1, prod) area <- apply(oddbooks[, 2:3], 1, prod) lob1.lm <- lm(log(weight) ~ log(volume), data=oddbooks) lob2.lm <- lm(log(weight) ~ log(thick)+log(area), data=oddbooks) lob3.lm <- lm(log(weight) ~ log(thick)+log(breadth)+log(height), data=oddbooks) ## Chapter 8: Multivariate methods ## Sec 8.1: Ordination audists # From DAAGxtras aupoints <- cmdscale(audists) library(MASS) eqscplot(aupoints) text(aupoints, labels = paste(rownames(aupoints))) library(ape) webpage <- "http://evolution.genetics.washington.edu/book/primates.dna" primates.dna <- read.dna(url(webpage)) primates.dist <- dist.dna(primates.dna, model = "F84") ## Use Kimura's F84 model primates.cmd <- cmdscale(primates.dist) eqscplot(primates.cmd) rtleft <- c(4, 2, 4, 2)[unclass(cut(primates.cmd[, 1], breaks = 4))] text(primates.cmd[, 1], primates.cmd[, 2], row.names(primates.cmd), pos = rtleft) > d <- dist(primates.cmd) > sum((d - primates.dist)^2)/sum(primates.dist^2) [1] 0.1977 > primates.cmd <- cmdscale(primates.dist, k = 3) > library(lattice) > cloud(primates.cmd[, 3] ~ primates.cmd[, 1] * primates.cmd[, 2]) > d <- dist(primates.cmd) > sum((d - primates.dist)^2)/sum(primates.dist^2) [1] 0.1045 ## Sec 8.2: Classification ## Chapter 9: Common Uses for Key Language Ideas ## Sec 9.1: Generic Functions (Classes and Methods) print methods(plot) ## ss 9.1.1 Namespaces ## ss 9.1.2 S4 Classes and Methods -- the \textit{methods} package ## ant111b is in the DAAG package library(lme4) # lme4 must be installed ant111b.lmer <- lmer(harvwt ~ 1 + (1 | site), data = ant111b) slotNames(ant111b.lmer) slot(ant111b.lmer, "assign") slot(ant111b.lmer, "call") ant111b.lmer@call VarCorr(ant111b.lmer) ## ss 9.1.3 Model, Graphics and Table Formulae ## ss 9.1.4 Expressions ## ss 9.1.5 Manipulation of Language Constructs ## Sec 9.2: Manipulation of Formulae ## ss 9.2.1 Model and Graphics Formulae plot.mtcars <- function(xvar="disp", yvar="mpg"){ mt.txt <- paste(yvar, "~", xvar) plot(formula(mt.txt), xlab=xvar, ylab=yvar) } attach(mtcars) names(mtcars) plot.mtcars(xvar="hp", yvar="mpg") ## ss 9.2.2 Extraction of Variable Names from Formula Objects all.vars(mpg ~ disp) plot.mtcars <- function(form = mpg~disp){ yvar <- all.vars(form)[1] xvar <- all.vars(form)[2] ## Include information that allows a meaningful label mtcars.info <- c(mpg= "Miles/(US) gallon", cyl= "Number of cylinders", disp= "Displacement (cu.in.)", hp= "Gross horsepower", drat= "Rear axle ratio", wt= "Weight (lb/1000)", qsec= "1/4 mile time", vs= "V/S", am= "Transmission (0 = automatic, 1 = manual)", gear= "Number of forward gears", carb= "Number of carburettors") xlab <- mtcars.info[xvar] ylab <- mtcars.info[yvar] plot(form, xlab=xlab, ylab=ylab) } ## Sec 9.3: Expressions plot.mtcars <- function(x = disp, y = mpg){ xvar <- deparse(expression(x)) yvar <- deparse(expression(y)) plot(y ~ x, xlab=xvar, ylab=yvar) } ## ss 9.3.1 Formatting and plotting of text and equations expr <- expression(x^2) expr <- expression(x^2) x <- 5 eval(expr) eval(expr, list(x=7)) x <- 1:10 plot(x, x^2, ylab=expression(x^2)) ## ss 9.3.2 Plotting expressions -- detailed examples italic("Acmena smithii") * ":" * phantom(0) * "wood vs dbd" ## Plot Acmena smithii data (subset of rainforest, from DAAG) Acmena <- subset(rainforest, species=="Acmena smithii") ## Plot the graph plot(wood~dbh, data=Acmena, xlim=c(0, max(Acmena$dbh))) ## In fitting the curve, omit the point where dbh is largest b <- coef(lm(log(wood) ~ log(dbh), data=Acmena, subset=dbh<40)) largest2 <- sort(Acmena$dbh, decreasing=TRUE)[2:1] curve(exp(b[1])*x^b[2], from=min(Acmena$dbh), to=largest2[1], add=TRUE) curve(exp(b[1])*x^b[2], from=largest2[1], to=largest2[2], lty=2, add=TRUE) ## Finally, use the code given below to add the legend and title mtext(side=3, line=1.5, cex=1.1, italic("Acmena smithii") * ":" * phantom(0) * "wood vs dbd) ## ss 9.3.3 Symbolic substitution in expressions mtext(side=3, line=1.5, substitute(italic(tx) * ": " * "wood vs dbh", list(tx="Acmena smithii")), cex=1.1) ## Code used to add legend to graph b <- round(b,3) arg1 <- bquote(italic(y) == .(A) * italic(x)^.(B), list(A=b[1], B=b[2])) arg2 <- quote("(" * italic(y) * "=wood; " * italic(x) * "=dbh)") legend("topleft", legend=do.call("expression", c(arg1, arg2)), bty="n") mtext(side=3, line=1.5, substitute(italic(tx) * ": " * "wood vs dbh", list(tx="Acmena smithii")), cex=1.1) ## ss 9.3.4 Another example of the use of \txtt{substitute()} random.weibull <- function(n=100, shape=1.75){ x <- rweibull(n=100, shape=shape) simden <- density(x, from=0) xval <- pretty(x,50) theoryden <- list(x=xval, y=dweibull(pretty(x,50), shape=shape)) plot(simden, ylim=range(c(simden$y, theoryden$y))) lines(theoryden, col="red") topright <- par()$usr[c(2,4)] - c(0, 1.5*par()$cxy[2]) text(topright[1], topright[2], substitute(atop("Density is" * phantom(0) * f(x) == (a/b) (x/b)^(a-1) * exp(- (x/b)^a), "with"*phantom(0)*list(a==z, b==1)), list(z=shape)), adj=1) } ## Run function random.weibull() ## Sec 9.4: The use of a list to pass parameter values mean(rnorm(20)) do.call("mean", args=list(x=rnorm(20))) ## No legal code: `average` <- function(x=possum$chest, FUN=function(x)mean(x)){ fun <- deparse(substitute(FUN)) do.call(fun, list(x=x)) } average() average(FUN=median) mean.call <- call("mean", x=rnorm(5)) eval(mean.call) eval(mean.call) mean.call ## Sec 9.5: Environments test <- function(){ fname <- as(sys.call(sys.parent())[[1]], "character") fname } test() newtest <- test newtest() gf <- function(width=2.25, height=2.25, color=F, pointsize=8){ funtxt <- sys.call(1) # Sea fnam <- paste(funtxt, ".pdf", sep="") print(paste("Output will be to the file '", fnam, "'", sep="")) pdf(file=fnam, width=width, height=height, pointsize= pointsize) } graph1 <- function(){ gf() # Call with default parameters curve(sin, -pi, 2*pi) dev.off() } ## ss 9.5.1 Bound and Unbound Variables ## Sec 9.6: Summary ## Sec 9.7: Exercises ## Chapter 10: Additional Notes on R ## Sec 10.1: Entry of data using read.table() and scan() ## ss 10.1.1 read.table() -- Errors when reading in data ## ss 10.1.2 read.table() -- Additional points ## ss 10.1.3 *The use of scan() for flexible data input col.names <- scan("molclock.txt", nlines=1, what="") molclock <- scan("molclock.txt", skip=1, what=c(list(""), rep(list(1),5))) molclock <- data.frame(molclock, row.names=1) # Column 1 supplies row names names(molclock) <- col.names ## ss 10.1.4 Large files -- Reading in part of a file ## Sec 10.2: The apply family of functions ## ss 10.2.1 The \txtt{apply()} function apply(molclock, 2, range) apply(UCBAdmissions, c(2,3), function(x)x[1]/sum(x)) apply(UCBAdmissions, c(1,2), sum) apply(apply(UCBAdmissions, c(1,2), sum), 2, function(x)x[1]/sum(x)) ## ss 10.2.2 The \txtt{sapply()} function sapply(molclock, range) sapply(molclock, range, na.rm=T) ## ss 10.2.3 More efficient alternatives to apply() and sapply() ## Sec 10.3: Logarithms, with some zero or negative numbers attach(mouse.data) min(R[R>0]) min(Rb[Rb>0]) min(G[G>0]) min(Gb[Gb>0]) detach(mouse.data) ## Sec 10.4: Computations with Large Datasets xy <- matrix(rnorm(500000),ncol=50) dim(xy) system.time(xy+1) ## Times are: user cpu, system cpu, elapsed, subproc1, subproc2 xy.df <- data.frame(xy) system.time(xy.df+1) xy <- matrix(rnorm(200000), nrow=2000) system.time(apply(xy,1,sum)) system.time(xy %*% rep(1,100)) system.time(rowSums(xy)) xy2 <- matrix(rnorm(200000), nrow=100) system.time(apply(xy2,1,sum)) system.time(xy2%*%rep(1,2000)) system.time(rowSums(xy2)) xy <- matrix(rnorm(200000), nrow=2000) dd <- sample(2000) system.time(diag(dd)%*%xy) system.time(sweep(xy, 1, dd, "*")) system.time(xy*dd) dd100 <- sample(100) system.time(xy%*%diag(dd100)) system.time(sweep(xy, 2, dd100, "*")) system.time(t(xy)*dd100) system.time(sv---d(xy)) # xy is 2000 x 100 xyt <- t(xy) system.time(svd(xyt)) # xy2 is 100 x 2000 ## ss 10.4.1 Data Base Connections ## Sec 10.5: Workspace management ## ss 10.5.1 The removal of clutter save.image(file="archive.Rdata") save.image(file="a31-1-03.Rdata") ## ss 10.5.2 Movement of files between computers ## ss 10.5.3 *Further possibilities -- saving objects in text form dump("molclock", file="molclock.R") source("molclock.R") # Use to retrieve molclock ## Sec 10.6: Debugging ## Sec 10.7: Summary ## Sec 10.8: References ## Chapter 11: Packages ## Sec 11.1: Base Packages ## Sec 11.2: Recommended packages ## Sec 11.3: Genetics and molecular biology ## ss 11.3.1 Genetics ## ss 11.3.2 Phylogenetics ## ss 11.3.3 Analysis of expression array data ## ss 11.3.4 The BioConductor bundle ## Sec 11.4: Graphics packages, including \textit{graphics}, \textit{lattice} and \textit{grid} ## Sec 11.5: Other contributed packages -- a small selection ## ss 11.5.1 Bayesian methods ## ss 11.5.2 Categorical data ## ss 11.5.3 GUI interface for statistics ## ss 11.5.4 Time series ## ss 11.5.5 Design of Experiments NA Design -- population genetics ## ss 11.5.6 The brewing and use of colors ## ss 11.5.7 Fun and recreational packages ## ss 11.5.8 Miscellaneous ## Sec 11.6: Multivariate analysis ## ss 11.6.1 cluster ## ss 11.6.2 Multivariate abilities in package \textit{stats} ## ss 11.6.3 MASS ## Chapter 12: References and Bibliography ## Sec 12.1: Books and Papers on R ## Sec 12.2: Graphics ## ss 12.2.1 Bioinformatics ## ss 12.2.2 Literature on trellis (lattice) graphics ## Sec 12.3: Influences on the Development of R -- Lisp-Stat