R version 2.10.1 (2009-12-14) Copyright (C) 2009 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. [R.app GUI 1.31 (5537) x86_64-apple-darwin9.8.0] > x=c(10,20) > y=c(30,40) > m=cbind(x,y) > m x y [1,] 10 30 [2,] 20 40 > class(m) [1] "matrix" > ?class starting httpd help server ... done > length(m) [1] 4 > dim(m) [1] 2 2 > m[1,] x y 10 30 > m[,2] [1] 30 40 > m[2,2] y 40 > m[,1]*m[,2] [1] 300 800 > m$x Error in m$x : $ operator is invalid for atomic vectors > m x y [1,] 10 30 [2,] 20 40 > rownames(m)=c('a','b') > m x y a 10 30 b 20 40 > m[,'y'] a b 30 40 > m$y Error in m$y : $ operator is invalid for atomic vectors > m x y a 10 30 b 20 40 > z=c('a','b') > m=cbind(x,y,z) > m x y z [1,] "10" "30" "a" [2,] "20" "40" "b" > m[,1]+m[,2] Error in m[, 1] + m[, 2] : non-numeric argument to binary operator > class(m) [1] "matrix" > class(m[,1]) [1] "character" > class(c(10,20)) [1] "numeric" > length(m) [1] 6 > dim(m) [1] 2 3 > m=cbind(x,y) > m x y [1,] 10 30 [2,] 20 40 > df=data.frame(x,y) > df x y 1 10 30 2 20 40 > class(df) [1] "data.frame" > length(df) [1] 2 > dim(df) [1] 2 2 > df$x [1] 10 20 > df[,'x'] [1] 10 20 > rownames(df)=c('a','b') > df[1,] x y a 10 30 > df['a',] x y a 10 30 > df[,1] [1] 10 20 > df[1,1] [1] 10 > df['a',1] [1] 10 > df[,1] + df[,2] [1] 40 60 > df x y a 10 30 b 20 40 > df=data.frame(x,y,z) > df x y z 1 10 30 a 2 20 40 b > m x y [1,] 10 30 [2,] 20 40 > m=cbind(x,y,z) > m x y z [1,] "10" "30" "a" [2,] "20" "40" "b" > df x y z 1 10 30 a 2 20 40 b > df[,1] + df[,2] [1] 40 60 > df[,1]+df[,3] [1] NA NA Warning message: In Ops.factor(df[, 1], df[, 3]) : + not meaningful for factors > df[c(1,3)] x z 1 10 a 2 20 b > df[,c(1,3)] x z 1 10 a 2 20 b > df[c(1,3),] x y z 1 10 30 a NA NA NA > height=1:10 > height>7 [1] FALSE FALSE FALSE FALSE FALSE FALSE [7] FALSE TRUE TRUE TRUE > which(height>7) [1] 8 9 10 > height[which(height>7)] [1] 8 9 10 > height[height>7] [1] 8 9 10 > df[c(1,3),] x y z 1 10 30 a NA NA NA > df[,c(1,3)] x z 1 10 a 2 20 b > df[,c(T,F,T)] x z 1 10 a 2 20 b > y=seq(30,50,by=10) > x [1] 10 20 > y [1] 30 40 50 > m=cbind(x,y) Warning message: In cbind(x, y) : number of rows of result is not a multiple of vector length (arg 1) > m x y [1,] 10 30 [2,] 20 40 [3,] 10 50 > m=rbind(x,y) Warning message: In rbind(x, y) : number of columns of result is not a multiple of vector length (arg 1) > m [,1] [,2] [,3] x 10 20 10 y 30 40 50 > x=c(x,NA) > rbind(x,y) [,1] [,2] [,3] x 10 20 NA y 30 40 50 > x=c(10,20) > x [1] 10 20 > y [1] 30 40 50 > rm(df) > df=data.frame(x,y) Error in data.frame(x, y) : arguments imply differing number of rows: 2, 3 > df function (x, df1, df2, ncp, log = FALSE) { if (missing(ncp)) .Internal(df(x, df1, df2, log)) else .Internal(dnf(x, df1, df2, ncp, log)) } > y=c(30,40) > x [1] 10 20 > y [1] 30 40 > l=list(x,y) > l [[1]] [1] 10 20 [[2]] [1] 30 40 > l=list(xx=x,yy=y) > l $xx [1] 10 20 $yy [1] 30 40 > class(l) [1] "list" > length(l) [1] 2 > dim(l) NULL > l[1] $xx [1] 10 20 > l['xx'] $xx [1] 10 20 > l$xx [1] 10 20 > sum(l[1]) Error in sum(l[1]) : invalid 'type' (list) of argument > l[[1]] [1] 10 20 > l[1] $xx [1] 10 20 > l$xx [1] 10 20 > sum(l$xx) [1] 30 > sum(l[[1]]) [1] 30 > sum(l[['xx']]) [1] 30 > l=list(xx=x,yy=y) > l $xx [1] 10 20 $yy [1] 30 40 > names(l) [1] "xx" "yy" > l=list(x,y) > l [[1]] [1] 10 20 [[2]] [1] 30 40 > names(l)=c('a','b') > l $a [1] 10 20 $b [1] 30 40 > m [,1] [,2] [,3] x 10 20 10 y 30 40 50 > dimnames(m) [[1]] [1] "x" "y" [[2]] NULL > dimnames(l) NULL > l $a [1] 10 20 $b [1] 30 40 > names(l)=c('a','hello word') > l $a [1] 10 20 $`hello word` [1] 30 40 > l$hello word Error: unexpected symbol in "l$hello word" Error: unexpected symbol in "l$''hello" > l$'hello word' [1] 30 40 > l=list(xx=x,yy=y,zz=z) > l $xx [1] 10 20 $yy [1] 30 40 $zz [1] "a" "b" > l$xx+l$yy [1] 40 60 > class(l$xx) [1] "numeric" > class(l$zz) [1] "character" > y=seq(30,50,by=10) > y [1] 30 40 50 > l=list(xx=x,yy=y,zz=z) > l $xx [1] 10 20 $yy [1] 30 40 50 $zz [1] "a" "b" > df=data.frame(x,y) Error in data.frame(x, y) : arguments imply differing number of rows: 2, 3 > y=c(30,40) > df=data.frame(x,y) > df x y 1 10 30 2 20 40 > x [1] 10 20 > rm(x) > df=data.frame(x=c(10,20),y) > df x y 1 10 30 2 20 40 > attach(df) The following object(s) are masked _by_ .GlobalEnv : y > detach(df) > y [1] 30 40 > df=data.frame(x=c(10,20),y=c(50,60)) > attach(df) The following object(s) are masked _by_ .GlobalEnv : y > y [1] 30 40 > rm(y) > attach(df) The following object(s) are masked from df ( position 3 ) : x y > detach(df) > detach(df) > df x y 1 10 50 2 20 60 > x Error: object 'x' not found > y Error: object 'y' not found > attach(df) > x [1] 10 20 > y [1] 50 60 > x=7 > x [1] 7 > df x y 1 10 50 2 20 60 > df$x=7 > df x y 1 7 50 2 7 60 > detach(df) > df x y z 1 10 50 a 2 20 60 b > df[1,2:3] y z 1 50 a > df=data.frame(x=c(10,20,30),y=c(50,60,70),z=c('a','b','c')) > df x y z 1 10 50 a 2 20 60 b 3 30 70 c > df[2:3,2:3] y z 2 60 b 3 70 c > subset(df,subset=(z=='a' | z=='c')) x y z 1 10 50 a 3 30 70 c > subset(df,subset=(z=='a' | z=='c'))[,2:3] y z 1 50 a 3 70 c > subset(df,subset=(z=='a' | z=='c'),select=2:3) y z 1 50 a 3 70 c > df x y z 1 10 50 a 2 20 60 b 3 30 70 c > df$y=rev(df$y) > df x y z 1 10 70 a 2 20 60 b 3 30 50 c > sort(df) Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) : undefined columns selected > order(df$y) [1] 3 2 1 > df$y [1] 70 60 50 > order(df$y,decreasing=T) [1] 1 2 3 > df[order(df$y),] x y z 3 30 50 c 2 20 60 b 1 10 70 a > order(df$y) [1] 3 2 1 > df[c(3,2,1),] x y z 3 30 50 c 2 20 60 b 1 10 70 a > df[c(2,3,1),] x y z 2 20 60 b 3 30 50 c 1 10 70 a > df[c(2,3),] x y z 2 20 60 b 3 30 50 c > df[c(3,2),] x y z 3 30 50 c 2 20 60 b > library(UsingR) > ewr Year Month AA CO DL HP NW 1 2000 Nov 8.6 8.3 8.6 10.4 8.1 2 2000 Oct 8.5 8.0 8.4 11.2 8.2 3 2000 Sep 8.1 8.5 8.4 10.2 8.3 4 2000 Aug 8.9 9.1 9.2 14.5 9.0 5 2000 Jul 8.3 8.9 8.2 11.5 8.8 6 2000 Jun 8.8 9.0 8.8 14.9 8.4 7 2000 May 8.1 8.8 7.9 9.8 7.8 8 2000 Apr 8.7 8.6 8.1 9.8 8.6 9 2000 Mar 8.8 8.4 7.8 8.0 8.7 10 2000 Feb 8.7 8.9 7.6 9.3 8.5 11 2000 Jan 9.4 9.4 8.5 9.9 9.4 12 1999 Dec 8.6 8.7 7.8 9.3 8.1 13 1999 Nov 8.3 9.3 8.5 10.4 8.9 14 1999 Oct 7.9 9.4 8.1 10.8 9.3 15 1999 Sep 8.4 9.3 8.1 10.3 9.0 ... > ?ewr > df=ewr[,3:10] > mean(df) AA CO DL HP 17.83478 20.01957 16.63043 19.60435 NW TW UA US 15.79783 16.28043 17.69130 15.49348 > median(df) [1] NA NA Warning messages: 1: In mean.default(X[[1L]], ...) : argument is not numeric or logical: returning NA 2: In mean.default(X[[2L]], ...) : argument is not numeric or logical: returning NA > mean(df) AA CO DL HP 17.83478 20.01957 16.63043 19.60435 NW TW UA US 15.79783 16.28043 17.69130 15.49348 > apply(df,2,median) AA CO DL HP NW TW 16.05 18.15 15.50 18.95 14.55 15.65 UA US 16.45 14.45 > apply(df,2,mean) AA CO DL HP 17.83478 20.01957 16.63043 19.60435 NW TW UA US 15.79783 16.28043 17.69130 15.49348 > ?apply > ?sapply > l $xx [1] 10 20 $yy [1] 30 40 50 $zz [1] "a" "b" > x [1] 7 > l=list(xx=c(10,20),yy=c(30,40)) > l $xx [1] 10 20 $yy [1] 30 40 > lapply(l,mean) $xx [1] 15 $yy [1] 35 > lapply(l,median) $xx [1] 15 $yy [1] 35 > sapply(l,median) xx yy 15 35 > class(sapply(l,median)) [1] "numeric" > sapply(l,range) xx yy [1,] 10 30 [2,] 20 40 > class(sapply(l,range)) [1] "matrix" > sapply(l,sum) xx yy 30 70 > l=list(xx=c(10,20),yy=c(30,40,50)) > l $xx [1] 10 20 $yy [1] 30 40 50 > class(sapply(l,median)) [1] "numeric" > sapply(l,median) xx yy 15 40 > sapply(l,range) xx yy [1,] 10 30 [2,] 20 50 > sapply(l,sum) xx yy 30 120 > boxplot(ewr[3:10]) > plot(ewr[3:10]) > attach(student.expenses) > table(cell.phone,car) car cell.phone N Y N 1 2 Y 3 4 > table(cell.phone,car,cable.modem) , , cable.modem = N car cell.phone N Y N 1 2 Y 2 3 , , cable.modem = Y car cell.phone N Y N 0 0 Y 1 1 > ftable(cell.phone,car,cable.modem) cable.modem N Y cell.phone car N N 1 0 Y 2 0 Y N 2 1 Y 3 1 > ftable(table(cell.phone,car,cable.modem),col.vars=c('cable.modem','car')) cable.modem N Y car N Y N Y cell.phone N 1 2 0 0 Y 2 3 1 1 #============================================================================= # Chapter 4, Example 4.3, pp. 109-110 > colnames(babies) [1] "id" "pluralty" "outcome" "date" "gestation" "sex" [7] "wt" "parity" "race" "age" "ed" "ht" [13] "wt1" "drace" "dage" "ded" "dht" "dwt" [19] "marital" "inc" "smoke" "time" "number" > ?babies > attach(babies) # scatter-plot of all values > plot(gestation,wt) # replace 999 (code for NA) with actual NA (to be ignored in plot), & replot > gestation[gestation==999]=NA > plot(gestation,wt) # looking at "smoke", we see each entry in the df has one of 5 possible # smoke values (i.e., falls into one of 5 categories of smoking behavior). > table(smoke) smoke 0 1 2 3 9 544 484 95 103 10 # we can specify that the dataset should be partitioned into differents subset # for each category (by setting the plotting character pch to the smoke value) # > plot(gestation,wt,pch=smoke) # doing ?babies, we see "smoke" categories defined as: # # smoke does mother smoke? 0=never, 1= smokes now, 2=until current # pregnancy, 3=once did, not now, 9=unknown # # let's make a legend, mapping each description with its corresponding value, # manually placing legend location with "locator(1)" # legend(locator(1), legend=c('never','yes','until pregnant','long ago','unknown'), pch=unique(smoke)) # let's plot with different colors instead of different characters # rainbox(length(unique(smoke))) will give us one color category # to index into it, we'll map 0:3,9 --> 0:4 --> 1:5 > smoke[smoke==9]=4 > plot(gestation,wt,col=rainbow(length(unique(smoke)))[smoke+1]) # how would we change the legend command to use colors instead of characters? #============================================================================= # Chapter 4, section 4.3.1, p. 126 # boxplot of income vs. gestation, including all values > boxplot(gestation~inc,data=babies) # exclude the coded NA values > boxplot(gestation~inc,data=babies,subset=(gestation!=999 & inc!=98)) # vary width of boxplot in proportion to sample size (for each income level) > boxplot(gestation~inc,data=babies,subset=(gestation!=999 & inc!=98),varwidth=T) # another way to select the same subset > boxplot(gestation~inc,data=babies[gestation!=999 & inc!=98,],varwidth=T) # add axis labels > boxplot(gestation~inc,data=babies[gestation!=999 & inc!=98,],varwidth=T,xlab="income level",ylab="gestation (days)") #============================================================================= > mandms blue brown green orange red yellow milk chocolate 10.0000 30.0000 10.0000 10.0000 20.0000 20.0000 Peanut 20.0000 20.0000 10.0000 10.0000 20.0000 20.0000 Peanut Butter 20.0000 20.0000 20.0000 0.0000 20.0000 20.0000 Almond 16.6667 16.6667 16.6667 16.6667 16.6667 16.6667 kid minis 16.6667 16.6667 16.6667 16.6667 16.6667 16.6667 # by default, which returns 1-dimensional indexing > which(mandms==0) [1] 18 # with optional argument, returns matrix indexing when appropriate > which(mandms==0,arr.ind=TRUE) row col Peanut Butter 3 4 # now an ugly way to directly convert the 1-dimensional index # to the matrix (row,col) index > index=which(mandms==0) > row=((index-1)%%nrow(mandms))+1 > row [1] 3 > col=ceiling(index/nrow(mandms)) > col [1] 4 > mandms[row,col] [1] 0 # in which package in what color missing > rownames(mandms)[row] [1] "Peanut Butter" > colnames(mandms)[col] [1] "orange" # the other direction: convert (row,col) to a 1-dimensional index > gen_index=(col-1)*nrow(mandms)+row > gen_index [1] 18