## Hugo Quené, R course at UiL OTS, June 2012 ## h.quene@uu.nl ### 1st hour: INTRODUCTION ## PRESENTATION PREFERENCES # increase font size (18+) and adjust color (red4) for Console window # use Rgui > preferences > gui # my first objects a <- seq(1:25) # create sequence from 1 to 25 and store in object a # the above command produces no output print(a) # show contents of object a, starting with first element # print is default action for objects a # typing object name is equivalent to print(a) 17*23 # use R as calculator: implicit new object is printed # standard operators: +-*/^ # other operators: integer division and modulo # 49 == (16*3)+1 # there are 49 items and 3 groups, how many items per group? 49 %/% 3 # integer division, returns 16 # how many items remaining after integer division? > 49 %% 3 # returns 1 ### OBJECTS # do *not* use c or t as names for your objects # c and t are predefined functions that will then be inaccessible c(1:12,13:25) # concatenate t( matrix(a,ncol=5) ) # transpose = reverse rows and columns ### NUMERICAL VARIABLE x <- 1:5 # shorthand for seq(1,5,by=1) x <- rep(x,each=2) # there is no Undo command, previous x is lost! print(x) mean(x) sd(x) # sqrt(20/9) summary(x) # print numerical summary ### CATEGORICAL VARIABLE or FACTOR xf <- as.factor(x) summary(xf) # print factor summary # read data interactively from terminal aux <- scan(what="character") # code characters for region # 1: W W M M N N S S # 9: print(aux) [1] "W" "W" "M" "M" "N" "N" "S" "S" region <- as.factor(aux) # convert character to factor summary(region) # note that factor levels are in alphabetical order!! # M N S W # 2 2 2 2 contrasts(region) # contrasts: first level is "baseline" in comparisons # N S W # M 0 0 0 # N 1 0 0 # S 0 1 0 # W 0 0 1 rm(aux) # remove garbage ### BOOLEAN or LOGICAL VARIABLE b <- (x>4) class(b) print(b) ### DATA FRAMES objects() # list all objects in workspace xxf <- data.frame( x, xf ) # bind variables into data frame = worksheet head(xxf) # show first n lines of data frame tail(xxf) # show last n lines of data frame sapply(xxf,class) # show class, for each element=column in xxf str(xxf) # show structure of data frame # read.table: read data from file: can also pass URL file spec nlspk <- read.table( file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"), header=TRUE, na.strings=c("NA","MISSING") ) # named arguments nlspk[1,] # first row nlspk[,1] # first col nlspk[,2:4] # columns 2 to 4 dim(nlspk) # check dimensions of rows and columns # read.spss require(foreign) # use package with import/export routines read.spss( file=file.choose() ) # choose file interactively ### SUBSELECTION [ ] x <- rnorm(30) # create x[14:16] <- NA # replace 3 obs by NA which(is.na(x)) # which NA obs table( is.na(x) ) # how many NA obs mean(x,na.rm=T) # excluding NA obs mean(x[!is.na(x)]) # excluding NA obs x[is.na(x)] <- mean(x[!is.na(x)]) # replace NA by mean var(x) # including replacements var(x[-(14:16)]) # replacing NA by mean has reduced variance !! sapply(nlspk[ , c(3,6,7)],mean) apply function mean to 3 ,columns of nlspk # age syldur nsyl.ln # 42.5250000 0.2560595 2.0741520 # use subselection to unselect outliers boxplot.stats(nlspk$syldur) # lower cutoff, Q1, median, Q3, upper cutoff ifv <- boxplot.stats(nlspk$syldur)$stats[5] # inner fence value, 1.5x IQR above Q3 which(nlspk$syldur>ifv) # inspect outlier cases # [1] 7 33 shapiro.test(nlspk$syldur) # all cases # W = 0.9359, p-value = 0.0005824 # reject H0:normality shapiro.test(nlspk$syldur[ok]) # exclude outliers # W = 0.9853, p-value = 0.509 # do not reject H0:normality var(nlspk$syldur[ok])/var(nlspk$syldur) # [1] 0.7014816 # removal of two outliers reduces variance by 30% ### SPLITTING fivenum(nlspk$age) # five-number summary: min, Q1, median, Q3, max # [1] 21 34 43 51 59 syldur.decade <- split(nlspk$syldur, nlspk$age%/%10) # split syldur by age%/%10 boxplot(syldur.decade, col="lavender", at=(1:4)-0.1, varwidth=T, xaxt="n") nsyl4 <- cut(nlspk$nsyl.ln,4) boxplot(nlspk$syldur~nsyl4, col="blue", at=(1:4)+0.1, varwidth=T, xaxt="n", add=T) legend("top", fill=c("lavender","blue"), legend=c("split by decade of age","split by quartile of phrase length")) ### TABLES # use table for numbers of cases table( nlspk$region, nlspk$isold ) # table of two variables, # cells contain numbers of observations with(nlspk, table( region, isold ) ) # same, less typing xtabs( ~region+isold, data=nlspk ) # similar # use tapply for applying function to data in each cell # apply function sd on age, broken down by cells defined by list() # variables are defined within scope of nlspk with(nlspk, tapply( age, list(region,isold), sd ) ) ### HELP help(sd) ?sd # shortcut to help() help.search("standard deviation") ??"standard deviation" # shortcut to help.search() ### SAVING YOUR WORK # Rgui > File > Save... # save Console (input and output) # Rgui > Workspace > Save Workspace As... # save entire Workspace (all objects) # Rgui > History > Save History... # save History (input) # saving data objects: as R objects, extension .Rda save(xxf, file="xxf.Rda") load(file="xxf.Rda") # recreates object that was saved in xxf.Rda # saving data objects: as exported text files write.table( xxf, file="xxf.txt", row.names=F, col.names=T ) require(foreign) write.csv( xxf, file="xxf.csv" ) # can be read into excel