## Hugo Quené, R course at UiL OTS, June 2012
## h.quene@uu.nl

### 1st hour: INTRODUCTION

## PRESENTATION PREFERENCES
# increase font size (18+) and adjust color (red4) for Console window
# use Rgui > preferences > gui

# my first objects
a <- seq(1:25) # create sequence from 1 to 25 and store in object a 
# the above command produces no output

print(a) # show contents of object a, starting with first element
# print is default action for objects
a # typing object name is equivalent to print(a)

17*23 # use R as calculator: implicit new object is printed

# standard operators: +-*/^
# other operators: integer division and modulo
# 49 == (16*3)+1
# there are 49 items and 3 groups, how many items per group? 
49 %/% 3  # integer division, returns 16
# how many items remaining after integer division? 
> 49 %% 3 # returns 1


### OBJECTS

# do *not* use c or t as names for your objects
# c and t are predefined functions that will then be inaccessible
c(1:12,13:25) # concatenate
t( matrix(a,ncol=5) ) # transpose = reverse rows and columns

### NUMERICAL VARIABLE

x <- 1:5  # shorthand for seq(1,5,by=1)
x <- rep(x,each=2) # there is no Undo command, previous x is lost!
print(x)
mean(x)
sd(x) # sqrt(20/9)
summary(x) # print numerical summary

### CATEGORICAL VARIABLE or FACTOR

xf <- as.factor(x) 
summary(xf) # print factor summary

# read data interactively from terminal
aux <- scan(what="character") # code characters for region
# 1: W W M M N N S S 
# 9: 
print(aux)
[1] "W" "W" "M" "M" "N" "N" "S" "S"
region <- as.factor(aux) # convert character to factor
summary(region) # note that factor levels are in alphabetical order!! 
# M N S W 
# 2 2 2 2 
contrasts(region) # contrasts: first level is "baseline" in comparisons
#   N S W
# M 0 0 0
# N 1 0 0
# S 0 1 0
# W 0 0 1
rm(aux) # remove garbage

### BOOLEAN or LOGICAL VARIABLE

b <- (x>4)
class(b)
print(b)

### DATA FRAMES

objects() # list all objects in workspace

xxf <- data.frame( x, xf ) # bind variables into data frame = worksheet
head(xxf) # show first n lines of data frame
tail(xxf) # show last n lines of data frame
sapply(xxf,class) # show class, for each element=column in xxf
str(xxf) # show structure of data frame

# read.table: read data from file: can also pass URL file spec
nlspk <- read.table(
	file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"),
	header=TRUE, na.strings=c("NA","MISSING") ) # named arguments
nlspk[1,] # first row
nlspk[,1] # first col
nlspk[,2:4] # columns 2 to 4
dim(nlspk) # check dimensions of rows and columns

# read.spss
require(foreign) # use package with import/export routines
read.spss( file=file.choose() ) # choose file interactively

### SUBSELECTION [ ]

x <- rnorm(30) # create 
x[14:16] <- NA # replace 3 obs by NA
which(is.na(x)) # which NA obs
table( is.na(x) ) # how many NA obs
mean(x,na.rm=T) # excluding NA obs
mean(x[!is.na(x)]) # excluding NA obs
x[is.na(x)] <- mean(x[!is.na(x)]) # replace NA by mean
var(x) # including replacements
var(x[-(14:16)]) # replacing NA by mean has reduced variance !! 

sapply(nlspk[ , c(3,6,7)],mean) apply function mean to 3 ,columns of nlspk
# age     syldur    nsyl.ln 
# 42.5250000  0.2560595  2.0741520 

# use subselection to unselect outliers
boxplot.stats(nlspk$syldur) # lower cutoff, Q1, median, Q3, upper cutoff
ifv <- boxplot.stats(nlspk$syldur)$stats[5] # inner fence value, 1.5x IQR above Q3
which(nlspk$syldur>ifv) # inspect outlier cases
# [1]  7 33
shapiro.test(nlspk$syldur) # all cases
# W = 0.9359, p-value = 0.0005824 # reject H0:normality
shapiro.test(nlspk$syldur[ok]) # exclude outliers
# W = 0.9853, p-value = 0.509 # do not reject H0:normality
var(nlspk$syldur[ok])/var(nlspk$syldur)
# [1] 0.7014816
# removal of two outliers reduces variance by 30%

### SPLITTING

fivenum(nlspk$age) # five-number summary: min, Q1, median, Q3, max
# [1] 21 34 43 51 59
syldur.decade <- split(nlspk$syldur, nlspk$age%/%10) # split syldur by age%/%10 
boxplot(syldur.decade, col="lavender", at=(1:4)-0.1, varwidth=T, xaxt="n")

nsyl4 <- cut(nlspk$nsyl.ln,4)
boxplot(nlspk$syldur~nsyl4, col="blue", at=(1:4)+0.1, varwidth=T, xaxt="n", add=T)
legend("top", fill=c("lavender","blue"), 
	legend=c("split by decade of age","split by quartile of phrase length"))

### TABLES

# use table for numbers of cases
table( nlspk$region, nlspk$isold ) # table of two variables, 
# cells contain numbers of observations
with(nlspk, table( region, isold ) ) # same, less typing 
xtabs( ~region+isold, data=nlspk ) # similar

# use tapply for applying function to data in each cell
# apply function sd on age, broken down by cells defined by list()
# variables are defined within scope of nlspk
with(nlspk, tapply( age, list(region,isold), sd ) )

### HELP
help(sd)
?sd # shortcut to help()
help.search("standard deviation")
??"standard deviation" # shortcut to help.search()

### SAVING YOUR WORK 

# Rgui > File > Save... # save Console (input and output)
# Rgui > Workspace > Save Workspace As... # save entire Workspace (all objects)
# Rgui > History > Save History... # save History (input)

# saving data objects: as R objects, extension .Rda
save(xxf, file="xxf.Rda")
load(file="xxf.Rda") # recreates object that was saved in xxf.Rda

# saving data objects: as exported text files
write.table( xxf, file="xxf.txt", row.names=F, col.names=T )
require(foreign)
write.csv( xxf, file="xxf.csv" ) # can be read into excel