## Hugo Quené, R course at UiL OTS, June 2012
## h.quene@uu.nl

### BOOTSTRAP ESTIMATION

nlspk <- read.table(
	file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"),
	header=TRUE, na.strings=c("NA","MISSING") ) # named arguments

# always start with eda
eda.fnc( nlspk$syldur )		# one suspicious outlier, with syldur>0.400

# new function
CV <- function(x) sd(x)/mean(x) # coefficient of variation
CV(nlspk$syldur) # sample-based estimate of CV
# [1] 0.1395056 
# BUT this estCV may be too high, because of the outlier;
# how robust is this estimate? Answer: use bootstrap method
# 1. create new object to hold bootstrap estimates
nbr <- 1000 # number of bootstrap replications
CV.boot <- numeric(nbr)
# 2a. take `nr` replications, 
# 2b. for each replication: resample observations, compute CV, store
for (i in 1:nbr) { CV.boot[i] <- CV( sample(nlspk$syldur,replace=T) ) }
# 3. the center of these bootstrap estimates is in fact a BETTER estimate 
# of the POPULATION value than the single-sample value
mean(CV.boot) # the better estimate
# lower dispersion among bootstrap estimates indicates more robust estimate
se(CV.boot)
CV.bias <- mean(CV.boot)-CV(nlspk$syldur) # bias of single-sample estimate of CV
quantile(CV.boot, c(.025,.975) ) # confidence interval of bootstrap estimate of CV.boot

# this is a lot of work --- but somebody has already programmed this for you.
# see http://www.statmethods.net/advstats/bootstrapping.html
require(boot)
# revised function to obtain Coefficient of Variation from data
CV <- function(data, indices) { # indices indicate which rows to use
  d <- data[indices] # allows boot to select sample
  return( sd(d)/mean(d) ) # return CV
}
nbr <- 1000		# bootstrapping with 1000 replications
results <- boot(data=nlspk$syldur, statistic=CV, R=1000 )
results			# view results
plot(results)	# histogram and QQ plot
# "bca" type of confidence intervals of CV of Dutch speakers
boot.ci(results, type="bca")
# ...  95%   ( 0.1145,  0.1894 )  
# if I would re-do the entire study with 80 different speakers, 
# I am 95% confident that the CV (a dimensionless ratio) 
# for the new study will be between 0.1145 and 0.1894 .


### REPEATED MEASURES ANOVA
# 
md593wide <- read.table(
	file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/meo/md593wide.txt") 
	)
# reshape from wide to long layout	
md593long <- reshape( md593wide, direction="long",
				varying=c("Tr1","Tr2","Tr3"),
				timevar="treatment", times=c("1","2","3"),
				v.names="resp", idvar="subject")
dim(md593wide) # check dimensions
dim(md593long)
# make very sure that subject and treatm are factors,
# otherwise aov(...+Error()) will produce garbage
md593long$subject <- as.factor(md593long$subject)
md593long$treatm <- as.factor(md593long$treatm)
# cf Johnson (2008, section 4.3.2, p.133)
# Exploratory Data Analysis
with( md593long, 
	boxplot( resp~group*treatm, varwidth=T, col=rainbow(2),
		xlab="group.treatment",ylab="resp (ms)") )
# Verify normality assumption
qqnorm(md593long$resp)
qqline(md593long$resp,lwd=2,col="purple")
shapiro.test(md593long$resp)
# Verify homoskedasticity and sphericity assumptions
# 1. create multivar matrix in *wide* layout (!)
with( md593long,	# extract 3 subsets and combine as 3 columns
	cbind( resp[treatm==1], resp[treatm==2], resp[treatm==3]) 
	) -> md593mat  
# 2. create var-cov matrix, using auxiliary functions and objects
md593mlmfit <- lm(md593mat~1) # 3 col matrix, intercept only
# 3. extract estimated variances and covariances from mlmfit
estVar(md593mlmfit)
cor(md593mat)	
# variances of differences (sphericity)
with(md593long, var(resp[treatm==1]-resp[treatm==2]) )
with(md593long, var(resp[treatm==1]-resp[treatm==3]) )
with(md593long, var(resp[treatm==2]-resp[treatm==3]) )
####
# RM-ANOVA proper
md593.aov <- aov( resp ~ treatm*group +Error(subject/treatm), data=md593long)
summary(md593.aov)
## post-hoc testing of treatm differences
p.bonferroni() # from hqfunctions.R
# in order to achieve family-wise alpha=.05, for k=3 treatments,
# use lower alpha for each pairwise comparison
with(md593long, t.test(resp[treatm==1],resp[treatm==2],paired=T))
# t = -9.2987, df = 19, p-value = 1.676e-08
with(md593long, t.test(resp[treatm==1],resp[treatm==3],paired=T))
# t = -12.8621, df = 19, p-value = 7.957e-11
with(md593long, t.test(resp[treatm==2],resp[treatm==3],paired=T))
# t = -7.0953, df = 19, p-value = 9.495e-07
### Done


### CROSSTABS

# CONFIDENTIAL data # do not redistribute
enq2011 <- read.table( file=url("http://www.hugoquene.nl/R/enq2011.txt"), header=T)

with( enq2011, table(woonutr,woonsit) ) -> table1 # table: factors as arguments
print(table1)
margin.table(table1,1) # by row = woonutr
margin.table(table1,2) # by col = woonsit
prop.table(table1,2) # by col = woonset, split woonsit by woonutr
chisq.test(table1)
summary(table1)	# also reports chisq test
# conclusion?

with( enq2011, xtabs(~woonutr+woonsit) ) -> table2 # xtabs: formula as argument
print(table2)

### SCATTERGRAMS

# color of points depends on geslacht
with( enq2011, 
	plot(lengte,jitter(schoen),		# add jitter to schoen, to avoid overlap
		col=ifelse(geslacht=="V","red","blue"), pch=16, 
		main="studenten Statistiek 2011") )
legend( "topleft", 
	col=c("red","blue"), pch=16,
	legend=c("V","M"), inset=0.02 )
# plot parameters: xlab, ylab, xaxt, yaxt, type, and many more
# see help(par) for a list of graphical parameters
# color of points depends on jaar
with( enq2011, 
	plot(lengte,jitter(schoen),		# add jitter to schoen, to avoid overlap
 		col=1+jaar, pch=16, 
 		main="studenten Statistiek 2011") )
legend( "topleft", 
	col=1+unique(enq2011$jaar), pch=16,
	legend=as.character(unique(enq2011$jaar)), inset=0.02 )
	
VOWELS <- c("A","E","I","O","U") # set of uppercase vowels
vowelname <- enq2011$letter

### SPLOM: scatterplot matrix
require(languageR)
pairscor.fnc( enq2011[,c(4,7,13,14,15)]) # matrix of scattergrams, from languageR package
require(psych)
pairs.panels( enq2011[,c(4,7,13,14,15)]) # matrix of scattergrams, from psych package

### TRELLIS PLOTS
require(lattice)
# xyplot( reistijd~afstand|woonutr, data=enq2011, 
# 	xlab="Afstand (km)", ylab="Reistijd (minuten)", pch=16) 
# case nr.50 is incorrect
xyplot( werktijd[-50]~I(8*reistijd/60)[-50]|woonutr[-50], 
	data=enq2011, 
	xlab="Reistijd (uren/wk)", ylab="Werktijd (uren/wk)", pch=16, grid=T, 
	type=c("p","spline","r") )
 
### SIMPLE LINEAR REGRESSION
nlspk <- read.table(
	file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"),
	header=TRUE, na.strings=c("NA","MISSING") ) # named arguments
ok <- nlspk$syldur<0.400 # exclude outlier
# always start with a scattergram ! 
with( nlspk, plot(nsyl.ln[ok],syldur[ok]) )
# the scattergram suggests a negative relation:
# the lower the (log of) average nr of syllables per phrase (=phraselength)
# the lower the average syllable duration (speech tempo). 
# at the phrase level (!) this is known as anticipatory shortening:
# longer phrases are spoken at a faster tempo. 
# here we see that more talkative speakers (having longer phrases) 
# tend to speak faster.
# we are going to investigate this by means of regression analysis. 
model1.lm <- lm( syldur~nsyl.ln, data=nlspk, subset=ok)
summary(model1.lm)
# syldur = 0.360 + (-0.051)*nsyl.ln
# model F is significant, slope b1 is significant
# but notice that intercept differs from mean(syldur) ! 
mean(nlspk$syldur[ok])

## CENTERING PREDICTORS
with( nlspk, plot(nsyl.ln[ok],syldur[ok], xlim=c(0,3),ylim=c(0,0.4) ) )
abline(model1.lm, col="red")
# standard error of intercept estimate
ise <- summary(model1.lm)$coefficients["(Intercept)","Std. Error"]
lines( x=c(0,0), 
	y=coef(model1.lm)[1]+c(-2,2)*ise, col="red" )
points(0,coef(model1.lm)[1],pch=1,col="red",cex=3)

model2.lm <- lm( syldur~I(nsyl.ln-median(nsyl.ln)), data=nlspk, subset=ok)
summary(model2.lm)
# model2 has lower error on estimate of intercept
# and a more sensible intercept. 

with( nlspk, plot(I(nsyl.ln[ok]-median(nsyl.ln[ok])),syldur[ok], xlim=c(0,3)-median(nsyl.ln[ok]),ylim=c(0,0.4) ) )
abline(model2.lm, col="red")
# standard error of intercept estimate
ise <- summary(model2.lm)$coefficients["(Intercept)","Std. Error"]
lines( x=c(0,0), 
	y=coef(model2.lm)[1]+c(-2,2)*ise, col="red" )
points(0,coef(model2.lm)[1],pch=1,col="red",cex=3)

# The same applies with multiple predictors: 
# each predictor is estimated at the point where all other predictors are zero.
# Centering predictors is particularly important if we have several predictors. 

# model evaluation: check influential observations
influence.measures(model1.lm)

## CROSS-VALIDATION

# the good R^2 (R=.42) of model1.lm and model2.lm suggest a good fit. 
# but how robust is it, i.e., how good can it predict unseen data? 
# for this we need crossvalidation. 
# cf Baayen 2008, p.193 ff
# re-compute the regression model, now using rms::ols instead of stats::lm
require(rms) # formerly package was named `Design`
# supply x=T and y=T arguments to store more information in `model1.ols`
model1.ols <- ols( syldur~nsyl.ln, data=nlspk, subset=ok, x=T, y=T )
val <- validate( model1.ols, method="cross", B=5, bw=F, pr=T )
# train coefficients on 4/5 sample of dataset
# test same coefficients on remaining 1/5 of dataset
print(val)
#          index.orig training   test optimism index.corrected n
#R-square      0.1852   0.1834 0.0260   0.1574          0.0278 5
#MSE           0.0008   0.0008 0.0008   0.0000          0.0008 5
#g             0.0149   0.0148 0.0121   0.0027          0.0122 5
#Intercept     0.0000   0.0000 0.0349  -0.0349          0.0349 5
#Slope         1.0000   1.0000 0.8605   0.1395          0.8605 5
# R-square and Slope are too optimistic

## MORE DIAGNOSTICS
# see Baayen 2008, section 6.2.3, p.188 ff
dffits <- abs(resid(model1.ols,"dffits"))
plot(dffits,type="h")
# which data points have large influence on each regression coefficient?
which.influence(model1.ols)

# AVOID OVERFITTING
# "As a rule of thumb, there should be at least fifteen times
# more observations than coefficients (...)" (Baayen 2008, p.195)
# Thus, for a data set of 80 observations, no more than 5 coefficients.


## MULTIPLE REGRESSION
concept <- read.table( file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/mer/concept.txt"), header=TRUE )
concept1.ols <- ols(GPA~1+IQ, data=concept, x=T, y=T)
concept1.ols
#          Coef    S.E.   t     Pr(>|t|)
#Intercept -3.5571 1.5518 -2.29 0.0247  
#IQ         0.1010 0.0141  7.14 <0.0001 
mean(concept$IQ)
# [1] 108.9231
# recover previously defined `eda.fnc`
source(file=url("http://www.hugoquene.nl/R/eda.fnc.R"))
eda.fnc(concept$IQ)
shapiro.test(concept$IQ)
# W = 0.9678, p-value = 0.04556
# IQ is not perfectly normally distributed
# hence use median, not mean, for centering IQ
concept2.ols <- ols(GPA~1+I(IQ-median(IQ)), data=concept, x=T, y=T)
concept2.ols
# notice lower error on Intercept coefficient
#          Coef   S.E.   t     Pr(>|t|)
#Intercept 7.5553 0.1857 40.68 <0.0001 
#IQ        0.1010 0.0141  7.14 <0.0001 

eda.fnc(concept$SC)
shapiro.test(concept$SC)
# W = 0.934, p-value = 0.000554
# SC is not normally distributed

concept3.ols <- ols(GPA~IQ+SC, data=concept, x=T, y=T)
concept3.ols
#          Coef    S.E.   t     Pr(>|t|)
#Intercept -3.8820 1.4722 -2.64 0.0102  
#IQ         0.0772 0.0154  5.02 <0.0001 
#SC         0.0513 0.0163  3.14 0.0024  

concept4.ols <- ols(GPA~I(IQ-median(IQ))+I(SC-median(SC)), data=concept, x=T, y=T )
concept4.ols
#          Coef   S.E.   t     Pr(>|t|)
#Intercept 7.6598 0.1789 42.82 <0.0001 
#IQ        0.0772 0.0154  5.02 <0.0001 
#SC        0.0513 0.0163  3.14 0.0024  
# notice large effect of centering on intercept Coef and S.E.

# However.... in Multiple Regression there is the risk of collinearity!
require(languageR)
cor(concept[,c("GPA","IQ","SC")])
#          GPA        IQ        SC
#GPA 1.0000000 0.6337307 0.5418329
#IQ  0.6337307 1.0000000 0.4931479
#SC  0.5418329 0.4931479 1.0000000
# IQ and SC are not independent predictors
collin.fnc(concept,c("GPA","IQ","SC") ) # compute collinearity index
#$cnumber
#[1] 27.40661
# "Medium collinearity is indicated by condition numbers around 15,
# and condition numbers of 30 or more indicate potentially harmful collinearity."
# -- Baayen 2008, p.182
## solution against collinearity: Principal Components Analysis
## see help(princomp) or help(prcomp)


### LOGISTIC REGRESSION
# Use if DV is binomial. 
# data after...
#   Macaulay & Brice (1997) Language 73(4)
#   see http://www.jstor.org/stable/10.2307/417327
# create columns
sex <- as.factor( c( rep(0,60),rep(1,132) ) ) # apologies for gender bias
juvenile <- as.logical( c( rep(1,48),rep(0,60-48),rep(1,52),rep(0,132-52)) )
genderbias <- data.frame(sex,juvenile) # combine into data frame
rm(sex,juvenile)
table(genderbias) -> table3
#   juvenile
#sex FALSE TRUE
#  0    12   48
#  1    80   52
prop.table(table3,1) # proportions by rows
#   juvenile
#sex     FALSE      TRUE
#  0 0.2000000 0.8000000
#  1 0.6060606 0.3939394

model3.glm <- glm(juvenile~1,data=genderbias,family="binomial")
summary(model3.glm)
#Coefficients:
#            Estimate Std. Error z value Pr(>|z|)
#(Intercept)  0.08338    0.14446   0.577    0.564
model4.glm <- glm(juvenile~1+sex,data=genderbias,family="binomial")
# DV is binomial
# predictor is also categorical here --- but not necessarily so. 
summary(model4.glm)
#Coefficients:
#            Estimate Std. Error z value Pr(>|z|)    
#(Intercept)   1.3863     0.3227   4.295 1.74e-05 ***
#sex1         -1.8171     0.3686  -4.929 8.26e-07 ***
# female log-odds: 1.386
# male log-odds: 1.386-1.817 = -0.431
# the log-odds for male references is signif lower than for female references.
# the odds of a male ref being juvenile (boy) is signif lower than the odds
# of a female ref being juvenile (girl).

### MORE LOGISTIC REGRESSION
require(rms)
polder <- read.table( file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/mer/polder.txt") 
polder$sex <- as.factor(polder$sex)
polder$age.c <- polder$age - median(polder$age) # centered to 52.5
polder1.lrm <- lrm(hit~1, data=polder)
coef(polder1.lrm)
# Intercept 
# 0.3639654 
logit(59/100) # 59 hits, 41 miss, log of odds ratio = logit
# [1] 0.3639654

polder2.lrm <- lrm(hit~1+sex, data=polder, x=T, y=T)
polder2.lrm
#           Coef    S.E.   Wald Z Pr(>|Z|)
# Intercept  0.8473 0.3086  2.75  0.0060 		# for females 
# sex=1     -0.9273 0.4188 -2.21  0.0268  		# for males
which.influence(polder2.lrm) -> temp2
# $Intercept
#  [1] "1"  "3"  "10" "11" "12" "13" "21" "24" "26" "30" "34" "38" "41" "43" "44"
polder[ temp2$Intercept, ]

# interpretation of coefficient for sex
table(polder$sex,polder$hit)
#     0  1
#  0 15 35
#  1 26 24
35/15	# odds ratio for females
# [1] 2.333333
24/26	# odds ratio for males
# [1] 0.9230769
0.923/2.333	# relative odds
[1] 0.3956279
# the odds (not P) for males to speak polder dutch are .39 of the odds for females. 
exp(-0.9273) # undo log operation
[1] 0.3956204
# the regression coefficient captures the difference in odds due to sex. 

# Evaluate performance of model2.lrm
# if predicted value >0 then hit predicted, `sign` returns pos~neg sign of argument
table( polder$hit, sign(predict(polder2.lrm)))	# rows observed, cols predicted
#    -1  1
#  0 26 15
#  1 24 35
# accuracy is (26+35)/100 = .61
# false alarm rate is (15/50) = .30
# true detection rate (hit rate) is 35/(24+35) = 0.59
qnorm(.59)-qnorm(.30) # d prime of model2.lrm
# [1] 0.7519455

# expand model with age.c # don't bother about age uncentered
polder3.lrm <- lrm(hit~1+sex+age.c, data=polder, x=T, y=T)	# with sex and age.c
polder3.lrm
#          Coef    S.E.   Wald Z Pr(>|Z|)
# Intercept  3.6254 1.0213  3.55  0.0004  	# females
# sex=1     -4.2516 1.2407 -3.43  0.0006  	# males
# age.c     -0.1787 0.0413 -4.33  <0.0001 	# same age trend for both sexes ! 

# Evaluate significance of predictors
anova(polder3.lrm)
# Factor     Chi-Square d.f. P     
# sex        11.74      1    0.0006
# age.c      18.73      1    <.0001
# TOTAL      18.83      2    0.0001
# Both predictors are indeed significant. 

# Evaluate performance of model3.lrm
# table( polder$hit, sign(predict(polder3.lrm)))	# rows obs, cols predicted
#    -1  1
#  0 35  6
#  1  8 51
# accuracy is (35+51)/100 = 0.86
# false alarm rate is (6/57) = 0.1052632
# true detection rate (hit rate) is 51/(8+51) = 0.86
qnorm(.86)-qnorm(.11) # d prime of model3.lrm
# [1] 2.306847
# So model3 has better sensitivity and specificity, hence higher d', than model2.

# Plot fitted trends
# females
curve( antilogit( (3.6254)-0.1787*(x-52.5)), 
	from=11,to=89, type="b", pch="F", xlab="Age",ylab="P(hit)", ylim=c(0,1) )
# males
curve( antilogit( (3.6254-4.2516)-0.1787*(x-52.5)), 
	from=11,to=89, type="b", pch="M", add=T )
# add data points, with jitter, males blue females red
points( polder$age[polder$sex=="1"], jitter(polder$hit[polder$sex=="1"], factor=0.2), pch=16, col="blue")
points( polder$age[polder$sex=="0"],jitter(polder$hit[polder$sex=="0"], factor=0.2), pch=16, col="red")
# notice that both curves follow the same age trend

# One way to compare logistic regression models is by means of their ROC curve;
# the curve connects the false-alarm rate (x axis) and accuracy (y axis). 
# A good model has a low FA rate and high hit rate, i.e. a large AREA UNDER ROC CURVE (AUC). 
require(ModelGood)
Roc(polder2.lrm) -> roc2
plot(roc2, lwd=3, lty=2)	# lty=2 gives dashed line
roc2
#AUC:no plan 
#      61.37 	# in percentages
# R^_ROC = 2*(.6137-0.5) = 0.227 # Menard 2009, p.77
Roc(polder3.lrm) -> roc3
roc3
# AUC:no plan 
#       95.12 	# in percentages
# R^_ROC = 2*(.9512-0.5) = 0.902  # Menard 2009, p.77
lines(roc3, lwd=3, lty=1)	# lty=1 gives solid line, captures larger area
# so model3.lrm indeed makes fewer prediction errors than model2.lrm

# Avoid overfitting: N > 10 * k / p
# where k = number of coefficients, p = proportion of hits or miss, whichever smallest.
# For polder, p=.41 and N=100, hence 100*.41 > 10*k hence 4.1>k or k<4.1,
# hence no more than k=4 coefficients for this data set. 
# See e.g. http://www.medcalc.org/manual/logistic_regression.php