## Hugo Quené, R course at UiL OTS, June 2012 ## h.quene@uu.nl ### BOOTSTRAP ESTIMATION nlspk <- read.table( file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"), header=TRUE, na.strings=c("NA","MISSING") ) # named arguments # always start with eda eda.fnc( nlspk$syldur ) # one suspicious outlier, with syldur>0.400 # new function CV <- function(x) sd(x)/mean(x) # coefficient of variation CV(nlspk$syldur) # sample-based estimate of CV # [1] 0.1395056 # BUT this estCV may be too high, because of the outlier; # how robust is this estimate? Answer: use bootstrap method # 1. create new object to hold bootstrap estimates nbr <- 1000 # number of bootstrap replications CV.boot <- numeric(nbr) # 2a. take `nr` replications, # 2b. for each replication: resample observations, compute CV, store for (i in 1:nbr) { CV.boot[i] <- CV( sample(nlspk$syldur,replace=T) ) } # 3. the center of these bootstrap estimates is in fact a BETTER estimate # of the POPULATION value than the single-sample value mean(CV.boot) # the better estimate # lower dispersion among bootstrap estimates indicates more robust estimate se(CV.boot) CV.bias <- mean(CV.boot)-CV(nlspk$syldur) # bias of single-sample estimate of CV quantile(CV.boot, c(.025,.975) ) # confidence interval of bootstrap estimate of CV.boot # this is a lot of work --- but somebody has already programmed this for you. # see http://www.statmethods.net/advstats/bootstrapping.html require(boot) # revised function to obtain Coefficient of Variation from data CV <- function(data, indices) { # indices indicate which rows to use d <- data[indices] # allows boot to select sample return( sd(d)/mean(d) ) # return CV } nbr <- 1000 # bootstrapping with 1000 replications results <- boot(data=nlspk$syldur, statistic=CV, R=1000 ) results # view results plot(results) # histogram and QQ plot # "bca" type of confidence intervals of CV of Dutch speakers boot.ci(results, type="bca") # ... 95% ( 0.1145, 0.1894 ) # if I would re-do the entire study with 80 different speakers, # I am 95% confident that the CV (a dimensionless ratio) # for the new study will be between 0.1145 and 0.1894 . ### REPEATED MEASURES ANOVA # md593wide <- read.table( file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/meo/md593wide.txt") ) # reshape from wide to long layout md593long <- reshape( md593wide, direction="long", varying=c("Tr1","Tr2","Tr3"), timevar="treatment", times=c("1","2","3"), v.names="resp", idvar="subject") dim(md593wide) # check dimensions dim(md593long) # make very sure that subject and treatm are factors, # otherwise aov(...+Error()) will produce garbage md593long$subject <- as.factor(md593long$subject) md593long$treatm <- as.factor(md593long$treatm) # cf Johnson (2008, section 4.3.2, p.133) # Exploratory Data Analysis with( md593long, boxplot( resp~group*treatm, varwidth=T, col=rainbow(2), xlab="group.treatment",ylab="resp (ms)") ) # Verify normality assumption qqnorm(md593long$resp) qqline(md593long$resp,lwd=2,col="purple") shapiro.test(md593long$resp) # Verify homoskedasticity and sphericity assumptions # 1. create multivar matrix in *wide* layout (!) with( md593long, # extract 3 subsets and combine as 3 columns cbind( resp[treatm==1], resp[treatm==2], resp[treatm==3]) ) -> md593mat # 2. create var-cov matrix, using auxiliary functions and objects md593mlmfit <- lm(md593mat~1) # 3 col matrix, intercept only # 3. extract estimated variances and covariances from mlmfit estVar(md593mlmfit) cor(md593mat) # variances of differences (sphericity) with(md593long, var(resp[treatm==1]-resp[treatm==2]) ) with(md593long, var(resp[treatm==1]-resp[treatm==3]) ) with(md593long, var(resp[treatm==2]-resp[treatm==3]) ) #### # RM-ANOVA proper md593.aov <- aov( resp ~ treatm*group +Error(subject/treatm), data=md593long) summary(md593.aov) ## post-hoc testing of treatm differences p.bonferroni() # from hqfunctions.R # in order to achieve family-wise alpha=.05, for k=3 treatments, # use lower alpha for each pairwise comparison with(md593long, t.test(resp[treatm==1],resp[treatm==2],paired=T)) # t = -9.2987, df = 19, p-value = 1.676e-08 with(md593long, t.test(resp[treatm==1],resp[treatm==3],paired=T)) # t = -12.8621, df = 19, p-value = 7.957e-11 with(md593long, t.test(resp[treatm==2],resp[treatm==3],paired=T)) # t = -7.0953, df = 19, p-value = 9.495e-07 ### Done ### CROSSTABS # CONFIDENTIAL data # do not redistribute enq2011 <- read.table( file=url("http://www.hugoquene.nl/R/enq2011.txt"), header=T) with( enq2011, table(woonutr,woonsit) ) -> table1 # table: factors as arguments print(table1) margin.table(table1,1) # by row = woonutr margin.table(table1,2) # by col = woonsit prop.table(table1,2) # by col = woonset, split woonsit by woonutr chisq.test(table1) summary(table1) # also reports chisq test # conclusion? with( enq2011, xtabs(~woonutr+woonsit) ) -> table2 # xtabs: formula as argument print(table2) ### SCATTERGRAMS # color of points depends on geslacht with( enq2011, plot(lengte,jitter(schoen), # add jitter to schoen, to avoid overlap col=ifelse(geslacht=="V","red","blue"), pch=16, main="studenten Statistiek 2011") ) legend( "topleft", col=c("red","blue"), pch=16, legend=c("V","M"), inset=0.02 ) # plot parameters: xlab, ylab, xaxt, yaxt, type, and many more # see help(par) for a list of graphical parameters # color of points depends on jaar with( enq2011, plot(lengte,jitter(schoen), # add jitter to schoen, to avoid overlap col=1+jaar, pch=16, main="studenten Statistiek 2011") ) legend( "topleft", col=1+unique(enq2011$jaar), pch=16, legend=as.character(unique(enq2011$jaar)), inset=0.02 ) VOWELS <- c("A","E","I","O","U") # set of uppercase vowels vowelname <- enq2011$letter ### SPLOM: scatterplot matrix require(languageR) pairscor.fnc( enq2011[,c(4,7,13,14,15)]) # matrix of scattergrams, from languageR package require(psych) pairs.panels( enq2011[,c(4,7,13,14,15)]) # matrix of scattergrams, from psych package ### TRELLIS PLOTS require(lattice) # xyplot( reistijd~afstand|woonutr, data=enq2011, # xlab="Afstand (km)", ylab="Reistijd (minuten)", pch=16) # case nr.50 is incorrect xyplot( werktijd[-50]~I(8*reistijd/60)[-50]|woonutr[-50], data=enq2011, xlab="Reistijd (uren/wk)", ylab="Werktijd (uren/wk)", pch=16, grid=T, type=c("p","spline","r") ) ### SIMPLE LINEAR REGRESSION nlspk <- read.table( file=url("http://www.hugoquene.nl/emlar/intra.bysubj.txt"), header=TRUE, na.strings=c("NA","MISSING") ) # named arguments ok <- nlspk$syldur<0.400 # exclude outlier # always start with a scattergram ! with( nlspk, plot(nsyl.ln[ok],syldur[ok]) ) # the scattergram suggests a negative relation: # the lower the (log of) average nr of syllables per phrase (=phraselength) # the lower the average syllable duration (speech tempo). # at the phrase level (!) this is known as anticipatory shortening: # longer phrases are spoken at a faster tempo. # here we see that more talkative speakers (having longer phrases) # tend to speak faster. # we are going to investigate this by means of regression analysis. model1.lm <- lm( syldur~nsyl.ln, data=nlspk, subset=ok) summary(model1.lm) # syldur = 0.360 + (-0.051)*nsyl.ln # model F is significant, slope b1 is significant # but notice that intercept differs from mean(syldur) ! mean(nlspk$syldur[ok]) ## CENTERING PREDICTORS with( nlspk, plot(nsyl.ln[ok],syldur[ok], xlim=c(0,3),ylim=c(0,0.4) ) ) abline(model1.lm, col="red") # standard error of intercept estimate ise <- summary(model1.lm)$coefficients["(Intercept)","Std. Error"] lines( x=c(0,0), y=coef(model1.lm)[1]+c(-2,2)*ise, col="red" ) points(0,coef(model1.lm)[1],pch=1,col="red",cex=3) model2.lm <- lm( syldur~I(nsyl.ln-median(nsyl.ln)), data=nlspk, subset=ok) summary(model2.lm) # model2 has lower error on estimate of intercept # and a more sensible intercept. with( nlspk, plot(I(nsyl.ln[ok]-median(nsyl.ln[ok])),syldur[ok], xlim=c(0,3)-median(nsyl.ln[ok]),ylim=c(0,0.4) ) ) abline(model2.lm, col="red") # standard error of intercept estimate ise <- summary(model2.lm)$coefficients["(Intercept)","Std. Error"] lines( x=c(0,0), y=coef(model2.lm)[1]+c(-2,2)*ise, col="red" ) points(0,coef(model2.lm)[1],pch=1,col="red",cex=3) # The same applies with multiple predictors: # each predictor is estimated at the point where all other predictors are zero. # Centering predictors is particularly important if we have several predictors. # model evaluation: check influential observations influence.measures(model1.lm) ## CROSS-VALIDATION # the good R^2 (R=.42) of model1.lm and model2.lm suggest a good fit. # but how robust is it, i.e., how good can it predict unseen data? # for this we need crossvalidation. # cf Baayen 2008, p.193 ff # re-compute the regression model, now using rms::ols instead of stats::lm require(rms) # formerly package was named `Design` # supply x=T and y=T arguments to store more information in `model1.ols` model1.ols <- ols( syldur~nsyl.ln, data=nlspk, subset=ok, x=T, y=T ) val <- validate( model1.ols, method="cross", B=5, bw=F, pr=T ) # train coefficients on 4/5 sample of dataset # test same coefficients on remaining 1/5 of dataset print(val) # index.orig training test optimism index.corrected n #R-square 0.1852 0.1834 0.0260 0.1574 0.0278 5 #MSE 0.0008 0.0008 0.0008 0.0000 0.0008 5 #g 0.0149 0.0148 0.0121 0.0027 0.0122 5 #Intercept 0.0000 0.0000 0.0349 -0.0349 0.0349 5 #Slope 1.0000 1.0000 0.8605 0.1395 0.8605 5 # R-square and Slope are too optimistic ## MORE DIAGNOSTICS # see Baayen 2008, section 6.2.3, p.188 ff dffits <- abs(resid(model1.ols,"dffits")) plot(dffits,type="h") # which data points have large influence on each regression coefficient? which.influence(model1.ols) # AVOID OVERFITTING # "As a rule of thumb, there should be at least fifteen times # more observations than coefficients (...)" (Baayen 2008, p.195) # Thus, for a data set of 80 observations, no more than 5 coefficients. ## MULTIPLE REGRESSION concept <- read.table( file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/mer/concept.txt"), header=TRUE ) concept1.ols <- ols(GPA~1+IQ, data=concept, x=T, y=T) concept1.ols # Coef S.E. t Pr(>|t|) #Intercept -3.5571 1.5518 -2.29 0.0247 #IQ 0.1010 0.0141 7.14 <0.0001 mean(concept$IQ) # [1] 108.9231 # recover previously defined `eda.fnc` source(file=url("http://www.hugoquene.nl/R/eda.fnc.R")) eda.fnc(concept$IQ) shapiro.test(concept$IQ) # W = 0.9678, p-value = 0.04556 # IQ is not perfectly normally distributed # hence use median, not mean, for centering IQ concept2.ols <- ols(GPA~1+I(IQ-median(IQ)), data=concept, x=T, y=T) concept2.ols # notice lower error on Intercept coefficient # Coef S.E. t Pr(>|t|) #Intercept 7.5553 0.1857 40.68 <0.0001 #IQ 0.1010 0.0141 7.14 <0.0001 eda.fnc(concept$SC) shapiro.test(concept$SC) # W = 0.934, p-value = 0.000554 # SC is not normally distributed concept3.ols <- ols(GPA~IQ+SC, data=concept, x=T, y=T) concept3.ols # Coef S.E. t Pr(>|t|) #Intercept -3.8820 1.4722 -2.64 0.0102 #IQ 0.0772 0.0154 5.02 <0.0001 #SC 0.0513 0.0163 3.14 0.0024 concept4.ols <- ols(GPA~I(IQ-median(IQ))+I(SC-median(SC)), data=concept, x=T, y=T ) concept4.ols # Coef S.E. t Pr(>|t|) #Intercept 7.6598 0.1789 42.82 <0.0001 #IQ 0.0772 0.0154 5.02 <0.0001 #SC 0.0513 0.0163 3.14 0.0024 # notice large effect of centering on intercept Coef and S.E. # However.... in Multiple Regression there is the risk of collinearity! require(languageR) cor(concept[,c("GPA","IQ","SC")]) # GPA IQ SC #GPA 1.0000000 0.6337307 0.5418329 #IQ 0.6337307 1.0000000 0.4931479 #SC 0.5418329 0.4931479 1.0000000 # IQ and SC are not independent predictors collin.fnc(concept,c("GPA","IQ","SC") ) # compute collinearity index #$cnumber #[1] 27.40661 # "Medium collinearity is indicated by condition numbers around 15, # and condition numbers of 30 or more indicate potentially harmful collinearity." # -- Baayen 2008, p.182 ## solution against collinearity: Principal Components Analysis ## see help(princomp) or help(prcomp) ### LOGISTIC REGRESSION # Use if DV is binomial. # data after... # Macaulay & Brice (1997) Language 73(4) # see http://www.jstor.org/stable/10.2307/417327 # create columns sex <- as.factor( c( rep(0,60),rep(1,132) ) ) # apologies for gender bias juvenile <- as.logical( c( rep(1,48),rep(0,60-48),rep(1,52),rep(0,132-52)) ) genderbias <- data.frame(sex,juvenile) # combine into data frame rm(sex,juvenile) table(genderbias) -> table3 # juvenile #sex FALSE TRUE # 0 12 48 # 1 80 52 prop.table(table3,1) # proportions by rows # juvenile #sex FALSE TRUE # 0 0.2000000 0.8000000 # 1 0.6060606 0.3939394 model3.glm <- glm(juvenile~1,data=genderbias,family="binomial") summary(model3.glm) #Coefficients: # Estimate Std. Error z value Pr(>|z|) #(Intercept) 0.08338 0.14446 0.577 0.564 model4.glm <- glm(juvenile~1+sex,data=genderbias,family="binomial") # DV is binomial # predictor is also categorical here --- but not necessarily so. summary(model4.glm) #Coefficients: # Estimate Std. Error z value Pr(>|z|) #(Intercept) 1.3863 0.3227 4.295 1.74e-05 *** #sex1 -1.8171 0.3686 -4.929 8.26e-07 *** # female log-odds: 1.386 # male log-odds: 1.386-1.817 = -0.431 # the log-odds for male references is signif lower than for female references. # the odds of a male ref being juvenile (boy) is signif lower than the odds # of a female ref being juvenile (girl). ### MORE LOGISTIC REGRESSION require(rms) polder <- read.table( file=url("http://www.let.uu.nl/~Hugo.Quene/personal/onderwijs/mer/polder.txt") polder$sex <- as.factor(polder$sex) polder$age.c <- polder$age - median(polder$age) # centered to 52.5 polder1.lrm <- lrm(hit~1, data=polder) coef(polder1.lrm) # Intercept # 0.3639654 logit(59/100) # 59 hits, 41 miss, log of odds ratio = logit # [1] 0.3639654 polder2.lrm <- lrm(hit~1+sex, data=polder, x=T, y=T) polder2.lrm # Coef S.E. Wald Z Pr(>|Z|) # Intercept 0.8473 0.3086 2.75 0.0060 # for females # sex=1 -0.9273 0.4188 -2.21 0.0268 # for males which.influence(polder2.lrm) -> temp2 # $Intercept # [1] "1" "3" "10" "11" "12" "13" "21" "24" "26" "30" "34" "38" "41" "43" "44" polder[ temp2$Intercept, ] # interpretation of coefficient for sex table(polder$sex,polder$hit) # 0 1 # 0 15 35 # 1 26 24 35/15 # odds ratio for females # [1] 2.333333 24/26 # odds ratio for males # [1] 0.9230769 0.923/2.333 # relative odds [1] 0.3956279 # the odds (not P) for males to speak polder dutch are .39 of the odds for females. exp(-0.9273) # undo log operation [1] 0.3956204 # the regression coefficient captures the difference in odds due to sex. # Evaluate performance of model2.lrm # if predicted value >0 then hit predicted, `sign` returns pos~neg sign of argument table( polder$hit, sign(predict(polder2.lrm))) # rows observed, cols predicted # -1 1 # 0 26 15 # 1 24 35 # accuracy is (26+35)/100 = .61 # false alarm rate is (15/50) = .30 # true detection rate (hit rate) is 35/(24+35) = 0.59 qnorm(.59)-qnorm(.30) # d prime of model2.lrm # [1] 0.7519455 # expand model with age.c # don't bother about age uncentered polder3.lrm <- lrm(hit~1+sex+age.c, data=polder, x=T, y=T) # with sex and age.c polder3.lrm # Coef S.E. Wald Z Pr(>|Z|) # Intercept 3.6254 1.0213 3.55 0.0004 # females # sex=1 -4.2516 1.2407 -3.43 0.0006 # males # age.c -0.1787 0.0413 -4.33 <0.0001 # same age trend for both sexes ! # Evaluate significance of predictors anova(polder3.lrm) # Factor Chi-Square d.f. P # sex 11.74 1 0.0006 # age.c 18.73 1 <.0001 # TOTAL 18.83 2 0.0001 # Both predictors are indeed significant. # Evaluate performance of model3.lrm # table( polder$hit, sign(predict(polder3.lrm))) # rows obs, cols predicted # -1 1 # 0 35 6 # 1 8 51 # accuracy is (35+51)/100 = 0.86 # false alarm rate is (6/57) = 0.1052632 # true detection rate (hit rate) is 51/(8+51) = 0.86 qnorm(.86)-qnorm(.11) # d prime of model3.lrm # [1] 2.306847 # So model3 has better sensitivity and specificity, hence higher d', than model2. # Plot fitted trends # females curve( antilogit( (3.6254)-0.1787*(x-52.5)), from=11,to=89, type="b", pch="F", xlab="Age",ylab="P(hit)", ylim=c(0,1) ) # males curve( antilogit( (3.6254-4.2516)-0.1787*(x-52.5)), from=11,to=89, type="b", pch="M", add=T ) # add data points, with jitter, males blue females red points( polder$age[polder$sex=="1"], jitter(polder$hit[polder$sex=="1"], factor=0.2), pch=16, col="blue") points( polder$age[polder$sex=="0"],jitter(polder$hit[polder$sex=="0"], factor=0.2), pch=16, col="red") # notice that both curves follow the same age trend # One way to compare logistic regression models is by means of their ROC curve; # the curve connects the false-alarm rate (x axis) and accuracy (y axis). # A good model has a low FA rate and high hit rate, i.e. a large AREA UNDER ROC CURVE (AUC). require(ModelGood) Roc(polder2.lrm) -> roc2 plot(roc2, lwd=3, lty=2) # lty=2 gives dashed line roc2 #AUC:no plan # 61.37 # in percentages # R^_ROC = 2*(.6137-0.5) = 0.227 # Menard 2009, p.77 Roc(polder3.lrm) -> roc3 roc3 # AUC:no plan # 95.12 # in percentages # R^_ROC = 2*(.9512-0.5) = 0.902 # Menard 2009, p.77 lines(roc3, lwd=3, lty=1) # lty=1 gives solid line, captures larger area # so model3.lrm indeed makes fewer prediction errors than model2.lrm # Avoid overfitting: N > 10 * k / p # where k = number of coefficients, p = proportion of hits or miss, whichever smallest. # For polder, p=.41 and N=100, hence 100*.41 > 10*k hence 4.1>k or k<4.1, # hence no more than k=4 coefficients for this data set. # See e.g. http://www.medcalc.org/manual/logistic_regression.php