# LME OF HARDMAN DATA # HQ 20120217 # INITIALIZATION # hardman <- read.table("bimm_datasets/JHardman_phonetic_categoric/jhardman_alldata.txt", header=T) hardman <- read.table( file=url("http://www.hugoquene.nl/bimm2012/hardman2.txt"), header=T) # The following steps are already performed on the downloadable dataset hardman2.txt #with(hardman, table(Subject,lisL1.1)) # different listeners have same Subject ID ! # construct new variable for unique listener ID #Listener <- as.integer(hardman$lisL1)*100+as.integer(hardman$Subject) # add new listener ID to data frame #hardman <- cbind(hardman,Listener) #save(hardman, file="hardman.rda") # MODELING: EMPTY MODEL AND Trial EFFECT # GLMM with binary response variable # three crossed random effects: talker, KeyWord, Listener # one fixed effect: Trial, centered to median of 30.5 # Trial is also added as random slope at ALL random levels hardman.m00 <- lmer( KeyWordResponse~1+(1|Listener)+(1|talker)+(1|KeyWord), data=hardman, family="binomial" ) hardman.m01 <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1|Listener)+(1|talker)+(1|KeyWord), data=hardman, family="binomial" ) hardman.m02 <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1|Listener)+(0+I(Trial-30.5)|Listener)+(1|talker)+(0+I(Trial-30.5)|talker)+(1|KeyWord)+(0+I(Trial-30.5)|KeyWord), data=hardman, family="binomial" ) hardman.m02b <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1+I(Trial-30.5)|Listener)+(1+I(Trial-30.5)|talker)+(1+I(Trial-30.5)|KeyWord), data=hardman, family="binomial" ) # with covariances anova(hardman.m01,hardman.m02,hardman.m02b) # do we need Trial in the fixed part of the model hardman.m03 <- update(hardman.m02b, .~.-I(Trial-30.5) ) # remove Trial from fixed part anova(hardman.m03,hardman.m02b) # Df AIC BIC logLik Chisq Chi Df Pr(>Chisq) #hardman.m03 10 11769 11844 -5874.3 #hardman.m02b 11 11770 11853 -5874.3 0.0482 1 0.8262 # Trial can indeed be removed from fixed part without loss # hardman.m02b seems to be the optimal model, for the time being # coef(hardman.m02b) # prints random intercept and random slopes summary(hardman.m02b) # discuss random intercepts, random slopes, and covariances # plot random intercept and random slope of Trial, for talker, KeyWord, Listener # by source-ing files source(file="hardman.trialbytalkers.r") source(file="hardman.trialbywords.r") source(file="hardman.trialbylisteners.r") # discuss these graphs # MODELING: ADD EXPERIMENTAL FACTORS hardman.m04 <- update(hardman.m03, .~.+lisL1*talkL1) # add experimental factors in fixed part # THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE tapply( fitted(hardman.m04), list(hardman$lisL1,hardman$talkL1), mean) # A M #American 0.9523419 0.8219412 #Hindi 0.8930716 0.7587132 #Korean 0.7378163 0.5868556 #Mand. 0.7867535 0.7432022 # THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE hardman.m05 <- update(hardman.m04, .~.-lisL1:talkL1) anova(hardman.m04,hardman.m05) # model without interaction is signif worse # so interaction is highly significant # THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE # expand random part with talkL1 and lisL1 ## re-use discarded m05 hardman.m05 <- update(hardman.m04, .~.-(1|talker)+(0+talkL1|talker)-(1|Listener)+(0+lisL1|Listener) ) # between-listener intercepts and slopes tend to differ anova(hardman.m04,hardman.m05) # Df AIC BIC logLik Chisq Chi Df Pr(>Chisq) #hardman.m04 14 11612 11717 -5792.1 #hardman.m05 25 11630 11818 -5790.0 4.174 11 0.9645 # including lisL1 and talkL1 in random part does NOT yield better model ## (should be checked separately for lisL1 and for talkL1) # so right now m04 is the optimal model # THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE hardman.m06 <- update(hardman.m04, .~.+as.factor(TalkerAccuracy) ) summary(hardman.m06) anova(hardman.m06,hardman.m04) # Df AIC BIC logLik Chisq Chi Df Pr(>Chisq) #hardman.m04 14 11612 11717 -5792.1 #hardman.m06 15 11611 11724 -5790.6 3.0883 1 0.07886 . # model with TalkerAccuracy is slightly better # but note that TalkerAccuracy varies only among M talkers not among A talkers... with(hardman, tapply(TalkerAccuracy,list(talker),mean) ) # AE1 AE2 AE3 ME1 ME2 ME3 #1.0000000 1.0000000 1.0000000 0.5734767 0.5125448 0.5322581 # ... therefore interaction cannot be estimated > tapply( fitted(hardman.m04), list(hardman$talkL1,hardman$lisL1), mean ) American Hindi Korean Mand. A 0.9523376 0.8930600 0.7377751 0.7867204 M 0.8219629 0.7587282 0.5868874 0.7432098 # SUMMARY BOXPLOTS OF MAIN EXPERIMENTAL EFFECTS subset(hardman,talkL1=="A") -> hardman.A subset(hardman,talkL1=="M") -> hardman.M attach(hardman.A) # attach allows easy reference to lisL1, masked by hardman.A boxplot( fitted(hardman.m04)[hardman$talkL1=="A"]~lisL1, col="blue", notch=T, at=(1:4)-0.1 ) attach(hardman.M) boxplot( fitted(hardman.m04)[hardman$talkL1=="M"]~lisL1, col="red", notch=T, at=(1:4)+0.1, add=T, xaxt="n" ) legend("bottomleft", fill=c("blue","red"), legend=c("Am. talker","Mand. talker") ) # BOOTSTRAP ESTIMATE OF RANDOM TERMS # BEWARE BECAUSE THIS TAKES A LONG TIME source(file="hardman.boot.r") # creates results.coef # 3 two-stage bootstrap iterations, with GLMM, took approx 565.151 sec CPU ( 188.3837 per iter) results.coef quantile(results.coef[,4], c(.025,.500,.975)) # random intercept listeners 2.5% 50% 97.5% 0.4723911 0.4982656 0.5496596