# LME OF HARDMAN DATA
# HQ 20120217

# INITIALIZATION

# hardman <- read.table("bimm_datasets/JHardman_phonetic_categoric/jhardman_alldata.txt", header=T)
hardman <- read.table( file=url("http://www.hugoquene.nl/bimm2012/hardman2.txt"), header=T)

# The following steps are already performed on the downloadable dataset hardman2.txt
#with(hardman, table(Subject,lisL1.1))
# different listeners have same Subject ID ! 
# construct new variable for unique listener ID
#Listener <- as.integer(hardman$lisL1)*100+as.integer(hardman$Subject)
# add new listener ID to data frame
#hardman <- cbind(hardman,Listener)
#save(hardman, file="hardman.rda")

# MODELING: EMPTY MODEL AND Trial EFFECT
# GLMM with binary response variable

# three crossed random effects: talker, KeyWord, Listener
# one fixed effect: Trial, centered to median of 30.5
# Trial is also added as random slope at ALL random levels
hardman.m00 <- lmer( KeyWordResponse~1+(1|Listener)+(1|talker)+(1|KeyWord), data=hardman, family="binomial" )
hardman.m01 <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1|Listener)+(1|talker)+(1|KeyWord), data=hardman, family="binomial" )
hardman.m02 <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1|Listener)+(0+I(Trial-30.5)|Listener)+(1|talker)+(0+I(Trial-30.5)|talker)+(1|KeyWord)+(0+I(Trial-30.5)|KeyWord), data=hardman, family="binomial" )
hardman.m02b <- lmer( KeyWordResponse~1+I(Trial-30.5)+(1+I(Trial-30.5)|Listener)+(1+I(Trial-30.5)|talker)+(1+I(Trial-30.5)|KeyWord), data=hardman, family="binomial" ) # with covariances
anova(hardman.m01,hardman.m02,hardman.m02b)
# do we need Trial in the fixed part of the model
hardman.m03 <- update(hardman.m02b, .~.-I(Trial-30.5) ) # remove Trial from fixed part
anova(hardman.m03,hardman.m02b)
#             Df   AIC   BIC  logLik  Chisq Chi Df Pr(>Chisq)
#hardman.m03  10 11769 11844 -5874.3                         
#hardman.m02b 11 11770 11853 -5874.3 0.0482      1     0.8262
# Trial can indeed be removed from fixed part without loss

# hardman.m02b seems to be the optimal model, for the time being
# coef(hardman.m02b) # prints random intercept and random slopes
summary(hardman.m02b) # discuss random intercepts, random slopes, and covariances

# plot random intercept and random slope of Trial, for talker, KeyWord, Listener
# by source-ing files
source(file="hardman.trialbytalkers.r")
source(file="hardman.trialbywords.r")
source(file="hardman.trialbylisteners.r")
# discuss these graphs

# MODELING: ADD EXPERIMENTAL FACTORS

hardman.m04 <- update(hardman.m03, .~.+lisL1*talkL1) # add experimental factors in fixed part

# THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE
tapply( fitted(hardman.m04), list(hardman$lisL1,hardman$talkL1), mean)
#                 A         M
#American 0.9523419 0.8219412
#Hindi    0.8930716 0.7587132
#Korean   0.7378163 0.5868556
#Mand.    0.7867535 0.7432022
# THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE
hardman.m05 <- update(hardman.m04, .~.-lisL1:talkL1)
anova(hardman.m04,hardman.m05) # model without interaction is signif worse
# so interaction is highly significant

# THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE
# expand random part with talkL1 and lisL1 ## re-use discarded m05
hardman.m05 <- update(hardman.m04, .~.-(1|talker)+(0+talkL1|talker)-(1|Listener)+(0+lisL1|Listener) )
# between-listener intercepts and slopes tend to differ
anova(hardman.m04,hardman.m05)
#            Df   AIC   BIC  logLik Chisq Chi Df Pr(>Chisq)
#hardman.m04 14 11612 11717 -5792.1                        
#hardman.m05 25 11630 11818 -5790.0 4.174     11     0.9645
# including lisL1 and talkL1 in random part does NOT yield better model
## (should be checked separately for lisL1 and for talkL1)
# so right now m04 is the optimal model

# THE FOLLOWING STEPS WILL TAKE LONG TO COMPUTE
hardman.m06 <- update(hardman.m04, .~.+as.factor(TalkerAccuracy) )
summary(hardman.m06)
anova(hardman.m06,hardman.m04)
#            Df   AIC   BIC  logLik  Chisq Chi Df Pr(>Chisq)  
#hardman.m04 14 11612 11717 -5792.1                           
#hardman.m06 15 11611 11724 -5790.6 3.0883      1    0.07886 .
# model with TalkerAccuracy is slightly better
# but note that TalkerAccuracy varies only among M talkers not among A talkers...
with(hardman, tapply(TalkerAccuracy,list(talker),mean) )
#      AE1       AE2       AE3       ME1       ME2       ME3 
#1.0000000 1.0000000 1.0000000 0.5734767 0.5125448 0.5322581 
# ... therefore interaction cannot be estimated

> tapply( fitted(hardman.m04), list(hardman$talkL1,hardman$lisL1), mean )
   American     Hindi    Korean     Mand.
A 0.9523376 0.8930600 0.7377751 0.7867204
M 0.8219629 0.7587282 0.5868874 0.7432098

# SUMMARY BOXPLOTS OF MAIN EXPERIMENTAL EFFECTS

subset(hardman,talkL1=="A") -> hardman.A
subset(hardman,talkL1=="M") -> hardman.M
attach(hardman.A)	# attach allows easy reference to lisL1, masked by hardman.A
boxplot( fitted(hardman.m04)[hardman$talkL1=="A"]~lisL1, col="blue", notch=T, at=(1:4)-0.1 )
attach(hardman.M)
boxplot( fitted(hardman.m04)[hardman$talkL1=="M"]~lisL1, col="red", notch=T, at=(1:4)+0.1, add=T, xaxt="n" )
legend("bottomleft", fill=c("blue","red"), legend=c("Am. talker","Mand. talker") )

# BOOTSTRAP ESTIMATE OF RANDOM TERMS

# BEWARE BECAUSE THIS TAKES A LONG TIME
source(file="hardman.boot.r") # creates results.coef
# 3 two-stage bootstrap iterations, with GLMM, took approx 565.151 sec CPU ( 188.3837 per iter) 
results.coef
quantile(results.coef[,4], c(.025,.500,.975)) # random intercept listeners
     2.5%       50%     97.5% 
0.4723911 0.4982656 0.5496596