Edps/Psych/Stat 587
Spring 2019
C.J.Anderson
R: EDA of Fixed Effects Structure with a Little Regression Diagnostics
Date: tba
1. The first thing you need to do is install and load the following packages --- each should be in
require( ) or library( ) :
library(lme4)
library(lmerTest)
library(lattice)
library(ggplot2)
library(stringi) # installed before HLMdiag or HLMdiag won't load properly
library(texreg)
library(optimx)
2.
Do basic setup, which will be on lab4 template:
#
# read in data into R
# the first row of the data matrix is the variable names
#
# Change path to where your data live
Lab4<-read.table("D:/Dropbox/edps587/Lab_and_homework/
Computer lab4/lab3.txt",header=TRUE)
#look at first few rows of data set
head(lab3)
#
# Set-up -- from before
#
# Use if statement to create "boy"
lab3$boy = ifelse(lab4$gender=="boy", 1, 0) # for some
reason this creates "NA" for girls
lab4$boy[is.na(lab4$boy)] <- 1
# Create Third
lab3$third <- ifelse(lab4$grade==3, 1, 0)
# School mean Center math and school mean math
grpMmath <- as.data.frame(aggregate(math~idschool,
data=lab4, "mean"))
names(grpMmath) <- c('idschool', 'grpMmath')
lab3 <- merge(lab3,grpMmath, by=c('idschool'))
lab4$grpCmath <- lab4$math – lab4$grpMmath
#
# make school id a factor variable
#
Lab4$idschool<-as.factor(lab4$idschool)
#
# Re-scale for later use
#
lab3$gcmath <- scale(lab3$grpCmath, center=FALSE,
scale=TRUE)
lab3$gmmath <- scale(lab3$grpMmath, center=FALSE,
scale=TRUE)
3. Since this is a very large data set, it is sometimes nice to have a smaller random sample to us.
The following code will do this:
####################################################
# Get random sample of 25 schools
####################################################
#
# Look at individuals within schools
#
####################################################
groups <- unique(lab4$idschool)[sample(1:145,25)]
subset <- lab4[lab4$idschool%in%groups,]
4. The first graph is a lattice plot to look at individual students within schools. Use the random
sample and plot science by math scores.
a. With points for students and linear regression for each school
#
# Plot 2: Lattice plot with regressions
#
xyplot(science ~ math | idschool, data=subset,
col.line=c('black','blue'),
type=c('p','r'),
main='Varability in Science ~ math relationship')
b. Do the same plot but use smooth regression (i.e., loess) rather than linear. To do
this change
type=c('p','smooth')
c. Note: if you want to do include both points, loess and linear regression, add ‘r’to
the type statement…you do not have to do this but you can if you want to see what
is looks like. This code will do it:
xyplot(science ~ math | idschool, data=subset,
type=c('p','smooth','r'),
main='Varability in Science ~ math relationship')
5. Using the sub-sample of the data, draw lattice plot of science by hours of watching TV (as
numeric variable) including
a. Linear regression lines
b. Smooth (loess) regressions
6. Using the sub-sample of the data, draw lattice plot of science by hours of playing computer
games (as numeric variable) including
a. Linear regression lines
b. Smooth (loess) regressions
7. Draw plot of regressions of science on math for all schools in the same plot:
a. Note this code is a bit picky in that it works best if all the code below is on one line
of input.
############################################################
# All regressions in one plot: science ~ math
############################################################
ggplot(lab4, aes(x=math, y=science, col=idschool,type='l'),
main='Regressions all in one plot') +
geom_smooth(method="lm", se=FALSE ) +
theme(legend.position="none")
b. Repeat but now only use our sub-sample of data.
8. Plot science by hour of watching TV.
a. Need to compute mean science for each value of watching TV:
(y <- aggregate(science~hoursTV, data=lab4, "mean"))
b. Now plot the means
plot(y[,1], y[,2], type='b',
ylab='Mean Science',
xlab='Hours Watching TV',
main='Marginal of Science x HoursTV')
9. Plot mean science by hours of playing computer games. (Revise code from above).
10. Plot regressions of science on hours watching TV for schools all in one figure
a. Use linear regression:
# Linear regressions
ggplot(lab4, aes(x=hoursTV, y=science,
col=idschool,type='l')) + geom_smooth(method="lm",
se=FALSE ) + theme(legend.position="none")
b. Use loess (smooth) curve:
geom_smooth(method="loess", se=FALSE)
11. Repeat 10 but plot science by hours playing computer games.
12. Plot science by type of community
a. Use linear regression
lab4$community <- as.factor(lab4$typecommunity)
ggplot(lab4, aes(x=math, y=science, col=community,
type='l')) + geom_smooth(method='lm', se=FALSE)
b. Use smooth lines
ggplot(lab4, aes(x=math, y=science, col=community,
type='l')) + geom_smooth(method='loess', se=FALSE)
c. Check frequencies per type of community
table(lab4$typecommunity)
Or
lab4$group <- cut(lab4$science,10)
table(lab4$group,lab4$typecommunity)
13. Fit simple regression to each school and examine (plot) R-squares by sample size.
Set up for plot and loop
nj <- table(lab4$idschool) # number per school
nj <- as.data.frame(nj)
names(nj) <- c("idschool","nj")
ssmodel <- (0) # initialize SS model
sstotal <- (0) # initialize SS total
R2 <- matrix(99,nrow=nclusters,ncol=2) # for saving R2s
# For a simple model: science ~ math
# Loop through schools
index <-1 # initialize
for (i in (unique(lab4$idschool))) {
sub <- lab4[ which(lab4$idschool==i),] # data for one school
model0 <- lm(science ~ math, data=sub) # fit the model
a <- anova(model0) # save anova table
ssmodel <- ssmodel + a[1,2] # add to SS model
sstotal <- sstotal + sum(a[,2]) # sum of all SS
# save the R2
R2[index,1:2] <- as.numeric(cbind(i,summary(model0)$r.squared))
index <- index+1 # up-date index
}
a. Some clean up and prepare for plotting
R2meta <- ssmodel/sstotal
R2 <- as.data.frame(R2)
names(R2) <- c("idschool","R2")
R2.mod1 <- merge(R2,nj,by=c("idschool"))
b. Now plot the results
plot(R2.mod1$nj,R2.mod1$R2,type='p',
ylim=c(0,1),
ylab="R squared",
xlab="n_j (number of observations per school)" ,
main="Science ~ math")
abline(h=R2meta,col='blue') # add line for R2meta
text(70,0.95,'R2meta=.36',col="blue") # text of R2meta's
14. Compute and plot R-squares from regression of science on math, hours watching TV
and hours playing computer games where hours watching TV & playing computer
games are numeric variables. Call them
R2.mod2
15. Compute and plot R-square from regressions of science on math and hours watching
TV where hours watching TV are categorical variables. Call them
R2.mod3
16. To compare the R-squares from the models from item 13 (numeric TV) versus those
from item 14 (categorical TV), plot the R-squares against each other and draw in
reference line:
plot(R2.mod2$R2,R2.mod3$R2, pch=20,cex=1.4,
ylim=c(0,1),
xlim=c(0,1),
ylab='Hours as Discrete',
xlab='Hours as Numeric',
main='Hours Discrete vs Numeric')
abline(a=0,b=1, col='blue') # adds a reference line
17. Easiest regression diagnostic to obtain
a. Fit model:
modelxx <- lmer(science ~ 1 + gcmath + gmmath + hrsTV +
hrscg + ( 1 + gcmath|idschool), lab4, REML=FALSE,
control = lmerControl(optimizer ="Nelder_Mead"))
b. Plot the residuals
plot(modelxx, xlab='Fitted Conditional',
ylab='Pearson Residuals')
c. Use HLMdiag to get conditional residuals (i.e., the fixed plus EB of Us) and do a
dotplot Get stuff
# Get y-(X*gamma+Z*U) where U estimated by Empricial Bayes
res1 <- HLMresid(modelxx, level=1, type="EB",
standardize=TRUE)
head(res1) # look at what is here
d. Does dotpots
# Plot of random effects with confidence bars
dotplot(ranef(modelxx,condVar=TRUE),
lattice.options=list(layout=c(1,2)))
18. We will now make a panel of plots (similar to what SAS does):
a. Ask for 4 graphs on a page in a 2 x 2 layout:
par(mfrow=c(2,2))
b. Do a scatter plot of standardized residuals x fitted values
fit <- fitted(modelxx) # get conditional fitted values
plot(fit,res1, xlab='Conditional Fitted Values',
ylab='Pearson Std Residuals',
main='Conditional Residuals')
c. Does the plot and adds in reference line
qqnorm(res1) # draws plot
abline(a=0,b=9, col='blue') # reference line
d. Draw histogram of standardized residuals are overlay normal distribution
h<- hist(res1,breaks=15,density=20) # draw historgram
xfit <- seq(-40, 40, length=50) # sets range & number quantiles
yfit <- dnorm(xfit, mean=0, sd=7.177) # should be normal
yfit <- yfit*diff(h$mids[1:2])*length(res1)# use mid-points
lines(xfit, yfit, col='darkblue', lwd=2) # draws normal
e. In the 4th plot put information about the model.
plot.new( ) # a plot with nothing in it.
text(.5,1.0,'Modelxx') # potentially useful text
text(.5,0.9,'Devience=48,356.5')
text(.5,0.8,'AIC=48,386.5')
text(.5,0.7,'BIC=48,489.5')
18. Influence diagnostics using package HLMdiag functions:
a. Cook’s distances:
par(mfrow=c(2,2))
cook <- cooks.distance(modelxx, group="idschool")
dotplot_diag(x=cook, cutoff="internal",
name="cooks.distance", ylab=("Cook's distance"),
xlab=("School"))
b. Mdfit:
mdfit <- mdffits(modelxx,group="idschool")
dotplot_diag(x=mdfit, cutoff="internal", name="mdffits",
ylab=("MDFIts"), xlab=("School"))
19. Not REUQIRED FOR HOMEWORK, but these can be
informative. See lme4 mannual for explanation of zeta
plot as well as some other graphics.
# zeta plot: straight lines are good
p1 <- profile(lmer(science ~ 1 + gcmath + gmmath +hrsTV + hrscg
+ ( 1 + gcmath|idschool), lab4, REML=FALSE))
xyplot(p1 , aspect =0.7)
xyplot(p1 , aspect =0.7, absVal=TRUE)
confint(p1, level=.99)