diff --git a/intro-linear-mixed-effects-models/LMM.Rmd b/intro-linear-mixed-effects-models/LMM.Rmd new file mode 100644 index 0000000..b543435 --- /dev/null +++ b/intro-linear-mixed-effects-models/LMM.Rmd @@ -0,0 +1,196 @@ +--- +title: "Linear_Mixed_Effects_Models: example datasets & models" +author: "Reuben Thomas" +date: "2023-04-24" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` +## Load the necessary libraries for this markdown +If you don't already have these packages installed then please do so before implementing the code in the rest of this file. +```{r} +require(lattice) +require(lme4) +require(ggplot2) +require(dplyr) +require(magrittr) + +``` +## Sleepstudy +Example of a data set modeled by random intercept and random slope mixed effects model + +```{r} +require(lattice) +require(lme4) +require(ggplot2) +require(dplyr) +require(magrittr) + +##what is the sleepstudy +?sleepstudy +##load the data +data("sleepstudy") + +##best fit line plot the data ignoring individual subjects +ggplot(sleepstudy, aes(x=Days, y=Reaction)) + + geom_point() + + geom_smooth(method = "lm", se=FALSE) + + ylab("reaction time in ms") + + theme_classic() + + theme(text = element_text(size = 16)) + + +##best fit line plots per individual subject +ggplot(sleepstudy, aes(x=Days, y=Reaction, color=Subject)) + + geom_point() + + geom_smooth(method = "lm", se=FALSE) + + ylab("reaction time in ms") + + theme_classic() + + theme(text = element_text(size = 16)) + + +##as boxplot per subject +ggplot(sleepstudy, aes(x=Subject, y=Reaction)) + + geom_boxplot() + + ylab("reaction time in ms") + + theme_classic() + + theme(text = element_text(size = 16)) + +##as boxplot per subject coloring the days +ggplot(sleepstudy, aes(x=Subject, y=Reaction)) + + geom_boxplot() + + geom_point(aes(color=Days)) + + ylab("reaction time in ms") + + theme_classic() + + theme(text = element_text(size = 16)) + +##xy plot +xyplot(Reaction ~ Days | Subject, sleepstudy, type = c("g","p","r"), + index = function(x,y) coef(lm(y ~ x))[1], + xlab = "Days of sleep deprivation", + ylab = "Average reaction time (ms)", aspect = "xy") + +##correlated random intercept and random slope +fm1 <- lme4::lmer(Reaction ~ Days + (Days | Subject), sleepstudy) +summary(fm1) + +##uncorrelated random intercept and random slope +fm2 <- lme4::lmer(Reaction ~ Days + (Days || Subject), sleepstudy) +summary(fm2) + +##ignoring the repeated measures within a subject - linear model +lm1 <- lm(Reaction ~ Days, data = sleepstudy) +summary(lm1) +``` + +## Dye stuff + +Example of a situation where pseudo-bulking the repeated measures per subject works + +```{r} +?Dyestuff +data("Dyestuff") + +#boxplot of yield by batch +ggplot(Dyestuff, aes(x=Batch, y=Yield)) + + geom_boxplot() + + geom_point() + + theme_classic() + + theme(text = element_text(size = 16)) + +#simple random intercept model +fm1 <- lmer(Yield ~ 1|Batch, Dyestuff) +summary(fm1) + +#ignore the repeated measures within a batch: linear model +lm1 <- lm(Yield ~ 1, Dyestuff) +summary(lm1) + +#pseudo-bulk yield by batch +summ_Dyestuff <- Dyestuff %>% + group_by(Batch) %>% + summarise(mYield = mean(Yield)) + +#linear model on the pseudo-bulked data +lm2 <- lm(mYield ~ 1, summ_Dyestuff) +summary(lm2) + +``` + +## Penicillin +Example illustrated crossed random designs +```{r} +data("Penicillin") +?Penicillin + +#visualize the data as a boxplot +ggplot(Penicillin, aes(x=plate, y=diameter)) + + geom_boxplot() + + geom_point(aes(color=sample)) + + theme(text = element_text(size = 16)) + + +#mixed effects model with two random effects +fm1 <- lme4::lmer(diameter ~ (1|plate) + (1|sample), Penicillin) +summary(fm1) + +#mixed effects model with one of them +fm2 <- lme4::lmer(diameter ~ (1|sample), Penicillin) +summary(fm2) + +#significance of the plate effect +anova(fm1, fm2) + +#ignoring the repeated measures associated with plate and sample +lm1 <- lm(diameter ~ 1, Penicillin) +summary(lm1) + +#pseudo-bulk by plate +p_summ_Penicillin <- Penicillin %>% + group_by(plate) %>% + summarise(m_diameter = mean(diameter)) + +#linear model with plate pseudo-bulked data +lm2 <- lm(m_diameter ~ 1, p_summ_Penicillin) +summary(lm2) + +#pseudo-bulk by sample +s_summ_Penicillin <- Penicillin %>% + group_by(sample) %>% + summarise(m_diameter = mean(diameter)) + +#linear model with sample pseudo-bulked data +lm3 <- lm(m_diameter ~ 1, s_summ_Penicillin) +summary(lm3) + + +``` + + +## Pastes data +Hierarchical or nested random effects +```{r} +?Pastes +data(Pastes) + +##dotplot visualization of the data +dotplot(cask ~ strength | reorder(batch, strength), Pastes, + strip = FALSE, strip.left = TRUE, layout = c(1, 10), + ylab = "Cask within batch", + xlab = "Paste strength", jitter.y = TRUE) + +##linear mixed effects model implemented nested/hierarchical design +fm1 <- lme4::lmer(strength ~ (1|batch/cask), Pastes) +summary(fm1) + +##ignoring the repeated measures with a linear model +lm1 <- lm(strength ~ 1, Pastes) +summary(lm1) + +##incorrectly modeling the nested random effects as crossed ones +fm2 <- lme4::lmer(strength ~ (1|batch) + (1|cask), Pastes) +summary(fm2) + +``` \ No newline at end of file diff --git a/intro-linear-mixed-effects-models/LMM.html b/intro-linear-mixed-effects-models/LMM.html new file mode 100644 index 0000000..77ca9b2 --- /dev/null +++ b/intro-linear-mixed-effects-models/LMM.html @@ -0,0 +1,853 @@ + + + + +
+ + + + + + + + + + +If you don’t already have these packages installed then please do so +before implementing the code in the rest of this file.
+require(lattice)
+## Loading required package: lattice
+require(lme4)
+## Loading required package: lme4
+## Loading required package: Matrix
+require(ggplot2)
+## Loading required package: ggplot2
+require(dplyr)
+## Loading required package: dplyr
+##
+## Attaching package: 'dplyr'
+## The following objects are masked from 'package:stats':
+##
+## filter, lag
+## The following objects are masked from 'package:base':
+##
+## intersect, setdiff, setequal, union
+require(magrittr)
+## Loading required package: magrittr
+Example of a data set modeled by random intercept and random slope +mixed effects model
+require(lattice)
+require(lme4)
+require(ggplot2)
+require(dplyr)
+require(magrittr)
+
+##what is the sleepstudy
+?sleepstudy
+##load the data
+data("sleepstudy")
+
+##best fit line plot the data ignoring individual subjects
+ggplot(sleepstudy, aes(x=Days, y=Reaction)) +
+ geom_point() +
+ geom_smooth(method = "lm", se=FALSE) +
+ ylab("reaction time in ms") +
+ theme_classic() +
+ theme(text = element_text(size = 16))
+## `geom_smooth()` using formula = 'y ~ x'
+##best fit line plots per individual subject
+ggplot(sleepstudy, aes(x=Days, y=Reaction, color=Subject)) +
+ geom_point() +
+ geom_smooth(method = "lm", se=FALSE) +
+ ylab("reaction time in ms") +
+ theme_classic() +
+ theme(text = element_text(size = 16))
+## `geom_smooth()` using formula = 'y ~ x'
+##as boxplot per subject
+ggplot(sleepstudy, aes(x=Subject, y=Reaction)) +
+ geom_boxplot() +
+ ylab("reaction time in ms") +
+ theme_classic() +
+ theme(text = element_text(size = 16))
+##as boxplot per subject coloring the days
+ggplot(sleepstudy, aes(x=Subject, y=Reaction)) +
+ geom_boxplot() +
+ geom_point(aes(color=Days)) +
+ ylab("reaction time in ms") +
+ theme_classic() +
+ theme(text = element_text(size = 16))
+##xy plot
+xyplot(Reaction ~ Days | Subject, sleepstudy, type = c("g","p","r"),
+ index = function(x,y) coef(lm(y ~ x))[1],
+ xlab = "Days of sleep deprivation",
+ ylab = "Average reaction time (ms)", aspect = "xy")
+##correlated random intercept and random slope
+fm1 <- lme4::lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
+summary(fm1)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: Reaction ~ Days + (Days | Subject)
+## Data: sleepstudy
+##
+## REML criterion at convergence: 1743.6
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -3.9536 -0.4634 0.0231 0.4634 5.1793
+##
+## Random effects:
+## Groups Name Variance Std.Dev. Corr
+## Subject (Intercept) 612.10 24.741
+## Days 35.07 5.922 0.07
+## Residual 654.94 25.592
+## Number of obs: 180, groups: Subject, 18
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 251.405 6.825 36.838
+## Days 10.467 1.546 6.771
+##
+## Correlation of Fixed Effects:
+## (Intr)
+## Days -0.138
+##uncorrelated random intercept and random slope
+fm2 <- lme4::lmer(Reaction ~ Days + (Days || Subject), sleepstudy)
+summary(fm2)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: Reaction ~ Days + ((1 | Subject) + (0 + Days | Subject))
+## Data: sleepstudy
+##
+## REML criterion at convergence: 1743.7
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -3.9626 -0.4625 0.0204 0.4653 5.1860
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## Subject (Intercept) 627.57 25.051
+## Subject.1 Days 35.86 5.988
+## Residual 653.58 25.565
+## Number of obs: 180, groups: Subject, 18
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 251.405 6.885 36.513
+## Days 10.467 1.560 6.712
+##
+## Correlation of Fixed Effects:
+## (Intr)
+## Days -0.184
+##ignoring the repeated measures within a subject - linear model
+lm1 <- lm(Reaction ~ Days, data = sleepstudy)
+summary(lm1)
+##
+## Call:
+## lm(formula = Reaction ~ Days, data = sleepstudy)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -110.848 -27.483 1.546 26.142 139.953
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 251.405 6.610 38.033 < 2e-16 ***
+## Days 10.467 1.238 8.454 9.89e-15 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 47.71 on 178 degrees of freedom
+## Multiple R-squared: 0.2865, Adjusted R-squared: 0.2825
+## F-statistic: 71.46 on 1 and 178 DF, p-value: 9.894e-15
+Example of a situation where pseudo-bulking the repeated measures per +subject works
+?Dyestuff
+data("Dyestuff")
+
+#boxplot of yield by batch
+ggplot(Dyestuff, aes(x=Batch, y=Yield)) +
+ geom_boxplot() +
+ geom_point() +
+ theme_classic() +
+ theme(text = element_text(size = 16))
+#simple random intercept model
+fm1 <- lmer(Yield ~ 1|Batch, Dyestuff)
+summary(fm1)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: Yield ~ 1 | Batch
+## Data: Dyestuff
+##
+## REML criterion at convergence: 319.7
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -1.4117 -0.7634 0.1418 0.7792 1.8296
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## Batch (Intercept) 1764 42.00
+## Residual 2451 49.51
+## Number of obs: 30, groups: Batch, 6
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 1527.50 19.38 78.8
+#ignore the repeated measures within a batch: linear model
+lm1 <- lm(Yield ~ 1, Dyestuff)
+summary(lm1)
+##
+## Call:
+## lm(formula = Yield ~ 1, data = Dyestuff)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -87.50 -58.75 2.50 47.50 107.50
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 1527.50 11.51 132.8 <2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 63.02 on 29 degrees of freedom
+#pseudo-bulk yield by batch
+summ_Dyestuff <- Dyestuff %>%
+ group_by(Batch) %>%
+ summarise(mYield = mean(Yield))
+
+#linear model on the pseudo-bulked data
+lm2 <- lm(mYield ~ 1, summ_Dyestuff)
+summary(lm2)
+##
+## Call:
+## lm(formula = mYield ~ 1, data = summ_Dyestuff)
+##
+## Residuals:
+## 1 2 3 4 5 6
+## -22.5 0.5 36.5 -29.5 72.5 -57.5
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 1527.50 19.38 78.8 6.23e-09 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 47.48 on 5 degrees of freedom
+Example illustrated crossed random designs
+data("Penicillin")
+?Penicillin
+
+#visualize the data as a boxplot
+ggplot(Penicillin, aes(x=plate, y=diameter)) +
+ geom_boxplot() +
+ geom_point(aes(color=sample)) +
+ theme(text = element_text(size = 16))
+#mixed effects model with two random effects
+fm1 <- lme4::lmer(diameter ~ (1|plate) + (1|sample), Penicillin)
+summary(fm1)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: diameter ~ (1 | plate) + (1 | sample)
+## Data: Penicillin
+##
+## REML criterion at convergence: 330.9
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -2.07923 -0.67140 0.06292 0.58377 2.97959
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## plate (Intercept) 0.7169 0.8467
+## sample (Intercept) 3.7311 1.9316
+## Residual 0.3024 0.5499
+## Number of obs: 144, groups: plate, 24; sample, 6
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 22.9722 0.8086 28.41
+#mixed effects model with one of them
+fm2 <- lme4::lmer(diameter ~ (1|sample), Penicillin)
+summary(fm2)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: diameter ~ (1 | sample)
+## Data: Penicillin
+##
+## REML criterion at convergence: 435.9
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -1.97355 -0.90190 0.02988 1.00350 2.02207
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## sample (Intercept) 3.701 1.924
+## Residual 1.019 1.010
+## Number of obs: 144, groups: sample, 6
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 22.9722 0.7899 29.08
+#significance of the plate effect
+anova(fm1, fm2)
+## refitting model(s) with ML (instead of REML)
+## Data: Penicillin
+## Models:
+## fm2: diameter ~ (1 | sample)
+## fm1: diameter ~ (1 | plate) + (1 | sample)
+## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
+## fm2 3 443.19 452.10 -218.59 437.19
+## fm1 4 340.19 352.07 -166.09 332.19 105 1 < 2.2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#ignoring the repeated measures associated with plate and sample
+lm1 <- lm(diameter ~ 1, Penicillin)
+summary(lm1)
+##
+## Call:
+## lm(formula = diameter ~ 1, data = Penicillin)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -4.9722 -0.9722 0.0278 1.0278 4.0278
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 22.9722 0.1693 135.7 <2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 2.031 on 143 degrees of freedom
+#pseudo-bulk by plate
+p_summ_Penicillin <- Penicillin %>%
+ group_by(plate) %>%
+ summarise(m_diameter = mean(diameter))
+
+#linear model with plate pseudo-bulked data
+lm2 <- lm(m_diameter ~ 1, p_summ_Penicillin)
+summary(lm2)
+##
+## Call:
+## lm(formula = m_diameter ~ 1, data = p_summ_Penicillin)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -1.47222 -0.68056 0.02778 0.86111 1.52778
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 22.9722 0.1788 128.5 <2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 0.876 on 23 degrees of freedom
+#pseudo-bulk by sample
+s_summ_Penicillin <- Penicillin %>%
+ group_by(sample) %>%
+ summarise(m_diameter = mean(diameter))
+
+#linear model with sample pseudo-bulked data
+lm3 <- lm(m_diameter ~ 1, s_summ_Penicillin)
+summary(lm3)
+##
+## Call:
+## lm(formula = m_diameter ~ 1, data = s_summ_Penicillin)
+##
+## Residuals:
+## 1 2 3 4 5 6
+## 2.19444 -1.01389 1.94444 -0.09722 -0.01389 -3.01389
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 22.9722 0.7899 29.08 9.01e-07 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 1.935 on 5 degrees of freedom
+Hierarchical or nested random effects
+?Pastes
+data(Pastes)
+
+##dotplot visualization of the data
+dotplot(cask ~ strength | reorder(batch, strength), Pastes,
+ strip = FALSE, strip.left = TRUE, layout = c(1, 10),
+ ylab = "Cask within batch",
+ xlab = "Paste strength", jitter.y = TRUE)
+##linear mixed effects model implemented nested/hierarchical design
+fm1 <- lme4::lmer(strength ~ (1|batch/cask), Pastes)
+summary(fm1)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: strength ~ (1 | batch/cask)
+## Data: Pastes
+##
+## REML criterion at convergence: 247
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -1.4798 -0.5156 0.0095 0.4720 1.3897
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## cask:batch (Intercept) 8.434 2.9041
+## batch (Intercept) 1.657 1.2874
+## Residual 0.678 0.8234
+## Number of obs: 60, groups: cask:batch, 30; batch, 10
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 60.0533 0.6769 88.72
+##ignoring the repeated measures with a linear model
+lm1 <- lm(strength ~ 1, Pastes)
+summary(lm1)
+##
+## Call:
+## lm(formula = strength ~ 1, data = Pastes)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -5.8533 -2.5533 -0.7533 2.8217 5.9467
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 60.053 0.418 143.7 <2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 3.238 on 59 degrees of freedom
+##incorrectly modeling the nested random effects as crossed ones
+fm2 <- lme4::lmer(strength ~ (1|batch) + (1|cask), Pastes)
+summary(fm2)
+## Linear mixed model fit by REML ['lmerMod']
+## Formula: strength ~ (1 | batch) + (1 | cask)
+## Data: Pastes
+##
+## REML criterion at convergence: 301.5
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -1.49025 -0.90096 -0.01247 0.62911 1.82246
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## batch (Intercept) 3.3639 1.8341
+## cask (Intercept) 0.1487 0.3856
+## Residual 7.3060 2.7030
+## Number of obs: 60, groups: batch, 10; cask, 3
+##
+## Fixed effects:
+## Estimate Std. Error t value
+## (Intercept) 60.0533 0.7125 84.28
+