1 Read a dataset of animal coagulation time vs diet
2 Visualize the data using scatterplot and boxplot
3 Fit a Linear Model
4 Analysis of Variance
5 Model Checking
6 Comparing Effects
7 A complete R Procedure for Example 3.1 from Textbook

1 Read a dataset of animal coagulation time vs diet

coagulation <- read.table("data/coagulation.txt", header=T)
coagulation

coag	diet
62	A
60	A
63	A
59	A
63	B
67	B
71	B
64	B
65	B
66	B
68	C
66	C
71	C
67	C
68	C
68	C
56	D
62	D
60	D
61	D
63	D
64	D
63	D
59	D

coag <- coagulation$coag
diet <- coagulation$diet

2 Visualize the data using scatterplot and boxplot

plot (as.numeric(diet), coag, 
      col = diet, 
      pch = as.numeric(diet), 
      xaxt="n", xlab = "Diet")
axis (1, 1:4, LETTERS[1:4])

boxplot(coag~diet)

3 Fit a Linear Model

3.1 Fit a linear model with cell means

# 
g0 <- lm(coag~0+diet)
# What model is fitted?
data.frame(coag,model.matrix(g0))

coag	dietA	dietB	dietC	dietD
62	1	0	0	0
60	1	0	0	0
63	1	0	0	0
59	1	0	0	0
63	0	1	0	0
67	0	1	0	0
71	0	1	0	0
64	0	1	0	0
65	0	1	0	0
66	0	1	0	0
68	0	0	1	0
66	0	0	1	0
71	0	0	1	0
67	0	0	1	0
68	0	0	1	0
68	0	0	1	0
56	0	0	0	1
62	0	0	0	1
60	0	0	0	1
61	0	0	0	1
63	0	0	0	1
64	0	0	0	1
63	0	0	0	1
59	0	0	0	1

This model is written as: \[ y_k = \mu_A x_k^A+\mu_B x_k^B+\mu_C x_k^C+\mu_D x_k^D+\epsilon_k \] where \(x_k^i=I(x_k=i)\) for \(i=A,B,C,D\).

# The model fitting results:
summary(g0)

# 
# Call:
# lm(formula = coag ~ 0 + diet)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -5.00  -1.25   0.00   1.25   5.00 
# 
# Coefficients:
#       Estimate Std. Error t value Pr(>|t|)    
# dietA   61.000      1.183    51.5   <2e-16 ***
# dietB   66.000      0.966    68.3   <2e-16 ***
# dietC   68.000      0.966    70.4   <2e-16 ***
# dietD   61.000      0.837    72.9   <2e-16 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.37 on 20 degrees of freedom
# Multiple R-squared:  0.999,   Adjusted R-squared:  0.999 
# F-statistic: 4.4e+03 on 4 and 20 DF,  p-value: <2e-16

3.2 Fit a linear model with centralized effects

# 
g3 <- lm(coag~diet, contrasts = list(diet=contr.sum))

# What model is fitted?
data.frame(coag, model.matrix(g3))

coag	X.Intercept.	diet1	diet2	diet3
62	1	1	0	0
60	1	1	0	0
63	1	1	0	0
59	1	1	0	0
63	1	0	1	0
67	1	0	1	0
71	1	0	1	0
64	1	0	1	0
65	1	0	1	0
66	1	0	1	0
68	1	0	0	1
66	1	0	0	1
71	1	0	0	1
67	1	0	0	1
68	1	0	0	1
68	1	0	0	1
56	1	-1	-1	-1
62	1	-1	-1	-1
60	1	-1	-1	-1
61	1	-1	-1	-1
63	1	-1	-1	-1
64	1	-1	-1	-1
63	1	-1	-1	-1
59	1	-1	-1	-1

This model is written as: \[ y_k = \mu+\tau_A (x_k^A-x_k^D)+\tau_B (x_k^B-x_k^D)+\tau_C (x_k^C-x_k^D)+\epsilon_k \] where \[\mu=(\mu_A+\ldots+\mu_D)/4, \tau_A=\mu_A-\mu,\ldots,\tau_D=\mu_D-\mu\]

## fitting results
summary(g3)

# 
# Call:
# lm(formula = coag ~ diet, contrasts = list(diet = contr.sum))
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -5.00  -1.25   0.00   1.25   5.00 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   64.000      0.498  128.54  < 2e-16 ***
# diet1         -3.000      0.974   -3.08  0.00589 ** 
# diet2          2.000      0.845    2.37  0.02819 *  
# diet3          4.000      0.845    4.73  0.00013 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.37 on 20 degrees of freedom
# Multiple R-squared:  0.671,   Adjusted R-squared:  0.621 
# F-statistic: 13.6 on 3 and 20 DF,  p-value: 4.66e-05

3.3 Fit a linear model with treatment differences

# 
g <- lm(coag~diet, contrasts = list(diet=contr.treatment(n=4,base = 1)))

# What model is fitted?
data.frame(model.matrix(g))

X.Intercept.	diet2	diet3	diet4
1	0	0	0
1	0	0	0
1	0	0	0
1	0	0	0
1	1	0	0
1	1	0	0
1	1	0	0
1	1	0	0
1	1	0	0
1	1	0	0
1	0	1	0
1	0	1	0
1	0	1	0
1	0	1	0
1	0	1	0
1	0	1	0
1	0	0	1
1	0	0	1
1	0	0	1
1	0	0	1
1	0	0	1
1	0	0	1
1	0	0	1
1	0	0	1

This model is written as: \[ y_k = \beta_0+\beta_B x_k^B+\beta_Cx_k^C+\beta_D x_k^D+\epsilon_k \] where, \[\beta_0=\mu_A, \beta_B=\mu_B-\mu_A, \beta_C=\mu_C-\mu_A, \beta_D=\mu_D-\mu_A.\] Note that \[\beta_B=\tau_B-\tau_A,\beta_C=\tau_C-\tau_A,\beta_D=\tau_D-\tau_A.\]

## fitting results
summary(g)

# 
# Call:
# lm(formula = coag ~ diet, contrasts = list(diet = contr.treatment(n = 4, 
#     base = 1)))
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -5.00  -1.25   0.00   1.25   5.00 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 6.10e+01   1.18e+00   51.55  < 2e-16 ***
# diet2       5.00e+00   1.53e+00    3.27  0.00380 ** 
# diet3       7.00e+00   1.53e+00    4.58  0.00018 ***
# diet4       2.99e-15   1.45e+00    0.00  1.00000    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.37 on 20 degrees of freedom
# Multiple R-squared:  0.671,   Adjusted R-squared:  0.621 
# F-statistic: 13.6 on 3 and 20 DF,  p-value: 4.66e-05

## by default, R uses this mean difference parametrization
gd <- lm(coag~diet)

summary(gd)

# 
# Call:
# lm(formula = coag ~ diet)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -5.00  -1.25   0.00   1.25   5.00 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 6.10e+01   1.18e+00   51.55  < 2e-16 ***
# dietB       5.00e+00   1.53e+00    3.27  0.00380 ** 
# dietC       7.00e+00   1.53e+00    4.58  0.00018 ***
# dietD       2.99e-15   1.45e+00    0.00  1.00000    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.37 on 20 degrees of freedom
# Multiple R-squared:  0.671,   Adjusted R-squared:  0.621 
# F-statistic: 13.6 on 3 and 20 DF,  p-value: 4.66e-05

4 Analysis of Variance

4.1 Fitted values, Residuals, and Sum Squares

This section is for illustrating prediction and residuals

y <- coag
y_hat_h0 <- rep (mean (y), length (y)); y_hat_h0

#  [1] 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64

y_hat_h1 <- predict (lm (y~diet)); y_hat_h1;

#  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
# 61 61 61 61 66 66 66 66 66 66 68 68 68 68 68 68 61 61 61 61 61 61 61 61

# residual (error) from h0 model (equal means)
resid_h0 <- y - y_hat_h0
# residual (error) from h1 model (unequal means)
resid_h1 <- y - y_hat_h1 
data.frame (diet, y=y, y_hat_h0, resid_h0, y_hat_h1, resid_h1, diff.residual=y_hat_h1-y_hat_h0 )

diet	y	y_hat_h0	resid_h0	y_hat_h1	resid_h1	diff.residual
A	62	64	-2	61	1	-3
A	60	64	-4	61	-1	-3
A	63	64	-1	61	2	-3
A	59	64	-5	61	-2	-3
B	63	64	-1	66	-3	2
B	67	64	3	66	1	2
B	71	64	7	66	5	2
B	64	64	0	66	-2	2
B	65	64	1	66	-1	2
B	66	64	2	66	0	2
C	68	64	4	68	0	4
C	66	64	2	68	-2	4
C	71	64	7	68	3	4
C	67	64	3	68	-1	4
C	68	64	4	68	0	4
C	68	64	4	68	0	4
D	56	64	-8	61	-5	-3
D	62	64	-2	61	1	-3
D	60	64	-4	61	-1	-3
D	61	64	-3	61	0	-3
D	63	64	-1	61	2	-3
D	64	64	0	61	3	-3
D	63	64	-1	61	2	-3
D	59	64	-5	61	-2	-3

Visulize the predictive values and residuals

plot (y, col = diet,pch = as.numeric(diet),
      main="Coaglution time versus Run Order (Sample ID)")
lines (1:4,g$fitted.values[1:4], col = 1)
lines (5:10,g$fitted.values[5:10], col = 2)
lines (11:16,g$fitted.values[11:16], col = 3)
lines (17:24,g$fitted.values[17:24], col = 4)
abline(h=mean(y))

# compute SS
SSt <-sum((y - y_hat_h0)^2); SSt

# [1] 340

SSe <- sum((y - y_hat_h1)^2); SSe

# [1] 112

SStr <- SSt - SSe; SStr # alternatively we can compute SStr by

# [1] 228

SStr2 <- sum((y_hat_h0-y_hat_h1)^2); SStr2

# [1] 228

Plot of Sum Square Against Degree Freedom

rss <- c(SSt, SSe,0)
num.par <- c(1, 4, n=nrow(coagulation))
plot(rss~num.par, xlab = "Number of Parameters", ylab = "Residual Sum Squares", type="b")
grid(nx=5*5)

4.2 ANOVA by Hand (Short-cut Formula)

Let \(n=n_1+\ldots,n_a\) with \(n_i\) is the number of observation in group \(i\). Formulae for computing SS: \[RSS_0 = SST = \sum_{i,j} y_{ij}^2-n\bar{y_{..}}^2 \] \[RSS_0-RSS_1 = SS_{tr} = \sum_i^a n_i\bar{y}_{i.}^2-n\bar{y_{..}}^2\] \[RSS_1 = SSE=SST-SS_{tr}=\sum_{i,j} y_{ij}^2 - \sum_i^a n_i\bar{y}_{i.}^2 = \sum_i^a SS_i\]

ybar_i. <- tapply (y, mean, INDEX = diet); ybar_i.

#  A  B  C  D 
# 61 66 68 61

ybar_.. <- mean (y); ybar_..

# [1] 64

n_i <- table (diet); n_i

# diet
# A B C D 
# 4 6 6 8

n <- length (y)
SSy <- sum (y^2)
# the above values will be provided in test question

# compute SSt,SSe, SStr
SSt <- SSy-n*ybar_..^2; SSt

# [1] 340

n_i*ybar_i.^2 # vectoized calculation

# diet
#     A     B     C     D 
# 14884 26136 27744 29768

SStr <- sum(n_i*ybar_i.^2) - n*ybar_..^2; SStr

# [1] 228

SSe <- SSt - SStr; SSe

# [1] 112

# compute F and output anova table
k <- length (unique (diet));k

# [1] 4

n <- length (y);n

# [1] 24

df1 <- k - 1
df2 <- n - k
MStr <- SStr/df1
MSe <- SSe/df2
f <- MStr /MSe
pvalue <- 1-pf (f, df1=df1, df2=df2); pvalue

# [1] 4.658e-05

anova_table <- data.frame (Df = c(df1,df2), 
                           SS = c(SStr,SSe), 
                           MS = c(MStr, MSe), 
                           F = c(f,NA), 
                           pvalue = c(pvalue, NA)); 
row.names(anova_table) <- c("diet", "Residuals")
anova_table

	Df	SS	MS	F	pvalue
diet	3	228	76.0	13.57	0
Residuals	20	112	5.6	NA	NA

4.3 ANOVA with R Function

Compute ANOVA with the fitting result with intercept

g <- lm(coag~diet)
anova (g)

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
diet	3	228	76.0	13.57	0
Residuals	20	112	5.6	NA	NA

4.4 Understanding Sampling Distribution of SS and F

We will use simulation to understand the sampling distributions of sum squares and F statistics in ANOVA.

When treat difference is 0

g <- lm (coag~diet)
n <- nrow (coagulation)
g$coefficients[2:4] <- 0
ylim0 <- range (coag) + c(-2,2)

# Simulate datasets with modified coeffients
for(i in 1:100){
  sim.coag <- predict(g, newdata = coagulation)+rnorm(n,0,2.37)
  par(mfrow=c(1,2))
  plot(sim.coag~as.numeric(diet),
       col=as.numeric(diet),
       xlab = "Diet",
       ylab = "Simulated Response",
       ylim = ylim0)    
  fit.sim.coag <- lm(sim.coag~diet)
  anova.fit.sim.coag <- anova(fit.sim.coag)
  ss.sim.coag <- anova.fit.sim.coag$`Sum Sq`
  rss.sim.coag <- rev(cumsum(c(0, rev(ss.sim.coag))))
  num.par <- cumsum(c(1,anova.fit.sim.coag$Df))
  plot(rss.sim.coag~num.par,
       xlab="Number of Parameters", 
       ylab = "Residual Sum Squares", 
       type ="b",
       main=sprintf("F=%.2f",anova.fit.sim.coag$`F value`[1])
       )

  grid(nx=25)
}

When treat difference is non-zero

g <- lm (coag~diet)
n <- nrow (coagulation)
g$coefficients[2:4] <- c(1,2,3)*3
ylim0 <- range (coag) + c(-2,5)

# Simulate datasets with modified coeffients
for(i in 1:100){
  sim.coag <- predict(g, newdata = coagulation)+rnorm(n,0,2.37)
  par(mfrow=c(1,2))
  plot(sim.coag~as.numeric(diet),
       col=as.numeric(diet),
       xlab = "Diet",
       ylab = "Simulated Response",
       ylim = ylim0)    
  fit.sim.coag <- lm(sim.coag~diet)
  anova.fit.sim.coag <- anova(fit.sim.coag)
  ss.sim.coag <- anova.fit.sim.coag$`Sum Sq`
  rss.sim.coag <- rev(cumsum(c(0, rev(ss.sim.coag))))
  num.par <- cumsum(c(1,anova.fit.sim.coag$Df))
  plot(rss.sim.coag~num.par,
       xlab="Number of Parameters", 
       ylab = "Residual Sum Squares", 
       type ="b",
       main=sprintf("F=%.2f",anova.fit.sim.coag$`F value`[1])
  )
  grid(nx=25)
}

5 Model Checking

## fit lm model
g <- lm(coag~diet)
summary (g)

# 
# Call:
# lm(formula = coag ~ diet)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -5.00  -1.25   0.00   1.25   5.00 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 6.10e+01   1.18e+00   51.55  < 2e-16 ***
# dietB       5.00e+00   1.53e+00    3.27  0.00380 ** 
# dietC       7.00e+00   1.53e+00    4.58  0.00018 ***
# dietD       2.99e-15   1.45e+00    0.00  1.00000    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.37 on 20 degrees of freedom
# Multiple R-squared:  0.671,   Adjusted R-squared:  0.621 
# F-statistic: 13.6 on 3 and 20 DF,  p-value: 4.66e-05

## plot residuals vs run order
plot(g$residuals,
     xlab="Run order", ylab="Residuals", 
     main="Plot of the residuals versus Run Order",  
     col = diet, pch = as.numeric(diet))
abline (h=0)

# check the constant variance assumption
plot(g$fit,g$residuals,
     xlab="Fitted",ylab="Residuals",
     main="Plot of the residuals versus fitted values",
     col = diet,
     pch = as.numeric(diet))

## add some jitter since the fitted value for diet A and D are the same
plot(jitter(g$fit), g$residuals, xlab="Fitted",ylab="Residuals",
     main="Jittered plot of the residuals versus fitted values",
     col = diet,
     pch = as.numeric(diet))

## draw qq plot to check normality of residuals
qqnorm(g$residuals)
qqline(g$residuals)

## testing normality of residuals
shapiro.test(g$residuals)

# 
#   Shapiro-Wilk normality test
# 
# data:  g$residuals
# W = 0.98, p-value = 0.9

# checking equality of variances 
## an illustration
abs.res <- abs(g$residuals)
plot (abs.res, pch = as.numeric(diet), col = diet)

boxplot (abs.res~diet)

anova (lm (abs.res~diet))

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
diet	3	4.333	1.444	0.7046	0.5604
Residuals	20	41.000	2.050	NA	NA

## traditional levene test is as above
levene.test(coagulation$coag, coagulation$diet, location = "mean")

# 
#   Classical Levene's test based on the absolute deviations from the mean
#   ( none not applied because the location is not set to median )
# 
# data:  coagulation$coag
# Test Statistic = 0.7, p-value = 0.6

## modified levene test is to substitute the means by medians
levene.test(coagulation$coag, coagulation$diet, location = "median")

# 
#   Modified robust Brown-Forsythe Levene-type test based on the absolute
#   deviations from the median
# 
# data:  coagulation$coag
# Test Statistic = 0.65, p-value = 0.6

#Bartlette's test
bartlett.test(coag~diet, coagulation)

# 
#   Bartlett test of homogeneity of variances
# 
# data:  coag by diet
# Bartlett's K-squared = 1.7, df = 3, p-value = 0.6

6 Comparing Effects

6.1 Estimate Treatment Means

g <- lm (coag~0+diet)
cbind(summary(g)$coefficients,confint (g))

#       Estimate Std. Error t value  Pr(>|t|) 2.5 % 97.5 %
# dietA       61     1.1832   51.55 9.548e-23 58.53  63.47
# dietB       66     0.9661   68.32 3.532e-25 63.98  68.02
# dietC       68     0.9661   70.39 1.949e-25 65.98  70.02
# dietD       61     0.8367   72.91 9.663e-26 59.25  62.75

Computing formular for standard error: \[\hat\sigma=\sqrt{MSE}\] \[se(\hat \mu_i)=\hat\sigma\times \sqrt{1/n_i}\]

## get MSE from ANOVA output
anova(g)

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
diet	4	98532	24633.0	4399	0
Residuals	20	112	5.6	NA	NA

table(diet)

# diet
# A B C D 
# 4 6 6 8

sigma <- sqrt(5.6); sigma

# [1] 2.366

se_mu1 <- sigma * sqrt(1/4)
me_mu1 <- se_mu1 * qt(0.975, df = 20)
CI <- 61 + c(-me_mu1,me_mu1); CI

# [1] 58.53 63.47

se_mu2 <- sigma * sqrt(1/6)
me_mu2 <- se_mu2 * qt(0.975, df = 20)
CI <- 66 + c(-me_mu2,me_mu2); CI

# [1] 63.98 68.02

6.2 Effect Comparison with Least Significant Difference

Simply using confidence intervals from lm model, which uses t quantile without using multiple comparison adjustment, and use “A” as the baseline level (intercept)

g <- lm (coag~diet)
cbind(summary(g)$coefficients,confint (g))

#              Estimate Std. Error   t value  Pr(>|t|)  2.5 % 97.5 %
# (Intercept) 6.100e+01      1.183 5.155e+01 9.548e-23 58.532 63.468
# dietB       5.000e+00      1.528 3.273e+00 3.803e-03  1.814  8.186
# dietC       7.000e+00      1.528 4.583e+00 1.805e-04  3.814 10.186
# dietD       2.991e-15      1.449 2.064e-15 1.000e+00 -3.023  3.023

Computing formula for standard error: \[se(\hat \mu_i-\hat \mu_j)=\hat\sigma\times \sqrt{1/n_i+1/n_j}\]

anova(g)

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
diet	3	228	76.0	13.57	0
Residuals	20	112	5.6	NA	NA

table(diet)

# diet
# A B C D 
# 4 6 6 8

sigma <- sqrt(5.6); sigma

# [1] 2.366

se_mu1_to_mu2 <- sigma * sqrt(1/4+1/6)
me_mu1_to_mu2 <- se_mu1_to_mu2 * qt(0.975, df = 20)
CI <- 66-61 + c(-me_mu1_to_mu2,me_mu1_to_mu2); CI

# [1] 1.814 8.186

se_mu1_to_mu4 <- sigma * sqrt(1/4+1/8)
me_mu1_to_mu4 <- se_mu1_to_mu4 * qt(0.975, df = 20)
CI <- 61-61 + c(-me_mu1_to_mu4,me_mu1_to_mu4); CI

# [1] -3.023  3.023

6.3 A Simulation Illustration for Multiple Comparison Bias

k <- 6 ## number of means
rep_means <- replicate (k, rnorm(10000))/sqrt(rchisq(10000, df =20)/20)
rep_means[1:10,]

#           [,1]    [,2]      [,3]     [,4]    [,5]     [,6]
#  [1,]  0.07936 -1.1155 -0.345271  0.52322  1.0142 -1.40514
#  [2,]  0.73488  1.7361  0.007894  0.24220  0.5333  1.17966
#  [3,] -0.89140 -0.7483 -0.201237 -0.85505 -0.3195 -0.02718
#  [4,] -0.31502 -1.5270 -0.097061  0.67787 -0.3831 -0.70941
#  [5,]  0.52082  1.3273  0.075378  1.46701  0.7047 -0.13058
#  [6,] -0.21127  1.2140 -1.113065  0.69077 -0.7042  1.80166
#  [7,]  0.32672 -2.3090  0.892029  1.25368 -0.1084  1.57682
#  [8,] -0.24332 -0.8896  0.350715 -3.32672 -0.2094 -0.03835
#  [9,]  0.30136  1.1280  0.429686 -0.42425  0.5074 -0.20134
# [10,]  0.48439  1.4437 -1.053625  0.03308  0.7985 -0.97384

ranget <- function (x) {max (x)-min(x)}
rep_means_max <- apply(rep_means, 1, ranget) 
sim.diff.means <- data.frame (dmeans=abs(rep_means[,-1]-rep_means[,1]), 
                              max_difference=rep_means_max)
#sim.data[1:10,]
ylim <- c(0, max (sim.diff.means)+4)
boxplot (sim.diff.means, ylim = ylim)
abline (h = qt(0.975,df = 20)* sqrt(2))
abline (h = qtukey(0.95, nmeans = k, df = 20), col = "red")
legend(0.5, ylim[2],
       c("t", "Tukey"), lty = c(1,1),col = 1:2)

6.4 Quantiles in Tukey, Bonferroni Correction and t

In hand calculation,considering multiple comparison, the t quantile should be replaced by tukey quantile/sqrt(2)

tukey quantile is always larger than t quantile

qt0 <- qt (0.975, df = 20)
qk <- qb <- rep (0,4)
qk [1]  <- qtukey(0.95, nmeans = 2, df =20 )/sqrt(2) ## nmeans is number of means
qk [2]  <- qtukey(0.95, nmeans = 4, df =20 )/sqrt(2) ## nmeans is number of means
qk [3]  <- qtukey(0.95, nmeans = 40, df =20 )/sqrt(2) ## nmeans is number of means
qk [4]  <- qtukey(0.95, nmeans = 100, df =20 )/sqrt(2) ## nmeans is number of means
## comparing to bonferroni correction
qb[1] <- qt (1-0.025/choose (2,2), df = 20) 
qb[2] <- qt (1-0.025/choose (4,2), df = 20) 
qb[3] <- qt (1-0.025/choose (40,2), df = 20) 
qb[4] <- qt (1-0.025/choose (100,2), df = 20) 

comp.qtb <- data.frame(t = qt0,  Tukey = qk, Bonferroni = qb )
row.names(comp.qtb) <- paste0("number of means = ", c(2,4,40,100))
comp.qtb

	t	Tukey	Bonferroni
number of means = 2	2.086	2.086	2.086
number of means = 4	2.086	2.799	2.927
number of means = 40	2.086	4.506	5.030
number of means = 100	2.086	5.082	5.849

6.5 Procedure for Computing Tukey Confidence Intervals

## explanation for how to compute the tukey CI for dietA - diet B
g <- lm (coag~diet)
anova.g <- anova (g)
dietMeans <- g$coefficients[1] + c(0, g$coefficients[-1]) ## or using
dietMeans <- tapply (coag, INDEX = diet, mean)
sigma <- sigma (g); sigma

# [1] 2.366

diet_n <- table (diet); diet_n

# diet
# A B C D 
# 4 6 6 8

Formula for Tukey Margin Error with confidence level \(1-\alpha\):

\[\hat\sigma \times \sqrt {1/n_i+1/n_j} \times q_{\alpha, k, df}/\sqrt{2}\] where \(q_{\alpha, k, df}\) is the upper tukey quantile with \(k\) means and \(df\) degrees freedom.

ME_tukey <- sigma * sqrt (1/diet_n[1]+1/diet_n[2]) * qtukey(0.95, 4, 20)/sqrt(2)
CI <- (dietMeans[2]-dietMeans[1]) + c(-ME_tukey,ME_tukey); names (CI) = c("lower", "upper"); CI

#  lower  upper 
# 0.7246 9.2754

## In R, you can use this function
dmeans.tukey <- TukeyHSD(aov (coag~diet)); dmeans.tukey

#   Tukey multiple comparisons of means
#     95% family-wise confidence level
# 
# Fit: aov(formula = coag ~ diet)
# 
# $diet
#     diff      lwr    upr  p adj
# B-A    5   0.7246  9.275 0.0183
# C-A    7   2.7246 11.275 0.0010
# D-A    0  -4.0560  4.056 1.0000
# C-B    2  -1.8241  5.824 0.4766
# D-B   -5  -8.5771 -1.423 0.0044
# D-C   -7 -10.5771 -3.423 0.0001

## Comparing with LSD CIs
g <- lm (coag~diet)
cbind(summary(g)$coefficients,confint (g))

#              Estimate Std. Error   t value  Pr(>|t|)  2.5 % 97.5 %
# (Intercept) 6.100e+01      1.183 5.155e+01 9.548e-23 58.532 63.468
# dietB       5.000e+00      1.528 3.273e+00 3.803e-03  1.814  8.186
# dietC       7.000e+00      1.528 4.583e+00 1.805e-04  3.814 10.186
# dietD       2.991e-15      1.449 2.064e-15 1.000e+00 -3.023  3.023

plot (dmeans.tukey)

7 A complete R Procedure for Example 3.1 from Textbook

## create data frame
poweretch <- data.frame (etch = scan ("data/etchrate.txt"), 
                         power = as.factor(rep(c(160, 180, 200, 220),each = 5)))
poweretch

etch	power
575	160
542	160
530	160
539	160
570	160
565	180
593	180
590	180
579	180
610	180
600	200
651	200
610	200
637	200
629	200
725	220
700	220
715	220
685	220
710	220

## scatterplot
with (poweretch, plot(as.numeric(power), etch, pch = as.numeric (power)))

## boxplot
boxplot (etch~power, data = poweretch)

## anova analysis
fit.etch <- lm (etch~power, data = poweretch)
summary (fit.etch)

# 
# Call:
# lm(formula = etch ~ power, data = poweretch)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -25.4  -13.0    2.8   13.2   25.6 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   551.20       8.17   67.47  < 2e-16 ***
# power180       36.20      11.55    3.13   0.0064 ** 
# power200       74.20      11.55    6.42  8.4e-06 ***
# power220      155.80      11.55   13.49  3.7e-10 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 18.3 on 16 degrees of freedom
# Multiple R-squared:  0.926,   Adjusted R-squared:  0.912 
# F-statistic: 66.8 on 3 and 16 DF,  p-value: 2.88e-09

anova(fit.etch)

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
power	3	66871	22290.2	66.8	0
Residuals	16	5339	333.7	NA	NA

## visualize residuals
plot (fit.etch$residuals, pch = as.numeric(poweretch$power))

plot (fit.etch$fitted.values, fit.etch$residuals,
      pch = as.numeric(poweretch$power))

## checking normality of residuals
qqnorm (fit.etch$residuals)
qqline(fit.etch$residuals)

shapiro.test(fit.etch$residuals)

# 
#   Shapiro-Wilk normality test
# 
# data:  fit.etch$residuals
# W = 0.94, p-value = 0.2

## testing equality of variances
with (poweretch, levene.test (etch, power))

# 
#   Modified robust Brown-Forsythe Levene-type test based on the absolute
#   deviations from the median
# 
# data:  etch
# Test Statistic = 0.2, p-value = 0.9

bartlett.test (etch~power, data = poweretch)

# 
#   Bartlett test of homogeneity of variances
# 
# data:  etch by power
# Bartlett's K-squared = 0.43, df = 3, p-value = 0.9

## multiple comparison for assessing pair difference
plot(TukeyHSD(aov(etch~power, data = poweretch)))

## look at the difference of a fixed pair of treatments
confint (fit.etch)

#              2.5 % 97.5 %
# (Intercept) 533.88 568.52
# power180     11.71  60.69
# power200     49.71  98.69
# power220    131.31 180.29

## change baseline level
poweretch$n.power <- factor(poweretch$power, levels = paste0(c(200, 180, 160, 220)))
fit.etch2 <- with(poweretch,lm (etch~n.power))
summary (fit.etch2)## looking at test result

# 
# Call:
# lm(formula = etch ~ n.power)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
#  -25.4  -13.0    2.8   13.2   25.6 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   625.40       8.17   76.55  < 2e-16 ***
# n.power180    -38.00      11.55   -3.29   0.0046 ** 
# n.power160    -74.20      11.55   -6.42  8.4e-06 ***
# n.power220     81.60      11.55    7.06  2.7e-06 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 18.3 on 16 degrees of freedom
# Multiple R-squared:  0.926,   Adjusted R-squared:  0.912 
# F-statistic: 66.8 on 3 and 16 DF,  p-value: 2.88e-09

confint(fit.etch2)

#              2.5 % 97.5 %
# (Intercept) 608.08 642.72
# n.power180  -62.49 -13.51
# n.power160  -98.69 -49.71
# n.power220   57.11 106.09

One-way ANOVA with Animation Illustration

Unit2: Analyzing Completely Randomized Design with a Single Categorial Factor

Longhai Li, University of Saskatchewan

January 2020