1 Read a dataset of animal coagulation time vs diet
2 Visualize the data using scatterplot and boxplot
3 Fit a Linear Model
- 3.1 Fit a linear model with no intercept:
- 3.2 Fit a linear model with an intercept:
4 Predictive (fitted) values and Residuals
5 Short-cut Formulae for Computing Sum Squares
6 R function for ANOVA with Linear Model

1 Read a dataset of animal coagulation time vs diet

coagulation <- read.table("data/coagulation.txt", header=T)

coagulation

##    coag diet
## 1    62    A
## 2    60    A
## 3    63    A
## 4    59    A
## 5    63    B
## 6    67    B
## 7    71    B
## 8    64    B
## 9    65    B
## 10   66    B
## 11   68    C
## 12   66    C
## 13   71    C
## 14   67    C
## 15   68    C
## 16   68    C
## 17   56    D
## 18   62    D
## 19   60    D
## 20   61    D
## 21   63    D
## 22   64    D
## 23   63    D
## 24   59    D

coag <- coagulation$coag
diet <- coagulation$diet

2 Visualize the data using scatterplot and boxplot

plot (as.numeric(diet), coag, 
      col = diet, 
      pch = as.numeric(diet), 
      xaxt="n", xlab = "Diet")
axis (1, 1:4, LETTERS[1:4])

boxplot(coag~diet)

3 Fit a Linear Model

3.1 Fit a linear model with no intercept:

# 
g1 <- lm(coag~0+diet)
summary(g1)

## 
## Call:
## lm(formula = coag ~ 0 + diet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5.00  -1.25   0.00   1.25   5.00 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## dietA   61.000      1.183    51.5   <2e-16 ***
## dietB   66.000      0.966    68.3   <2e-16 ***
## dietC   68.000      0.966    70.4   <2e-16 ***
## dietD   61.000      0.837    72.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.37 on 20 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 4.4e+03 on 4 and 20 DF,  p-value: <2e-16

# What model is fitted?
data.frame(model.matrix(g1))

##    dietA dietB dietC dietD
## 1      1     0     0     0
## 2      1     0     0     0
## 3      1     0     0     0
## 4      1     0     0     0
## 5      0     1     0     0
## 6      0     1     0     0
## 7      0     1     0     0
## 8      0     1     0     0
## 9      0     1     0     0
## 10     0     1     0     0
## 11     0     0     1     0
## 12     0     0     1     0
## 13     0     0     1     0
## 14     0     0     1     0
## 15     0     0     1     0
## 16     0     0     1     0
## 17     0     0     0     1
## 18     0     0     0     1
## 19     0     0     0     1
## 20     0     0     0     1
## 21     0     0     0     1
## 22     0     0     0     1
## 23     0     0     0     1
## 24     0     0     0     1

This model is written as: \[ y_i = \mu_A I(x_i=A)+\mu_B I(x_i=B)+\mu_C I(x_i=C)+\mu_D I(x_i=D)+\epsilon_i \]

3.2 Fit a linear model with an intercept:

# 
g <- lm(coag~diet)
summary(g)

## 
## Call:
## lm(formula = coag ~ diet)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5.00  -1.25   0.00   1.25   5.00 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.10e+01   1.18e+00   51.55  < 2e-16 ***
## dietB       5.00e+00   1.53e+00    3.27  0.00380 ** 
## dietC       7.00e+00   1.53e+00    4.58  0.00018 ***
## dietD       2.99e-15   1.45e+00    0.00  1.00000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.37 on 20 degrees of freedom
## Multiple R-squared:  0.671,  Adjusted R-squared:  0.621 
## F-statistic: 13.6 on 3 and 20 DF,  p-value: 4.66e-05

# What model is fitted?
data.frame(model.matrix(g))

##    X.Intercept. dietB dietC dietD
## 1             1     0     0     0
## 2             1     0     0     0
## 3             1     0     0     0
## 4             1     0     0     0
## 5             1     1     0     0
## 6             1     1     0     0
## 7             1     1     0     0
## 8             1     1     0     0
## 9             1     1     0     0
## 10            1     1     0     0
## 11            1     0     1     0
## 12            1     0     1     0
## 13            1     0     1     0
## 14            1     0     1     0
## 15            1     0     1     0
## 16            1     0     1     0
## 17            1     0     0     1
## 18            1     0     0     1
## 19            1     0     0     1
## 20            1     0     0     1
## 21            1     0     0     1
## 22            1     0     0     1
## 23            1     0     0     1
## 24            1     0     0     1

This model is written as: \[ y_i = \beta_0+\beta_B I(x_i=B)+\beta_CI(x_i=C)+\beta_D I(x_i=D)+\epsilon_i \] where, \(\beta_0=\mu_A, \beta_B=\mu_B-\mu_A, \beta_C=\mu_C-\mu_A, \beta_D=\mu_D-\mu_A\).

4 Predictive (fitted) values and Residuals

This section is for illustrating prediction and residuals

y <- coag
y_hat_h0 <- rep (mean (y), length (y)); y_hat_h0

##  [1] 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
## [24] 64

y_hat_h1 <- predict (lm (y~diet)); y_hat_h1;

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
## 61 61 61 61 66 66 66 66 66 66 68 68 68 68 68 68 61 61 61 61 61 61 61 61

resid_h0 <- y - y_hat_h0 # residual (error) from h0 model
resid_h1 <- y - y_hat_h1 # residual (error) from h1 model
data.frame (diet, y=y, y_hat_h0, resid_h0, y_hat_h1, resid_h1, y_hat_h1-y_hat_h0 )

##    diet  y y_hat_h0 resid_h0 y_hat_h1 resid_h1 y_hat_h1...y_hat_h0
## 1     A 62       64       -2       61        1                  -3
## 2     A 60       64       -4       61       -1                  -3
## 3     A 63       64       -1       61        2                  -3
## 4     A 59       64       -5       61       -2                  -3
## 5     B 63       64       -1       66       -3                   2
## 6     B 67       64        3       66        1                   2
## 7     B 71       64        7       66        5                   2
## 8     B 64       64        0       66       -2                   2
## 9     B 65       64        1       66       -1                   2
## 10    B 66       64        2       66        0                   2
## 11    C 68       64        4       68        0                   4
## 12    C 66       64        2       68       -2                   4
## 13    C 71       64        7       68        3                   4
## 14    C 67       64        3       68       -1                   4
## 15    C 68       64        4       68        0                   4
## 16    C 68       64        4       68        0                   4
## 17    D 56       64       -8       61       -5                  -3
## 18    D 62       64       -2       61        1                  -3
## 19    D 60       64       -4       61       -1                  -3
## 20    D 61       64       -3       61        0                  -3
## 21    D 63       64       -1       61        2                  -3
## 22    D 64       64        0       61        3                  -3
## 23    D 63       64       -1       61        2                  -3
## 24    D 59       64       -5       61       -2                  -3

# Visulize the predictive values and residuals
plot (y, col = diet,pch = as.numeric(diet),
      main="Coaglution time versus Run Order (Sample ID)")
lines (1:4,g$fitted.values[1:4], col = 1)
lines (5:10,g$fitted.values[5:10], col = 2)
lines (11:16,g$fitted.values[11:16], col = 3)
lines (17:24,g$fitted.values[17:24], col = 4)
abline(h=mean(y))

# compute SS
SSt <-sum((y - y_hat_h0)^2); SSt

## [1] 340

SSe <- sum((y - y_hat_h1)^2); SSe

## [1] 112

SStr <- SSt - SSe; SStr # alternatively we can compute SStr by

## [1] 228

SStr2 <- sum((y_hat_h0-y_hat_h1)^2); SStr2

## [1] 228

5 Short-cut Formulae for Computing Sum Squares

ybar_i. <- tapply (y, mean, INDEX = diet); ybar_i.

##  A  B  C  D 
## 61 66 68 61

ybar_.. <- mean (y); ybar_..

## [1] 64

n_i <- table (diet); n_i

## diet
## A B C D 
## 4 6 6 8

n <- length (y)
SSy <- sum (y^2)
# the above values will be provided in test question

# compute SSt,SSe, SStr
SSt <- SSy-n*ybar_..^2; SSt

## [1] 340

SStr <- sum(n_i*ybar_i.^2) - n*ybar_..^2; SStr

## [1] 228

SSe <- SSt - SStr; SSe

## [1] 112

# compute F and output anova table
k <- length (unique (diet));k

## [1] 4

n <- length (y);n

## [1] 24

df1 <- k - 1
df2 <- n - k
MStr <- SStr/df1
MSe <- SSe/df2
f <- MStr /MSe
pvalue <- 1-pf (f, df1=df1, df2=df2); pvalue

## [1] 4.658e-05

anova_table <- data.frame (Df = c(df1,df2), 
                           SS = c(SStr,SSe), 
                           MS = c(MStr, MSe), 
                           F = c(f,NA), 
                           pvalue = c(pvalue, NA)); 
row.names(anova_table) <- c("diet", "Residuals")
anova_table

##           Df  SS   MS     F    pvalue
## diet       3 228 76.0 13.57 4.658e-05
## Residuals 20 112  5.6    NA        NA

6 R function for ANOVA with Linear Model

Compute ANOVA with the fitting result with intercept

g <- lm(coag~diet)
anova (g)

## Analysis of Variance Table
## 
## Response: coag
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## diet       3    228    76.0    13.6 4.7e-05 ***
## Residuals 20    112     5.6                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Do not compute ANOVA with the fitting result w/o intercept

g1 <- lm(coag~0+diet)
anova (g1)

## Analysis of Variance Table
## 
## Response: coag
##           Df Sum Sq Mean Sq F value Pr(>F)    
## diet       4  98532   24633    4399 <2e-16 ***
## Residuals 20    112       6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Regression with a Categorical Variable

Revisiting ANOVA with Linear Modelling

Longhai Li

September 2019