1 Input data
2 Correlation Coefficient
3 Estimating regression coefficients
4 Fitted Values, Residuals and Sum Squares
5 Short-cut formulae for SST, SSR, SSE
6 Statistical Inference for \(\beta_1\)
7 R functions for Regression Analysis
8 Confidence intervals for the mean of \(y_i\) at \(x_i\)
9 Predictive interval for \(y_i\) at \(x_i\)

library ("latex2exp")

1 Input data

issu <- data.frame (
    driving = c(5, 2, 12, 9, 15, 6, 25, 16),
    premium = c(64, 87, 50, 71, 44, 56, 42, 60)
)

y <- issu$premium
x <- issu$driving
plot(x,y, xlab = "Driving", ylab = "Premium")

2 Correlation Coefficient

xbar <- mean(x)
ybar <- mean (y)
n <- length(y)

data.frame (xi=x,yi=y, xiyi=x*y, x.squared=x^2, y.squared=y^2, 
            x_c=x-xbar, y_c=y-ybar, x_cy_c=(x-xbar)*(y-ybar))

#   xi yi xiyi x.squared y.squared   x_c    y_c    x_cy_c
# 1  5 64  320        25      4096 -6.25   4.75  -29.6875
# 2  2 87  174         4      7569 -9.25  27.75 -256.6875
# 3 12 50  600       144      2500  0.75  -9.25   -6.9375
# 4  9 71  639        81      5041 -2.25  11.75  -26.4375
# 5 15 44  660       225      1936  3.75 -15.25  -57.1875
# 6  6 56  336        36      3136 -5.25  -3.25   17.0625
# 7 25 42 1050       625      1764 13.75 -17.25 -237.1875
# 8 16 60  960       256      3600  4.75   0.75    3.5625

SSxy <- sum(x*y) - n*xbar*ybar; SSxy

# [1] -593.5

sum((x-xbar)*(y-ybar))

# [1] -593.5

SSxx <- sum (x^2) - n*xbar^2;SSxx

# [1] 383.5

sum((x-xbar)^2)

# [1] 383.5

SSyy <- sum(y^2) - n*ybar^2;SSyy

# [1] 1557.5

sum((y-ybar)^2)

# [1] 1557.5

r <- SSxy/(sqrt(SSxx)*sqrt(SSyy));r

# [1] -0.7679342

The correlation coefficient \(r\) can be computed with a built-in function in R

cor(x,y)

# [1] -0.7679342

3 Estimating regression coefficients

There are such relationships between the correlation \(\rho\) and regression coefficients \(\beta_0\) and \(\beta_1\):

\[\beta_1=\rho \frac{\sigma_y}{\sigma_x}\] \[\beta_0=\mu_y - \beta_1\mu_x.\] Using the above relationships, we can estimate \(\beta_0\) and \(\beta_1\) with \[\hat{\beta_1}=r\frac{s_y}{s_x}=\frac{SS_{xy}}{SS_{xx}}\] \[\hat{\beta_0}=\bar{y}-\hat{\beta_1}\bar{x}\]

xbar <- mean(x)
ybar <- mean (y)
n <- length(y)
SSxy <- sum(x*y) - n*xbar*ybar
SSxx <- sum (x^2) - n*xbar^2
SSyy <- sum (y^2) - n*ybar^2

beta1 <- SSxy/SSxx; beta1

# [1] -1.547588

beta0 <- ybar - beta1*xbar; beta0

# [1] 76.66037

plot(x,y)
abline (a = beta0, b = beta1)

Let us verify the relationship between rho and beta1

r*sd(y)/sd(x)

# [1] -1.547588

r*sqrt(SSyy)/sqrt(SSxx)

# [1] -1.547588

SSxy/SSxx

# [1] -1.547588

4 Fitted Values, Residuals and Sum Squares

fitted1 <- beta0+beta1*x
fitted0 <- rep(ybar, n)
residual1 <- y - fitted1
residual0 <- y - fitted0

data.frame (y, fitted0, residual0, fitted1, residual1, diff.res=fitted1-fitted0)

#    y fitted0 residual0  fitted1  residual1   diff.res
# 1 64   59.25      4.75 68.92243  -4.922425   9.672425
# 2 87   59.25     27.75 73.56519  13.434811  14.315189
# 3 50   59.25     -9.25 58.08931  -8.089309  -1.160691
# 4 71   59.25     11.75 62.73207   8.267927   3.482073
# 5 44   59.25    -15.25 53.44654  -9.446545  -5.803455
# 6 56   59.25     -3.25 67.37484 -11.374837   8.124837
# 7 42   59.25    -17.25 37.97066   4.029335 -21.279335
# 8 60   59.25      0.75 51.89896   8.101043  -7.351043

Visualize the fitted line, fitted values, and residuals

par (mfrow = c(1,3))
plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
points(x, fitted0, pch = 4,col=4)
for (i in 1: length(y)){
    arrows( x[i],fitted0[i],x[i], y[i], length=0.1, col = 4)
}
title(main= TeX("Residuals of Fitting Model 0: $y_i-\\bar{y}$"))


plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
abline(a=beta0, b=beta1)
points(x, fitted1, pch = 3,col=3)
for (i in 1: length(y)){
    arrows( x[i],fitted1[i],x[i], y[i], length=0.1, col = 3)
    
}
title(main= TeX("Residuals of Fitting Model 1: $y_i-\\hat{y}_i$"))


plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
abline(a=beta0, b=beta1)
points(x, fitted0, pch = 4,col=4)
points(x, fitted1, pch = 3,col=3)

for (i in 1: length(y)){
    arrows( x[i],fitted0[i],x[i], fitted1[i], length=0.1, col = 2)
    
}
title(main= TeX("Differences in Residuals: $\\hat{y}_i-\\bar{y}$"))

Definition of SSR, SSE, SST:

SST <- sum((y-fitted0)^2); SST

# [1] 1557.5

SSE <- sum ((y-fitted1)^2);SSE

# [1] 639.0065

SSR <- SST-SSE;SSR

# [1] 918.4935

SSR can be computed directly with

sum((fitted1-fitted0)^2)

# [1] 918.4935

Coefficient of Determination: \(R^2\)

R2 <- SSR/SST; R2

# [1] 0.5897229

F statististic

f <- (SSR/1)/(SSE/(n-2)); f

# [1] 8.624264

p-value to assess whether the relationship exists (statistical significance measure)

pvf <- pf(f, df1=1, df2=n-2, lower.tail = FALSE); pvf

# [1] 0.0260588

ANOVA table

Ftable <- data.frame(df = c(1,n-2), SS=c(SSR,SSE), 
                     MSS = c(SSR/1, SSE/(n-2)), Fstat=c(f,NA),
                     p.value = c(pvf, NA),
                     R2 = c(SSR, SSE)/SST)
Ftable

#   df       SS      MSS    Fstat   p.value        R2
# 1  1 918.4935 918.4935 8.624264 0.0260588 0.5897229
# 2  6 639.0065 106.5011       NA        NA 0.4102771

5 Short-cut formulae for SST, SSR, SSE

An important formular for computing SSR for simple linear regression is:

\[ SSR = \hat\beta_1^2 SS_{xx} = \hat\beta_1 SS_{xy} \]

With SSR and SST (\(=SS_{yy}\)), we can compute SSE with this equation:

\[ SST = SSR + SSE \]

# The following 6 summary numbers are the beginning of computing lm:
xbar <- mean(x)
ybar <- mean (y)
n <- length(y)
SSxy <- sum(x*y) - n*xbar*ybar
SSxx <- sum (x^2) - n*xbar^2
SSyy <- sum (y^2) - n*ybar^2

# estimate linear line:
beta1 <- SSxy/SSxx
beta0 <- ybar - beta1*xbar

# compute SSR, SSE, and SST
SST <- SSyy; SST

# [1] 1557.5

SSR <- beta1^2*SSxx;SSR

# [1] 918.4935

Alternatively, SSR can be computed with:

SSR_alt <- beta1*SSxy; SSR_alt

# [1] 918.4935

SSE <- SST-SSR; SSE

# [1] 639.0065

6 Statistical Inference for \(\beta_1\)

To make inference about \(\beta_1\), we need to calculate the standard error of \(\hat{\beta_1}\). The formula is \[ \hat{SE}(\hat{\beta_1})=\frac{s_e}{\sqrt{SS_{xx}}} \] where, \(s_e=\sqrt{\frac{SSE}{n-2}}\) is the standard deviation of residuals.

sd_e <- sqrt(SSE/(n-2)); sd_e

# [1] 10.31994

se_beta1 <- sd_e/sqrt(SSxx); se_beta1

# [1] 0.5269802

tstat <- beta1/se_beta1; tstat

# [1] -2.93671

To test \(H_1: \beta_1\not=0\), the p-value is:

2*pt(-abs(beta1)/se_beta1, df = n-2)

# [1] 0.0260588

To test \(H_1: \beta_1<0\), the p-value is:

pt(beta1/se_beta1, df = n-2)

# [1] 0.0130294

To test \(H_1: \beta_1>0\), the p-value is:

pt(beta1/se_beta1, df = n-2, lower.tail = FALSE)

# [1] 0.9869706

To find C.I. for \(\beta_1\):

alpha <- 0.05
me_beta1 <- qt(alpha/2, df = n-2, lower.tail = FALSE) * se_beta1
# 95% CI:
beta1 + c(-me_beta1, me_beta1)

# [1] -2.8370622 -0.2581138

7 R functions for Regression Analysis

All the above calculation can be done with a single function lm:

issu <- data.frame (
    driving = c(5, 2, 12, 9, 15, 6, 25, 16),
    premium = c(64, 87, 50, 71, 44, 56, 42, 60)
)

lmfit_issu <- lm (premium ~ driving, data = issu)
# summary of fitting results
summary (lmfit_issu)

# 
# Call:
# lm(formula = premium ~ driving, data = issu)
# 
# Residuals:
#      Min       1Q   Median       3Q      Max 
# -11.3748  -8.4286  -0.4465   8.1428  13.4348 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   76.660      6.961  11.012 3.33e-05 ***
# driving       -1.548      0.527  -2.937   0.0261 *  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 10.32 on 6 degrees of freedom
# Multiple R-squared:  0.5897,  Adjusted R-squared:  0.5213 
# F-statistic: 8.624 on 1 and 6 DF,  p-value: 0.02606

# more results are in lmfit_issu
names(lmfit_issu)

#  [1] "coefficients"  "residuals"     "effects"       "rank"         
#  [5] "fitted.values" "assign"        "qr"            "df.residual"  
#  [9] "xlevels"       "call"          "terms"         "model"

# confidence interval
confint(lmfit_issu)

#                 2.5 %     97.5 %
# (Intercept) 59.626611 93.6941192
# driving     -2.837062 -0.2581138

# anova table
anova (lmfit_issu)

# Analysis of Variance Table
# 
# Response: premium
#           Df Sum Sq Mean Sq F value  Pr(>F)  
# driving    1 918.49  918.49  8.6243 0.02606 *
# Residuals  6 639.01  106.50                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8 Confidence intervals for the mean of \(y_i\) at \(x_i\)

newdata <- data.frame (driving = seq(0,30,by = 1))
newdata

#    driving
# 1        0
# 2        1
# 3        2
# 4        3
# 5        4
# 6        5
# 7        6
# 8        7
# 9        8
# 10       9
# 11      10
# 12      11
# 13      12
# 14      13
# 15      14
# 16      15
# 17      16
# 18      17
# 19      18
# 20      19
# 21      20
# 22      21
# 23      22
# 24      23
# 25      24
# 26      25
# 27      26
# 28      27
# 29      28
# 30      29
# 31      30

means.pred <- predict (lmfit_issu, newdata = newdata)
cbind(newdata,predicted.premium = means.pred)

#    driving predicted.premium
# 1        0          76.66037
# 2        1          75.11278
# 3        2          73.56519
# 4        3          72.01760
# 5        4          70.47001
# 6        5          68.92243
# 7        6          67.37484
# 8        7          65.82725
# 9        8          64.27966
# 10       9          62.73207
# 11      10          61.18449
# 12      11          59.63690
# 13      12          58.08931
# 14      13          56.54172
# 15      14          54.99413
# 16      15          53.44654
# 17      16          51.89896
# 18      17          50.35137
# 19      18          48.80378
# 20      19          47.25619
# 21      20          45.70860
# 22      21          44.16102
# 23      22          42.61343
# 24      23          41.06584
# 25      24          39.51825
# 26      25          37.97066
# 27      26          36.42308
# 28      27          34.87549
# 29      28          33.32790
# 30      29          31.78031
# 31      30          30.23272

conf.int.means <- predict (lmfit_issu, newdata = newdata, interval = "confidence")
conf.int.means

#         fit       lwr      upr
# 1  76.66037 59.626611 93.69412
# 2  75.11278 59.162862 91.06269
# 3  73.56519 58.666320 88.46406
# 4  72.01760 58.129539 85.90566
# 5  70.47001 57.543074 83.39695
# 6  68.92243 56.895010 80.94984
# 7  67.37484 56.170501 78.57917
# 8  65.82725 55.351511 76.30299
# 9  64.27966 54.417080 74.14224
# 10 62.73207 53.344559 72.11959
# 11 61.18449 52.112230 70.25674
# 12 59.63690 50.703158 68.57064
# 13 58.08931 49.109160 67.06946
# 14 56.54172 47.333033 65.75041
# 15 54.99413 45.387767 64.60050
# 16 53.44654 43.293215 63.59988
# 17 51.89896 41.071980 62.72593
# 18 50.35137 38.746101 61.95664
# 19 48.80378 36.335159 61.27240
# 20 47.25619 33.855584 60.65680
# 21 45.70860 31.320707 60.09650
# 22 44.16102 28.741148 59.58089
# 23 42.61343 26.125294 59.10156
# 24 41.06584 23.479757 58.65192
# 25 39.51825 20.809764 58.22674
# 26 37.97066 18.119462 57.82187
# 27 36.42308 15.412164 57.43399
# 28 34.87549 12.690536 57.06044
# 29 33.32790  9.956737 56.69907
# 30 31.78031  7.212530 56.34810
# 31 30.23272  4.459364 56.00609

9 Predictive interval for \(y_i\) at \(x_i\)

int_y <- predict (lmfit_issu, newdata = newdata, 
                  interval = "prediction")
int_y

#         fit       lwr       upr
# 1  76.66037 46.200375 107.12036
# 2  75.11278 45.245370 104.98018
# 3  73.56519 44.245596 102.88478
# 4  72.01760 43.198502 100.83670
# 5  70.47001 42.101580  98.83845
# 6  68.92243 40.952425  96.89243
# 7  67.37484 39.748774  95.00090
# 8  65.82725 38.488571  93.16593
# 9  64.27966 37.170018  91.38930
# 10 62.73207 35.791627  89.67252
# 11 61.18449 34.352265  88.01671
# 12 59.63690 32.851193  86.42260
# 13 58.08931 31.288091  84.89053
# 14 56.54172 29.663065  83.42038
# 15 54.99413 27.976648  82.01162
# 16 53.44654 26.229779  80.66331
# 17 51.89896 24.423774  79.37414
# 18 50.35137 22.560283  78.14245
# 19 48.80378 20.641239  76.96632
# 20 47.25619 18.668809  75.84358
# 21 45.70860 16.645332  74.77188
# 22 44.16102 14.573273  73.74876
# 23 42.61343 12.455166  72.77169
# 24 41.06584 10.293572  71.83811
# 25 39.51825  8.091039  70.94547
# 26 37.97066  5.850072  70.09126
# 27 36.42308  3.573105  69.27305
# 28 34.87549  1.262480  68.48850
# 29 33.32790 -1.079562  67.73536
# 30 31.78031 -3.450898  67.01152
# 31 30.23272 -5.849519  66.31497

We can visualize these prediction with such a plot:

plot(x, y, ylim = range(int_y), xlim = c(0,30))
lines(newdata$driving, conf.int.means[,1], lty = 1)
lines(newdata$driving, conf.int.means[,2], lty = 2, col = 2)
lines(newdata$driving, conf.int.means[,3], lty = 2, col = 2 )
lines(newdata$driving, int_y[,2], lty = 3, col = 3)
lines(newdata$driving, int_y[,3], lty = 3, col = 3)

Introduction to Statistical Methods

Simple Linear Regression

Longhai Li

September 2019