xbar <- mean(x)
ybar <- mean (y)
n <- length(y)
data.frame (xi=x,yi=y, xiyi=x*y, x.squared=x^2, y.squared=y^2,
x_c=x-xbar, y_c=y-ybar, x_cy_c=(x-xbar)*(y-ybar))
# xi yi xiyi x.squared y.squared x_c y_c x_cy_c
# 1 5 64 320 25 4096 -6.25 4.75 -29.6875
# 2 2 87 174 4 7569 -9.25 27.75 -256.6875
# 3 12 50 600 144 2500 0.75 -9.25 -6.9375
# 4 9 71 639 81 5041 -2.25 11.75 -26.4375
# 5 15 44 660 225 1936 3.75 -15.25 -57.1875
# 6 6 56 336 36 3136 -5.25 -3.25 17.0625
# 7 25 42 1050 625 1764 13.75 -17.25 -237.1875
# 8 16 60 960 256 3600 4.75 0.75 3.5625
# [1] -593.5
# [1] -593.5
# [1] 383.5
# [1] 383.5
# [1] 1557.5
# [1] 1557.5
# [1] -0.7679342
The correlation coefficient \(r\) can be computed with a built-in function in R
# [1] -0.7679342
There are such relationships between the correlation \(\rho\) and regression coefficients \(\beta_0\) and \(\beta_1\):
\[\beta_1=\rho \frac{\sigma_y}{\sigma_x}\] \[\beta_0=\mu_y - \beta_1\mu_x.\] Using the above relationships, we can estimate \(\beta_0\) and \(\beta_1\) with \[\hat{\beta_1}=r\frac{s_y}{s_x}=\frac{SS_{xy}}{SS_{xx}}\] \[\hat{\beta_0}=\bar{y}-\hat{\beta_1}\bar{x}\]
xbar <- mean(x)
ybar <- mean (y)
n <- length(y)
SSxy <- sum(x*y) - n*xbar*ybar
SSxx <- sum (x^2) - n*xbar^2
SSyy <- sum (y^2) - n*ybar^2
beta1 <- SSxy/SSxx; beta1
# [1] -1.547588
# [1] 76.66037
Let us verify the relationship between rho and beta1
# [1] -1.547588
# [1] -1.547588
# [1] -1.547588
fitted1 <- beta0+beta1*x
fitted0 <- rep(ybar, n)
residual1 <- y - fitted1
residual0 <- y - fitted0
data.frame (y, fitted0, residual0, fitted1, residual1, diff.res=fitted1-fitted0)
# y fitted0 residual0 fitted1 residual1 diff.res
# 1 64 59.25 4.75 68.92243 -4.922425 9.672425
# 2 87 59.25 27.75 73.56519 13.434811 14.315189
# 3 50 59.25 -9.25 58.08931 -8.089309 -1.160691
# 4 71 59.25 11.75 62.73207 8.267927 3.482073
# 5 44 59.25 -15.25 53.44654 -9.446545 -5.803455
# 6 56 59.25 -3.25 67.37484 -11.374837 8.124837
# 7 42 59.25 -17.25 37.97066 4.029335 -21.279335
# 8 60 59.25 0.75 51.89896 8.101043 -7.351043
Visualize the fitted line, fitted values, and residuals
par (mfrow = c(1,3))
plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
points(x, fitted0, pch = 4,col=4)
for (i in 1: length(y)){
arrows( x[i],fitted0[i],x[i], y[i], length=0.1, col = 4)
}
title(main= TeX("Residuals of Fitting Model 0: $y_i-\\bar{y}$"))
plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
abline(a=beta0, b=beta1)
points(x, fitted1, pch = 3,col=3)
for (i in 1: length(y)){
arrows( x[i],fitted1[i],x[i], y[i], length=0.1, col = 3)
}
title(main= TeX("Residuals of Fitting Model 1: $y_i-\\hat{y}_i$"))
plot (premium ~ driving, data = issu, ylim = range(y, fitted1, fitted0))
abline (h=mean(y))
abline(a=beta0, b=beta1)
points(x, fitted0, pch = 4,col=4)
points(x, fitted1, pch = 3,col=3)
for (i in 1: length(y)){
arrows( x[i],fitted0[i],x[i], fitted1[i], length=0.1, col = 2)
}
title(main= TeX("Differences in Residuals: $\\hat{y}_i-\\bar{y}$"))
Definition of SSR, SSE, SST:
# [1] 1557.5
# [1] 639.0065
# [1] 918.4935
SSR can be computed directly with
# [1] 918.4935
Coefficient of Determination: \(R^2\)
# [1] 0.5897229
F statististic
# [1] 8.624264
p-value to assess whether the relationship exists (statistical significance measure)
# [1] 0.0260588
ANOVA table
Ftable <- data.frame(df = c(1,n-2), SS=c(SSR,SSE),
MSS = c(SSR/1, SSE/(n-2)), Fstat=c(f,NA),
p.value = c(pvf, NA),
R2 = c(SSR, SSE)/SST)
Ftable
# df SS MSS Fstat p.value R2
# 1 1 918.4935 918.4935 8.624264 0.0260588 0.5897229
# 2 6 639.0065 106.5011 NA NA 0.4102771
An important formular for computing SSR for simple linear regression is:
\[ SSR = \hat\beta_1^2 SS_{xx} = \hat\beta_1 SS_{xy} \]
With SSR and SST (\(=SS_{yy}\)), we can compute SSE with this equation:
\[ SST = SSR + SSE \]
# The following 6 summary numbers are the beginning of computing lm:
xbar <- mean(x)
ybar <- mean (y)
n <- length(y)
SSxy <- sum(x*y) - n*xbar*ybar
SSxx <- sum (x^2) - n*xbar^2
SSyy <- sum (y^2) - n*ybar^2
# estimate linear line:
beta1 <- SSxy/SSxx
beta0 <- ybar - beta1*xbar
# compute SSR, SSE, and SST
SST <- SSyy; SST
# [1] 1557.5
# [1] 918.4935
Alternatively, SSR can be computed with:
# [1] 918.4935
# [1] 639.0065
To make inference about \(\beta_1\), we need to calculate the standard error of \(\hat{\beta_1}\). The formula is \[ \hat{SE}(\hat{\beta_1})=\frac{s_e}{\sqrt{SS_{xx}}} \] where, \(s_e=\sqrt{\frac{SSE}{n-2}}\) is the standard deviation of residuals.
# [1] 10.31994
# [1] 0.5269802
# [1] -2.93671
To test \(H_1: \beta_1\not=0\), the p-value is:
# [1] 0.0260588
To test \(H_1: \beta_1<0\), the p-value is:
# [1] 0.0130294
To test \(H_1: \beta_1>0\), the p-value is:
# [1] 0.9869706
To find C.I. for \(\beta_1\):
alpha <- 0.05
me_beta1 <- qt(alpha/2, df = n-2, lower.tail = FALSE) * se_beta1
# 95% CI:
beta1 + c(-me_beta1, me_beta1)
# [1] -2.8370622 -0.2581138
All the above calculation can be done with a single function lm:
issu <- data.frame (
driving = c(5, 2, 12, 9, 15, 6, 25, 16),
premium = c(64, 87, 50, 71, 44, 56, 42, 60)
)
lmfit_issu <- lm (premium ~ driving, data = issu)
# summary of fitting results
summary (lmfit_issu)
#
# Call:
# lm(formula = premium ~ driving, data = issu)
#
# Residuals:
# Min 1Q Median 3Q Max
# -11.3748 -8.4286 -0.4465 8.1428 13.4348
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 76.660 6.961 11.012 3.33e-05 ***
# driving -1.548 0.527 -2.937 0.0261 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 10.32 on 6 degrees of freedom
# Multiple R-squared: 0.5897, Adjusted R-squared: 0.5213
# F-statistic: 8.624 on 1 and 6 DF, p-value: 0.02606
# [1] "coefficients" "residuals" "effects" "rank"
# [5] "fitted.values" "assign" "qr" "df.residual"
# [9] "xlevels" "call" "terms" "model"
# 2.5 % 97.5 %
# (Intercept) 59.626611 93.6941192
# driving -2.837062 -0.2581138
# Analysis of Variance Table
#
# Response: premium
# Df Sum Sq Mean Sq F value Pr(>F)
# driving 1 918.49 918.49 8.6243 0.02606 *
# Residuals 6 639.01 106.50
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# driving
# 1 0
# 2 1
# 3 2
# 4 3
# 5 4
# 6 5
# 7 6
# 8 7
# 9 8
# 10 9
# 11 10
# 12 11
# 13 12
# 14 13
# 15 14
# 16 15
# 17 16
# 18 17
# 19 18
# 20 19
# 21 20
# 22 21
# 23 22
# 24 23
# 25 24
# 26 25
# 27 26
# 28 27
# 29 28
# 30 29
# 31 30
# driving predicted.premium
# 1 0 76.66037
# 2 1 75.11278
# 3 2 73.56519
# 4 3 72.01760
# 5 4 70.47001
# 6 5 68.92243
# 7 6 67.37484
# 8 7 65.82725
# 9 8 64.27966
# 10 9 62.73207
# 11 10 61.18449
# 12 11 59.63690
# 13 12 58.08931
# 14 13 56.54172
# 15 14 54.99413
# 16 15 53.44654
# 17 16 51.89896
# 18 17 50.35137
# 19 18 48.80378
# 20 19 47.25619
# 21 20 45.70860
# 22 21 44.16102
# 23 22 42.61343
# 24 23 41.06584
# 25 24 39.51825
# 26 25 37.97066
# 27 26 36.42308
# 28 27 34.87549
# 29 28 33.32790
# 30 29 31.78031
# 31 30 30.23272
# fit lwr upr
# 1 76.66037 59.626611 93.69412
# 2 75.11278 59.162862 91.06269
# 3 73.56519 58.666320 88.46406
# 4 72.01760 58.129539 85.90566
# 5 70.47001 57.543074 83.39695
# 6 68.92243 56.895010 80.94984
# 7 67.37484 56.170501 78.57917
# 8 65.82725 55.351511 76.30299
# 9 64.27966 54.417080 74.14224
# 10 62.73207 53.344559 72.11959
# 11 61.18449 52.112230 70.25674
# 12 59.63690 50.703158 68.57064
# 13 58.08931 49.109160 67.06946
# 14 56.54172 47.333033 65.75041
# 15 54.99413 45.387767 64.60050
# 16 53.44654 43.293215 63.59988
# 17 51.89896 41.071980 62.72593
# 18 50.35137 38.746101 61.95664
# 19 48.80378 36.335159 61.27240
# 20 47.25619 33.855584 60.65680
# 21 45.70860 31.320707 60.09650
# 22 44.16102 28.741148 59.58089
# 23 42.61343 26.125294 59.10156
# 24 41.06584 23.479757 58.65192
# 25 39.51825 20.809764 58.22674
# 26 37.97066 18.119462 57.82187
# 27 36.42308 15.412164 57.43399
# 28 34.87549 12.690536 57.06044
# 29 33.32790 9.956737 56.69907
# 30 31.78031 7.212530 56.34810
# 31 30.23272 4.459364 56.00609
# fit lwr upr
# 1 76.66037 46.200375 107.12036
# 2 75.11278 45.245370 104.98018
# 3 73.56519 44.245596 102.88478
# 4 72.01760 43.198502 100.83670
# 5 70.47001 42.101580 98.83845
# 6 68.92243 40.952425 96.89243
# 7 67.37484 39.748774 95.00090
# 8 65.82725 38.488571 93.16593
# 9 64.27966 37.170018 91.38930
# 10 62.73207 35.791627 89.67252
# 11 61.18449 34.352265 88.01671
# 12 59.63690 32.851193 86.42260
# 13 58.08931 31.288091 84.89053
# 14 56.54172 29.663065 83.42038
# 15 54.99413 27.976648 82.01162
# 16 53.44654 26.229779 80.66331
# 17 51.89896 24.423774 79.37414
# 18 50.35137 22.560283 78.14245
# 19 48.80378 20.641239 76.96632
# 20 47.25619 18.668809 75.84358
# 21 45.70860 16.645332 74.77188
# 22 44.16102 14.573273 73.74876
# 23 42.61343 12.455166 72.77169
# 24 41.06584 10.293572 71.83811
# 25 39.51825 8.091039 70.94547
# 26 37.97066 5.850072 70.09126
# 27 36.42308 3.573105 69.27305
# 28 34.87549 1.262480 68.48850
# 29 33.32790 -1.079562 67.73536
# 30 31.78031 -3.450898 67.01152
# 31 30.23272 -5.849519 66.31497
We can visualize these prediction with such a plot:
plot(x, y, ylim = range(int_y), xlim = c(0,30))
lines(newdata$driving, conf.int.means[,1], lty = 1)
lines(newdata$driving, conf.int.means[,2], lty = 2, col = 2)
lines(newdata$driving, conf.int.means[,3], lty = 2, col = 2 )
lines(newdata$driving, int_y[,2], lty = 3, col = 3)
lines(newdata$driving, int_y[,3], lty = 3, col = 3)