library(knitr)
library(rmarkdown)
# This chunk sets up the global options for the document.
knitr::opts_chunk$set(
  echo = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  warning = FALSE,
  message = FALSE
)

1 Fit of the multiple linear regression

This section analyzes the wire bond strength dataset.

1.0.1 Data Loading and Visualization

Note: You must change the file paths in the read.csv() functions below to match the location of the files on your computer.

# Read data. Change the path as necessary.
# Example: bond.data <- read.csv("data/wire-bond.csv")
bond.data <- read.csv("wire-bond.csv")

# This will now be automatically rendered as a paged table
bond.data

Note: to enable paged table in html, you need to set df_print: paged in the output format for html as follows output: df_print: paged.

# Create initial scatter plots
par(mfrow = c(1, 2))
plot(bond.data$length, bond.data$strength,
     xlab = "Wire Length", ylab = "Pull strength")
plot(bond.data$height, bond.data$strength,
     xlab = "Die height", ylab = "Pull strength")

1.0.2 Model Fitting and Summary

We fit a multiple linear regression model with strength as the response variable and length and height as predictors.

# Fit the linear model
fit <- lm(strength ~ length + height, data = bond.data)

# Display the classic summary output (this is not a data frame, so it's unaffected)
summary(fit)
#> 
#> Call:
#> lm(formula = strength ~ length + height, data = bond.data)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -3.865 -1.542 -0.362  1.196  5.841 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 2.263791   1.060066   2.136 0.044099 *  
#> length      2.744270   0.093524  29.343  < 2e-16 ***
#> height      0.012528   0.002798   4.477 0.000188 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.288 on 22 degrees of freedom
#> Multiple R-squared:  0.9811, Adjusted R-squared:  0.9794 
#> F-statistic: 572.2 on 2 and 22 DF,  p-value: < 2.2e-16

The summary provides the ANOVA F-test for overall significance, \(R^2\), adjusted \(R^2\), and t-tests for individual coefficients.

1.0.3 Confidence Intervals and Model Components

We can extract confidence intervals for the regression coefficients, as well as fitted values and residuals.

# Confidence intervals (matrix output, will be auto-paged)
confint(fit)
#>                   2.5 %     97.5 %
#> (Intercept) 0.065348613 4.46223426
#> length      2.550313061 2.93822623
#> height      0.006724246 0.01833138
# Fitted values and ordinary residuals (matrix output, will be auto-paged)
pred <- fitted.values(fit)
e <- resid(fit)
data.frame(y = bond.data$strength, y.hat = pred, e = e)
# Extract covariance matrix (matrix output, will be auto-paged)
cov.mat <- vcov(fit)
cov.mat
#>              (Intercept)        length        height
#> (Intercept)  1.123740429 -3.921612e-02 -1.781991e-03
#> length      -0.039216122  8.746709e-03 -9.903775e-05
#> height      -0.001781991 -9.903775e-05  7.831149e-06
# Standard errors (vector output, needs to be a data.frame to be paged)
data.frame(std.error = sqrt(diag(cov.mat)))

2 Partial F-test and t-test

This section uses data on the weight, height, and age of children to demonstrate partial F-tests.

# Data: Weight, height and age of children
wgt <- c(64, 71, 53, 67, 55, 58, 77, 57, 56, 51, 76, 68)
hgt <- c(57, 59, 49, 62, 51, 50, 55, 48, 42, 42, 61, 57)
age <- c(8, 10, 6, 11, 8, 7, 10, 9, 10, 6, 12, 9)
child.data <- data.frame(wgt, hgt, age)

2.0.1 Problem 1: Simple Linear Regression

fit11 <- lm(wgt ~ hgt, data = child.data)
summary(fit11)
#> 
#> Call:
#> lm(formula = wgt ~ hgt, data = child.data)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.8736 -3.8973 -0.4402  2.2624 11.8375 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)   
#> (Intercept)   6.1898    12.8487   0.482  0.64035   
#> hgt           1.0722     0.2417   4.436  0.00126 **
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 5.471 on 10 degrees of freedom
#> Multiple R-squared:  0.663,  Adjusted R-squared:  0.6293 
#> F-statistic: 19.67 on 1 and 10 DF,  p-value: 0.001263

2.0.2 Problem 2: Adding a Predictor

Here we test if adding age significantly improves the model that already contains hgt.

fit21 <- lm(wgt ~ hgt + age, data = child.data)
summary(fit21)
#> 
#> Call:
#> lm(formula = wgt ~ hgt + age, data = child.data)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -6.8708 -1.7004  0.3454  1.4642 10.2336 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)   6.5530    10.9448   0.599   0.5641  
#> hgt           0.7220     0.2608   2.768   0.0218 *
#> age           2.0501     0.9372   2.187   0.0565 .
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 4.66 on 9 degrees of freedom
#> Multiple R-squared:   0.78,  Adjusted R-squared:  0.7311 
#> F-statistic: 15.95 on 2 and 9 DF,  p-value: 0.001099
# Compare the nested models (anova output is a data frame, will be auto-paged)
fit22 <- lm(wgt ~ hgt, data = child.data)
anova(fit22, fit21)

2.0.3 Problem 3: Adding a Polynomial Term

We test if adding a quadratic term for age improves the model with hgt and age.

fit31 <- lm(wgt ~ hgt + age + I(age^2), data = child.data)
summary(fit31)
#> 
#> Call:
#> lm(formula = wgt ~ hgt + age + I(age^2), data = child.data)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -6.806 -1.771  0.374  1.374 10.161 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  3.43843   33.61082   0.102    0.921  
#> hgt          0.72369    0.27696   2.613    0.031 *
#> age          2.77687    7.42728   0.374    0.718  
#> I(age^2)    -0.04171    0.42241  -0.099    0.924  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 4.94 on 8 degrees of freedom
#> Multiple R-squared:  0.7803, Adjusted R-squared:  0.6978 
#> F-statistic: 9.469 on 3 and 8 DF,  p-value: 0.005208
fit32 <- lm(wgt ~ hgt + age, data = child.data)
anova(fit32, fit31)

2.0.4 Problem 4: Testing Multiple Coefficients

We test if the age and age^2 terms are jointly significant.

fit41 <- lm(wgt ~ hgt + age + I(age^2), data = child.data)
fit42 <- lm(wgt ~ hgt, data = child.data)
anova(fit42, fit41)

3 Mean Response and Prediction Intervals

This section demonstrates how to calculate confidence and prediction intervals for the wire bond strength data.

3.0.1 Confidence Interval for Mean Response

Construct a 95% CI on the mean pull strength for a wire bond with wire length = 8 and die height = 275.

# We use the model 'fit' from the first section (predict output is a matrix)
predict(fit, newdata = data.frame(length = 8, height = 275),
        interval = "confidence", level = 0.95)
#>       fit      lwr      upr
#> 1 27.6631 26.66324 28.66296

3.0.2 Prediction Interval for a New Observation

Construct a 95% PI on the pull strength for a new wire bond with wire length = 8 and die height = 275.

predict(fit, newdata = data.frame(length = 8, height = 275),
        interval = "prediction", level = 0.95)
#>       fit      lwr      upr
#> 1 27.6631 22.81378 32.51241

4 Model Diagnostics

This section covers various methods for diagnosing the model fit using residuals and other measures.

4.0.1 Residual Calculations

We can calculate hat values, and various types of residuals. They are combined here into a single data frame which will be auto-paged.

residuals_df <- data.frame(
  hat_values = hatvalues(fit),
  ordinary_resid = resid(fit),
  standardized_resid = resid(fit) / sigma(fit),
  studentized_internal = rstandard(fit),
  studentized_external = rstudent(fit)
)
residuals_df

4.0.2 Residual Plots

A common practice is to analyze plots of the studentized residuals to check for non-linearity, non-constant variance, and non-normality.

# Residual analysis using internally studentized residuals
n <- nrow(bond.data)
r <- rstandard(fit) 
y.hat <- fitted.values(fit)

par(mfrow = c(2, 3))
qqnorm(r, main = "Normal Q-Q Plot")
qqline(r)
plot(y.hat, r, xlab = "Fitted values", ylab = "Studentized Residuals")
abline(h = 0)
plot(1:n, r, xlab = "Observation Number", ylab = "Studentized Residuals")
abline(h = 0)
plot(bond.data$length, r, xlab = "Wire Length", ylab = "Studentized Residuals")
abline(h = 0)
plot(bond.data$height, r, xlab = "Die Height", ylab = "Studentized Residuals")
abline(h = 0)

5 Influential Observations

We can identify influential data points using metrics like DFFITS, DFBETAS, and Cook’s D.

# Find DFFITS, DFBETAS and Cook's D for the wire bond strength data.
influence_df <- data.frame(dffits = dffits(fit), cook.D = cooks.distance(fit), dfbetas(fit))
influence_df

5.0.1 Plotting with the olsrr Package

The olsrr package provides convenient functions for plotting these influence diagnostics. Note: You only need to run install.packages("olsrr") once.

# install.packages("olsrr") # Run this once if you don't have the package
library(olsrr)

# Plot for Cook's D
ols_plot_cooksd_chart(fit)

# Plot for DFFITS
ols_plot_dffits(fit)

# Plot for DFBETAS
ols_plot_dfbetas(fit)

6 Polynomial Regression

This example uses data on the cost and production lot size of airplane sidewall panels.

y <- c(1.81, 1.70, 1.65, 1.55, 1.48, 1.40, 1.30, 1.26, 1.24, 1.21, 1.20, 1.18)
x <- c(20, 25, 30, 35, 40, 50, 60, 65, 70, 75, 80, 90)
fit_poly <- lm(y ~ x + I(x^2))
summary(fit_poly)
#> 
#> Call:
#> lm(formula = y ~ x + I(x^2))
#> 
#> Residuals:
#>        Min         1Q     Median         3Q        Max 
#> -0.0174763 -0.0065087  0.0001297  0.0071482  0.0151887 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  2.198e+00  2.255e-02   97.48 6.38e-15 ***
#> x           -2.252e-02  9.424e-04  -23.90 1.88e-09 ***
#> I(x^2)       1.251e-04  8.658e-06   14.45 1.56e-07 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.01219 on 9 degrees of freedom
#> Multiple R-squared:  0.9975, Adjusted R-squared:  0.9969 
#> F-statistic:  1767 on 2 and 9 DF,  p-value: 2.096e-12

The plot shows the fitted quadratic curve.

plot(x, y, xlab = "Lot size, x", ylab = "Average cost per unit, y")
lines(x, predict(fit_poly, newdata = data.frame(x = x)), type = "l")

We can use a partial F-test to see if the quadratic term significantly improves the model.

fit1 <- lm(y ~ x)
anova(fit1, fit_poly)

7 Indicator Variables

This section uses the SBP (Systolic Blood Pressure) dataset to model the effect of a categorical variable (sex).

# Note: Update this path to your local file location
sbpdata <- read.csv("sbpdata.csv")

# Fit the full model with an interaction term
fit.full <- lm(sbp ~ age + sex + age:sex, data = sbpdata)
summary(fit.full)
#> 
#> Call:
#> lm(formula = sbp ~ age + sex + age:sex, data = sbpdata)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -20.647  -3.410   1.254   4.314  21.153 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 97.07708    5.17046  18.775  < 2e-16 ***
#> age          0.94932    0.10864   8.738 1.43e-12 ***
#> sex         12.96144    7.01172   1.849   0.0691 .  
#> age:sex      0.01203    0.14519   0.083   0.9342    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 8.946 on 65 degrees of freedom
#> Multiple R-squared:  0.7759, Adjusted R-squared:  0.7656 
#> F-statistic: 75.02 on 3 and 65 DF,  p-value: < 2.2e-16

7.0.1 Hypothesis Tests for Indicator Models

We can perform various hypothesis tests to see if the lines are coincident, parallel, or have equal intercepts.

# Test of Coincidence. H0: beta2=beta3=0
fit.coin <- lm(sbp ~ age, data = sbpdata)
anova(fit.coin, fit.full)
# Test of Parallelism. H0: beta3=0
fit.para <- lm(sbp ~ age + sex, data = sbpdata)
anova(fit.para, fit.full)
# Test for Equal Intercepts. H0: beta2=0 (assuming parallel lines)
fit1.inter <- lm(sbp ~ age + sex, data = sbpdata)
fit2.inter <- lm(sbp ~ age, data = sbpdata)
anova(fit2.inter, fit1.inter)

7.0.2 Plotting the Fitted Model

The plot below shows the separate regression lines for males and females.

# Define color for males (sex=1) and females (sex=0).
colors <- ifelse(sbpdata$sex == 1, "black", "gray")

# Scatter plot
plot(sbpdata$age, sbpdata$sbp, xlab = "Age", ylab = "SBP", col = colors)

# Add the fitted lines for males and females
# The design matrix is data.frame(1, age, 1, age) for sex = 1,
# and data.frame(1, age, 0, 0) for sex = 0.
lines(sbpdata$age, cbind(1, sbpdata$age, 1, sbpdata$age) %*% coef(fit.full), col = "black")
lines(sbpdata$age, cbind(1, sbpdata$age, 0, 0) %*% coef(fit.full), col = "gray")

# Add legend
legend("topleft", legend = c("sex = 1 (male)", "sex = 0 (female)"), 
       lty = c(1, 1), col = c("black", "gray"))

8 Model Building

This section uses the olsrr package and wine quality data to demonstrate automated model selection techniques.

library(olsrr)
# Note: Update this path to your local file location
wine <- read.csv("wine.csv")

# A dot in the lm function means: use all other variables as predictors.
model <- lm(quality ~ ., data = wine)

8.0.1 All Possible Regression

This method evaluates every possible combination of predictors. The output is a data frame and will be auto-paged.

ols_step_best_subset(model)
#>              Best Subsets Regression             
#> -------------------------------------------------
#> Model Index    Predictors
#> -------------------------------------------------
#>      1         flavor                             
#>      2         flavor oakiness                    
#>      3         aroma flavor oakiness              
#>      4         clarity aroma flavor oakiness      
#>      5         clarity aroma body flavor oakiness 
#> -------------------------------------------------
#> 
#>                                                   Subsets Regression Summary                                                   
#> -------------------------------------------------------------------------------------------------------------------------------
#>                        Adj.        Pred                                                                                         
#> Model    R-Square    R-Square    R-Square     C(p)       AIC        SBIC        SBC        MSEP       FPE       HSP       APC  
#> -------------------------------------------------------------------------------------------------------------------------------
#>   1        0.6242      0.6137      0.5868    9.0436    130.0214    21.6859    134.9341    61.4102    1.7010    0.0462    0.4176 
#>   2        0.6611      0.6417      0.6058    6.8132    128.0901    20.1242    134.6404    57.0033    1.6171    0.0441    0.3970 
#>   3        0.7038      0.6776      0.6379    3.9278    124.9781    18.0702    133.1661    51.3383    1.4906    0.0409    0.3659 
#>   4        0.7147      0.6801      0.6102    4.6747    125.5480    19.2854    135.3736    50.9872    1.5143    0.0418    0.3717 
#>   5        0.7206      0.6769       0.587    6.0000    126.7552    21.0956    138.2183    51.5452    1.5649    0.0436    0.3842 
#> -------------------------------------------------------------------------------------------------------------------------------
#> AIC: Akaike Information Criteria 
#>  SBIC: Sawa's Bayesian Information Criteria 
#>  SBC: Schwarz Bayesian Criteria 
#>  MSEP: Estimated error of prediction, assuming multivariate normality 
#>  FPE: Final Prediction Error 
#>  HSP: Hocking's Sp 
#>  APC: Amemiya Prediction Criteria

8.0.2 Automated Stepwise Procedures

We can also use backward, forward, or stepwise selection to find a good model. These functions also return data frames.

# Backward Elimination (alpha_out = 0.1)
ols_step_backward_p(model, p_val = 0.1)
#> 
#> 
#>                              Stepwise Summary                             
#> ------------------------------------------------------------------------
#> Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
#> ------------------------------------------------------------------------
#>  0      Full Model    126.755    138.218    21.096    0.72060    0.67694 
#>  1      body          125.548    135.374    19.285    0.71471    0.68013 
#>  2      clarity       124.978    133.166    18.070    0.70377    0.67763 
#> ------------------------------------------------------------------------
#> 
#> Final Model Output 
#> ------------------
#> 
#>                          Model Summary                          
#> ---------------------------------------------------------------
#> R                       0.839       RMSE                 1.098 
#> R-Squared               0.704       MSE                  1.207 
#> Adj. R-Squared          0.678       Coef. Var            9.338 
#> Pred R-Squared          0.638       AIC                124.978 
#> MAE                     0.868       SBC                133.166 
#> ---------------------------------------------------------------
#>  RMSE: Root Mean Square Error 
#>  MSE: Mean Square Error 
#>  MAE: Mean Absolute Error 
#>  AIC: Akaike Information Criteria 
#>  SBC: Schwarz Bayesian Criteria 
#> 
#>                                ANOVA                                
#> -------------------------------------------------------------------
#>                Sum of                                              
#>               Squares        DF    Mean Square      F         Sig. 
#> -------------------------------------------------------------------
#> Regression    108.935         3         36.312    26.925    0.0000 
#> Residual       45.853        34          1.349                     
#> Total         154.788        37                                    
#> -------------------------------------------------------------------
#> 
#>                                   Parameter Estimates                                    
#> ----------------------------------------------------------------------------------------
#>       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
#> ----------------------------------------------------------------------------------------
#> (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#>       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
#>      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#>    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
#> ----------------------------------------------------------------------------------------
# Forward Selection (alpha_in = 0.1)
ols_step_forward_p(model, p_val = 0.1)
#> 
#> 
#>                              Stepwise Summary                             
#> ------------------------------------------------------------------------
#> Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
#> ------------------------------------------------------------------------
#>  0      Base Model    165.209    168.484    55.141    0.00000    0.00000 
#>  1      flavor        130.021    134.934    21.686    0.62417    0.61373 
#>  2      oakiness      128.090    134.640    20.124    0.66111    0.64175 
#>  3      aroma         124.978    133.166    18.070    0.70377    0.67763 
#> ------------------------------------------------------------------------
#> 
#> Final Model Output 
#> ------------------
#> 
#>                          Model Summary                          
#> ---------------------------------------------------------------
#> R                       0.839       RMSE                 1.098 
#> R-Squared               0.704       MSE                  1.207 
#> Adj. R-Squared          0.678       Coef. Var            9.338 
#> Pred R-Squared          0.638       AIC                124.978 
#> MAE                     0.868       SBC                133.166 
#> ---------------------------------------------------------------
#>  RMSE: Root Mean Square Error 
#>  MSE: Mean Square Error 
#>  MAE: Mean Absolute Error 
#>  AIC: Akaike Information Criteria 
#>  SBC: Schwarz Bayesian Criteria 
#> 
#>                                ANOVA                                
#> -------------------------------------------------------------------
#>                Sum of                                              
#>               Squares        DF    Mean Square      F         Sig. 
#> -------------------------------------------------------------------
#> Regression    108.935         3         36.312    26.925    0.0000 
#> Residual       45.853        34          1.349                     
#> Total         154.788        37                                    
#> -------------------------------------------------------------------
#> 
#>                                   Parameter Estimates                                    
#> ----------------------------------------------------------------------------------------
#>       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
#> ----------------------------------------------------------------------------------------
#> (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#>      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#>    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
#>       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
#> ----------------------------------------------------------------------------------------
# Stepwise Regression (alpha_in = 0.1, alpha_out = 0.1)
ols_step_both_p(model, p_enter = 0.1, p_remove = 0.1)
#> 
#> 
#>                               Stepwise Summary                              
#> --------------------------------------------------------------------------
#> Step    Variable          AIC        SBC       SBIC       R2       Adj. R2 
#> --------------------------------------------------------------------------
#>  0      Base Model      165.209    168.484    55.141    0.00000    0.00000 
#>  1      flavor (+)      130.021    134.934    21.686    0.62417    0.61373 
#>  2      oakiness (+)    128.090    134.640    20.124    0.66111    0.64175 
#>  3      aroma (+)       124.978    133.166    18.070    0.70377    0.67763 
#> --------------------------------------------------------------------------
#> 
#> Final Model Output 
#> ------------------
#> 
#>                          Model Summary                          
#> ---------------------------------------------------------------
#> R                       0.839       RMSE                 1.098 
#> R-Squared               0.704       MSE                  1.207 
#> Adj. R-Squared          0.678       Coef. Var            9.338 
#> Pred R-Squared          0.638       AIC                124.978 
#> MAE                     0.868       SBC                133.166 
#> ---------------------------------------------------------------
#>  RMSE: Root Mean Square Error 
#>  MSE: Mean Square Error 
#>  MAE: Mean Absolute Error 
#>  AIC: Akaike Information Criteria 
#>  SBC: Schwarz Bayesian Criteria 
#> 
#>                                ANOVA                                
#> -------------------------------------------------------------------
#>                Sum of                                              
#>               Squares        DF    Mean Square      F         Sig. 
#> -------------------------------------------------------------------
#> Regression    108.935         3         36.312    26.925    0.0000 
#> Residual       45.853        34          1.349                     
#> Total         154.788        37                                    
#> -------------------------------------------------------------------
#> 
#>                                   Parameter Estimates                                    
#> ----------------------------------------------------------------------------------------
#>       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
#> ----------------------------------------------------------------------------------------
#> (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#>      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#>    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
#>       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
#> ----------------------------------------------------------------------------------------

9 Multicollinearity

This section explores the issue of multicollinearity, where predictors are highly correlated.

9.0.1 A Simple Example

y <- c(19, 20, 37, 39, 36, 38)
x1 <- c(4, 4, 7, 7, 7.1, 7.1)
x2 <- c(16, 16, 49, 49, 50.4, 50.4)
cor(data.frame(x1, x2))
#>           x1        x2
#> x1 1.0000000 0.9999713
#> x2 0.9999713 1.0000000
# Full model
fit_multi <- lm(y ~ x1 + x2)
summary(fit_multi)
#> 
#> Call:
#> lm(formula = y ~ x1 + x2)
#> 
#> Residuals:
#>    1    2    3    4    5    6 
#> -0.5  0.5 -1.0  1.0 -1.0  1.0 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -156.056    117.158  -1.332    0.275
#> x1            65.444     45.890   1.426    0.249
#> x2            -5.389      4.152  -1.298    0.285
#> 
#> Residual standard error: 1.225 on 3 degrees of freedom
#> Multiple R-squared:  0.9897, Adjusted R-squared:  0.9829 
#> F-statistic: 144.3 on 2 and 3 DF,  p-value: 0.001043
# Simple model
fit1_multi <- lm(y ~ x1)
summary(fit1_multi)
#> 
#> Call:
#> lm(formula = y ~ x1)
#> 
#> Residuals:
#>       1       2       3       4       5       6 
#> -0.5260  0.4740 -0.1925  1.8075 -1.7814  0.2186 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -4.0293     2.3332  -1.727    0.159    
#> x1            5.8888     0.3762  15.654 9.73e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.325 on 4 degrees of freedom
#> Multiple R-squared:  0.9839, Adjusted R-squared:  0.9799 
#> F-statistic: 245.1 on 1 and 4 DF,  p-value: 9.725e-05

9.0.2 VIFs in the Wine Quality Data

We can calculate the Variance Inflation Factor (VIF) for each predictor to diagnose multicollinearity. A VIF > 10 is often considered problematic.

wine.x <- wine[, -ncol(wine)] # Assuming quality is the last column
cor(wine.x)
#>              clarity     aroma       body      flavor  oakiness
#> clarity   1.00000000 0.0619021 -0.3083783 -0.08515993 0.1832147
#> aroma     0.06190210 1.0000000  0.5489102  0.73656121 0.2016444
#> body     -0.30837826 0.5489102  1.0000000  0.64665917 0.1521059
#> flavor   -0.08515993 0.7365612  0.6466592  1.00000000 0.1797605
#> oakiness  0.18321471 0.2016444  0.1521059  0.17976051 1.0000000
# Find VIF's using the olsrr package (returns a data frame).
ols_vif_tol(model)