Chapter VII - Moving Beyond Linearity

All the questions are as per the ISL seventh printing.

Common

libsUsed<-c("dplyr","ggplot2","tidyverse",
            "ISLR","caret","MASS", "gridExtra",
            "pls","latex2exp","data.table")
invisible(lapply(libsUsed, library, character.only = TRUE))

Question 7.6 - Page 299

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained. In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.

Answer

Lets get the data.

set.seed(1984)
wageDat<-ISLR::Wage
wageDat %>% str %>% print
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...
## NULL
wageDat %>% summary %>% print
##       year           age                     maritl           race
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55
##  Max.   :2009   Max.   :80.00
##
##               education                     region               jobclass
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456
##  3. Some College   :650   3. East North Central:   0
##  4. College Grad   :685   4. West North Central:   0
##  5. Advanced Degree:426   5. South Atlantic    :   0
##                           6. East South Central:   0
##                           (Other)              :   0
##             health      health_ins      logwage           wage
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38
##                                      Median :4.653   Median :104.92
##                                      Mean   :4.654   Mean   :111.70
##                                      3rd Qu.:4.857   3rd Qu.:128.68
##                                      Max.   :5.763   Max.   :318.34
##
wageDat %>% sapply(unique) %>% sapply(length) %>% print
##       year        age     maritl       race  education     region   jobclass
##          7         61          5          4          5          1          2
##     health health_ins    logwage       wage
##          2          2        508        508
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
##     melanoma

a) Polynomial regression

all.deltas = rep(NA, 10)
for (i in 1:10) {
  glm.fit = glm(wage~poly(age, i), data=Wage)
  all.deltas[i] = cv.glm(Wage, glm.fit, K=10)$delta[2]
}
plot(1:10, all.deltas, xlab="Degree", ylab="CV error", type="l", pch=20, lwd=2, ylim=c(1590, 1700))
min.point = min(all.deltas)
sd.points = sd(all.deltas)
abline(h=min.point + 0.2 * sd.points, col="red", lty="dashed")
abline(h=min.point - 0.2 * sd.points, col="red", lty="dashed")
legend("topright", "0.2-standard deviation lines", lty="dashed", col="red")
# ANOVA
fits=list()
for (i in 1:10) {
  fits[[i]]=glm(wage~poly(age,i),data=wageDat)
}
anova(fits[[1]],fits[[2]],fits[[3]],fits[[4]],fits[[5]],
  fits[[6]],fits[[7]],fits[[8]],fits[[9]],fits[[10]])
## Analysis of Deviance Table
##
## Model  1: wage ~ poly(age, i)
## Model  2: wage ~ poly(age, i)
## Model  3: wage ~ poly(age, i)
## Model  4: wage ~ poly(age, i)
## Model  5: wage ~ poly(age, i)
## Model  6: wage ~ poly(age, i)
## Model  7: wage ~ poly(age, i)
## Model  8: wage ~ poly(age, i)
## Model  9: wage ~ poly(age, i)
## Model 10: wage ~ poly(age, i)
##    Resid. Df Resid. Dev Df Deviance
## 1       2998    5022216
## 2       2997    4793430  1   228786
## 3       2996    4777674  1    15756
## 4       2995    4771604  1     6070
## 5       2994    4770322  1     1283
## 6       2993    4766389  1     3932
## 7       2992    4763834  1     2555
## 8       2991    4763707  1      127
## 9       2990    4756703  1     7004
## 10      2989    4756701  1        3
  • The 4th degree looks the best at the moment
# 3rd or 4th degrees look best based on ANOVA test
# let's go with 4th degree fit
plot(wage~age, data=wageDat, col="darkgrey")
agelims = range(wageDat$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 4), data=wageDat)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
lines(age.grid, lm.pred, col="blue", lwd=2)

b) Step function and cross-validation

# cross-validation
cv.error <- rep(0,9)
for (i in 2:10) {
  wageDat$age.cut <- cut(wageDat$age,i)
  glm.fit <- glm(wage~age.cut, data=wageDat)
  cv.error[i-1] <- cv.glm(wageDat, glm.fit, K=10)$delta[1]  # [1]:std, [2]:bias-corrected
}
cv.error
## [1] 1732.337 1682.978 1636.736 1635.600 1624.174 1610.688 1604.081 1612.005
## [9] 1607.022
cv.error
## [1] 1732.337 1682.978 1636.736 1635.600 1624.174 1610.688 1604.081 1612.005
## [9] 1607.022
plot(2:10, cv.error, type="b")
cut.fit <- glm(wage~cut(age,8), data=wageDat)
preds <- predict(cut.fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- preds$fit + cbind(2*preds$se.fit, -2*preds$se.fit)
plot(wageDat$age, wageDat$wage, xlim=agelims, cex=0.5, col="darkgrey")
title("Fit with 8 Age Bands")
lines(age.grid, preds$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

Question 7.8 - Page 299

Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.

Answer

autoDat<-ISLR::Auto
autoDat %>% pivot_longer(-c(mpg,name),names_to="Params",values_to="Value") %>% ggplot(aes(x=mpg,y=Value)) +
  geom_point() +
  facet_wrap(~ Params, scales = "free_y")

Very clearly there is a lot of non-linearity in the mpg data, especially for acceleration, weight, displacement, horsepower.

rss = rep(NA, 10)
fits = list()
for (d in 1:10) {
    fits[[d]] = lm(mpg ~ poly(displacement, d), data = autoDat)
    rss[d] = deviance(fits[[d]])
}
rss %>% print
##  [1] 8378.822 7412.263 7392.322 7391.722 7380.838 7270.746 7089.716 6917.401
##  [9] 6737.801 6610.190
anova(fits[[1]],fits[[2]],fits[[3]],fits[[4]],fits[[5]],
  fits[[6]],fits[[7]],fits[[8]],fits[[9]],fits[[10]])
## Analysis of Variance Table
##
## Model  1: mpg ~ poly(displacement, d)
## Model  2: mpg ~ poly(displacement, d)
## Model  3: mpg ~ poly(displacement, d)
## Model  4: mpg ~ poly(displacement, d)
## Model  5: mpg ~ poly(displacement, d)
## Model  6: mpg ~ poly(displacement, d)
## Model  7: mpg ~ poly(displacement, d)
## Model  8: mpg ~ poly(displacement, d)
## Model  9: mpg ~ poly(displacement, d)
## Model 10: mpg ~ poly(displacement, d)
##    Res.Df    RSS Df Sum of Sq       F    Pr(>F)
## 1     390 8378.8
## 2     389 7412.3  1    966.56 55.7108 5.756e-13 ***
## 3     388 7392.3  1     19.94  1.1494  0.284364
## 4     387 7391.7  1      0.60  0.0346  0.852549
## 5     386 7380.8  1     10.88  0.6273  0.428823
## 6     385 7270.7  1    110.09  6.3455  0.012177 *
## 7     384 7089.7  1    181.03 10.4343  0.001344 **
## 8     383 6917.4  1    172.31  9.9319  0.001753 **
## 9     382 6737.8  1    179.60 10.3518  0.001404 **
## 10    381 6610.2  1    127.61  7.3553  0.006990 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Confirming our visual indications, we see that the second degree models work well.

library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
##     expand, pack, unpack
## Loaded glmnet 3.0-2
library(boot)
cv.errs = rep(NA, 15)
for (d in 1:15) {
    fit = glm(mpg ~ poly(displacement, d), data = Auto)
    cv.errs[d] = cv.glm(Auto, fit, K = 15)$delta[2]
}
which.min(cv.errs)
## [1] 10

Strangely, we seem to have ended up with a ten variable model here.

# Step functions
cv.errs = rep(NA, 10)
for (c in 2:10) {
    Auto$dis.cut = cut(Auto$displacement, c)
    fit = glm(mpg ~ dis.cut, data = Auto)
    cv.errs[c] = cv.glm(Auto, fit, K = 10)$delta[2]
}
which.min(cv.errs) %>% print
## [1] 9
library(splines)
cv.errs = rep(NA, 10)
for (df in 3:10) {
    fit = glm(mpg ~ ns(displacement, df = df), data = Auto)
    cv.errs[df] = cv.glm(Auto, fit, K = 10)$delta[2]
}
which.min(cv.errs) %>% print
## [1] 10
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
##     accumulate, when
## Loaded gam 1.16.1
# GAMs
fit = gam(mpg ~ s(displacement, 4) + s(horsepower, 4), data = Auto)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
summary(fit)
##
## Call: gam(formula = mpg ~ s(displacement, 4) + s(horsepower, 4), data = Auto)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max
## -11.2982  -2.1592  -0.4394   2.1247  17.0946
##
## (Dispersion Parameter for gaussian family taken to be 15.3543)
##
##     Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 5880.697 on 382.9999 degrees of freedom
## AIC: 2194.05
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
##                     Df  Sum Sq Mean Sq F value  Pr(>F)
## s(displacement, 4)   1 15254.9 15254.9 993.524 < 2e-16 ***
## s(horsepower, 4)     1  1038.4  1038.4  67.632 3.1e-15 ***
## Residuals          383  5880.7    15.4
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
##                    Npar Df Npar F     Pr(F)
## (Intercept)
## s(displacement, 4)       3 13.613 1.863e-08 ***
## s(horsepower, 4)         3 15.606 1.349e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 7.9 - Pages 299-300

This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

(a) Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

(b) Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.

(c) Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.

(d) Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

(e) Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

(f) Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.

Answer

boston<-MASS::Boston
boston %>% str %>% print
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## NULL
boston %>% summary %>% print
##       crim                zn             indus            chas
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000
##       nox               rm             age              dis
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127
##       rad              tax           ptratio          black
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90
##      lstat            medv
##  Min.   : 1.73   Min.   : 5.00
##  1st Qu.: 6.95   1st Qu.:17.02
##  Median :11.36   Median :21.20
##  Mean   :12.65   Mean   :22.53
##  3rd Qu.:16.95   3rd Qu.:25.00
##  Max.   :37.97   Max.   :50.00
boston %>% sapply(unique) %>% sapply(length) %>% print
##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax
##     504      26      76       2      81     446     356     412       9      66
## ptratio   black   lstat    medv
##      46     357     455     229

a) Polynomial

fit.03 <- lm(nox~poly(dis,3), data=boston)
dislims <- range(boston$dis)
dis.grid <- seq(dislims[1], dislims[2], 0.1)
preds <- predict(fit.03, newdata=list(dis=dis.grid), se=TRUE)
se.bands <- preds$fit + cbind(2*preds$se.fit, -2*preds$se.fit)
par(mfrow=c(1,1), mar=c(4.5,4.5,1,1), oma=c(0,0,4,0))
plot(boston$dis, boston$nox, xlim=dislims, cex=0.5, col="darkgrey")
title("Degree 3 Polynomial Fit")
lines(dis.grid, preds$fit, lwd=2, col="blue")
matlines(dis.grid, se.bands, lwd=1, col="blue", lty=3)
summary(fit.03)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = boston)
##
## Residuals:
##       Min        1Q    Median        3Q       Max
## -0.121130 -0.040619 -0.009738  0.023385  0.194904
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16

b) Multiple Polynomials

rss.error <- rep(0,10)
for (i in 1:10) {
  lm.fit <- lm(nox~poly(dis,i), data=boston)
  rss.error[i] <- sum(lm.fit$residuals^2)
}
rss.error
##  [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
##  [9] 1.833331 1.832171
plot(rss.error, type="b")

c) Cross validation and polynomial selection

require(boot)
set.seed(1)
cv.error <- rep(0,10)
for (i in 1:10) {
  glm.fit <- glm(nox~poly(dis,i), data=boston)
  cv.error[i] <- cv.glm(boston, glm.fit, K=10)$delta[1]  # [1]:std, [2]:bias-corrected
}
cv.error
##  [1] 0.005558263 0.004085706 0.003876521 0.003863342 0.004237452 0.005686862
##  [7] 0.010278897 0.006810868 0.033308607 0.004075599
plot(cv.error, type="b")
  • I feel like the second degree fit would be the most reasonable, though the fourth degree seems to be doing well.

d) Regression spline

fit.sp <- lm(nox~bs(dis, df=4), data=boston)
pred <- predict(fit.sp, newdata=list(dis=dis.grid), se=T)
plot(boston$dis, boston$nox, col="gray")
lines(dis.grid, pred$fit, lwd=2)
lines(dis.grid, pred$fit+2*pred$se, lty="dashed")
lines(dis.grid, pred$fit-2*pred$se, lty="dashed")
# set df to select knots at uniform quantiles of `dis`
attr(bs(boston$dis,df=4),"knots")  # only 1 knot at 50th percentile
##     50%
## 3.20745

e) Range of regression splines

rss.error <- rep(0,7)
for (i in 4:10) {
  fit.sp <- lm(nox~bs(dis, df=i), data=boston)
  rss.error[i-3] <- sum(fit.sp$residuals^2)
}
rss.error
## [1] 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535
plot(4:10, rss.error, type="b")
  • As the model gains more degrees of freedom, it tends to over fit to the training data better

f) Cross validation for best spline

cv.error <- rep(0,7)
for (i in 4:10) {
  glm.fit <- glm(nox~bs(dis, df=i), data=boston)
  cv.error[i-3] <- cv.glm(boston, glm.fit, K=10)$delta[1]
}
## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.1523), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.1523), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.2157), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.2157), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.35953333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.35953333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.38403333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.38403333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.07945, `50%` = 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.07945, `50%` = 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.1103, `50%` = 3.2797, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.1103, `50%` = 3.2797, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.9682, `40%` = 2.7147, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.9682, `40%` = 2.7147, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.95434, `40%` = 2.59666, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.95434, `40%` = 2.59666, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.82203333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.82203333333333, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.8226, `33.33333%` =
## 2.3817, : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.8226, `33.33333%` =
## 2.3817, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.7936, `28.57143%`
## = 2.16972857142857, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.7936, `28.57143%`
## = 2.16972857142857, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.754625, `25%` = 2.10215, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.754625, `25%` = 2.10215, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.751575, `25%` = 2.08755, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.751575, `25%` = 2.08755, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
cv.error
## [1] 0.003898810 0.003694675 0.003732665 0.003766202 0.003716389 0.003723126
## [7] 0.003727358
plot(4:10, cv.error, type="b")
  • A fifth degree polynomial is clearly indicated

Question 10 - Page 300

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Answer

colDat<-ISLR::College
colDat %>% str %>% print
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...
## NULL
colDat %>% summary %>% print
##  Private        Apps           Accept          Enroll       Top10perc
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00
##            Median : 1558   Median : 1110   Median : 434   Median :23.00
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00
##    Top25perc      F.Undergrad     P.Undergrad         Outstate
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700
##    Room.Board       Books           Personal         PhD
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00
##     Terminal       S.F.Ratio      perc.alumni        Expend
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233
##    Grad.Rate
##  Min.   : 10.00
##  1st Qu.: 53.00
##  Median : 65.00
##  Mean   : 65.46
##  3rd Qu.: 78.00
##  Max.   :118.00
colDat %>% sapply(unique) %>% sapply(length) %>% print
##     Private        Apps      Accept      Enroll   Top10perc   Top25perc
##           2         711         693         581          82          89
## F.Undergrad P.Undergrad    Outstate  Room.Board       Books    Personal
##         714         566         640         553         122         294
##         PhD    Terminal   S.F.Ratio perc.alumni      Expend   Grad.Rate
##          78          65         173          61         744          81
plotLEAP=function(leapObj){
  par(mfrow = c(2,2))
  bar2=which.max(leapObj$adjr2)
  bbic=which.min(leapObj$bic)
  bcp=which.min(leapObj$cp)
  plot(leapObj$rss,xlab="Number of variables",ylab="RSS",type="b")
  plot(leapObj$adjr2,xlab="Number of variables",ylab=TeX("Adjusted R^2"),type="b")
  points(bar2,leapObj$adjr2[bar2],col="green",cex=2,pch=20)
  plot(leapObj$bic,xlab="Number of variables",ylab=TeX("BIC"),type="b")
  points(bbic,leapObj$bic[bbic],col="blue",cex=2,pch=20)
  plot(leapObj$cp,xlab="Number of variables",ylab=TeX("C_p"),type="b")
  points(bcp,leapObj$cp[bcp],col="red",cex=2,pch=20)
}

a) Train test

train_ind = sample(colDat %>% nrow,100)
test_ind = setdiff(seq_len(colDat %>% nrow), train_ind)

Best subset selection

train_set<-colDat[train_ind,]
test_set<-colDat[-train_ind,]
library(leaps)
modelFit<-regsubsets(Outstate~.,data=colDat,nvmax=20)
modelFit %>% summary %>% print
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = colDat, nvmax = 20)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "
## 8  ( 1 )  "*"        " "  "*"    " "    " "       " "       "*"
## 9  ( 1 )  "*"        "*"  "*"    " "    " "       " "       "*"
## 10  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"
## 11  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"
## 12  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "
## 8  ( 1 )  " "         "*"        " "   " "      " " "*"      " "
## 9  ( 1 )  " "         "*"        " "   " "      " " "*"      " "
## 10  ( 1 ) " "         "*"        " "   " "      " " "*"      " "
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"
## 13  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"
## 16  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "
## 2  ( 1 )  " "         "*"    " "
## 3  ( 1 )  " "         "*"    " "
## 4  ( 1 )  "*"         "*"    " "
## 5  ( 1 )  "*"         "*"    " "
## 6  ( 1 )  "*"         "*"    "*"
## 7  ( 1 )  "*"         "*"    "*"
## 8  ( 1 )  "*"         "*"    "*"
## 9  ( 1 )  "*"         "*"    "*"
## 10  ( 1 ) "*"         "*"    "*"
## 11  ( 1 ) "*"         "*"    "*"
## 12  ( 1 ) "*"         "*"    "*"
## 13  ( 1 ) "*"         "*"    "*"
## 14  ( 1 ) "*"         "*"    "*"
## 15  ( 1 ) "*"         "*"    "*"
## 16  ( 1 ) "*"         "*"    "*"
## 17  ( 1 ) "*"         "*"    "*"

We might want to take a look at these.

par(mfrow=c(2,2))
plot(modelFit)
plot(modelFit,scale='Cp')
plot(modelFit,scale='r2')
plot(modelFit,scale='adjr2')
plotLEAP(modelFit %>% summary)
  • So we like 14 variables, namely
coefficients(modelFit,id=14) %>% print
##   (Intercept)    PrivateYes          Apps        Accept        Enroll
## -1.817040e+03  2.256946e+03 -2.999022e-01  8.023519e-01 -5.372545e-01
##     Top10perc   F.Undergrad    Room.Board      Personal           PhD
##  2.365529e+01 -9.569936e-02  8.741819e-01 -2.478418e-01  1.269506e+01
##      Terminal     S.F.Ratio   perc.alumni        Expend     Grad.Rate
##  2.297296e+01 -4.700560e+01  4.195006e+01  2.003912e-01  2.383197e+01
  • But five seems like a better bet.
coefficients(modelFit,id=5)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni
## -2864.6325619  2936.7416766     1.0677573    40.5334088    61.3147684
##        Expend
##     0.2253945

b) GAM

library(gam)
fit = gam(Outstate ~ Private+s(Apps,3)+Accept+Enroll+
            Top10perc+F.Undergrad+Room.Board+
            Personal+PhD+Terminal+S.F.Ratio+
            perc.alumni+Expend+Grad.Rate
        , data = colDat)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
fit2 = gam(Outstate ~ Private+s(Room.Board,2)+s(PhD,3)+s(perc.alumni)+Expend
        , data = colDat)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow=c(2,2))
plot(fit,se=TRUE)

par(mfrow=c(2,2))
plot(fit2,se=TRUE)

c) Evaluate

pred <- predict(fit, test_set)
mse.error <- mean((test_set$Outstate - pred)^2)
mse.error %>% print
## [1] 3691891
gam.tss = mean((test_set$Outstate - mean(test_set$Outstate))^2)
test.rss = 1 - mse.error/gam.tss
test.rss %>% print
## [1] 0.7731239
pred2 <- predict(fit2, test_set)
mse.error2 <- mean((test_set$Outstate - pred2)^2)
mse.error2 %>% print
## [1] 4121902
gam.tss2 = mean((test_set$Outstate - mean(test_set$Outstate))^2)
test.rss2 = 1 - mse.error2/gam.tss2
test.rss2 %>% print
## [1] 0.7466987

This is pretty good model, all told.

d) Summary

summary(fit) %>% print
##
## Call: gam(formula = Outstate ~ Private + s(Apps, 3) + Accept + Enroll +
##     Top10perc + F.Undergrad + Room.Board + Personal + PhD + Terminal +
##     S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = colDat)
## Deviance Residuals:
##       Min        1Q    Median        3Q       Max
## -6641.083 -1262.806    -5.698  1270.911  9965.901
##
## (Dispersion Parameter for gaussian family taken to be 3749048)
##
##     Null Deviance: 12559297426 on 776 degrees of freedom
## Residual Deviance: 2849276343 on 760 degrees of freedom
## AIC: 13985.3
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
##              Df     Sum Sq    Mean Sq  F value    Pr(>F)
## Private       1 4034912907 4034912907 1076.250 < 2.2e-16 ***
## s(Apps, 3)    1 1344548030 1344548030  358.637 < 2.2e-16 ***
## Accept        1   90544274   90544274   24.151 1.091e-06 ***
## Enroll        1  144471570  144471570   38.535 8.838e-10 ***
## Top10perc     1 1802244831 1802244831  480.721 < 2.2e-16 ***
## F.Undergrad   1   45230645   45230645   12.065 0.0005430 ***
## Room.Board    1 1110285773 1110285773  296.151 < 2.2e-16 ***
## Personal      1   47886988   47886988   12.773 0.0003738 ***
## PhD           1  220249039  220249039   58.748 5.476e-14 ***
## Terminal      1   66366007   66366007   17.702 2.892e-05 ***
## S.F.Ratio     1  190811028  190811028   50.896 2.274e-12 ***
## perc.alumni   1  225293653  225293653   60.094 2.904e-14 ***
## Expend        1  258162295  258162295   68.861 4.805e-16 ***
## Grad.Rate     1   57947219   57947219   15.457 9.214e-05 ***
## Residuals   760 2849276343    3749048
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
##             Npar Df Npar F     Pr(F)
## (Intercept)
## Private
## s(Apps, 3)        2  8.571 0.0002085 ***
## Accept
## Enroll
## Top10perc
## F.Undergrad
## Room.Board
## Personal
## PhD
## Terminal
## S.F.Ratio
## perc.alumni
## Expend
## Grad.Rate
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit2) %>% print
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD,
##     3) + s(perc.alumni) + Expend, data = colDat)
## Deviance Residuals:
##       Min        1Q    Median        3Q       Max
## -8676.030 -1345.678    -8.409  1265.524  9590.459
##
## (Dispersion Parameter for gaussian family taken to be 4175193)
##
##     Null Deviance: 12559297426 on 776 degrees of freedom
## Residual Deviance: 3194023899 on 765.0002 degrees of freedom
## AIC: 14064.05
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
##                   Df     Sum Sq    Mean Sq F value    Pr(>F)
## Private            1 3751107814 3751107814  898.43 < 2.2e-16 ***
## s(Room.Board, 2)   1 2913770756 2913770756  697.88 < 2.2e-16 ***
## s(PhD, 3)          1 1149711330 1149711330  275.37 < 2.2e-16 ***
## s(perc.alumni)     1  556759894  556759894  133.35 < 2.2e-16 ***
## Expend             1  554812125  554812125  132.88 < 2.2e-16 ***
## Residuals        765 3194023899    4175193
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
##                  Npar Df Npar F     Pr(F)
## (Intercept)
## Private
## s(Room.Board, 2)       1 4.9853 0.0258517 *
## s(PhD, 3)              2 9.1614 0.0001171 ***
## s(perc.alumni)         3 0.8726 0.4548496
## Expend
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1