# fit polynomial model of order 3 for: nox ~ dis
poly.fit = lm(nox ~ poly(dis, 3, raw = TRUE), data = b);
# summary of model
summary(poly.fit)## 
## Call:
## lm(formula = nox ~ poly(dis, 3, raw = TRUE), data = b)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.9341281  0.0207076  45.110  < 2e-16 ***
## poly(dis, 3, raw = TRUE)1 -0.1820817  0.0146973 -12.389  < 2e-16 ***
## poly(dis, 3, raw = TRUE)2  0.0219277  0.0029329   7.476 3.43e-13 ***
## poly(dis, 3, raw = TRUE)3 -0.0008850  0.0001727  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16# plot of resulting data
p1 = ggplot(b, aes(x = dis, y = nox)) + geom_point() + theme_bw()
p1 + stat_smooth(method = "lm", se = TRUE, fill = NA,
                formula = y ~ poly(x, 3, raw = TRUE), colour = "blue")# regression spline to predix nox using dis
spline.fit = lm(nox ~ bs(dis, df = 4, knots = c(6)), data = b)
summary(spline.fit)## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(6)), data = b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12387 -0.04012 -0.01033  0.02308  0.19446 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.76037    0.01018  74.667  < 2e-16 ***
## bs(dis, df = 4, knots = c(6))1 -0.23672    0.02321 -10.200  < 2e-16 ***
## bs(dis, df = 4, knots = c(6))2 -0.36177    0.02548 -14.200  < 2e-16 ***
## bs(dis, df = 4, knots = c(6))3 -0.33337    0.04044  -8.244 1.47e-15 ***
## bs(dis, df = 4, knots = c(6))4 -0.36220    0.05105  -7.095 4.45e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06208 on 501 degrees of freedom
## Multiple R-squared:  0.7152, Adjusted R-squared:  0.7129 
## F-statistic: 314.6 on 4 and 501 DF,  p-value: < 2.2e-16p1 + stat_smooth(method = "lm", formula = y ~ bs(x, 4, knots = c(6)),
                 se = TRUE, fill = NA)rss = rep(0, 10)
colors = c("blue", "red", "green", "yellow", "seagreen2", 
           "deeppink1", "purple", "sienna2", "orange", "slateblue1")
for (i in 3:10) {
    fit.i    = lm(nox ~ bs(dis, df = i), data = b)
    rss[i]   = sum(fit.i$residuals^2)
}
rss[3:10]## [1] 1.934107 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535df_rss = data.frame(df = 3:10, rss = rss[3:10])
ggplot(df_rss, aes(x = df, y = rss)) + geom_line() + theme_bw()p1 + stat_smooth(method = "lm", formula = y ~ bs(x, 3), 
                 se = FALSE, fill = NA, colour = colors[1]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 4), 
                 se = FALSE, fill = NA, colour = colors[2]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 5), 
                 se = FALSE, fill = NA, colour = colors[3]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 6), 
                 se = FALSE, fill = NA, colour = colors[4]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 7), 
                 se = FALSE, fill = NA, colour = colors[5]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 8), 
                 se = FALSE, fill = NA, colour = colors[6]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 9), 
                 se = FALSE, fill = NA, colour = colors[7]) +
    stat_smooth(method = "lm", formula = y ~ bs(x, 10),
                se = FALSE, fill = NA, colour = colors[7])