# fit polynomial model of order 3 for: nox ~ dis
poly.fit = lm(nox ~ poly(dis, 3, raw = TRUE), data = b);
# summary of model
summary(poly.fit)
##
## Call:
## lm(formula = nox ~ poly(dis, 3, raw = TRUE), data = b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9341281 0.0207076 45.110 < 2e-16 ***
## poly(dis, 3, raw = TRUE)1 -0.1820817 0.0146973 -12.389 < 2e-16 ***
## poly(dis, 3, raw = TRUE)2 0.0219277 0.0029329 7.476 3.43e-13 ***
## poly(dis, 3, raw = TRUE)3 -0.0008850 0.0001727 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
# plot of resulting data
p1 = ggplot(b, aes(x = dis, y = nox)) + geom_point() + theme_bw()
p1 + stat_smooth(method = "lm", se = TRUE, fill = NA,
formula = y ~ poly(x, 3, raw = TRUE), colour = "blue")
# regression spline to predix nox using dis
spline.fit = lm(nox ~ bs(dis, df = 4, knots = c(6)), data = b)
summary(spline.fit)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(6)), data = b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12387 -0.04012 -0.01033 0.02308 0.19446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.76037 0.01018 74.667 < 2e-16 ***
## bs(dis, df = 4, knots = c(6))1 -0.23672 0.02321 -10.200 < 2e-16 ***
## bs(dis, df = 4, knots = c(6))2 -0.36177 0.02548 -14.200 < 2e-16 ***
## bs(dis, df = 4, knots = c(6))3 -0.33337 0.04044 -8.244 1.47e-15 ***
## bs(dis, df = 4, knots = c(6))4 -0.36220 0.05105 -7.095 4.45e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06208 on 501 degrees of freedom
## Multiple R-squared: 0.7152, Adjusted R-squared: 0.7129
## F-statistic: 314.6 on 4 and 501 DF, p-value: < 2.2e-16
p1 + stat_smooth(method = "lm", formula = y ~ bs(x, 4, knots = c(6)),
se = TRUE, fill = NA)
rss = rep(0, 10)
colors = c("blue", "red", "green", "yellow", "seagreen2",
"deeppink1", "purple", "sienna2", "orange", "slateblue1")
for (i in 3:10) {
fit.i = lm(nox ~ bs(dis, df = i), data = b)
rss[i] = sum(fit.i$residuals^2)
}
rss[3:10]
## [1] 1.934107 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535
df_rss = data.frame(df = 3:10, rss = rss[3:10])
ggplot(df_rss, aes(x = df, y = rss)) + geom_line() + theme_bw()
p1 + stat_smooth(method = "lm", formula = y ~ bs(x, 3),
se = FALSE, fill = NA, colour = colors[1]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 4),
se = FALSE, fill = NA, colour = colors[2]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 5),
se = FALSE, fill = NA, colour = colors[3]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 6),
se = FALSE, fill = NA, colour = colors[4]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 7),
se = FALSE, fill = NA, colour = colors[5]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 8),
se = FALSE, fill = NA, colour = colors[6]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 9),
se = FALSE, fill = NA, colour = colors[7]) +
stat_smooth(method = "lm", formula = y ~ bs(x, 10),
se = FALSE, fill = NA, colour = colors[7])