data(Boston)
b = Boston
mu_hat = mean(b$medv) # estimate of the population mean of medv
n = nrow(b)
se = sd(b$medv) / sqrt(n)
set.seed(1)
iters = 1e4
# function to use to get the approximation
boot.fn = function(d, ind) {
return(mean(d[ind]));
}
boot_result = boot(b$medv, boot.fn, iters)
boot_result
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = b$medv, statistic = boot.fn, R = iters)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.0005436561 0.4054831
se_hat = 0.40548
low = mu_hat - 2 * se_hat
high = mu_hat + 2 * se_hat
boot_ci = c(low, high)
medv_test = t.test(b$medv)
ci = c(medv_test$conf.int[1], medv_test$conf.int[2])
\[ \left( 21.722, 23.344 \right) \]
\[ \left( 21.730, 23.336 \right) \]
\[ Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \epsilon \]
beta = c(-2, -1, 1, 2);
f = function(x) {
y = beta[1] + beta[2] * x + beta[3] * x^2 + beta[4] * x^3 + eps;
return(y);
}
set.seed(123)
x = rnorm(100);
eps = rnorm(100)
y = f(x)
d = data.frame(x, y)
regfit.full = regsubsets(y ~ poly(x, 10, raw = T),
data = d, nvmax = 10, method = "exhaustive");
full.sum = summary(regfit.full)
which.min(full.sum$bic) # 3
which.max(full.sum$adjr2) # 7
which.min(full.sum$cp) # 3
multiplot(plotCriteria(regfit.full), cols = 2); # generate plots for all 3 criteria
coefficients(regfit.full, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## -2.0296061 -1.0795538 0.9084569
## poly(x, 10, raw = T)3
## 2.0204363
coefficients(regfit.full, id = 7)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## -2.2068250 -1.1092516 3.2758848
## poly(x, 10, raw = T)3 poly(x, 10, raw = T)4 poly(x, 10, raw = T)6
## 2.0530514 -4.0243410 2.2138377
## poly(x, 10, raw = T)8 poly(x, 10, raw = T)10
## -0.4870719 0.0373538
# forward
fwd.fit = regsubsets(y ~ poly(x, 10, raw = T), method = 'forward', data = d, nvmax = 10)
fwd.sum = summary(fwd.fit)
which.min(fwd.sum$bic) # 3
which.max(fwd.sum$adjr2) # 4
which.min(fwd.sum$cp) # 3
# backward
bkwd.fit = regsubsets(y ~ poly(x, 10, raw = T), method = "backward", data = d, nvmax = 10)
bkwd.sum = summary(bkwd.fit)
which.min(bkwd.sum$bic) # 3
which.max(bkwd.sum$adjr2) # 7
which.min(bkwd.sum$cp) # 3
fwd_plots = plotCriteria(fwd.fit, col = "red");
bkwd_plots = plotCriteria(bkwd.fit, col = "blue");
fwd_bkwd = append(fwd_plots, bkwd_plots);
multiplot(fwd_bkwd, cols = 2)
coefficients(fwd.fit, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## -2.0296061 -1.0795538 0.9084569
## poly(x, 10, raw = T)3
## 2.0204363
coefficients(bkwd.fit, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## -2.0296061 -1.0795538 0.9084569
## poly(x, 10, raw = T)3
## 2.0204363
coefficients(bkwd.fit, id = 7) # same predictors chosen as in best subset
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## -2.2068250 -1.1092516 3.2758848
## poly(x, 10, raw = T)3 poly(x, 10, raw = T)4 poly(x, 10, raw = T)6
## 2.0530514 -4.0243410 2.2138377
## poly(x, 10, raw = T)8 poly(x, 10, raw = T)10
## -0.4870719 0.0373538
#### f. We generate a response vector $Y$ according to the model
x_mat = model.matrix(y ~ poly(x, 10, raw = T), data = d)[,-1]
y = d$y
cv.lasso = cv.glmnet(x = x_mat, y = y, alpha = 1);
plot(cv.lasso) # lamdba vs. MSE
lambda_star = cv.lasso$lambda.min # find optimal lambda via CV
predict(cv.lasso, type = "coefficients", s = lambda_star) # coefficient estimates
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.955366728
## poly(x, 10, raw = T)1 -0.963705978
## poly(x, 10, raw = T)2 0.744397361
## poly(x, 10, raw = T)3 1.936282462
## poly(x, 10, raw = T)4 .
## poly(x, 10, raw = T)5 0.013011465
## poly(x, 10, raw = T)6 0.009279842
## poly(x, 10, raw = T)7 .
## poly(x, 10, raw = T)8 .
## poly(x, 10, raw = T)9 .
## poly(x, 10, raw = T)10 .
\[ Y = \beta_0 + \beta_7 X^7 + \epsilon, \quad \beta_0 = -2, \quad \beta_7 = -1 \]
## best subset selection\
# beta0 = -2, beta7 = -1
y0 = beta[1] + beta[2] * x^7 + eps;
d0 = data.frame(y = y0, x = x)
bss = regsubsets(y0 ~ poly(x, 10, raw = TRUE),
data = d0, method = "exhaustive", nvmax = 10)
bss_sum = summary(bss)
which.min(bss_sum$cp) # 3
which.min(bss_sum$bic) # 1
which.max(bss_sum$adjr2) # 4
set.seed(123)
coef(bss, id = 3) # order 2, 7, 9
## (Intercept) poly(x, 10, raw = TRUE)2 poly(x, 10, raw = TRUE)6
## -1.958221056 -0.257807002 0.009422817
## poly(x, 10, raw = TRUE)7
## -0.999773816
coef(bss, id = 1) # bic model: X^7, close to approximation
## (Intercept) poly(x, 10, raw = TRUE)7
## -2.106749 -1.000296
coef(bss, id = 4) # order 4, 5, 7, 9
## (Intercept) poly(x, 10, raw = TRUE)4 poly(x, 10, raw = TRUE)6
## -1.96762983 -0.41711570 0.18075304
## poly(x, 10, raw = TRUE)7 poly(x, 10, raw = TRUE)8
## -1.00062525 -0.01966157
## lasso
mat0 = model.matrix(y ~ poly(x, 10, raw = TRUE), data = d0)[,-1]
lasso0 = cv.glmnet(mat0, y0, alpha = 1);
lambda0_star = lasso0$lambda.min
predict(lasso0, type = "coefficients", s = lambda_star)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.1721173
## poly(x, 10, raw = TRUE)1 .
## poly(x, 10, raw = TRUE)2 .
## poly(x, 10, raw = TRUE)3 .
## poly(x, 10, raw = TRUE)4 .
## poly(x, 10, raw = TRUE)5 .
## poly(x, 10, raw = TRUE)6 .
## poly(x, 10, raw = TRUE)7 -0.9760871
## poly(x, 10, raw = TRUE)8 .
## poly(x, 10, raw = TRUE)9 .
## poly(x, 10, raw = TRUE)10 .