glm.default = glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.default) # income, balance both significant
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
default.probs = predict(glm.default, type = "response");
glm.pred = rep("No", dim(Default)[1])
glm.pred[default.probs > 0.5] = "Yes";
conf_mat = table(glm.pred, Default$default)
train_acc = round(mean(glm.pred == Default$default), 4)
The training accuracy is 0.9737 using logistic regression on the entire dataset.
n_obs = dim(Default)[1]
train_logistic = function() {
train = sample(n_obs, n_obs/2);
dft_train_fit = glm(default ~ income + balance, data = Default,
subset = train, family = binomial)
test_default = Default[-train,]
test.probs = predict(dft_train_fit, test_default, type = "response");
def_pred = rep("No", length(train));
def_pred[test.probs > 0.5] = "Yes";
test_err = round(mean(def_pred != Default$default[-train]), 4)
test_err
}
set.seed(123)
e1 = train_logistic() # 0.0286
# (c) use different splits of train/validation
set.seed(234);
e2 = train_logistic() # 0.0256 error
set.seed(345);
e3 = train_logistic() # 0.027 error
Using the seeds 123, 234, and 345, we randomly split the training/validation sets and fit a logisitic regression model for each of these splits. The resulting testing errors were: 0.0286, 0.0256, and 0.027.
train_logistic_1 = function() {
train = sample(n_obs, n_obs/2);
dft_train_fit = glm(default ~ income + balance + student, data = Default,
subset = train, family = binomial)
test_default = Default[-train,]
test.probs = predict(dft_train_fit, test_default, type = "response");
def_pred = rep("No", length(train));
def_pred[test.probs > 0.5] = "Yes";
test_err = round(mean(def_pred != Default$default[-train]), 4)
test_err
}
set.seed(123)
err1 = train_logistic_1() # 0.0284
set.seed(234);
err2 = train_logistic_1() # 0.0266
set.seed(345);
err3 = train_logistic_1() # 0.0274
Using the same seeds as before, the testing errors were: 0.0284, 0.0266, and 0.0274. The testing errors did not decrease when we added the student dummy variable.
# (a) logistic regression model that predicts Direction using Lag1, Lag2
glm_dir0 = glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial)
summary(glm_dir0)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.623 -1.261 1.001 1.083 1.506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
# (b) predict direction using all but first obs
glm_dir1 = glm(Direction ~ Lag1 + Lag2, data = Weekly[-1,], family = binomial)
summary(glm_dir1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6258 -1.2617 0.9999 1.0819 1.5071
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
# (c) use model from part (b) to predict direction of first obs
ob1.prob = predict(glm_dir1, Weekly[1,], type = "response") > 0.5 # TRUE
Weekly[1,]$Direction # Down -> incorrectly classified
## [1] Down
## Levels: Down Up
Since the posterior probability for the first observation was greater 0.5, we predict that the Direction is Up, but upon looking at the true Direction, we see that it was actually Down. Our prediction was incorrect.
n = dim(Weekly)[1]
err = rep(0, n);
for (i in c(1:n)) {
train = Weekly[-i,] # leave out i-th observation
# train model using the new training set
train.fit = glm(Direction ~ Lag1 + Lag2, data = train, family = binomial)
# predict i-th observation
pred_i = predict(train.fit, Weekly[i,], type = "response") > 0.5
if (pred_i == TRUE) {
dir = "Up"
} else {
dir = "Down"
}
# store indicator for error
err[i] = as.numeric((dir != Weekly[i,]$Direction))
}
total_errors = sum(err); # 490
avg_error = round(mean(err), 4) # 0.45
\[ Y = X - 2X^2 + \varepsilon\]
set.seed(1)
y = rnorm(100)
x = rnorm(100)
y = x - 2 * x^2 + rnorm(100)
d = data.frame(x, y)
ggplot(d, aes(x, y)) + geom_point() # quadratic plot, where x in [-2, 2]
set.seed(11)
cv_err = rep(0, 4);
for (i in c(1:4)) {
fit_order_i = glm(y ~ poly(x, i), data = d);
cv_err[i] = cv.glm(d, fit_order_i)$delta[1];
}
cv_err = round(cv_err, 4)
The errors are given for the four models are given from order 1 to order 4: 5.891, 1.0866, 1.1026, and 1.1148.
set.seed(111)
cv_err1 = rep(0, 4);
for (i in c(1:4)) {
fit_order_i = glm(y ~ poly(x, i), data = d);
# uncomment line below to see summaries
# print(summary(fit_order_i)$Coefficient);
cv_err1[i] = cv.glm(d, fit_order_i)$delta[1];
}
round(cv_err1, 4)
## [1] 5.8910 1.0866 1.1026 1.1148
summary(fit_order_i) # order 4 model
##
## Call:
## glm(formula = y ~ poly(x, i), data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8914 -0.5244 0.0749 0.5932 2.7796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1041 -17.549 <2e-16 ***
## poly(x, i)1 2.3164 1.0415 2.224 0.0285 *
## poly(x, i)2 -21.0586 1.0415 -20.220 <2e-16 ***
## poly(x, i)3 -0.3048 1.0415 -0.293 0.7704
## poly(x, i)4 -0.4926 1.0415 -0.473 0.6373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.084654)
##
## Null deviance: 552.21 on 99 degrees of freedom
## Residual deviance: 103.04 on 95 degrees of freedom
## AIC: 298.78
##
## Number of Fisher Scoring iterations: 2