查看全集:💎Quantopia量化分析56讲
过拟合(Overfitting)指模型在训练数据上表现优异,但在新数据上表现显著下降的现象。其本质是模型过度捕捉了训练数据中的噪声和偶然模式,而非真实的数据生成规律。
核心公式揭示本质:
其中ε代表噪声项。过拟合模型实际上是在拟合的复合结构。
过拟合的典型特征:
import numpy as np
import matplotlib.pyplot as plt
# 生成带噪声的二次曲线数据
np.random.seed(42)
x = np.linspace(0, 2, 20)
y_true = 2 * x**2 - 3*x + 1
y = y_true + np.random.normal(0, 0.5, len(x))
# 拟合不同阶数的多项式
degrees = [1, 2, 10]
models = [np.poly1d(np.polyfit(x, y, d)) for d in degrees]
# 可视化对比
xx = np.linspace(-0.5, 2.5, 200)
plt.figure(figsize=(12,6))
plt.scatter(x, y, c='red', label='Observed Data')
for i, (d, model) in enumerate(zip(degrees, models)):
plt.plot(xx, model(xx), label=f'Degree {d} ({"过拟合" if d==10 else "适拟合" if d==2 else "欠拟合"})')
plt.ylim(-2, 5)
plt.legend()
plt.xlabel('X'), plt.ylabel('Y')
plt.title('模型复杂度对拟合效果的影响');
关键洞察点
数据获取与处理
import yfinance as yf
import pandas as pd
# 获取多资产历史数据
symbols = ['PG', 'PEP', 'MCD', 'ATHN', 'DOW']
start = '2013-01-01'
end = '2015-01-01'
data = yf.download(symbols, start=start, end=end)['Close']
data.columns = symbols # 简化列名处理
模型对比分析
from statsmodels.api import OLS
# 单变量回归
X_single = data[['PEP']]
y = data['PG']
model_single = OLS(y, sm.add_constant(X_single)).fit()
# 多变量回归(包含伪相关变量)
X_multi = data[['PEP', 'MCD', 'ATHN', 'DOW']]
model_multi = OLS(y, sm.add_constant(X_multi)).fit()
# 结果对比
print(f"单变量模型R²: {model_single.rsquared_adj:.3f}")
print(f"多变量模型R²: {model_multi.rsquared_adj:.3f}")
样本外验证
# 获取新时间段数据
new_data = yf.download(symbols, start='2015-01-01', end='2017-01-01')['Close']
new_data.columns = symbols
# 预测新数据
X_new_single = new_data[['PEP']]
X_new_multi = new_data[['PEP', 'MCD', 'ATHN', 'DOW']]
pred_single = model_single.predict(sm.add_constant(X_new_single))
pred_multi = model_multi.predict(sm.add_constant(X_new_multi))
# 计算样本外R²
from sklearn.metrics import r2_score
r2_single = r2_score(new_data['PG'], pred_single)
r2_multi = r2_score(new_data['PG'], pred_multi)
print(f"\n样本外表现:")
print(f"单变量模型R²: {r2_single:.3f}")
print(f"多变量模型R²: {r2_multi:.3f}")
窗口长度选择实验
# 获取麦当劳股价数据
mcd = yf.download('MCD', start='2010-01-01', end='2020-01-01')['Close']
# 定义滚动策略函数
def rolling_strategy(series, window=30, z_entry=1, z_exit=0.5):
rolling = series.rolling(window)
zscore = (series - rolling.mean()) / rolling.std()
position = 0
portfolio = pd.Series(0, index=series.index)
for i in range(len(series)):
if zscore[i] > z_entry:
position = -1 # 做空
elif zscore[i] < -z_entry:
position = 1 # 做多
elif abs(zscore[i]) < z_exit:
position = 0 # 平仓
portfolio.iloc[i] = position
returns = portfolio.shift() * series.pct_change()
return returns.cumsum()
# 测试不同窗口长度
windows = [20, 50, 100]
results = pd.DataFrame({f'{w}d': rolling_strategy(mcd, w) for w in windows})
# 可视化结果
results.plot(figsize=(12,6))
plt.title('不同滚动窗口长度的策略表现')
plt.ylabel('累计收益');
关键发现
Lasso (L1):
Ridge (L2):
其中 为实际值, 为预测值, 为正则化参数, 为模型参数。第一项为损失函数,第二项为正则化项。
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
其中k为参数个数,L为似然函数值
n为样本量,惩罚项更强
# 定义参数网格
param_grid = {
'window': range(10, 101, 10),
'z_entry': np.linspace(0.5, 2, 5),
'z_exit': np.linspace(0.2, 0.8, 4)
}
# 执行网格搜索
best_params = {}
best_score = -np.inf
for window in param_grid['window']:
for entry in param_grid['z_entry']:
for exit in param_grid['z_exit']:
# 在训练期计算收益
train_ret = rolling_strategy(mcd[:'2015'], window, entry, exit).iloc[-1]
# 在验证期测试
valid_ret = rolling_strategy(mcd['2016':'2018'], window, entry, exit).iloc[-1]
# 综合评分(训练收益 * 验证收益)
score = train_ret * valid_ret
if score > best_score:
best_score = score
best_params = {'window': window, 'entry': entry, 'exit': exit}
print(f"最优参数:{best_params}")
# 初始化参数存储
params_history = []
# 时间窗口滚动
for year in range(2010, 2020):
train_data = mcd[str(year-3):str(year)]
test_data = mcd[str(year):str(year+1)]
# 参数优化(简化示例)
best_window = ... # 优化逻辑
# 测试期表现
test_ret = rolling_strategy(test_data, best_window).iloc[-1]
params_history.append({'year': year, 'window': best_window, 'return': test_ret})
# 分析参数稳定性
pd.DataFrame(params_history).plot(x='year', y='window', kind='bar');
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
estimator=model,
X=X,
y=y,
cv=5,
scoring='r2'
)
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Train')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Test')
plt.xlabel('Training Size')
plt.ylabel('R² Score')
plt.legend();
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=30)
sorted_idx = result.importances_mean.argsort()
plt.boxplot(result.importances[sorted_idx].T,
vert=False, labels=X.columns[sorted_idx])
plt.title("Permutation Importance");
陷阱1:P值崇拜
陷阱2:滚动优化幻觉
陷阱3:幸存者偏差