【机器学习】阿里云天池竞赛——工业蒸汽量预测(5)

机器学习经典赛题:工业蒸汽量预测(5):模型验证(赛题实战)

5.3 模型验证与调参实战

5.3.1 模型过拟合与欠拟合

  1. 基础代码
    导入工具包,用于模型验证和数据处理。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression   #从sklearn引入线性模型
from sklearn.neighbors import KNeighborsRegressor   #k近邻回归模型
from sklearn.tree import DecisionTreeRegressor     #决策树回归模型
from sklearn.ensemble import RandomForestRegressor    #随机森林回归模型
from sklearn.svm import SVR    #支持向量机
from lightgbm import LGBMRegressor   #LightGBM回归模型

from sklearn.model_selection import train_test_split #切分数据
from sklearn.metrics import mean_squared_error  #评价指标
from sklearn.linear_model import SGDRegressor

读取数据:

#读取数据
train_data_file = '../data/zhengqi_train.txt'
test_data_file = '../data/zhengqi_test.txt'
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

对数据进行归一化处理:

from sklearn import preprocessing

#归一化处理
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])

train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']

使用PCA进行特征降维:

#PCA方法降维
from sklearn.decomposition import PCA

# 保留16个主成分
pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

切分数据集,分为训练集和验证集:

#保留16维特征并切分数据
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_train_pca_16.columns]
target = new_train_pca_16['target']

#划分数据集, 训练集80% : 验证机20%
train_data, test_data, train_target, test_target = train_test_split(train, target,test_size=0.2, random_state=0)
  1. 欠拟合
    模型欠拟合的情况:
# 模型欠拟合的情况
clf = SGDRegressor(max_iter=500, tol=1e-2)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 0.0009160671723394329
SGDRegressor test MSE: 0.000981634366517726
  1. 过拟合
    模型过拟合的情况:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 1.0898632608641039e+24
SGDRegressor test MSE: 1.6982104334110243e+24
  1. 正常拟合
    模型正常拟合的情况:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 0.001055038113307342
SGDRegressor test MSE: 0.001233821154013793

5.3.2 模型正则化

  1. L2范数正则化
    采用L2范数正则化处理模型:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L2', alpha=0.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 0.0010417279350570649
SGDRegressor test MSE: 0.0012885335267450496
  1. L1范数正则化
    使用L1范数正则化处理模型
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1', alpha=0.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)
  1. ElasticNet联合L1和L2范数加权正则化
    使用ElasticNet正则化处理模型:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='elasticnet', l1_ratio=0.9, alpha=.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 0.0009336657359501396
SGDRegressor test MSE: 0.0011472413880708912

5.3.3 模型交叉验证

  1. 简单交叉验证
    使用简单交叉验证方法对模型进行交叉验证并切分数据集,其中训练数据为80%,验证数据为20%。
#划分数据集, 训练集80% : 验证机20%
train_data, test_data, train_target, test_target = train_test_split(train, target,test_size=0.2, random_state=0)

clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("SGDRegressor train MSE:", score_train)
print("SGDRegressor test MSE:", score_test)

输出结果:

SGDRegressor train MSE: 0.0008805332436144762
SGDRegressor test MSE: 0.0009742019879530982
  1. K折交叉验证
    使用K折交叉验证方法对模型进行交叉验证:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
        train_index], target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, "折", "SGDRegressor train MSE:", score_train)
    print(k, "折", "SGDRegressor test MSE:", score_test, "\n")

运行结果:

0 折 SGDRegressor train MSE: 0.0009220591043969376
0 折 SGDRegressor test MSE: 0.0005310778285509602 

1 折 SGDRegressor train MSE: 0.0008800234249554588
1 折 SGDRegressor test MSE: 0.001259968578517734 

2 折 SGDRegressor train MSE: 0.0008211966326584619
2 折 SGDRegressor test MSE: 0.0009107924815045727 

3 折 SGDRegressor train MSE: 0.0009456226800574005
3 折 SGDRegressor test MSE: 0.0009853654948970164 

4 折 SGDRegressor train MSE: 0.0009055864978733344
4 折 SGDRegressor test MSE: 0.0011048017616418345 
  1. 留一法交叉验证
    使用留一法交叉验证对模型进行交叉验证:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
num = 100
for k, (train_index, test_index) in enumerate(loo.split(train)):
    train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
        train_index], target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, "个", "SGDRegressor train MSE:", score_train)
    print(k, "个", "SGDRegressor test MSE:", score_test, "\n")
    if k >= 9:
        break

运行结果:

0 个 SGDRegressor train MSE: 0.0007447292899787238
0 个 SGDRegressor test MSE: 5.2599252754080336e-05 
...
9 个 SGDRegressor train MSE: 0.0007887842650802939
9 个 SGDRegressor test MSE: 0.0005482147049830396 
  1. 留P法交叉验证
    使用留P法交叉验证对模型进行交叉验证:
from sklearn.model_selection import LeavePOut

lpo = LeavePOut(p=10)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[
        train_index], target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, "10个", "SGDRegressor train MSE:", score_train)
    print(k, "10个", "SGDRegressor test MSE:", score_test, "\n")
    if k >= 9:
        break

运行结果:

0 10个 SGDRegressor train MSE: 0.0007482296929583594
0 10个 SGDRegressor test MSE: 0.00029421354601069736 
...
9 10个 SGDRegressor train MSE: 0.0008062741989626481
9 10个 SGDRegressor test MSE: 0.00023679186419471267 

5.3.4 模型超参空间及调参

  1. 穷举网格搜索
    使用数据训练随机森林模型,采用网格搜索方法调参:
#使用数据训练随机森林模型,采用穷举网格搜索方法调参
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators': [50, 100, 200], 'max_depth': [1, 2, 3]}
clf = GridSearchCV(RandomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor GridSearchCV test MSE:", score_test)
print(sorted(clf.cv_results_.keys()))  #包含训练时间和验证指标的一些信息

运行结果:

RandomForestRegressor GridSearchCV test MSE: 0.012637546435789914
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_max_depth', 'param_n_estimators', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
  1. 随机参数优化
    使用数据训练随机森林模型,采用随机参数优化方法调参:
#使用数据训练随机森林模型,采用随即参数优化方法调参
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor = RandomForestRegressor()
parameters = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [1, 2, 3, 4, 5]
}
clf = RandomizedSearchCV(RandomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
print('Best parameters found are:', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor RandomizedSearchCV GridSearchCV test MSE:", score_test)
print(sorted(clf.cv_results_.keys()))  #包含训练时间和验证指标的一些信息

运行结果:

Best parameters found are: {'n_estimators': 300, 'max_depth': 5}
RandomForestRegressor RandomizedSearchCV GridSearchCV test MSE: 8.52446793094484e-05
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_max_depth', 'param_n_estimators', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
  1. LGB调参
    使用数据训练LGB模型,采用网格搜索方法调参:
import lightgbm as lgb

clf = lgb.LGBMRegressor(num_leaves=31)
parameters = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
clf.fit(train_data, train_target)

score_test = mean_squared_error(test_target, clf.predict(test_data))
print("LGBMRegressor GridSearchCV test MSE:", score_test)
# LGBMRegressor GridSearchCV test MSE: 1.560809562908417e-05
  1. LGB线下验证
    下面给出对数据建模、5折交叉验证、划分数据、对LGB模型进行训练、计算MSE评价性能等流程的代码:
# 加载数据
train_data2 = pd.read_csv('../data/zhengqi_train.txt', sep='\t')
test_data2 = pd.read_csv('../data/zhengqi_test.txt', sep='\t')

train_data2_f = train_data2[test_data2.columns].values
train_data2_target = train_data2['target'].values

# lgb 模型
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds = 5
kf = KFold(n_splits=Folds, random_state=2022, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse': [],
    'test_mse': []
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(train_data2_f)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2022,
        objective='regression',
    )

    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = train_data2_f[train_index], train_data2_f[test_index]
    y_train_KFold, y_test_KFold = train_data2_target[train_index], train_data2_target[test_index]

    # 训练模型
    #     reg.fit(X_train_KFold, y_train_KFold)
    lgb_reg.fit(
        X=X_train_KFold, y=y_train_KFold,
        eval_set=[(X_train_KFold, y_train_KFold), (X_test_KFold, y_test_KFold)],
        eval_names=['Train', 'Test'],
        early_stopping_rounds=100,
        eval_metric='MSE',
        verbose=50
    )

    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold, num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold, num_iteration=lgb_reg.best_iteration_)

    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i + 1))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')

    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

5.3.5 学习曲线和验证曲线

  1. 学习曲线
    绘制数据的学习曲线,使用模型SGDRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve

train_data_file = "../data/zhengqi_train.txt"
test_data_file = "../data/zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

plt.figure(figsize=(18, 10), dpi=150)


def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, n_jobs=n_jobs,
                                                            train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
                     color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1,
                     color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label="training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label="corss-validation score")
    plt.legend(loc="best")
    return plt


x = train_data[test_data.columns].values
y = train_data['target'].values
title = "LinearRegression"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = SGDRegressor()
plot_learning_curve(estimator, title, x, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1).show()

学习曲线
2. 验证曲线

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve

train_data_file = "../data/zhengqi_train.txt"
test_data_file = "../data/zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
x = train_data[test_data.columns].values
y = train_data['target'].values

param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
train_scores, test_scores = validation_curve(SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1')
                                             , x, y, param_name='alpha', param_range=param_range,
                                             cv=10, scoring='r2', n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with SCDRegressor")
plt.xlabel("alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training scores", color='r')
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2,
                 color='r')
plt.semilogx(param_range, test_scores_mean, label="Sross_validation score", color='g')
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2,
                 color='g')
plt.legend(loc="best")
plt.show()

验证曲线

参考资料

[1] 《阿里云天池大赛赛题解析——机器学习篇》