from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def log_reg(y_hat, y):
p = 1.0 / (1.0 + np.exp(-y_hat))
g = p - y.get_label()
h = p * (1.0-p)
return g, h
def error_rate(y_hat, y):
return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)
def load_data(path):
df = pd.read_csv(path)
df['Sex'] = pd.Categorical(df['Sex']).codes
df_embarked = pd.get_dummies(df['Embarked'])
df_embarked.columns = df_embarked.columns.map(lambda x: 'Embraked_' + str(x))
df.drop(labels=['Embarked', 'Ticket', 'Cabin', 'PassengerId', 'Name'], axis=1, inplace=True)
df = pd.merge(df, df_embarked, right_index=True, left_index=True, how='inner')
'''Fare 不存在空值,就对age进行填充,利用GBDT进行填充'''
model_feature = GradientBoostingRegressor(n_estimators=1000, subsample=0.8)
selected_feature = ['Fare', 'Parch', 'SibSp', 'Pclass', 'Age']
age_test = df.loc[df.Age.isnull()][selected_feature]
age_train = df.loc[df['Age'].notnull()][selected_feature]
x_train, y_train = age_train.iloc[:, :-1], age_train.iloc[:, -1]
x_test = age_test.iloc[:, :-1]
model_feature.fit(x_train, y_train)
y_pred_train = model_feature.predict(x_train)
print('train loss is ', mean_squared_error(y_train, y_pred_train))
df.loc[df.Age.isnull(), 'Age'] = model_feature.predict(x_test)
'''进行预测,通过adaboost, GBDT, xgboost进行比较'''
x, y = df.loc[:, df.columns != 'Survived'], df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=0)
ada_base_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=3)
ada = AdaBoostClassifier(ada_base_tree)
para_ada = [{'n_estimators': [100, 500]}]
scores = ['roc_auc', 'precision']
for score in scores:
clf = GridSearchCV(ada, param_grid=para_ada, scoring=score)
clf.fit(x_train, y_train)
print("Best parameters set found on development set:")
print(clf.best_params_)
print('best score ', clf.best_score_)
print()
ada_estimator = AdaBoostClassifier(n_estimators=100)
ada_estimator.fit(x_train, y_train)
y_pred = ada_estimator.predict(x_test)
print('accuracy ', accuracy_score(y_test, y_pred))
print('f1 measure ', f1_score(y_test, y_pred))
'''xgboost'''
d_train = xgb.DMatrix(x_train, label=y_train)
d_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(d_test, 'eval'), (d_train, 'train')]
param_xgboost = {'eta': 0.3, 'n_estimators': 100, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1,
'colsample_bytree': 1, 'colsample_bylevel': 1, 'subsample': 1, 'reg_lambda': 1, 'reg_alpha': 0,
'seed': 33}
xgboost_clf = xgb.train(params=param_xgboost, dtrain=d_train, num_boost_round=100, evals=watch_list, obj=log_reg, feval=error_rate)
y_pred_boost = xgboost_clf.predict(d_test)
xgb_classified = xgb.XGBClassifier(**param_xgboost)
xgb_classified.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric='auc')
y_pred_xgboost = xgb_classified.predict_proba(x_test)
y_prob_pred = np.argmax(y_pred_xgboost, axis=1)
print('xbgoost accuracy: ', accuracy_score(y_test, y_prob_pred))
print('xgboost f1 measure ', f1_score(y_test, y_prob_pred))
if __name__ == '__main__':
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(precision=4)
path = r'Titanic.train.csv'
load_data(path)