一,只用LR的AUC对比:
1,LR demo:
%config ZMQInteractiveShell.ast_node_interactivity='all' print ('started')import pandas as pd data = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp']) data.head()# 开始数据处理 data.head() data_new = data.copy(deep=False) data_new.groupby(['Action']).count() data_new.loc[data['Action'] != 'buy', 'Action'] = 0 data_new.loc[data['Action'] == 'buy', 'Action'] = 1data_new.groupby(['Action']).count()
View Code
# 1, 只用逻辑回归 import warnings warnings.filterwarnings('ignore') from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score from sklearn.preprocessing.data import OneHotEncoderx = data_new[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data_new[['Action']]x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)lr = LogisticRegression() lr.fit(x_train, y_train)train_pred = lr.predict_proba(x_train)[:, 1] print ('train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = lr.predict_proba(x_test)[:, 1] print ('test\' auc: %.3f' % roc_auc_score(y_test, test_pred))one_hot = OneHotEncoder(categories='auto')x_one_hot = one_hot.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x_one_hot, y, test_size=0.3)lr = LogisticRegression() lr.fit(x_train, y_train)train_pred = lr.predict_proba(x_train)[:, 1] print ('one-hot processing train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = lr.predict_proba(x_test)[:, 1] print ('one-hot processing test\' auc: %.3f' % roc_auc_score(y_test, test_pred))
View Code
train's auc: 0.523 test' auc: 0.523
2,LR one-hot之后:
# 1, 只用逻辑回归。是否one-hot区别很大。 import warnings warnings.filterwarnings('ignore') from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score from sklearn.preprocessing.data import OneHotEncoderx = data_new[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data_new[['Action']]x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)lr = LogisticRegression() lr.fit(x_train, y_train)train_pred = lr.predict_proba(x_train)[:, 1] print ('train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = lr.predict_proba(x_test)[:, 1] print ('test\' auc: %.3f' % roc_auc_score(y_test, test_pred))one_hot = OneHotEncoder(categories='auto')x_one_hot = one_hot.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x_one_hot, y, test_size=0.2)lr = LogisticRegression() lr.fit(x_train, y_train)train_pred = lr.predict_proba(x_train)[:, 1] print ('one-hot processing train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = lr.predict_proba(x_test)[:, 1] print ('one-hot processing test\' auc: %.3f' % roc_auc_score(y_test, test_pred))
View Code
one-hot processing train's auc: 0.987 one-hot processing test' auc: 0.783
3,LR + 网格搜索:(没时间仔细的搜索)
# 逻辑回归,网格搜索。 from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCVdata = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp'])# 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count() x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)#lr.fit(x_train, y_train) cv_params = {'max_iter': [100, 200, 300]} grid_search = GridSearchCV(estimator = lr, param_grid=cv_params) grid_search.fit(x_train, y_train)train_pred = grid_search.predict_proba(x_train)[:, 1] print ('train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = grid_search.predict_proba(x_test)[:, 1] print ('test\' auc: %.3f' % roc_auc_score(y_test, test_pred))one_hot = OneHotEncoder(categories='auto')x_one_hot = one_hot.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x_one_hot, y, test_size=0.2)lr = LogisticRegression() cv_params = {'max_iter': [50, 80, 100]} grid_search = GridSearchCV(estimator = lr, param_grid=cv_params) grid_search.fit(x_train, y_train)train_pred = grid_search.predict_proba(x_train)[:, 1] print ('one-hot processing train\'s auc: %.3f' % roc_auc_score(y_train, train_pred)) test_pred = grid_search.predict_proba(x_test)[:, 1] print ('one-hot processing test\' auc: %.3f' % roc_auc_score(y_test, test_pred))
View Code
one-hot processing train's auc: 0.985 one-hot processing test' auc: 0.793
二,只用XGBoost的AUC对比(seed都指定为66):
1,XGBoost demo:
# 1,只用XGBoost print ('只用XGBoost:') %config ZMQInteractiveShell.ast_node_interactivity='all' import pandas as pd from sklearn.model_selection import train_test_split import xgboost import warnings from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_scorewarnings.filterwarnings('ignore') data = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp']) data.head(2)data.groupby(['Action']).count() # 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count()x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)x_train.head() y_train.groupby(['Action']).count()xgb = xgboost.XGBClassifier() xgb.fit(x_train, y_train) train_pred = xgb.predict_proba(x_train)[:, 1] train_auc = roc_auc_score(y_train, train_pred) print ('trian auc: %.3f' % train_auc)test_pred = xgb.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, test_pred) print ('test auc: %.3f' % test_auc)
View Code
trian auc: 0.662 test auc: 0.654
2,XGBoost + 网格搜索(只搜迭代速率):
print("网格搜索的XGBoost:") from sklearn.model_selection import GridSearchCVdata = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp'])# 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count() x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)xgb = xgboost.XGBClassifier() cv_params = {'learning_rate': [0.1, 0.2]}grid_search = GridSearchCV(estimator = xgb, param_grid = cv_params)xgb.fit(x_train, y_train) train_pred = xgb.predict_proba(x_train)[:, 1] train_auc = roc_auc_score(y_train, train_pred) print ('trian auc: %.3f' % train_auc)test_pred = xgb.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, test_pred) print ('test auc: %.3f' % test_auc)
View Code
trian auc: 0.660 test auc: 0.656
3,XGBoost+网格搜索:
print ("网格搜索的XGBoost:")from sklearn.model_selection import GridSearchCV x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle = False, random_state = 0)cv_params = {'learning_rate': [ 0.1, 0.2]} other_params = {'learning_rate': 0.12, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 5, 'seed': 1,'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 0.005, 'reg_alpha': 2,'reg_lambda': 0.05}model = xgboost.XGBClassifier(**other_params) optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=3, verbose=1, n_jobs=4) optimized_GBM.fit(x_train, y_train)print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_)) print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))xgboost=optimized_GBM y_pred_train = xgboost.predict_proba(x_train)[:, 1] xgb_train_auc = roc_auc_score(y_train, y_pred_train) print("XGBoost train auc: %.5f" % xgb_train_auc )y_pred_test = xgboost.predict_proba(x_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print("XGBoost test auc: %.5f" % xgb_test_auc )
View Code
XGBoost train auc: 0.74102 XGBoost test auc: 0.69231
4,XGBoost+网格搜索,用模型和用grid进行fit的区别:
grid:
print("网格搜索的XGBoost:") from sklearn.model_selection import GridSearchCVdata = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp'])# 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count() x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)xgb = xgboost.XGBClassifier(seed=61) cv_params = {'learning_rate': [0.1, 0.2]}grid_search = GridSearchCV(estimator = xgb, param_grid = cv_params, cv=3)grid_search.fit(x_train, y_train) train_pred = grid_search.predict_proba(x_train)[:, 1] train_auc = roc_auc_score(y_train, train_pred) print ('trian auc: %.3f' % train_auc)test_pred = grid_search.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, test_pred) print ('test auc: %.3f' % test_auc)
View Code
trian auc: 0.658 test auc: 0.655
xgb:
print("网格搜索的XGBoost:") from sklearn.model_selection import GridSearchCVdata = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp'])# 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count() x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)xgb = xgboost.XGBClassifier() cv_params = {'learning_rate': [0.1, 0.2]}grid_search = GridSearchCV(estimator = xgb, param_grid = cv_params)xgb.fit(x_train, y_train) train_pred = xgb.predict_proba(x_train)[:, 1] train_auc = roc_auc_score(y_train, train_pred) print ('trian auc: %.3f' % train_auc)test_pred = xgb.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, test_pred) print ('test auc: %.3f' % test_auc)
View Code
trian auc: 0.662 test auc: 0.657
好好调参之后:
grid:
print ("网格搜索的XGBoost:")from sklearn.model_selection import GridSearchCV x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle = False, random_state = 0)cv_params = {'learning_rate': [ 0.1, 0.2]} other_params = {'learning_rate': 0.12, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 5, 'seed': 1,'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 0.005, 'reg_alpha': 2,'reg_lambda': 0.05, 'seed': 61}model = xgboost.XGBClassifier(**other_params) optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=3, verbose=1, n_jobs=4) optimized_GBM.fit(x_train, y_train)print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_)) print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))xgboost=optimized_GBM y_pred_train = xgboost.predict_proba(x_train)[:, 1] xgb_train_auc = roc_auc_score(y_train, y_pred_train) print("XGBoost train auc: %.5f" % xgb_train_auc )y_pred_test = xgboost.predict_proba(x_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print("XGBoost test auc: %.5f" % xgb_test_auc )
View Code
XGBoost train auc: 0.74096 XGBoost test auc: 0.69175
xgb:
print ("网格搜索的XGBoost:") import pandas as pd from sklearn.model_selection import train_test_split import xgboost import warnings from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import GridSearchCV x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle = False, random_state = 0)cv_params = {'learning_rate': [ 0.1, 0.2]} other_params = {'learning_rate': 0.12, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 5, 'seed': 1,'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 0.005, 'reg_alpha': 2,'reg_lambda': 0.05, 'seed': 61}model = xgboost.XGBClassifier(**other_params) optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=3, verbose=1, n_jobs=4) model.fit(x_train, y_train)xgboost=model y_pred_train = xgboost.predict_proba(x_train)[:, 1] xgb_train_auc = roc_auc_score(y_train, y_pred_train) print("XGBoost train auc: %.5f" % xgb_train_auc )y_pred_test = xgboost.predict_proba(x_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print("XGBoost test auc: %.5f" % xgb_test_auc )
View Code
XGBoost train auc: 0.74768 XGBoost test auc: 0.69270
5,6以后在研究。
5,XGBoost+regression:
6,XGBoost的参数进一步研究,比如score:
7,最佳的XGBoost:
print ('最佳的xgboost') import pandas as pd from sklearn.model_selection import train_test_split import xgboost import warnings from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import GridSearchCVdata = pd.read_csv('data_1.csv', names=['UserID', 'ItemID', 'CategoryID', 'Action', 'Timestamp']) data.head(2)data.groupby(['Action']).count() # 开始数据处理 data.loc[data['Action'] != 'buy', 'Action'] = 0 data.loc[data['Action'] == 'buy', 'Action'] = 1 data.groupby(['Action']).count()x = data[['UserID', 'ItemID', 'CategoryID', 'Timestamp']] y = data[['Action']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle = False, random_state = 0)xgb = xgboost.XGBClassifier(base_score=0.5, booster='gbtree',colsample_bylevel=1, colsample_bytree=0.6,gamma=0.005, learning_rate=0.1,max_delta_step=0, max_depth=5,min_child_weight=5, missing=None,n_estimators=300, n_jobs=1, nthread=None,objective='binary:logistic',random_state=0, reg_alpha=2,reg_lambda=0.05, scale_pos_weight=1,seed=61, silent=True, subsample=0.9) xgb.fit(x_train, y_train) from sklearn.metrics import auc, roc_auc_score y_pred_train = xgb.predict_proba(x_train)[:, 1] xgb_train_auc = roc_auc_score(y_train, y_pred_train) print("XGBoost train auc: %.5f" % xgb_train_auc )y_pred_test = xgb.predict_proba(x_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print("XGBoost test auc: %.5f" % xgb_test_auc )
View Code
三,XGBoost+LR:
xgboost 最好的灌给LR。
1, XGBoost + LR
2,GBDT + LR:
四,wide&deep:
五:FM
五,FM:(调用survise):
六:GBDT+LR:
转载于:https://www.cnblogs.com/yueyebigdata/p/11319046.html