def run_grid_search(self): """ This method is called by derived class to start grid search process """ features,labels,cv_folds = self.getFeaturesLabel() dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns) parameter_iterable = self.__get_param_iterable(self.__get_param_grid()) kwargs = self.get_learning_params() for param in parameter_iterable: logging.info("used parameters: {}".format(param)) bst = xgb.cv(param, dtrain_cv, folds=cv_folds,**kwargs) self.__add_to_resultset(param, bst) self.__disp_result() return
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10): for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) # if(param_untuned.keys()[0] == 'n_estimators'): # cv = 1 grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose) grid_search.fit(x, y) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v return estimator,params_sklearn
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10) # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, # 0.3, 0.6, 1], cv=10) lasso.fit(x_train_split, y_train_split) y_predicted = lasso.predict(X=x_test_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc') # Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] # Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions) print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score')
def clean_params_for_sk(params: dict) -> dict: """ Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or randomized search estimators. :param params: A dictionary of XGB parameters. :return: A copy of the same dictionary without the aforementioned problematic parameters. """ # In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when # called through GridSearchCV - parallelism should be achieved through its n_jobs parameter. # See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details. params_copy = params.copy() params_copy['nthread'] = 1 # In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned. if 'num_class' in params_copy.keys(): del params_copy['num_class'] return params_copy
def predict(): saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def predict(): saved = state.load('model') #saved = None if saved == None: train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def predict(): saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() ftrain, ftest, _ = fea_1.get() ftrain2, ftest2, _ = fea_2.get() train = pd.concat([train, ftrain, ftrain2], axis=1) test = pd.concat([test, ftest, ftest2], axis=1) print(train.shape, test.shape) z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed) for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] #Print model report: print("\nModel Report") print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') #xgboost???????
def rmse_cv(model, x_train, y_train): rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5)) return rmse
def run_croos_validation(self): features,labels,cv_folds = self.getFeaturesLabel() dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns) self.set_xgb_parameters() # specify validations set to watch performance model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params) best_scroe = model[self.best_score_colname_in_cv].max() return best_scroe
def tune_n_estimators(alg,xgtrain,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval=True, show_stdv=True) # alg.set_params(n_estimators=cvresult.shape[0]) return cvresult.shape[0]
def modelfit(alg, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(predictors.values, label=target.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\ metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(predictors, target, eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(predictors) dtrain_predprob = alg.predict_proba(predictors)[:, 1] #Print model report: print("\nModel Report") print("Accuracy : %.4g" % metrics.accuracy_score(target.values, dtrain_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob)) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') # examples of usage # 1
def score(params): logging.info("Training with params: ") logging.info(params) # Delete 'n_estimators' because it's only a constructor param # when you're using XGB's sklearn API. # Instead, we have to save 'n_estimators' (# of boosting rounds) # to xgb.cv(). num_boost_round = int(params['n_estimators']) del params['n_estimators'] dtrain = xgb.DMatrix(X_train, label=y_train) # As of version 0.6, XGBoost returns a dataframe of the following form: # boosting iter | mean_test_err | mean_test_std | mean_train_err | mean_train_std # boost iter 1 mean_test_iter1 | mean_test_std1 | ... | ... # boost iter 2 mean_test_iter2 | mean_test_std2 | ... | ... # ... # boost iter n_estimators score_history = xgb.cv(params, dtrain, num_boost_round, nfold=5, stratified=True, early_stopping_rounds=250, verbose_eval=500) # Only use scores from the final boosting round since that's the one # that performed the best. mean_final_round = score_history.tail(1).iloc[0, 0] std_final_round = score_history.tail(1).iloc[0, 1] logging.info("\tMean Score: {0}\n".format(mean_final_round)) logging.info("\tStd Dev: {0}\n\n".format(std_final_round)) # score() needs to return the loss (1 - score) # since optimize() should be finding the minimum, and AUC # naturally finds the maximum. loss = 1 - mean_final_round return {'loss': loss, 'status': STATUS_OK}
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None): train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan')) test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan')) evallist = [(test_data,'eval'), (train_data,'train')] #if xgb_params == None: # xgb_params = get_default_xgboost_params() if not use_cv: num_rounds = 10 else: cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5, metrics={'rmse'}, show_progress=True) print cvresult num_rounds = len(cvresult) gbdt = None if(use_sklean): #gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None) xgb_params['n_estimators'] = num_rounds gbdt = xgboost.XGBRegressor(xgb_params) gbdt.fit(x_train, y_train) y_pred = gbdt.predict(X_test) return gbdt, y_pred else: #gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5) gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True) ceate_feature_map_for_feature_importance(features) show_feature_importance(gbdt, feature_names=features) y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan"))) return XGBoostModel(gbdt), y_pred
def GBM(self, argsDict): max_depth = argsDict["max_depth"] + 10 subsample = argsDict["subsample"] * 0.1 + 0.5 #n_estimators = argsDict['n_estimators'] * 5 + 50 learning_rate = argsDict["learning_rate"] * 0.02 + 0.12 #gamma = argsDict["gamma"] * 0.1 #min_child_weight = argsDict["min_child_weight"] + 1 print("max_depth:" + str(max_depth), "learning_rate:" + str(learning_rate), "subsample:" + str(subsample)) params={ "max_depth":max_depth, #"gamma":gamma, 'subsample' : subsample, 'learning_rate' : learning_rate, #'subsample' : subsample, #'min_child_weight': min_child_weight, 'objective': "multi:softmax", 'num_class': 7 , "eval_metric":'merror', 'silent':False, # 'gpu_id':1, # 'max_bin':16, # 'tree_method': "gpu_exact", # 'updater':'grow_gpu', # 'n_gpus':-1, # 'predictor' : "gpu_predictor", } num_round = 1 model=xgb.train(params,self.train, num_round, self.watchlist, feval=Xg_iter_precision) cov_res=xgb.cv(params,self.train, num_round, nfold=5, feval=Xg_iter_precision) #print(cov_res.head()) cov_rec=cov_res.tail(1)['test-precision_4_5_6-mean'].values predicted=model.predict(self.test) scoring=precision_score( self.test_y,predicted,average='micro',labels=[4,5,6]) print('precision is ',scoring) print('cv_precision_4_5_6',cov_rec[0]) return -cov_rec[0]
def tune_num_estimators(metric: str, label: np.ndarray, params: dict, strat_folds: StratifiedKFold, train) -> Tuple[int, float]: """ Uses xgboost's cross-validation method to tune the number of estimators and returns that along with the best CV score achieved. :param metric: Evaluation metric that is monitored during cross-validation - e.g. 'logloss' or 'rmse'. :param label: An array-like containing the labels of the classification or regression problem. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :return: A tuple containing the tuned number of estimators along with the best CV score achieved. """ eval_hist = xgb.cv( dtrain=xgb.DMatrix(train, label=label), early_stopping_rounds=50, folds=strat_folds, metrics=metric, num_boost_round=10000, params=params, verbose_eval=True ) num_trees = eval_hist.shape[0] best_score = eval_hist.values[num_trees - 1, 0] return num_trees, best_score
def build(self): train, y, test, _ = data_src.get() xgb_params = dict( max_depth = 5, learning_rate = 0.005, subsample = 0.7, gamma = 5, alpha = 0.01, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) idx = (test.smoke > 0).values * (test.smoke < 1).values print('values to restore:', np.sum(idx)) xtrain = pd.concat([train, test[~idx]]) ytrain = xtrain['smoke'] xtrain.drop('smoke', axis=1, inplace=True) print(xtrain.shape, ytrain.shape, test[idx].shape) dtrain = xgb.DMatrix(xtrain.values, ytrain.values) dpred = xgb.DMatrix(test[idx].drop('smoke', axis=1).values) cv = xgb.cv(params=xgb_params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=1, metrics='error', stratified=True) print('smoke num_boost_rounds =', len(cv)) bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv)) test.ix[idx, 'smoke'] = bst.predict(dpred) test['smoke'] = (test['smoke'] > 0.5) * 1 return train, y, test, None
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params): scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname2] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname2] /= N_splits vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)] print('validation loss: ', vloss, np.mean(vloss), np.std(vloss)) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128): v[cname], z[cname] = 0, 0 np.random.seed(seed) build_model().summary(line_length=120) model_path = '../data/working/' + cname + '_keras_model.h5' ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits) scores = list() for n, (itrain, ival) in enumerate(ss.split(train3, y)): xtrain, xval = train3[itrain], train3[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = batch_size, epochs = 10000, validation_data = (xval, yval), verbose = 0, callbacks = build_keras_fit_callbacks(model_path), shuffle = True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += pconvert(p).ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, now()) scores.append(score) z[cname] += pconvert(model.predict(test3)).ravel() del model for i in range(3): gc.collect(i) os.remove(model_path) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits
def restore_missing(df, N_splits = 10): xgb_params = dict( max_depth = 5, learning_rate = 0.005, gamma = 1, alpha = 0.01, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2} df.ix[df.active == -1, 'active'] = 1 df.ix[df.alco == -1, 'alco'] = 0 label = 'smoke' print('before', label, '{{{', df[label].value_counts(), '}}}') xtrain = df[df[label] > -1].copy() ytrain = xtrain[label].astype('int32').values xtrain = xtrain.drop(label, axis=1) #print(label, ytrain.value_counts()) xpred = df[df[label] == -1].copy() ypred = xpred[label] * 0 xpred = xpred.drop(label, axis=1) dpred = xgb.DMatrix(xpred) dtrain = xgb.DMatrix(xtrain, label=ytrain) cv = xgb.cv(params=xgb_params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=100, nfold=10, metrics='error', stratified=True) print(label, 'num_boost_rounds =', len(cv)) bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv)) ypred += bst.predict(dpred) df.ix[df[label] == -1, label] = (ypred > 0.5) * 1 print('restored', label, '{{{', df[label].value_counts(), '}}}')
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname2] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname2] /= N_splits z[cname] += z[cname2] / N_seeds v[cname] += v[cname2] / N_seeds vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)] print('validation loss: ', vloss, np.mean(vloss), np.std(vloss)) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 9 N_seeds = 4 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 5, learning_rate = 0.02, alpha = 0.01, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) dtest = xgb.DMatrix(test2) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 9 N_seeds = 4 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 4, learning_rate = 0.02, subsample = 0.7, alpha = 0.015, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb3(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 9 N_seeds = 4 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 4, learning_rate = 0.02, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name N_splits = 9 N_seeds = 4 from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval dtrain = xgb.DMatrix(train2, y) def step_xgb(params): cv = xgb.cv(params=params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=100, nfold=10, seed=params['seed']) score = cv.ix[len(cv)-1, 0] print(cname, score, len(cv), params) return dict(loss=score, status=STATUS_OK) space_xgb = dict( max_depth = hp.choice('max_depth', range(2, 8)), subsample = hp.quniform('subsample', 0.6, 1, 0.05), colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05), learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005), min_child_weight = hp.quniform('min_child_weight', 1, 6, 1), gamma = hp.quniform('gamma', 0.5, 10, 0.05), objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) trs = load_state(cname + '_trials') if trs == None: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin)) for n in range(5): best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr) save_state(cname + '_trials', (tr, space_xgb)) xgb_params = space_eval(space_xgb, best) print(xgb_params) xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params)