我们从Python开源项目中,提取了以下47个代码示例,用于说明如何使用xgboost.XGBClassifier()。
def xgb_model(train_data, train_label, test_data, test_label): clf = xgb.XGBClassifier(max_depth=7, min_child_weight=1, learning_rate=0.1, n_estimators=500, silent=True, objective='binary:logistic', gamma=0, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=1, missing=None) clf.fit(train_data, train_label, eval_metric='auc', verbose=True, eval_set=[(test_data, test_label)], early_stopping_rounds=100) y_pre = clf.predict(test_data) y_pro = clf.predict_proba(test_data)[:, 1] #print "AUC Score : %f" % metrics.roc_auc_score(test_label, y_pro) #print"Accuracy : %.4g" % metrics.accuracy_score(test_label, y_pre) return clf
def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf self._gs = a_grid_search if a_clf is None: classifier = XGBClassifier(max_depth=MAX_DEPTH, n_estimators=NTREES, learning_rate=ALPHA, objective="multi:softprob") self._clf = classifier # latest version of XGBoost cannot deal with non-sparse feature vectors self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)])
def threshold_estimate(x,y): x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0) weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1])) w1 = np.array([1]*y_train.shape[0]) w1[y_train==1]=weight print("samples: %d %d %f" % (x_train.shape[0], x_test.shape[0], weight)) estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50) estimator.fit(x_train, y_train, sample_weight=w1) y_scores = estimator.predict_proba(x_test)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, y_scores) f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:]) m_idx = np.argmax(f1) m_thresh = thresholds[2+m_idx] print("%d %f %f" % (precision.shape[0], f1[m_idx], m_thresh)) return m_thresh # Estimate threshold for the classifier using inner-round cross validation
def test_model_detection(self): sklearn_model = LogisticRegression() pipeline_model = Pipeline([('log', sklearn_model)]) xgb_model = XGBClassifier() nn_model = NNModel(100,10) sklearn_opt = Optimizer(sklearn_model,[], lambda x: x) pipeline_opt = Optimizer(pipeline_model,[], lambda x: x) xgb_opt = Optimizer(xgb_model,[], lambda x: x) nn_opt = Optimizer(nn_model,[], lambda x: x) self.assertEqual(sklearn_opt.model_module, 'sklearn') self.assertEqual(pipeline_opt.model_module, 'pipeline') self.assertEqual(xgb_opt.model_module, 'xgboost') self.assertEqual(nn_opt.model_module, 'keras')
def objective(space): estimator = XGBClassifier( n_estimators=n_estimators, max_depth=int(space['max_depth']), min_child_weight=int(space['min_child_weight']), gamma=space['gamma'], subsample=space['subsample'], colsample_bytree=space['colsample_bytree'] ) estimator.fit( x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], early_stopping_rounds=30, verbose=False, eval_metric='error' ) score = accuracy_score(y_val, estimator.predict(x_val)) return {'loss': 1 - score, 'status': STATUS_OK}
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5): # global dtrain_whole global num_boost_round global params_sklearn # global x # global y for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10) grid_search.fit(x, y) df0 = pd.DataFrame(grid_search.cv_results_) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] # print df0 print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ # print grid_search.cv_results_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v if len(params_untuned)==1: return v
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5): # global dtrain_whole global num_boost_round global params_sklearn # global x # global y for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10) grid_search.fit(x, y) df0 = pd.DataFrame(grid_search.cv_results_) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] # print df0 print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ # print grid_search.cv_results_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10): for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) # if(param_untuned.keys()[0] == 'n_estimators'): # cv = 1 grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose) grid_search.fit(x, y) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v return estimator,params_sklearn
def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1): """ ExtraTrees """ import xgboost as xgb X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format( n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape)) clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob', seed=0, silent=True, nthread=-1, learning_rate=learning_rate) eval_set = [(X_test, y_test)] clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror") y_pred = clf.predict(X_test) prec = float(np.sum(y_pred == y_test)) / len(y_test) LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0)) return clf, y_pred
def get_classifier(method='logistic_regression'): if 'logistic_regression' == method: return LogisticRegression(C=1e3, tol=0.01, multi_class='ovr', solver='liblinear', n_jobs=-1, random_state=123) if 'random_forest' == method: return RandomForestClassifier(n_estimators=250, bootstrap=False, n_jobs=-1, random_state=123) if 'gradient_boosting' == method: return xgb.XGBClassifier(max_depth=10, subsample=0.7, n_estimators=500, min_child_weight=0.05, colsample_bytree=0.3, learning_rate=0.1)
def try_params( n_iterations, params, get_predictions = False ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = XGB( n_estimators = n_estimators, nthread = -1, **params ) return train_and_eval_sklearn_classifier( clf, data )
def test_build_new_model_xgboost(self): xgb_model = XGBClassifier(max_depth=3) xgb_opt = Optimizer(xgb_model,[], lambda x: x) new_model = xgb_opt.build_new_model({'max_depth': 2}) self.assertEqual(new_model.get_params()['max_depth'], 2)
def XGBoost(X, y): print("Iniciando treinamento do XGBoost") start_time = time.time() X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=1) clf = xgb.XGBClassifier(learning_rate=0.15, n_estimators=170, nthread=6, max_depth=8, seed=0, silent=True, subsample=0.85, colsample_bytree=0.85) clf.fit(X, y) score = clf.score(X_test, y_test) print("XGBoost score: ", score, "(", (time.time()-start_time)/60.0, "minutos )") return clf
def __init__(self,name,kwargs): import xgboost as xgb kwargs = kwargs.copy() if "random_state" in kwargs: kwargs["seed"] = kwargs["random_state"] kwargs.pop("random_state") super(GCXGBClassifier,self).__init__(name,xgb.XGBClassifier,kwargs)
def XGBOUT2(bp, all_samples,train_samp,Xcoords, Ycoords, Zcoords,k,threshold,nthread,bootstrap = True): '''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor Bootstrap''' num_samp = len(all_samples) if bootstrap: np.random.seed() random.seed() I = np.random.choice(num_samp,size = num_samp, replace = True) samples = all_samples[I,:] else: samples = all_samples Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], all_samples[:,Zcoords],train_samp,k) model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11) gbm = model.fit(Xtrain,Ytrain) pred = gbm.predict_proba(Xtest) pred_exact = gbm.predict(Xtest) acc1 = accuracy_score(Ytest, pred_exact) AUC1 = roc_auc_score(Ytest,pred[:,1]) del gbm gbm = model.fit(Xtrain[:,len(Xcoords)::],Ytrain) pred = gbm.predict_proba(Xtest[:,len(Xcoords)::]) pred_exact = gbm.predict(Xtest[:,len(Xcoords)::]) acc2 = accuracy_score(Ytest, pred_exact) AUC2 = roc_auc_score(Ytest,pred[:,1]) del gbm if AUC1 > AUC2 + threshold: return [0.0, AUC1 - AUC2 , AUC2 - 0.5, acc1 - acc2, acc2 - 0.5] else: return [1.0, AUC1 - AUC2, AUC2 - 0.5, acc1 - acc2, acc2 - 0.5]
def XGBOUT_Independence(bp, all_samples,train_samp,Xcoords, Ycoords, k,threshold,nthread,bootstrap = True): '''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor Bootstrap''' num_samp = len(all_samples) if bootstrap: np.random.seed() random.seed() I = np.random.choice(num_samp,size = num_samp, replace = True) samples = all_samples[I,:] else: samples = all_samples Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], None,train_samp,k) s1,s2 = Xtrain.shape if s2 >= 4: model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11) else: model = xgb.XGBClassifier() gbm = model.fit(Xtrain,Ytrain) pred = gbm.predict_proba(Xtest) pred_exact = gbm.predict(Xtest) acc1 = accuracy_score(Ytest, pred_exact) AUC1 = roc_auc_score(Ytest,pred[:,1]) del gbm if AUC1 > 0.5 + threshold: return [0.0, AUC1 - 0.5 , acc1- 0.5] else: return [1.0, AUC1 - 0.5 , acc1- 0.5]
def train_xgboost_classifier(): return mp.ModelProperties(), xgboost.XGBClassifier()
def buildEstimators(mode): if mode == 'train' or mode == 'cv': # best parameters got by gridsearchCV, best score: 1 estimators = [('anova_filter', SelectKBest(f_classif, k='all')), ('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))] clf = Pipeline(estimators) elif mode == 'test': clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r")) return clf
def threshold_estimate_cv(x,y,k_fold): print "%d %d %d" % (y.shape[0], sum(y==1), sum(y==0)) kf1 = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0) threshold = np.zeros((k_fold),dtype="float32") cnt = 0 for train_index, test_index in kf1: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] w1 = np.array([1]*y_train.shape[0]) weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1])) w1 = np.array([1]*y_train.shape[0]) w1[y_train==1]=weight estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50) estimator.fit(x_train, y_train, sample_weight=w1) y_scores = estimator.predict_proba(x_test)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, y_scores) f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:]) m_idx = np.argmax(f1) threshold[cnt] = thresholds[2+m_idx] cnt += 1 print("%d %f %f" % (precision.shape[0], f1[m_idx], thresholds[2+m_idx])) return np.mean(threshold), threshold # Cross validation using gradient tree boosting
def parametered_single(x_train,y_train,x_test,y_test,thresh_opt): print("samples: %d %d %d %d" % (x_train.shape[0],x_train.shape[1],x_test.shape[0],x_test.shape[1])) metrics = np.zeros((1,5),dtype="float32") thresh = 0.5 # estimate the threshold if thresh_opt==1: thresh = threshold_estimate(x_train,y_train) clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=500, nthread=50) weight = float(sum(y_train<1))/float(sum(y_train==1)) w1 = np.array([1]*y_train.shape[0]) w1[y_train==1]=weight clf.fit(x_train, y_train, sample_weight=w1) prob = clf.predict_proba(x_test) yfit = (prob[:,1]>thresh) precision, recall, f1, mcc = score_function(y_test,yfit) metrics = np.array((thresh,precision,recall,f1,mcc)) print metrics importances = clf.feature_importances_ indices1 = np.argsort(importances)[::-1] features1 = np.transpose(np.array((indices1,importances[indices1]))) pred = np.transpose(np.array((y_test,yfit))) return metrics, pred, prob, features1 # Cross validation for PEP-Word
def xgb0(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): return df.drop(['time'], axis=1) logging.info("train xgb0 model") clf = xgb.XGBClassifier() clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def xgb150opt(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): return df.drop(['time'], axis=1) logging.info("train xgb150opt model") clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=3, subsample=0.667, colsample_bytree=1) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def xgb150opt2(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): return df.drop(['time'], axis=1) logging.info("train xgb150opt2 model") clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1, subsample=0.85263, colsample_bytree=0.657894, reg_alpha=1.55556, reg_lambda=1.22222, gamma=0.3333333) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def __init__(self, trainX, trainY): self.trainX = trainX self.trainY = trainY self.level0 = xgb.XGBClassifier(learning_rate=0.325, silent=True, objective="binary:logistic", nthread=-1, gamma=0.85, min_child_weight=5, max_delta_step=1, subsample=0.85, colsample_bytree=0.55, colsample_bylevel=1, reg_alpha=0.5, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None, n_estimators=1920, max_depth=6) self.h_param_grid = {'max_depth': hp.quniform('max_depth', 1, 13, 1), 'subsample': hp.quniform('subsample', 0.5, 1, 0.05), 'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025), 'gamma': hp.quniform('gamma', 0.5, 1, 0.05), 'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05), 'n_estimators': hp.quniform('n_estimators', 10, 200, 5), } self.to_int_params = ['n_estimators', 'max_depth']
def train_xgboost(): df = pd.read_csv('survival_data.csv', index_col=0) p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) y=np.array([]) t=0 z=np.array([]) for ind in range(len(folder_names_train)): try: temp = df.get_value(str(folder_names_train[ind]),'Class') y=np.append(y,temp) temp = df.get_value(str(folder_names_train[ind]),'Age') z=np.append(z,np.array([temp])) except Exception as e: t+=1 print (t,str(e),"Label Not found, deleting entry") y=np.append(y,0) z=np.array([[v] for v in z]) t=np.concatenate((p,q),axis=1) u=np.concatenate((r,s),axis=1) x=np.concatenate((t,u),axis=1) #print(x.shape) #print (x) #print (x.shape,z.shape) x=np.concatenate((x,z),axis=1) #print (x) #clf=linear_model.LogisticRegression(C=1e5) #clf = RandomForestClassifier() clf = xgb.XGBClassifier() clf.fit(x,y) return clf
def _get_model(self): if self._model == 'xgb': return XGBClassifier() if self._model == 'svc_rbf': return SVC() if self._model == 'svc_lin': return LinearSVC() return RFC()
def models(): params = {'n_jobs':nthread,'random_state':seed,'class_weight':None} # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1} clfs = [ # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), ] for clf in clfs: yield clf
def test_classifier(loop): # noqa with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): a = dxgb.XGBClassifier() X2 = da.from_array(X, 5) y2 = da.from_array(y, 5) a.fit(X2, y2) p1 = a.predict(X2) b = xgb.XGBClassifier() b.fit(X, y) np.testing.assert_array_almost_equal(a.feature_importances_, b.feature_importances_) assert_eq(p1, b.predict(X))
def fit(self, X, y=None): """Fit a gradient boosting classifier Parameters ---------- X : array-like [n_samples, n_features] Feature Matrix. May be a dask.array or dask.dataframe y : array-like Labels Returns ------- self : XGBClassifier Notes ----- This differs from the XGBoost version in three ways 1. The ``sample_weight``, ``eval_set``, ``eval_metric``, ``early_stopping_rounds`` and ``verbose`` fit kwargs are not supported. 2. The labels are not automatically label-encoded 3. The ``classes_`` and ``n_classes_`` attributes are not learned """ client = default_client() xgb_options = self.get_xgb_params() self._Booster = train(client, xgb_options, X, y, num_boost_round=self.n_estimators) return self
def process_fold(X_train, X_val, y_train, y_val, X_test): #XGBoos clf = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.005, n_estimators=500)) clf.fit(X_train, y_train) y_p_x = clf.predict_proba(X_val) y_p_x_tst = clf.predict_proba(X_test) # Keras y_p_k, y_p_k_tst = KerasClassifier(X_train, y_train, X_val, y_val, X_test) return (y_p_x+y_p_k) / 2.0, (y_p_x_tst+y_p_k_tst) / 2.0
def train_xgb(X_train, y_train, X_test, y_test): n_trees = 1000 X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format( n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape)) clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=5, objective='multi:softprob', seed=0, silent=True, nthread=-1, learning_rate=0.1) eval_set = [(X_test, y_test)] clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror", early_stopping_rounds=10) y_pred = clf.predict(X_test) prec = float(np.sum(y_pred == y_test)) / len(y_test) LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0)) return clf, y_pred
def xgboostcv(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, gamma, min_child_weight, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, subsample=subsample, colsample_bytree=colsample_bytree, gamma=gamma, min_child_weight = min_child_weight, seed=seed, objective="binary:logistic") clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25) ll = -log_loss(y1, clf.predict_proba(x1)) return ll
def xgboo(): # Gradient Boosted Trees to grid search model = XGBClassifier(seed=random_state, nthread=8) parameters = {'max_depth': [3, 6, 9], 'n_estimators': [50, 100, 200, 400]} grid = GridSearchCV(model, parameters, n_jobs=4, verbose=2) return grid
def xgboostcv(max_depth, learning_rate, n_estimators, # gamma, # min_child_weight, # max_delta_step, subsample, colsample_bytree, ratio=131.708, silent =True, nthread = -1, seed = 42): return cross_val_score(XGBClassifier(max_depth = int(max_depth), learning_rate = learning_rate, n_estimators = int(n_estimators), silent = silent, nthread = nthread, # gamma = gamma, # min_child_weight = min_child_weight, # max_delta_step = max_delta_step, subsample = subsample, colsample_bytree = colsample_bytree, scale_pos_weight = ratio, seed = seed), X, y, scoring='f1', cv=5).mean()
def select_mdl(self, mdl_type, param): """ # define classifier and parameters :param mdl_type: specify which model to initialize :param param: a dict storing model parameters """ if (mdl_type == 'xgb'): self.mdl = xgb.XGBClassifier(**param) elif (mdl_type == 'lr'): self.mdl = LogisticRegression(**param) elif (mdl_type == 'rf'): self.mdl = RandomForestClassifier(**param)
def pipeline(self): # This is a property for serialization support with xgboost, # because we change self.clf after __init__. pipeline = [self.vec] if isinstance(self.clf, XGBClassifier): # Work around xgboost issue: # https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543 pipeline.append(CSCTransformer()) pipeline.append(self.clf) return make_pipeline(*pipeline)
def explain_predictions(self, docs, top=30): if not isinstance(self.clf, XGBClassifier): raise NotImplementedError booster = self.clf.booster() xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)} feature_names = get_feature_names(self.clf, self.vec, num_features=len(xgb_feature_names)) feature_names.bias_name = '<BIAS>' X = self.vec.transform(docs) X = X.tocsc() dmatrix = DMatrix(X, missing=self.clf.missing) leaf_ids = booster.predict(dmatrix, pred_leaf=True) tree_dumps = booster.get_dump(with_stats=True) docs_weights = [] for i, _leaf_ids in enumerate(leaf_ids): all_weights = _target_feature_weights( _leaf_ids, tree_dumps, feature_names=feature_names, xgb_feature_names=xgb_feature_names)[1] weights = np.zeros_like(all_weights) idx = X[i].nonzero()[1] bias_idx = feature_names.bias_idx weights[idx] = all_weights[idx] weights[bias_idx] = all_weights[bias_idx] docs_weights.append(weights) weights = np.mean(docs_weights, axis=0) feature_weights = get_top_features( feature_names=np.array( [_prettify_feature(f) for f in feature_names]), coef=weights, top=top) return Explanation( estimator=type(self.clf).__name__, targets=[TargetExplanation('y', feature_weights=feature_weights)], )
def get_attributes(obj): if isinstance(obj, TfidfVectorizer): return get_tfidf_attributes(obj) elif isinstance(obj, XGBClassifier): return pickle.dumps(obj) elif isinstance(obj, BaseEstimator): return {attr: getattr(obj, attr) for attr in dir(obj) if not attr.startswith('_') and attr.endswith('_') and attr not in skip_attributes} elif obj is not None: raise TypeError(type(obj))
def set_attributes(parent, field, attributes): obj = getattr(parent, field) if isinstance(obj, TfidfVectorizer): set_ifidf_attributes(obj, attributes) elif isinstance(obj, XGBClassifier): setattr(parent, field, pickle.loads(attributes)) elif isinstance(obj, BaseEstimator): for k, v in attributes.items(): try: setattr(obj, k, v) except AttributeError: raise AttributeError( 'can\'t set attribute {} on {}'.format(k, obj)) elif obj is not None: raise TypeError(type(obj))
def fit(self, X, y): import xgboost as xgb self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.max_depth = int(self.max_depth) # (TODO) Gb used at most half of the features, here we use all self.colsample_bylevel = float(self.colsample_bylevel) self.colsample_bytree = float(self.colsample_bytree) self.gamma = float(self.gamma) self.min_child_weight = int(self.min_child_weight) self.max_delta_step = int(self.max_delta_step) self.reg_alpha = float(self.reg_alpha) self.reg_lambda = float(self.reg_lambda) self.nthread = int(self.nthread) self.base_score = float(self.base_score) self.scale_pos_weight = float(self.scale_pos_weight) # We don't support multilabel, so we only need 1 objective function if len(numpy.unique(y)) == 2: # We probably have binary classification self.objective = 'binary:logistic' else: self.objective = 'multi:softprob' self.estimator = xgb.XGBClassifier( max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, silent=self.silent, objective=self.objective, nthread=self.nthread, gamma=self.gamma, scale_pos_weight=self.scale_pos_weight, min_child_weight=self.min_child_weight, max_delta_step=self.max_delta_step, subsample=self.subsample, colsample_bytree=self.colsample_bytree, colsample_bylevel=self.colsample_bylevel, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, base_score=self.base_score, seed=self.seed ) self.estimator.fit(X, y, eval_metric='auc') return self
def parametered_cv(x,y,k_fold,k_fold1): print("samples: %d %d %d %d" % (x.shape[0],x.shape[1],k_fold,k_fold1)) kf = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0) index = [] label = [] yfit = [] metrics = np.zeros((k_fold,5),dtype="float32") thresholds = [] predicted = np.array([[0,0]]) features1 = np.array([[0,0]]) thresh = 0.5 cnt = 0 print "Positive: %d Negative: %d" % (sum(y==1), sum(y==0)) for train_index, test_index in kf: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] print y_train.shape print("%d %d %d %d" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1])) if k_fold1>1: thresh, thresh_vec = threshold_estimate_cv(x_train,y_train,k_fold1) elif k_fold1==1: thresh = threshold_estimate(x_train,y_train) else: thresh = 0.5 print("%d %f" % (x_train.shape[0], thresh)) weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1])) w1 = np.array([1]*y_train.shape[0]) w1[y_train==1]=weight weight1 = float(len(y_test[y_test == 0]))/float(len(y_test[y_test == 1])) clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50) clf.fit(x_train, y_train, sample_weight=w1) prob = clf.predict_proba(x_test) yfit1 = (prob[:,1]>thresh) index = np.concatenate((index,test_index),axis=0) label = np.concatenate((label,y_test),axis=0) yfit = np.concatenate((yfit,yfit1),axis=0) precision, recall, f1, mcc = score_function(y_test,yfit1) metrics[cnt,:] = np.array((thresh,precision,recall,f1,mcc)) print metrics[cnt,:] cnt += 1 predicted = np.concatenate((predicted,prob),axis=0) importances = clf.feature_importances_ indices1 = np.argsort(importances)[::-1] feature_1 = np.transpose(np.array((indices1,importances[indices1]))) features1 = np.concatenate((features1,feature_1),axis=0) pred = np.transpose(np.array((index,label,yfit))) aver_metrics = np.mean(metrics,axis=0) aver_metrics = np.reshape(aver_metrics,(1,metrics.shape[1])) metrics_1 = np.concatenate((metrics,aver_metrics),axis=0) print aver_metrics return metrics_1, pred, predicted[1:,], features1[1:,] # Single run using gradient tree boosting
def online(X_org, y_org, test_x, test_uid): n_folds = 5 verbose = True shuffle = False X = X_org y = y_org X_submission = test_x if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(StratifiedKFold(y, n_folds)) clfs = [ RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})), ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})), GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})), LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})), xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})), xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})), ] print "Creating train and test sets for blending." dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) for j, clf in enumerate(clfs): print j, clf dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) print "Blending." # clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1) clf = linear_model.RidgeCV( alphas=np.linspace(0, 200), cv=LM_CV_NUM) # clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100) clf.fit(dataset_blend_train, y) # y_submission = clf.predict_proba(dataset_blend_test)[:,1] print clf.coef_, clf.intercept_ y_submission = clf.predict(dataset_blend_test) # for RidgeCV print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) print "blend result" save_submission(os.path.join(consts.SUBMISSION_PATH, MODEL_NAME + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'), test_uid, y_submission)
def online2(X_org, y_org, test_x, test_uid): n_folds = 5 verbose = True shuffle = False X = X_org y = y_org X_submission = test_x if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(StratifiedKFold(y, n_folds)) clfs = [ RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})), ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})), GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})), LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})), # xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})), # xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})), ] print "Creating train and test sets for blending." dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) for j, clf in enumerate(clfs): print j, clf dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] clf.fit(X_train, y_train) dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) save_submission(os.path.join(consts.SUBMISSION_PATH, clf.__class__.__name__ + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'), test_uid, dataset_blend_test[:, j])
def models(): extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy', 'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed} extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse', 'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed} xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9, 'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed} xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9, 'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed} #NN params nb_epoch = 3 batch_size = 128 esr = 402 param1 = { 'hidden_units': (256, 256), 'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } param2 = { 'hidden_units': (1024, 1024), 'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } clfs = [ (D2, XGBClassifier(**xgb_cla)), (D11, XGBClassifier(**xgb_cla)), (D2, XGBRegressor(**xgb_reg)), (D11, XGBRegressor(**xgb_reg)), (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)) ] for clf in clfs: yield clf
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]], label: np.ndarray, metric_sklearn: str, n_jobs: int, param_grid: dict, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, verbosity_level: int = 10) -> Tuple[dict, float]: """ Grid search over a segment of XGBoost parameters. :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param param_grid: A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search values [0.5, 0.6, 0.7, 0.8]. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) grid = GridSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_jobs=n_jobs, param_grid=param_grid, scoring=metric_sklearn, verbose=verbosity_level ) grid.fit(train, label) best_score = grid.best_score_ # Massage the score to be in line with what xgboost reports if metric_sklearn == 'neg_mean_squared_error': best_score = abs(best_score) ** 0.5 elif metric_sklearn == 'neg_log_loss': best_score = abs(best_score) return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score
def tune_xgb_params_randomized(estimator_cls, label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 10, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level ) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]