我们从Python开源项目中,提取了以下31个代码示例,用于说明如何使用xgboost.XGBRegressor()。
def train_xgboost(): df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7') p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) y=np.array([]) t=0 z=np.array([]) for ind in range(len(folder_names_train)): try: temp = df.get_value(str(folder_names_train[ind]),'Survival') y=np.append(y,temp) temp = df.get_value(str(folder_names_train[ind]),'Age') z=np.append(z,np.array([temp])) except Exception as e: t+=1 print (t,str(e),"Label Not found, deleting entry") y=np.append(y,0) z=np.array([[v] for v in z]) t=np.concatenate((p,q),axis=1) u=np.concatenate((r,s),axis=1) x=np.concatenate((t,u),axis=1) #print(x.shape) #print (x) #print (x.shape,z.shape) x=np.concatenate((x,z),axis=1) #print (x) #clf=linear_model.LogisticRegression(C=1e5) #clf = RandomForestRegressor() clf = xgb.XGBRegressor() clf.fit(x,y) return clf
def _train_convert_evaluate(self, bt_params = {}, **params): """ Set up the unit test by loading the dataset and training a model. """ # Train a model xgb_model = xgboost.XGBRegressor(**params) xgb_model.fit(self.X, self.target) # Convert the model (feature_names can't be given because of XGboost) spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = xgb_model.predict(self.X) # Evaluate it metrics = evaluate_regressor(spec, df, target = 'target', verbose = False) return metrics
def xgb_model_select(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() xgb_clf = xgb.XGBRegressor() parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]} grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def train_model_for_appcounts(df): app_df = df[['appCount','age','gender','education','marriageStatus','haveBaby']] known_app = app_df[app_df.appCount.notnull()].as_matrix() unknown_app = app_df[app_df.appCount.isnull()].as_matrix() y = known_app[:, 0] X = known_app[:, 1:] print 'Train Xgboost Model(For Missing AppCount)...' start_time = datetime.datetime.now() xgb_reg = XGBRegressor(n_estimators=100, max_depth=3) xgb_reg.fit(X, y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) predicted_app = xgb_reg.predict(unknown_app[:, 1:]) df.loc[ (df.appCount.isnull()), 'appCount' ] = predicted_app return df, xgb_reg
def train_model_for_age(df): age_df = df[['age', 'appCount','gender','education','marriageStatus','haveBaby']] known_age = age_df[age_df.age != 0].as_matrix() unknown_age = age_df[age_df.age == 0].as_matrix() y = known_age[:, 0] X = known_age[:, 1:] print 'Train Xgboost Model(For Missing Age)...' start_time = datetime.datetime.now() xgb_reg = XGBRegressor(n_estimators=100, max_depth=3) xgb_reg.fit(X, y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) predicted_age = xgb_reg.predict(unknown_age[:, 1:]) df.loc[ (df.age == 0), 'age' ] = predicted_age return df, xgb_reg
def generate_XGB_model(train_df): train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') '''print 'Train And Fix Missing Age Value...' train_df, xgb_age = train_model_for_age(train_df) joblib.dump(xgb_age, 'XGB_age.model')''' train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True) print 'Done' print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Xgboost Model...' start_time = datetime.datetime.now() xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False) xbg_clf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_}) print model_df return xbg_clf
def train_xgboost(): df = pd.read_csv('data/stage1_labels.csv') print(df.head()) x = np.array([np.mean(np.load('npy_result/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20) clf = xgb.XGBRegressor(max_depth=10, n_estimators=1500, min_child_weight=9, learning_rate=0.05, nthread=8, subsample=0.80, colsample_bytree=0.80, seed=4242) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50) return clf
def try_params( n_iterations, params, get_predictions = False ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) model = XGB( n_estimators = n_estimators, nthread = -1, **params ) return train_and_eval_sklearn_regressor( model, data )
def print_results(self, model_name): if self.ml_for_analytics and model_name in ('LogisticRegression', 'RidgeClassifier', 'LinearRegression', 'Ridge'): self._print_ml_analytics_results_linear_model() elif self.ml_for_analytics and model_name in ['RandomForestClassifier', 'RandomForestRegressor', 'XGBClassifier', 'XGBRegressor', 'GradientBoostingRegressor', 'GradientBoostingClassifier', 'LGBMRegressor', 'LGBMClassifier']: self._print_ml_analytics_results_random_forest()
def _get_xgb_feat_importances(self, clf): try: # Handles case when clf has been created by calling # xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit() fscore = clf.booster().get_fscore() except: # Handles case when clf has been created by calling xgb.train. # Thus, clf is an instance of xgb.Booster. fscore = clf.get_fscore() trained_feature_names = self._get_trained_feature_names() feat_importances = [] # Somewhat annoying. XGBoost only returns importances for the features it finds useful. # So we have to go in, get the index of the feature from the "feature name" by removing the f before the feature name, and grabbing the rest of that string, which is actually the index of that feature name. fscore_list = [[int(k[1:]), v] for k, v in fscore.items()] feature_infos = [] sum_of_all_feature_importances = 0.0 for idx_and_result in fscore_list: idx = idx_and_result[0] # Use the index that we grabbed above to find the human-readable feature name feature_name = trained_feature_names[idx] feat_importance = idx_and_result[1] # If we sum up all the feature importances and then divide by that sum, we will be able to have each feature importance as it's relative feature imoprtance, and the sum of all of them will sum up to 1, just as it is in scikit-learn. sum_of_all_feature_importances += feat_importance feature_infos.append([feature_name, feat_importance]) sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1]) print('Here are the feature_importances from the tree-based model:') print('The printed list will only contain at most the top 50 features.') for feature in sorted_feature_infos[-50:]: print(str(feature[0]) + ': ' + str(round(feature[1] / sum_of_all_feature_importances, 4)))
def _print_ml_analytics_results_random_forest(self): try: final_model_obj = self.trained_final_model.named_steps['final_model'] except: final_model_obj = self.trained_final_model print('\n\nHere are the results from our ' + final_model_obj.model_name) if self.name is not None: print(self.name) print('predicting ' + self.output_column) # XGB's Classifier has a proper .feature_importances_ property, while the XGBRegressor does not. if final_model_obj.model_name in ['XGBRegressor', 'XGBClassifier']: self._get_xgb_feat_importances(final_model_obj.model) else: trained_feature_names = self._get_trained_feature_names() try: trained_feature_importances = final_model_obj.model.feature_importances_ except AttributeError as e: # There was a version of LightGBM that had this misnamed to miss the "s" at the end trained_feature_importances = final_model_obj.model.feature_importance_ feature_infos = zip(trained_feature_names, trained_feature_importances) sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1]) print('Here are the feature_importances from the tree-based model:') print('The printed list will only contain at most the top 50 features.') for feature in sorted_feature_infos[-50:]: print(feature[0] + ': ' + str(round(feature[1], 4)))
def setClf(self): self.clf = XGBRegressor(max_depth=7, learning_rate=0.01, n_estimators=100) return
def create_features(user_id,is_exp, feature_cloumn_func = lambda day:get_feature_cloumn(None,day,has_user_type=False), load_exp_func = load_user_exp_model, load_func = load_user_model, is_exp_power = False ): print user_id dataset = get_month_by_id(user_id) result = [] for day in range(1,32): feature_column = feature_cloumn_func(day) x_ = dataset[feature_column] trainer = xgb.XGBRegressor() if is_exp: if is_exp_power: x_ = exp_power(x_) load_exp_func(trainer,day,user_id) else: load_func(trainer,day,user_id) y_p = trainer.predict(x_) y_p = pd.Series(y_p,name='y_p#%d'%(day-1)) if not is_exp: y_p = np.exp(y_p) result.append(y_p) result = pd.DataFrame(result).T result.index = dataset.index for day in range(31): result['real#%d'%day] = dataset['y#%d'%day].apply(np.exp) sys.stdout.flush() return result
def train_xgboost_regressor(): return mp.ModelProperties(regression=True), xgboost.XGBRegressor()
def apply_filler(self, x_train, y_train, x_test): model = xgboost.XGBRegressor() model = model.fit(x_train, y_train) return model.predict(x_test)
def test_unsupported_conversion(self): feature_names = self.scikit_data.feature_names output_name = 'target' xgb_model = xgboost.XGBRegressor(objective = 'reg:gamma') xgb_model.fit(self.scikit_data.data, self.scikit_data.target) with self.assertRaises(ValueError): spec = xgb_converter.convert(xgb_model, feature_names, 'target') xgb_model = xgboost.XGBRegressor(objective = 'reg:tweedie') xgb_model.fit(self.scikit_data.data, self.scikit_data.target) with self.assertRaises(ValueError): spec = xgb_converter.convert(xgb_model, feature_names, 'target')
def test(): iris = load_iris() xgb_model = xgb.XGBRegressor(n_estimators=300000, max_depth=2) xgb_model.fit(iris.data[:120],iris.target[:120]) predict = xgb_model.predict(iris.data[:120]) print mean_squared_error(iris.target[:120], predict) pred = xgb_model.predict(iris.data[120:]) print mean_squared_error(iris.target[120:], pred)
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None): train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan')) test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan')) evallist = [(test_data,'eval'), (train_data,'train')] #if xgb_params == None: # xgb_params = get_default_xgboost_params() if not use_cv: num_rounds = 10 else: cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5, metrics={'rmse'}, show_progress=True) print cvresult num_rounds = len(cvresult) gbdt = None if(use_sklean): #gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None) xgb_params['n_estimators'] = num_rounds gbdt = xgboost.XGBRegressor(xgb_params) gbdt.fit(x_train, y_train) y_pred = gbdt.predict(X_test) return gbdt, y_pred else: #gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5) gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True) ceate_feature_map_for_feature_importance(features) show_feature_importance(gbdt, feature_names=features) y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan"))) return XGBoostModel(gbdt), y_pred
def train_xgboost(): df = pd.read_csv('data/stage1_labels.csv') # print df.head() x = [] y = [] did = df['id'].tolist() cancer = df['cancer'].tolist() for i in range(len(df)): if os.path.isfile('data/stage1/%s.npy' % did[i]): f = np.load('data/stage1/%s.npy' % did[i]) f = f.reshape(f.shape[0], 2048) x.append(np.mean(f, axis=0)) y.append(cancer[i]) x = np.array(x) print x.shape y = np.array(y) trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=822, stratify=y, test_size=0.1) clfs = [] for s in range(5): # Some parameters were taken from discussion. clf = xgb.XGBRegressor(n_estimators=1000, max_depth=10, min_child_weight=10, learning_rate=0.01, subsample=0.80, colsample_bytree=0.70, seed=822 + s, reg_alpha=0.1) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=100) clfs.append(clf) return clfs
def xgbr(X,y): X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0) xgbr_boost = xgb.XGBRegressor(seed=1) xgbr_boost.fit(X_train,y_train.ravel()) print 'training error:',1.0 - xgbr_boost.score(X_train,y_train) print 'validation error:',1.0 - xgbr_boost.score(X_validation,y_validation) time_fit(xgbr_boost,X_train,y_train.ravel())
def __init__(self): self.name = "onegbm" self.m = Pipeline([ ("drop", FeatureRemover(["UPBS", "UPBE", "SCMN", "earthmars_km", "OCC_MARS_200KM_START_", "sa_monthly"])), ("gbm", xgboost.XGBRegressor(max_depth=7, n_estimators=1000, learning_rate=0.05, silent=1, seed=42)) ])
def models(): params = {'n_jobs':nthread,'random_state':seed,'class_weight':None} # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1} clfs = [ # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), ] for clf in clfs: yield clf
def test_regressor(loop): # noqa with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): a = dxgb.XGBRegressor() X2 = da.from_array(X, 5) y2 = da.from_array(y, 5) a.fit(X2, y2) p1 = a.predict(X2) b = xgb.XGBRegressor() b.fit(X, y) assert_eq(p1, b.predict(X))
def make_model(params): return xgb.XGBRegressor(**params)
def xgb_Fit(knownX,knownY,preX): xlf = xgb.XGBRegressor(max_depth=11, learning_rate=0.01, n_estimators=301, silent=True, objective=mape, gamma=0, min_child_weight=5, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=1e0, reg_lambda=0, scale_pos_weight=1, seed=9, missing=None) x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1) for i in range(y_train.shape[1]): xlf.fit(x_train, y_train[:, i].reshape(-1, 1), eval_metric=mape, verbose=False) # eval_set=[(x_test, y_test[:, i].reshape(-1, 1))], early_stopping_rounds=2) tempPre = xlf.predict(preX).reshape(-1, 1) if i == 0: Y_pre = tempPre else: Y_pre = np.c_[Y_pre, tempPre] Y_pre = Y_pre.reshape(-1, 1) return Y_pre #?model??gridsearch
def xgb_Fit(knownX,knownY,preX): xlf = xgb.XGBRegressor(max_depth=7,#11 learning_rate=0.06,#0.01 n_estimators=1000, silent=True, objective=mapeobj, gamma=0, min_child_weight=5, max_delta_step=0, subsample=1,#0.8 colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=1e0, reg_lambda=0, scale_pos_weight=1, seed=1850, missing=None) x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1) for i in range(y_train.shape[1]): xlf.fit(x_train, y_train[:, i].reshape(-1,1)) # print('Training Error: {:.3f}'.format(1 - xlf.score(x_train,y_train[:,i].reshape(-1,1)))) # print('Validation Error: {:.3f}'.format(1 - xlf.score(x_test,y_test[:,i].reshape(-1,1)))) #predict value for output tempPre = xlf.predict(preX).reshape(-1, 1) if i == 0: Y_pre = tempPre else: Y_pre = np.c_[Y_pre, tempPre] Y_pre = Y_pre.reshape(-1, 1) return Y_pre #sklearn???????
def svr_main(X, Y): X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) predict_list = [] for i in xrange(TEST_SIZE): X = [ [x] for x in xrange(i, TRAIN_SIZE+i)] clf.fit(X, Y[i:TRAIN_SIZE+i]) y_pred = clf.predict([TRAIN_SIZE+1+i]) predict_list.append(y_pred) print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list) print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list)) origin_data = Y_test print "origin data:%s"%origin_data plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
def fit(self, X, y, refit=False): import xgboost as xgb self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.max_depth = int(self.max_depth) # (TODO) Gb used at most half of the features, here we use all self.colsample_bylevel = float(self.colsample_bylevel) self.colsample_bytree = float(self.colsample_bytree) self.gamma = float(self.gamma) self.min_child_weight = int(self.min_child_weight) self.max_delta_step = int(self.max_delta_step) self.reg_alpha = float(self.reg_alpha) self.reg_lambda = float(self.reg_lambda) self.nthread = int(self.nthread) self.base_score = float(self.base_score) self.scale_pos_weight = float(self.scale_pos_weight) self.objective = 'reg:linear' self.estimator = xgb.XGBRegressor( max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, silent=self.silent, objective=self.objective, nthread=self.nthread, gamma=self.gamma, scale_pos_weight=self.scale_pos_weight, min_child_weight=self.min_child_weight, max_delta_step=self.max_delta_step, subsample=self.subsample, colsample_bytree=self.colsample_bytree, colsample_bylevel=self.colsample_bylevel, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, base_score=self.base_score, seed=self.seed ) self.estimator.fit(X, y) return self
def models(): extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy', 'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed} extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse', 'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30, 'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed} xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9, 'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed} xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9, 'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed} #NN params nb_epoch = 3 batch_size = 128 esr = 402 param1 = { 'hidden_units': (256, 256), 'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } param2 = { 'hidden_units': (1024, 1024), 'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid), 'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch, } clfs = [ (D2, XGBClassifier(**xgb_cla)), (D11, XGBClassifier(**xgb_cla)), (D2, XGBRegressor(**xgb_reg)), (D11, XGBRegressor(**xgb_reg)), (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)), (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)), # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)), # # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)), # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)) ] for clf in clfs: yield clf
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]], label: np.ndarray, metric_sklearn: str, n_jobs: int, param_grid: dict, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, verbosity_level: int = 10) -> Tuple[dict, float]: """ Grid search over a segment of XGBoost parameters. :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param param_grid: A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search values [0.5, 0.6, 0.7, 0.8]. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) grid = GridSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_jobs=n_jobs, param_grid=param_grid, scoring=metric_sklearn, verbose=verbosity_level ) grid.fit(train, label) best_score = grid.best_score_ # Massage the score to be in line with what xgboost reports if metric_sklearn == 'neg_mean_squared_error': best_score = abs(best_score) ** 0.5 elif metric_sklearn == 'neg_log_loss': best_score = abs(best_score) return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score
def tune_xgb_params_randomized(estimator_cls, label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 10, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level ) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]