我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用xgboost.train()。
def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None): print(X.shape,y.shape) num_round = self.params['num_round'] early_stopping_rounds = self.params['early_stopping_rounds'] dtrain = xgb.DMatrix(X, y) dtrain.set_group(Xg) if Xt is not None: dvalid = xgb.DMatrix(Xt, yt) dvalid.set_group(Xgt) watchlist = [(dtrain, 'train'), (dvalid, 'valid')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model, maximize=True) else: watchlist = [(dtrain, 'train')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, verbose_eval=1,xgb_model=load_model) self.bst = bst if save_model is not None: bst.save_model(save_model)
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split, y_test_split, title_name): # Split the training data into an extra set of test # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual y') plt.ylabel('Predicted y') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def run_gbm(dtrain, dvalid, param): # check training arguments in param n_round = param.get('num_boost_round', 5000) early_stop = param.get('early_stopping_rounds', 5) verbose_eval = param.get('verbose_eval', 100) # specify validations set to watch performance watchlist = [(dvalid, 'eval')] bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=n_round, evals=watchlist, early_stopping_rounds=early_stop, verbose_eval = verbose_eval ) return bst
def crate_pre_train_model(x_,y_): (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1) (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1) dtrain = xgb.DMatrix( x_train, label=y_train) dtest = xgb.DMatrix( x_test, label=y_test) evallist = [(dtrain,'train'),(dtest,'eval')] param = {'objective':'reg:linear','max_depth':3 } param['nthread'] = 64 #param['min_child_weight'] = 15 #param['subsample'] = 1 #param['num_class'] = 7 plst = param.items() num_round = 5000 bst = xgb.train( plst, dtrain, num_round, evallist,early_stopping_rounds=100, #obj=logregobj, feval=evalerror ) return bst # %% main
def get_data(item='train',id=1,is_shuffle=False,is_subtrain=1): file_path=os.path.join(metadata_root,item+'_list0'+id+'.txt') files=[] labels=[] with open(file_path,'r')as fp: lines=fp.readlines() if is_shuffle==True: np.random.shuffle(lines) if not is_subtrain==1: lines=random.sample(lines,int(len(lines)*is_subtrain)) for line in lines: tmp_prefix=line.strip().split('.')[0].split('/')[1] label_tmp=line.strip().split(' ')[1] files.append(os.path.join(feature_root,tmp_prefix+'.npy')) labels.append(int(label_tmp)-1) return files,np.array(labels,dtype=np.float64)
def tune_num_boost_round(): # global watchlist global num_boost_round global evals_result global eval_metric_xgb_format evals_result = {} xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result) evals_result = evals_result['eval'][eval_metric_xgb_format] # pprint.pprint(evals_result) max = 0.0 max_loc = 0 for i,v in enumerate(evals_result): # print '%d ...... %d : %d'%(i,max_loc,max) if v>max: max = v max_loc = i # print "max_loc : %s , max : %s"%(max_loc,max) num_boost_round = max_loc+1 print('**** num_boost_round : ', num_boost_round)
def tune_num_boost_round(): # global watchlist global num_boost_round global evals_result evals_result = {} xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result) evals_result = evals_result['eval']['map'] pprint.pprint(evals_result) max = 0.0 max_loc = 0 for i,v in enumerate(evals_result): # print '%d ...... %d : %d'%(i,max_loc,max) if v>max: max = v max_loc = i print "max_loc : %d , max : %d"%(max_loc,max) num_boost_round = max_loc+1 print '**** num_boost_round : ', num_boost_round
def load_data(): train_data = pd.read_csv(os.path.join(data_folder, 'train.csv'), delimiter=';', skip_blank_lines=True) test_data = pd.read_csv(os.path.join(data_folder, 'test.csv'), delimiter=';', skip_blank_lines=True, na_values='None') ntrain = train_data.shape[0] ntest = test_data.shape[0] print('ntrain={}'.format(ntrain)) print('ntest={}'.format(ntest)) y_train = train_data['cardio'].values # -------------------------------------------------------------- x_train = train_data.drop(["id", "cardio"], axis=1) x_test = test_data.drop(["id"], axis=1) x_test.replace('None', np.nan) return (x_train,y_train,x_test) # ---------------------------------------------------------------------
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tag') parser.add_argument('--train-recordfile', default='train', help='identifier for file with the users to train on (default: train). deprecated: specify in hps...') parser.add_argument('-n', '--n-rounds', type=int, default=50, help='Number of rounds of boosting. Deprecated: specify this in hp config file') parser.add_argument('--weight', action='store_true', help='Whether to do per-instance weighting. Deprecated: specify in hps') args = parser.parse_args() try: hps = hypers.hps_for_tag(args.tag) except hypers.NoHpsDefinedException: logging.warn('No hps found for tag {}. Creating and saving some.'.format(args.tag)) hps = hypers.get_default_hparams() hps.train_file = args.train_recordfile hps.rounds = args.n_rounds hps.weight = args.weight hypers.save_hps(args.tag, hps) validate_hps(hps) dataset = Dataset(hps.train_file, hps) with time_me(mode='stderr'): train(dataset, args.tag, hps)
def xgb_train(train_config, X_train, y_train, X_test, y_test): import xgboost as xgb LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format( X_train.shape, y_train.shape, X_test.shape, y_test.shape)) param = train_config["param"] xg_train = xgb.DMatrix(X_train, label=y_train) xg_test = xgb.DMatrix(X_test, label=y_test) num_round = int(train_config["num_round"]) watchlist = [(xg_train, 'train'), (xg_test, 'test')] try: bst = xgb.train(param, xg_train, num_round, watchlist) except KeyboardInterrupt: LOGGER.info("Canceld by user's Ctrl-C action") return y_pred = np.argmax(bst.predict(xg_test), axis=1) acc = 100. * np.sum(y_pred == y_test) / len(y_test) LOGGER.info("accuracy={}%".format(acc))
def train_relatedness_classifier(trainX, trainY): xg_train = xgb.DMatrix(trainX, label=trainY) # setup parameters for xgboost param = {} # use softmax multi-class classification param['objective'] = 'binary:logistic' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 6 param['silent'] = 1 param['nthread'] = 20 num_round = 1000 relatedness_classifier = xgb.train(param, xg_train, num_round); return relatedness_classifier
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def fit(self, X, y): if self.use_mspe: lgb_train = lgb.Dataset(X, y, weight=np.ones(X.shape[0]), free_raw_data=False) lgb_test = lgb.Dataset(X, y, reference=lgb_train, weight=np.ones(X.shape[0]), free_raw_data=False) self.gbm = lgb.train( self.kwargs, lgb_train, num_boost_round=10, fobj=mspe, feval=evalerror_lgbm, valid_sets=lgb_test) else: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3) #lgb_test = lgb.Dataset(X, y, reference=lgb_train, # weight=np.ones(X.shape[0]), # free_raw_data=False) self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False) #print "gbm best_iteration=", self.gbm.best_iteration
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not HAS_XGBOOST: return if not HAS_SKLEARN: return scikit_data = load_boston() dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target, feature_names = scikit_data.feature_names) xgb_model = xgboost.train({}, dtrain, 1) # Save the data and the model self.scikit_data = scikit_data self.xgb_model = xgb_model self.feature_names = self.scikit_data.feature_names
def _train_convert_evaluate(self, bt_params = {}, **params): """ Set up the unit test by loading the dataset and training a model. """ # Train a model xgb_model = xgboost.train(bt_params, self.dtrain, **params) # Convert the model spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = xgb_model.predict(self.dtrain) # Evaluate it metrics = evaluate_regressor(spec, df, target = 'target', verbose = False) return metrics
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10): train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan')) test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan')) evallist = [(train_data,'train'), (test_data,'eval')] if xgb_params is None: xgb_params = get_default_xgboost_params() print "xgb_params not found" print "XGBoost, using param", xgb_params gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5) isgbtree = xgb_params["booster"] == "gbtree" if isgbtree : ceate_feature_map_for_feature_importance(features) show_feature_importance(gbdt) y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit) else: y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan'))) return XGBoostModel(gbdt), y_pred
def predict(self, X): train_file = os.path.join(self.tmp_dir, 'train.svm') pred_file = os.path.join(self.tmp_dir, 'pred.svm') out_file = os.path.join(self.tmp_dir, 'out.txt') print "Exporting pred..." with open(pred_file, 'w') as f: dump_svmlight_file(X, np.zeros(X.shape[0]), f=f) params = self.params.copy() params['iter'] = 0 params['task'] = 'r' params['train'] = train_file params['test'] = pred_file params['out'] = out_file params['load_model'] = os.path.join(self.tmp_dir, 'model.libfm') params = " ".join("-{} {}".format(k, params[k]) for k in params) command = "{} {}".format(self.exec_path, params) print command os.system(command) return pd.read_csv(out_file, header=None).values.flatten()
def ExtGBDT(train_x, train_y, test_x, test_y): """ Ext-GBDT """ num_round = 100 param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc', 'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4, 'max_delta_step': 0} train_X = xgb.DMatrix(train_x, train_y) test_X = xgb.DMatrix(test_x) bst = xgb.train(param, train_X, num_round) pred = bst.predict(test_X) predict_y = [] for i in range(len(pred)): if pred[i] < 0.5: predict_y.append(0) else: predict_y.append(1) auc = evaluate_auc(pred, test_y) evaluate(predict_y, test_y) return auc
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params , X_test, using_notebook=True, num_boost_round=0): ''' early version ''' # create array object to hold predictions baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32) #loop for as many times as we want bags if using_notebook: for n in tqdm_notebook(range(0, bag_round)): #shuffle first, aids in increasing variance and forces different results X_train, y_train=shuffle(X_train, y_train, random_state=seed+n) params['seed'] = seed + n model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round) pred = model.predict(X_test) baggedpred += pred/bag_round return baggedpred
def do_run(self, train, predict, window): LabelColumnName = 'label' data_file = "data_file_xgboost_" + str(window) + ".pkl" if os.path.exists(data_file): input = open(data_file, 'rb') data_feature = pickle.load(input) input.close() else: data_feature = get_all_stocks_feature_data(self.paras, window, LabelColumnName) output = open(data_file, 'wb') pickle.dump(data_feature, output) output.close() model = None train_feature = {} if train: model = self.train_data(data_feature, window, LabelColumnName) if predict: self.predict_data(model, data_feature, window, LabelColumnName)
def runXGB(train_X, train_y, seed_val=123): param = {} param['objective'] = 'multi:softprob' param['eta'] = 0.05 param['max_depth'] = 6 param['silent'] = 1 param['num_class'] = 22 param['eval_metric'] = "mlogloss" param['min_child_weight'] = 2 param['subsample'] = 0.9 param['colsample_bytree'] = 0.9 param['seed'] = seed_val num_rounds = 115 plst = list(param.items()) xgtrain = xgb.DMatrix(train_X, label=train_y) model = xgb.train(plst, xgtrain, num_rounds) return model
def cross_validate(train): #separate training and validation set X_train,X_valid= split_train_validation(train) scores = []; preds = [] for i in xrange(len(X_train)): #convert X_train, Y_train etc... to xgboost matrix dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan) #predict with xgboost parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8, 'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50, 'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'} plst = parameters.items() bst = xgb.train(plst, dtrain) pred = bst.predict(dvalid) scores.append(log_loss(X_valid[i]['group'].tolist(),pred)) pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_) preds.append(pred) return scores, preds
def test_basic(c, s, a, b): dtrain = xgb.DMatrix(df, label=labels) bst = xgb.train(param, dtrain) ddf = dd.from_pandas(df, npartitions=4) dlabels = dd.from_pandas(labels, npartitions=4) dbst = yield dxgb._train(c, param, ddf, dlabels) dbst = yield dxgb._train(c, param, ddf, dlabels) # we can do this twice result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == labels dcorrect = (dresult > 0.5) == labels assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, ddf) assert isinstance(predictions, dd.Series) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, pd.Series) assert ((predictions > 0.5) != labels).sum() < 2
def test_dmatrix_kwargs(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2,)) dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0}) # Distributed model matches local model with dmatrix kwargs dtrain = xgb.DMatrix(X, label=y, missing=0.0) bst = xgb.train(param, dtrain) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) assert np.abs(result - dresult).sum() < 0.02 # Distributed model gives bad predictions without dmatrix kwargs dtrain_incompat = xgb.DMatrix(X, label=y) dresult_incompat = dbst.predict(dtrain_incompat) assert np.abs(result - dresult_incompat).sum() > 0.02
def test_numpy(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2,)) dbst = yield dxgb._train(c, param, dX, dy) dbst = yield dxgb._train(c, param, dX, dy) # we can do this twice dtrain = xgb.DMatrix(X, label=y) bst = xgb.train(param, dtrain) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == y dcorrect = (dresult > 0.5) == y assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, np.ndarray) assert ((predictions > 0.5) != labels).sum() < 2
def test_synchronous_api(loop): # noqa dtrain = xgb.DMatrix(df, label=labels) bst = xgb.train(param, dtrain) ddf = dd.from_pandas(df, npartitions=4) dlabels = dd.from_pandas(labels, npartitions=4) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: dbst = dxgb.train(c, param, ddf, dlabels) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == labels dcorrect = (dresult > 0.5) == labels assert dcorrect.sum() >= correct.sum()
def fit(self, X, y=None): """Fit the gradient boosting model Parameters ---------- X : array-like [n_samples, n_features] y : array-like Returns ------- self : the fitted Regressor Notes ----- This differs from the XGBoost version not supporting the ``eval_set``, ``eval_metric``, ``early_stopping_rounds`` and ``verbose`` fit kwargs. """ client = default_client() xgb_options = self.get_xgb_params() self._Booster = train(client, xgb_options, X, y, num_boost_round=self.n_estimators) return self
def train(params, dmatrix_train, dmatrix_validate): params['silent'] = 1 params['objective'] = 'binary:logistic' # output probabilities params['eval_metric'] = 'auc' num_rounds = params["num_rounds"] early_stopping_rounds = params["early_stop_rounds"] # early stop will check on the last dataset watchlist = [(dmatrix_train, 'train'), (dmatrix_validate, 'validate')] bst = xgb.train(param, dmatrix_train, num_rounds, watchlist, early_stopping_rounds=early_stopping_rounds) print "parameters: {}".format(param) print "best {}: {:.2f}".format(param["eval_metric"], bst.best_score) print "best_iteration: %d" % (bst.best_iteration) return params,bst
def fit(self, X, y, x_val=None, y_val=None): dtrain = xgb.DMatrix(X, label=y) if x_val is not None: dtest = xgb.DMatrix(x_val, label=y_val) watchlist = [(dtrain, 'train'), (dtest, 'validation')] self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, evals=watchlist, verbose_eval=self.verbose) else: self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds) return
def fit(self, X, y, x_val=None, y_val=None): dtrain = xgb.DMatrix(X, label=y) if x_val is not None: dtest = xgb.DMatrix(x_val, label=y_val) watchlist = [(dtrain, 'train'), (dtest, 'validation')] self.xgb = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, evals=watchlist, verbose_eval=self.verbose) else: self.xgb = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) return
def fit(self, train_fs, train_labels, valid_fs, valid_labels): rank_k = self.config.getint('RANK', 'rank_k') train_DMatrix = xgb.DMatrix(train_fs, label=train_labels) train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k)) valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels) valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k)) watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')] # self.__lock() self.model = xgb.train(self.params, train_DMatrix, self.params['num_round'], watchlist, early_stopping_rounds=self.params['early_stop'], verbose_eval=self.params['verbose_eval']) LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit) # self.__unlock() valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit) return valid_preds
def Options(): op = OptionParser() op.add_option('-E', '--events') op.add_option('-l', '--train') op.add_option('-F', '--fstr') op.add_option('-B', '--bags') op.add_option('-R', '--seed') op.add_option('-T', '--trees') op.add_option('-O', '--out') op.add_option('-d', '--depth') op.add_option('-e', '--eta') op.add_option('-S', '--subsample') op.add_option('-v', '--toeval') op.add_option("-V", action="store_true", dest="verbose") op.add_option("-D", action="store_true", dest="dump") op.add_option("-Q", action="store_true", dest="names") op.add_option("-P", action="store_true", dest="production") return op.parse_args()[0]
def predict(): saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def predict(): saved = state.load('model') #saved = None if saved == None: train, y, test, _ = data.get() z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def predict(): saved = state.load('model') #saved = None if debug_mode: saved = None if saved == None: train, y, test, _ = data.get() ftrain, ftest, _ = fea_1.get() ftrain2, ftest2, _ = fea_2.get() train = pd.concat([train, ftrain, ftrain2], axis=1) test = pd.concat([test, ftest, ftest2], axis=1) print(train.shape, test.shape) z = pd.DataFrame() z['id'] = test.id z['y'] = 0 v = pd.DataFrame() v['id'] = train.id v['y'] = y cv, _ = run(train, y, test, v, z) state.save('model', (v, z, cv, None)) else: v, z, cv, _ = saved return v, z, cv, _
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed) for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def save_results(v, z): pred_path = '../submissions/p' + base_data_name() + '.csv' all_data_path = '../data/output/model' + base_data_name() + '.csv.gz' z.y = np.clip(z.y.values, 1e-5, 1-1e-5) z[['y']].to_csv(pred_path, header=None, index=False) v['train'] = 1 z['train'] = 0 q = pd.concat([v, z], axis=0) q.to_csv(all_data_path, index=False, compression='gzip') for c in z.columns: if c in {'id', 'train', 'y'}: continue z[c] = prestore(z[c]) print(z.head(20)) print('saved', pred_path, all_data_path)
def fit(self,X,y,Xt=None,yt=None, load_model=None,save_model=None, obj=None,feval=None,print_fscore=True,evalx=None): print(X.shape,y.shape) num_round = self.params.get('num_round',100) early_stopping_rounds = self.params.get('early_stopping_rounds',None) maximize = self.params.get('maximize',False) dtrain = xgb.DMatrix(X, y) vb = self.params.get('verbose_eval',1) if Xt is not None: dvalid = xgb.DMatrix(Xt, yt) watchlist = [(dtrain, 'train'), (dvalid, 'valid')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, early_stopping_rounds=early_stopping_rounds,verbose_eval=vb, xgb_model=load_model,obj=obj,feval=feval,maximize=maximize) else: watchlist = [(dtrain, 'train')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval) self.bst = bst if save_model is not None: bst.save_model(save_model) fscore = self.feature_importance() if print_fscore: print("Feature Importance:") for i in fscore: print(i) if Xt is not None and evalx is not None: yp = self.predict(Xt) score = evalx(yt,yp) print(score) return score return 0
def save_dataframe(self, df): with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, 'train_debug', self.timestamp, '.h5']), "w") as train: train.put("train_debug", df)
def load_dataframe(): dataframe_name = 'train_debug' # one-hot encoded # not one-hot # date_time = '20170613_19h09m40s' # date_time = '20170613_19h34m31s' # date_time = '20170614_00h07m32s' date_time = '20170619_11h47m22s' with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, dataframe_name, date_time, '.h5']), 'r') as train: return train.get(dataframe_name)