Python xgboost 模块,train() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用xgboost.train()

项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None):
        print(X.shape,y.shape)
        num_round = self.params['num_round']
        early_stopping_rounds = self.params['early_stopping_rounds']
        dtrain = xgb.DMatrix(X, y)
        dtrain.set_group(Xg)

        if Xt is not None:
            dvalid = xgb.DMatrix(Xt, yt)
            dvalid.set_group(Xgt)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model,
                maximize=True)
        else:
            watchlist = [(dtrain, 'train')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                verbose_eval=1,xgb_model=load_model)
        self.bst = bst
        if save_model is not None:
            bst.save_model(save_model)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:DriverPower    作者:smshuai    | 项目源码 | 文件源码
def run_gbm(dtrain, dvalid, param):
    # check training arguments in param
    n_round = param.get('num_boost_round', 5000)
    early_stop = param.get('early_stopping_rounds', 5)
    verbose_eval = param.get('verbose_eval', 100)
    # specify validations set to watch performance
    watchlist = [(dvalid, 'eval')]
    bst = xgb.train(params=param,
                    dtrain=dtrain,
                    num_boost_round=n_round,
                    evals=watchlist,
                    early_stopping_rounds=early_stop,
                    verbose_eval = verbose_eval
                   )
    return bst
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def crate_pre_train_model(x_,y_):
    (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
    (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
    dtrain = xgb.DMatrix( x_train, label=y_train)
    dtest = xgb.DMatrix( x_test, label=y_test)
    evallist  = [(dtrain,'train'),(dtest,'eval')]
    param = {'objective':'reg:linear','max_depth':3 }
    param['nthread'] = 64
    #param['min_child_weight'] = 15
    #param['subsample'] = 1
    #param['num_class'] = 7
    plst = param.items()
    num_round = 5000
    bst = xgb.train( plst, dtrain, num_round,
                    evallist,early_stopping_rounds=100,
                    #obj=logregobj,
                    feval=evalerror
                    )
    return bst

# %% main
项目:Video-Classification-Action-Recognition    作者:qijiezhao    | 项目源码 | 文件源码
def get_data(item='train',id=1,is_shuffle=False,is_subtrain=1):
    file_path=os.path.join(metadata_root,item+'_list0'+id+'.txt')
    files=[]
    labels=[]
    with open(file_path,'r')as fp:
        lines=fp.readlines()
        if is_shuffle==True:
            np.random.shuffle(lines)
        if not is_subtrain==1:
            lines=random.sample(lines,int(len(lines)*is_subtrain))
        for line in lines:
            tmp_prefix=line.strip().split('.')[0].split('/')[1]
            label_tmp=line.strip().split(' ')[1]
            files.append(os.path.join(feature_root,tmp_prefix+'.npy'))
            labels.append(int(label_tmp)-1)
    return files,np.array(labels,dtype=np.float64)
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_num_boost_round():
    # global watchlist
    global num_boost_round
    global evals_result
    global eval_metric_xgb_format
    evals_result = {}
    xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result)
    evals_result = evals_result['eval'][eval_metric_xgb_format]
    # pprint.pprint(evals_result)
    max = 0.0
    max_loc = 0
    for i,v in enumerate(evals_result):
        # print '%d ...... %d : %d'%(i,max_loc,max)
        if v>max:
            max = v
            max_loc = i
    # print "max_loc : %s ,  max : %s"%(max_loc,max)
    num_boost_round = max_loc+1
    print('****  num_boost_round : ', num_boost_round)
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_num_boost_round():
    # global watchlist
    global num_boost_round
    global  evals_result
    evals_result = {}
    xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result)
    evals_result = evals_result['eval']['map']
    pprint.pprint(evals_result)
    max = 0.0
    max_loc = 0
    for i,v in enumerate(evals_result):
        # print '%d ...... %d : %d'%(i,max_loc,max)
        if v>max:
            max = v
            max_loc = i
    print "max_loc : %d ,  max : %d"%(max_loc,max)
    num_boost_round = max_loc+1
    print '****  num_boost_round : ', num_boost_round
项目:MLBootCampV    作者:Koziev    | 项目源码 | 文件源码
def load_data():

    train_data = pd.read_csv(os.path.join(data_folder, 'train.csv'), delimiter=';', skip_blank_lines=True)
    test_data = pd.read_csv(os.path.join(data_folder, 'test.csv'), delimiter=';', skip_blank_lines=True,
                            na_values='None')

    ntrain = train_data.shape[0]
    ntest = test_data.shape[0]

    print('ntrain={}'.format(ntrain))
    print('ntest={}'.format(ntest))

    y_train = train_data['cardio'].values

    # --------------------------------------------------------------

    x_train = train_data.drop(["id", "cardio"], axis=1)
    x_test = test_data.drop(["id"], axis=1)

    x_test.replace('None', np.nan)

    return (x_train,y_train,x_test)

# ---------------------------------------------------------------------
项目:instacart-basket-prediction    作者:colinmorris    | 项目源码 | 文件源码
def main():
  logging.basicConfig(level=logging.INFO)
  parser = argparse.ArgumentParser()
  parser.add_argument('tag')
  parser.add_argument('--train-recordfile', default='train', 
      help='identifier for file with the users to train on (default: train). deprecated: specify in hps...')
  parser.add_argument('-n', '--n-rounds', type=int, default=50,
      help='Number of rounds of boosting. Deprecated: specify this in hp config file')
  parser.add_argument('--weight', action='store_true',
      help='Whether to do per-instance weighting. Deprecated: specify in hps')
  args = parser.parse_args()

  try:
    hps = hypers.hps_for_tag(args.tag)
  except hypers.NoHpsDefinedException:
    logging.warn('No hps found for tag {}. Creating and saving some.'.format(args.tag))
    hps = hypers.get_default_hparams()
    hps.train_file = args.train_recordfile
    hps.rounds = args.n_rounds
    hps.weight = args.weight
    hypers.save_hps(args.tag, hps)
  validate_hps(hps)
  dataset = Dataset(hps.train_file, hps)
  with time_me(mode='stderr'):
    train(dataset, args.tag, hps)
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))
项目:fnc-1    作者:shangjingbo1226    | 项目源码 | 文件源码
def train_relatedness_classifier(trainX, trainY):
    xg_train = xgb.DMatrix(trainX, label=trainY)
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'binary:logistic'
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['nthread'] = 20

    num_round = 1000
    relatedness_classifier = xgb.train(param, xg_train, num_round);

    return relatedness_classifier
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:kdd2017    作者:JinpengLI    | 项目源码 | 文件源码
def fit(self, X, y):
        if self.use_mspe:
            lgb_train = lgb.Dataset(X, y,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            lgb_test = lgb.Dataset(X, y, reference=lgb_train,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            self.gbm = lgb.train(
                self.kwargs,
                lgb_train,
                num_boost_round=10,
                fobj=mspe,
                feval=evalerror_lgbm,
                valid_sets=lgb_test)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3)
            #lgb_test = lgb.Dataset(X, y, reference=lgb_train,
            #            weight=np.ones(X.shape[0]), 
            #            free_raw_data=False) 
            self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
            #print "gbm best_iteration=", self.gbm.best_iteration
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not HAS_XGBOOST:
            return
        if not HAS_SKLEARN:
            return

        scikit_data = load_boston()
        dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target,
                feature_names = scikit_data.feature_names)
        xgb_model = xgboost.train({}, dtrain, 1)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.xgb_model = xgb_model
        self.feature_names = self.scikit_data.feature_names
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def _train_convert_evaluate(self, bt_params = {}, **params):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        # Train a model
        xgb_model = xgboost.train(bt_params, self.dtrain, **params)

        # Convert the model
        spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False)

        # Get predictions
        df = pd.DataFrame(self.X, columns=self.feature_names)
        df['prediction'] = xgb_model.predict(self.dtrain)

        # Evaluate it
        metrics = evaluate_regressor(spec, df, target = 'target', verbose = False)
        return metrics
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10):
    train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
    test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
    evallist  = [(train_data,'train'), (test_data,'eval')]

    if xgb_params is None:
        xgb_params = get_default_xgboost_params()
        print "xgb_params not found"

    print "XGBoost, using param", xgb_params
    gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)

    isgbtree = xgb_params["booster"] == "gbtree"
    if isgbtree :
        ceate_feature_map_for_feature_importance(features)
        show_feature_importance(gbdt)
        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit)
    else:
        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')))

    return XGBoostModel(gbdt), y_pred
项目:kaggle-allstate-claims-severity    作者:alno    | 项目源码 | 文件源码
def predict(self, X):
        train_file = os.path.join(self.tmp_dir, 'train.svm')
        pred_file = os.path.join(self.tmp_dir, 'pred.svm')
        out_file = os.path.join(self.tmp_dir, 'out.txt')

        print "Exporting pred..."
        with open(pred_file, 'w') as f:
            dump_svmlight_file(X, np.zeros(X.shape[0]), f=f)

        params = self.params.copy()
        params['iter'] = 0
        params['task'] = 'r'
        params['train'] = train_file
        params['test'] = pred_file
        params['out'] = out_file
        params['load_model'] = os.path.join(self.tmp_dir, 'model.libfm')
        params = " ".join("-{} {}".format(k, params[k]) for k in params)

        command = "{} {}".format(self.exec_path, params)

        print command
        os.system(command)

        return pd.read_csv(out_file, header=None).values.flatten()
项目:CreditScoring    作者:cqw5    | 项目源码 | 文件源码
def ExtGBDT(train_x, train_y, test_x, test_y):
    """ Ext-GBDT """
    num_round = 100
    param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc',
             'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4,
             'max_delta_step': 0}
    train_X = xgb.DMatrix(train_x, train_y)
    test_X = xgb.DMatrix(test_x)
    bst = xgb.train(param, train_X, num_round)
    pred = bst.predict(test_X)
    predict_y = []
    for i in range(len(pred)):
        if pred[i] < 0.5:
            predict_y.append(0)
        else:
            predict_y.append(1)
    auc = evaluate_auc(pred, test_y)
    evaluate(predict_y, test_y)
    return auc
项目:Kaggle_Buddy    作者:NickYi1990    | 项目源码 | 文件源码
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params
                                 , X_test, using_notebook=True, num_boost_round=0):
    '''
        early version
    '''
    # create array object to hold predictions
    baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32)
    #loop for as many times as we want bags
    if using_notebook:
        for n in tqdm_notebook(range(0, bag_round)):
            #shuffle first, aids in increasing variance and forces different results
            X_train, y_train=shuffle(X_train, y_train, random_state=seed+n)
            params['seed'] = seed + n
            model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round)
            pred = model.predict(X_test)
            baggedpred += pred/bag_round

    return baggedpred
项目:StockRecommendSystem    作者:doncat99    | 项目源码 | 文件源码
def do_run(self, train, predict, window):
        LabelColumnName = 'label'
        data_file = "data_file_xgboost_" + str(window) + ".pkl"

        if os.path.exists(data_file):
            input = open(data_file, 'rb')
            data_feature = pickle.load(input)
            input.close()
        else:
            data_feature = get_all_stocks_feature_data(self.paras, window, LabelColumnName)
            output = open(data_file, 'wb')
            pickle.dump(data_feature, output)
            output.close()

        model = None

        train_feature = {}

        if train: model = self.train_data(data_feature, window, LabelColumnName)

        if predict: self.predict_data(model, data_feature, window, LabelColumnName)
项目:bank-product-recommender    作者:rohansapre    | 项目源码 | 文件源码
def runXGB(train_X, train_y, seed_val=123):
  param = {}
  param['objective'] = 'multi:softprob'
  param['eta'] = 0.05
  param['max_depth'] = 6
  param['silent'] = 1
  param['num_class'] = 22
  param['eval_metric'] = "mlogloss"
  param['min_child_weight'] = 2
  param['subsample'] = 0.9
  param['colsample_bytree'] = 0.9
  param['seed'] = seed_val
  num_rounds = 115

  plst = list(param.items())
  xgtrain = xgb.DMatrix(train_X, label=train_y)
  model = xgb.train(plst, xgtrain, num_rounds)  
  return model
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def cross_validate(train):
    #separate training and validation set
    X_train,X_valid= split_train_validation(train)
    scores = []; preds = []
    for i in xrange(len(X_train)):
        #convert X_train, Y_train etc... to xgboost matrix
        dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) 
        dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan)

        #predict with xgboost
        parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8,
                'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50,
                'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'}
        plst = parameters.items()
        bst = xgb.train(plst, dtrain)
        pred = bst.predict(dvalid)

        scores.append(log_loss(X_valid[i]['group'].tolist(),pred))
        pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_)
        preds.append(pred)
    return scores, preds
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_basic(c, s, a, b):
    dtrain = xgb.DMatrix(df, label=labels)
    bst = xgb.train(param, dtrain)

    ddf = dd.from_pandas(df, npartitions=4)
    dlabels = dd.from_pandas(labels, npartitions=4)
    dbst = yield dxgb._train(c, param, ddf, dlabels)
    dbst = yield dxgb._train(c, param, ddf, dlabels)  # we can do this twice

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == labels
    dcorrect = (dresult > 0.5) == labels
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, ddf)
    assert isinstance(predictions, dd.Series)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, pd.Series)

    assert ((predictions > 0.5) != labels).sum() < 2
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_dmatrix_kwargs(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2,))
    dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0})

    # Distributed model matches local model with dmatrix kwargs
    dtrain = xgb.DMatrix(X, label=y, missing=0.0)
    bst = xgb.train(param, dtrain)
    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)
    assert np.abs(result - dresult).sum() < 0.02

    # Distributed model gives bad predictions without dmatrix kwargs
    dtrain_incompat = xgb.DMatrix(X, label=y)
    dresult_incompat = dbst.predict(dtrain_incompat)
    assert np.abs(result - dresult_incompat).sum() > 0.02
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_numpy(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2,))
    dbst = yield dxgb._train(c, param, dX, dy)
    dbst = yield dxgb._train(c, param, dX, dy)  # we can do this twice

    dtrain = xgb.DMatrix(X, label=y)
    bst = xgb.train(param, dtrain)

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == y
    dcorrect = (dresult > 0.5) == y
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, np.ndarray)

    assert ((predictions > 0.5) != labels).sum() < 2
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_synchronous_api(loop):  # noqa
    dtrain = xgb.DMatrix(df, label=labels)
    bst = xgb.train(param, dtrain)

    ddf = dd.from_pandas(df, npartitions=4)
    dlabels = dd.from_pandas(labels, npartitions=4)

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:

            dbst = dxgb.train(c, param, ddf, dlabels)

            result = bst.predict(dtrain)
            dresult = dbst.predict(dtrain)

            correct = (result > 0.5) == labels
            dcorrect = (dresult > 0.5) == labels
            assert dcorrect.sum() >= correct.sum()
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit the gradient boosting model

        Parameters
        ----------
        X : array-like [n_samples, n_features]
        y : array-like

        Returns
        -------
        self : the fitted Regressor

        Notes
        -----
        This differs from the XGBoost version not supporting the ``eval_set``,
        ``eval_metric``, ``early_stopping_rounds`` and ``verbose`` fit
        kwargs.
        """
        client = default_client()
        xgb_options = self.get_xgb_params()
        self._Booster = train(client, xgb_options, X, y,
                              num_boost_round=self.n_estimators)
        return self
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def train(params, dmatrix_train, dmatrix_validate):
    params['silent'] = 1
    params['objective'] = 'binary:logistic'  # output probabilities
    params['eval_metric'] = 'auc'

    num_rounds = params["num_rounds"]
    early_stopping_rounds = params["early_stop_rounds"]

    # early stop will check on the last dataset
    watchlist = [(dmatrix_train, 'train'), (dmatrix_validate, 'validate')]
    bst = xgb.train(param, dmatrix_train, num_rounds, watchlist, early_stopping_rounds=early_stopping_rounds)

    print "parameters: {}".format(param)
    print "best {}: {:.2f}".format(param["eval_metric"], bst.best_score)
    print "best_iteration: %d" % (bst.best_iteration)

    return params,bst
项目:gestalt    作者:mpearmain    | 项目源码 | 文件源码
def fit(self, X, y, x_val=None, y_val=None):

        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return
项目:gestalt    作者:mpearmain    | 项目源码 | 文件源码
def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 verbose_eval=self.verbose)
        return
项目:zhihu-machine-learning-challenge-2017    作者:HouJP    | 项目源码 | 文件源码
def fit(self,
            train_fs, train_labels,
            valid_fs, valid_labels):
        rank_k = self.config.getint('RANK', 'rank_k')

        train_DMatrix = xgb.DMatrix(train_fs, label=train_labels)
        train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k))
        valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels)
        valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k))

        watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')]
        # self.__lock()
        self.model = xgb.train(self.params,
                               train_DMatrix,
                               self.params['num_round'],
                               watchlist,
                               early_stopping_rounds=self.params['early_stop'],
                               verbose_eval=self.params['verbose_eval'])
        LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit)
        # self.__unlock()
        valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit)
        return valid_preds
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))
项目:Sberbank    作者:dimaquick    | 项目源码 | 文件源码
def Options():                                                                                                                                                              
    op = OptionParser()                                                                                                                                              
    op.add_option('-E', '--events')
    op.add_option('-l', '--train')
    op.add_option('-F', '--fstr')
    op.add_option('-B', '--bags')
    op.add_option('-R', '--seed')
    op.add_option('-T', '--trees')
    op.add_option('-O', '--out')
    op.add_option('-d', '--depth')
    op.add_option('-e', '--eta')
    op.add_option('-S', '--subsample')
    op.add_option('-v', '--toeval')
    op.add_option("-V", action="store_true", dest="verbose")                                                                                                              
    op.add_option("-D", action="store_true", dest="dump")                                                                                                              
    op.add_option("-Q", action="store_true", dest="names")                                                                                                              
    op.add_option("-P", action="store_true", dest="production")                                                                                                              
    return op.parse_args()[0]
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        ftrain, ftest, _ = fea_1.get()
        ftrain2, ftest2, _ = fea_2.get()
        train = pd.concat([train, ftrain, ftrain2], axis=1)
        test = pd.concat([test, ftest, ftest2], axis=1)
        print(train.shape, test.shape)

        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if saved == None:
        train, y, test, _ = data.get()
        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def predict():
    saved = state.load('model')
    #saved = None
    if debug_mode:
        saved = None
    if saved == None:
        train, y, test, _ = data.get()
        ftrain, ftest, _ = fea_1.get()
        ftrain2, ftest2, _ = fea_2.get()
        train = pd.concat([train, ftrain, ftrain2], axis=1)
        test = pd.concat([test, ftest, ftest2], axis=1)
        print(train.shape, test.shape)

        z = pd.DataFrame()
        z['id'] = test.id
        z['y'] = 0

        v = pd.DataFrame()
        v['id'] = train.id
        v['y'] = y
        cv, _ = run(train, y, test, v, z)
        state.save('model', (v, z, cv, None))
    else:
        v, z, cv, _ = saved
    return v, z, cv, _
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def save_results(v, z):
    pred_path = '../submissions/p' + base_data_name() + '.csv'
    all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'

    z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
    z[['y']].to_csv(pred_path, header=None, index=False)

    v['train'] = 1
    z['train'] = 0

    q = pd.concat([v, z], axis=0)
    q.to_csv(all_data_path, index=False, compression='gzip')

    for c in z.columns:
        if c in {'id', 'train', 'y'}: continue
        z[c] = prestore(z[c])
    print(z.head(20))
    print('saved', pred_path, all_data_path)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def save_results(v, z):
    pred_path = '../submissions/p' + base_data_name() + '.csv'
    all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'

    z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
    z[['y']].to_csv(pred_path, header=None, index=False)

    v['train'] = 1
    z['train'] = 0

    q = pd.concat([v, z], axis=0)
    q.to_csv(all_data_path, index=False, compression='gzip')

    for c in z.columns:
        if c in {'id', 'train', 'y'}: continue
        z[c] = prestore(z[c])
    print(z.head(20))
    print('saved', pred_path, all_data_path)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def save_results(v, z):
    pred_path = '../submissions/p' + base_data_name() + '.csv'
    all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'

    z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
    z[['y']].to_csv(pred_path, header=None, index=False)

    v['train'] = 1
    z['train'] = 0

    q = pd.concat([v, z], axis=0)
    q.to_csv(all_data_path, index=False, compression='gzip')

    for c in z.columns:
        if c in {'id', 'train', 'y'}: continue
        z[c] = prestore(z[c])
    print(z.head(20))
    print('saved', pred_path, all_data_path)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def save_results(v, z):
    pred_path = '../submissions/p' + base_data_name() + '.csv'
    all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'

    z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
    z[['y']].to_csv(pred_path, header=None, index=False)

    v['train'] = 1
    z['train'] = 0

    q = pd.concat([v, z], axis=0)
    q.to_csv(all_data_path, index=False, compression='gzip')

    for c in z.columns:
        if c in {'id', 'train', 'y'}: continue
        z[c] = prestore(z[c])
    print(z.head(20))
    print('saved', pred_path, all_data_path)
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def fit(self,X,y,Xt=None,yt=None,
        load_model=None,save_model=None,
        obj=None,feval=None,print_fscore=True,evalx=None):
        print(X.shape,y.shape)

        num_round = self.params.get('num_round',100)
        early_stopping_rounds = self.params.get('early_stopping_rounds',None)
        maximize = self.params.get('maximize',False)
        dtrain = xgb.DMatrix(X, y)
        vb = self.params.get('verbose_eval',1)
        if Xt is not None:
            dvalid = xgb.DMatrix(Xt, yt)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                early_stopping_rounds=early_stopping_rounds,verbose_eval=vb,
                xgb_model=load_model,obj=obj,feval=feval,maximize=maximize)
        else:
            watchlist = [(dtrain, 'train')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval)
        self.bst = bst
        if save_model is not None:
            bst.save_model(save_model)            

        fscore = self.feature_importance()
        if print_fscore:
            print("Feature Importance:")
            for i in fscore:
                print(i) 
        if Xt is not None and evalx is not None:
            yp = self.predict(Xt)
            score = evalx(yt,yp)
            print(score)
            return score
        return 0
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def save_dataframe(self, df):
        with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, 'train_debug', self.timestamp, '.h5']), "w") as train:
            train.put("train_debug", df)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def load_dataframe():
        dataframe_name = 'train_debug'

        # one-hot encoded
        # not one-hot
        # date_time = '20170613_19h09m40s'
        # date_time = '20170613_19h34m31s'
        # date_time = '20170614_00h07m32s'
        date_time = '20170619_11h47m22s'
        with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, dataframe_name, date_time, '.h5']), 'r') as train:
            return train.get(dataframe_name)