Python xgboost 模块,DMatrix() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用xgboost.DMatrix()

项目:ensemble_amazon    作者:kaz-Anova    | 项目源码 | 文件源码
def predict_proba(self, X): 
    try:
      rows=(X.shape[0])
    except:
      rows=len(X)
    X1 = self.build_matrix(X)
    if  self.k_models!=None and len(self.k_models)<2:
        predictions = self.bst.predict(X1)
    else :
        dtest = xgb.DMatrix(X)
        predictions= None
        for gbdt in self.k_models:
            predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree)  
            if predictions==None:
                predictions=predsnew
            else:
                for g in range (0, predsnew.shape[0]):
                    predictions[g]+=predsnew[g]
        for g in range (0, len(predictions)):
            predictions[g]/=float(len(self.k_models))               
        predictions=np.array(predictions)
    if self.objective == 'multi:softprob': return predictions.reshape( rows, self.num_class)
    return np.vstack([1 - predictions, predictions]).T
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None):
        print(X.shape,y.shape)
        num_round = self.params['num_round']
        early_stopping_rounds = self.params['early_stopping_rounds']
        dtrain = xgb.DMatrix(X, y)
        dtrain.set_group(Xg)

        if Xt is not None:
            dvalid = xgb.DMatrix(Xt, yt)
            dvalid.set_group(Xgt)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model,
                maximize=True)
        else:
            watchlist = [(dtrain, 'train')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                verbose_eval=1,xgb_model=load_model)
        self.bst = bst
        if save_model is not None:
            bst.save_model(save_model)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:DriverPower    作者:smshuai    | 项目源码 | 文件源码
def predict_with_gbm(X, y, model):
    """

    Args:
        X:
        y:
        model:

    Returns:

    """
    assert model['model_name'] == 'GBM',\
        'Wrong model name in model info: {}. Need GBM.'.format(model['model_name'])
    testData = xgb.DMatrix(data=X, label=y.nMut.values, feature_names=model['feature_names'])
    testData.set_base_margin(np.array(np.log(y.length+1/y.N) + np.log(y.N)))
    kfold = model['kfold']
    pred = np.zeros(y.shape[0])
    for k in range(1, kfold+1):
        model['model'][k].set_param(model['params'])  # Bypass a bug of dumping without max_delta_step
        pred += model['model'][k].predict(testData)
    pred = pred / kfold
    return pred
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def run_grid_search(self):
        """
        This method is called by derived class to start grid search process
        """
        features,labels,cv_folds = self.getFeaturesLabel()
        dtrain_cv  = xgb.DMatrix(features, label= labels,feature_names=features.columns)

        parameter_iterable = self.__get_param_iterable(self.__get_param_grid())  
        kwargs = self.get_learning_params()
        for param in parameter_iterable:
            logging.info("used parameters: {}".format(param))
            bst = xgb.cv(param, dtrain_cv, folds=cv_folds,**kwargs)
            self.__add_to_resultset(param, bst)

        self.__disp_result() 
        return
项目:AutoFolio    作者:mlindauer    | 项目源码 | 文件源码
def predict(self, X):
        '''
            transform ASLib scenario data

            Arguments
            ---------
            X: numpy.array
                instance feature matrix

            Returns
            -------

        '''
        preds = np.array(self.model.predict(xgb.DMatrix(X)))
        preds[preds < 0.5] = 0
        preds[preds >= 0.5] = 1
        return preds
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def crate_pre_train_model(x_,y_):
    (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
    (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
    dtrain = xgb.DMatrix( x_train, label=y_train)
    dtest = xgb.DMatrix( x_test, label=y_test)
    evallist  = [(dtrain,'train'),(dtest,'eval')]
    param = {'objective':'reg:linear','max_depth':3 }
    param['nthread'] = 64
    #param['min_child_weight'] = 15
    #param['subsample'] = 1
    #param['num_class'] = 7
    plst = param.items()
    num_round = 5000
    bst = xgb.train( plst, dtrain, num_round,
                    evallist,early_stopping_rounds=100,
                    #obj=logregobj,
                    feval=evalerror
                    )
    return bst

# %% main
项目:instacart-basket-prediction    作者:colinmorris    | 项目源码 | 文件源码
def as_dmatrix(self):
    path = self.dmatrix_cache_path
    # xgb is not try/except friendly here
    if os.path.exists(path):
      dm = xgb.DMatrix(path, feature_names=self.feature_names,
          feature_types=(self.feature_types if FTYPES else None)
          )
    else:
      logging.info('Cache miss on dmatrix. Building and caching.')
      dm = self._as_dmatrix()
      dm.save_binary(path)
    # We add on weights (if any) after the fact, to avoid proliferation of big
    # serialized dmatrix files.
    if self.weight_mode != 'none':
      weights = self.get_weights()
      dm.set_weight(weights)
    return dm
项目:Tencent2017_Final_Coda_Allegro    作者:BladeCoda    | 项目源码 | 文件源码
def predict_test_prob(bst):
    df_all=loadCSV('data/first_merge/test_join_v9.csv') 

    df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv') 
    print('????')
    df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID')
    del df_sta_lgbm

    instanceID=df_all.instanceID.values
    feature_all=df_all.drop(['label','clickTime','instanceID',
                             'residence','appCategory'],axis=1).values

    del df_all

    dtest=xgb.DMatrix(feature_all)
    prob=bst.predict(dtest)

    output=pd.DataFrame({'instanceID':instanceID,'prob':prob})

    output.to_csv('result/submission2.csv',index=False) 

#????
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))
项目:Medium-crawler-with-data-analyzer    作者:lifei96    | 项目源码 | 文件源码
def data_pre_process(train_path, test_path, label, drop_list=None):
    train_dataset = pandas.read_csv(train_path)
    if drop_list:
        train_dataset = train_dataset.drop(drop_list, axis=1)
    y_train = train_dataset[label].astype(int)
    print y_train.dtypes
    X_train = train_dataset.drop(label, axis=1)
    test_dataset = pandas.read_csv(test_path)
    if drop_list:
        test_dataset = test_dataset.drop(drop_list, axis=1)
    y_test = test_dataset[label].astype(int)
    print y_test.dtypes
    X_test = test_dataset.drop(label, axis=1)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    return dtrain, dtest
项目:fnc-1    作者:shangjingbo1226    | 项目源码 | 文件源码
def train_relatedness_classifier(trainX, trainY):
    xg_train = xgb.DMatrix(trainX, label=trainY)
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'binary:logistic'
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['nthread'] = 20

    num_round = 1000
    relatedness_classifier = xgb.train(param, xg_train, num_round);

    return relatedness_classifier
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not HAS_XGBOOST:
            return
        if not HAS_SKLEARN:
            return

        scikit_data = load_boston()
        dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target,
                feature_names = scikit_data.feature_names)
        xgb_model = xgboost.train({}, dtrain, 1)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.xgb_model = xgb_model
        self.feature_names = self.scikit_data.feature_names
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10):
    train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
    test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
    evallist  = [(train_data,'train'), (test_data,'eval')]

    if xgb_params is None:
        xgb_params = get_default_xgboost_params()
        print "xgb_params not found"

    print "XGBoost, using param", xgb_params
    gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)

    isgbtree = xgb_params["booster"] == "gbtree"
    if isgbtree :
        ceate_feature_map_for_feature_importance(features)
        show_feature_importance(gbdt)
        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit)
    else:
        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')))

    return XGBoostModel(gbdt), y_pred
项目:KAGGLE_AVITO_2016    作者:ZFTurbo    | 项目源码 | 文件源码
def run_train_with_model(train, features, model_path):
    start_time = time.time()

    gbm = xgb.Booster()
    gbm.load_model(model_path)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(train[features]))
    score = roc_auc_score(train['isDuplicate'].values, check)
    validation_df = pd.DataFrame({'itemID_1': train['itemID_1'].values, 'itemID_2': train['itemID_2'].values,
                                  'isDuplicate': train['isDuplicate'].values, 'probability': check})
    print('AUC score value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print('Prediction time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return validation_df, score
项目:KAGGLE_AVITO_2016    作者:ZFTurbo    | 项目源码 | 文件源码
def run_test_with_model(train, test, features, model_path):
    start_time = time.time()

    gbm = xgb.Booster()
    gbm.load_model(model_path)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(train[features]))
    score = roc_auc_score(train['isDuplicate'].values, check)
    validation_df = pd.DataFrame({'isDuplicate': train['isDuplicate'].values, 'probability': check})
    # print(validation_df)
    print('AUC score value: {:.6f}'.format(score))
    # score1 = roc_auc_score(validation_df['isDuplicate'].values, validation_df['probability'])
    # print('AUC score check value: {:.6f}'.format(score1))


    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]))

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), validation_df, score
项目:CreditScoring    作者:cqw5    | 项目源码 | 文件源码
def ExtGBDTEnsemblePredict(sub_clf_num, predict_x):
    """
    ????????
    :param sub_clf_num: ??????
    :param predict_x: ??????feature
    :return: socre: ndarray, ????
    """
    total_score = np.zeros(len(predict_x))  # ?????????????????
    for i in range(sub_clf_num):
        predict_X = xgb.DMatrix(predict_x)
        model_file = '../model/model' + str(i)
        bst = pickle.load(open(model_file, 'r'))
        predict_y = bst.predict(predict_X)
        total_score += predict_y
    score = total_score / sub_clf_num
    return score
项目:CreditScoring    作者:cqw5    | 项目源码 | 文件源码
def ExtGBDT(train_x, train_y, test_x, test_y):
    """ Ext-GBDT """
    num_round = 100
    param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc',
             'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4,
             'max_delta_step': 0}
    train_X = xgb.DMatrix(train_x, train_y)
    test_X = xgb.DMatrix(test_x)
    bst = xgb.train(param, train_X, num_round)
    pred = bst.predict(test_X)
    predict_y = []
    for i in range(len(pred)):
        if pred[i] < 0.5:
            predict_y.append(0)
        else:
            predict_y.append(1)
    auc = evaluate_auc(pred, test_y)
    evaluate(predict_y, test_y)
    return auc
项目:bank-product-recommender    作者:rohansapre    | 项目源码 | 文件源码
def runXGB(train_X, train_y, seed_val=123):
  param = {}
  param['objective'] = 'multi:softprob'
  param['eta'] = 0.05
  param['max_depth'] = 6
  param['silent'] = 1
  param['num_class'] = 22
  param['eval_metric'] = "mlogloss"
  param['min_child_weight'] = 2
  param['subsample'] = 0.9
  param['colsample_bytree'] = 0.9
  param['seed'] = seed_val
  num_rounds = 115

  plst = list(param.items())
  xgtrain = xgb.DMatrix(train_X, label=train_y)
  model = xgb.train(plst, xgtrain, num_rounds)  
  return model
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def cross_validate(train):
    #separate training and validation set
    X_train,X_valid= split_train_validation(train)
    scores = []; preds = []
    for i in xrange(len(X_train)):
        #convert X_train, Y_train etc... to xgboost matrix
        dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) 
        dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan)

        #predict with xgboost
        parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8,
                'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50,
                'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'}
        plst = parameters.items()
        bst = xgb.train(plst, dtrain)
        pred = bst.predict(dvalid)

        scores.append(log_loss(X_valid[i]['group'].tolist(),pred))
        pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_)
        preds.append(pred)
    return scores, preds
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_basic(c, s, a, b):
    dtrain = xgb.DMatrix(df, label=labels)
    bst = xgb.train(param, dtrain)

    ddf = dd.from_pandas(df, npartitions=4)
    dlabels = dd.from_pandas(labels, npartitions=4)
    dbst = yield dxgb._train(c, param, ddf, dlabels)
    dbst = yield dxgb._train(c, param, ddf, dlabels)  # we can do this twice

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == labels
    dcorrect = (dresult > 0.5) == labels
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, ddf)
    assert isinstance(predictions, dd.Series)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, pd.Series)

    assert ((predictions > 0.5) != labels).sum() < 2
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_dmatrix_kwargs(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2,))
    dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0})

    # Distributed model matches local model with dmatrix kwargs
    dtrain = xgb.DMatrix(X, label=y, missing=0.0)
    bst = xgb.train(param, dtrain)
    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)
    assert np.abs(result - dresult).sum() < 0.02

    # Distributed model gives bad predictions without dmatrix kwargs
    dtrain_incompat = xgb.DMatrix(X, label=y)
    dresult_incompat = dbst.predict(dtrain_incompat)
    assert np.abs(result - dresult_incompat).sum() > 0.02
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_numpy(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2,))
    dbst = yield dxgb._train(c, param, dX, dy)
    dbst = yield dxgb._train(c, param, dX, dy)  # we can do this twice

    dtrain = xgb.DMatrix(X, label=y)
    bst = xgb.train(param, dtrain)

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == y
    dcorrect = (dresult > 0.5) == y
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, np.ndarray)

    assert ((predictions > 0.5) != labels).sum() < 2
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_synchronous_api(loop):  # noqa
    dtrain = xgb.DMatrix(df, label=labels)
    bst = xgb.train(param, dtrain)

    ddf = dd.from_pandas(df, npartitions=4)
    dlabels = dd.from_pandas(labels, npartitions=4)

    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:

            dbst = dxgb.train(c, param, ddf, dlabels)

            result = bst.predict(dtrain)
            dresult = dbst.predict(dtrain)

            correct = (result > 0.5) == labels
            dcorrect = (dresult > 0.5) == labels
            assert dcorrect.sum() >= correct.sum()
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    label = dbuild.get_label()
    scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)

    print('scale_pos_weight', scale_pos_weight)
    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist, scale_pos_weight
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    label = dbuild.get_label()
    scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)

    print('scale_pos_weight', scale_pos_weight)
    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist, scale_pos_weight
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    label = dbuild.get_label()
    scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)

    print('scale_pos_weight', scale_pos_weight)
    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist, scale_pos_weight
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    label = dbuild.get_label()
    scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1)

    print('scale_pos_weight', scale_pos_weight)
    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist, scale_pos_weight
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def split_build_valid():

    train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), 
                                              p=[1-valid_size, valid_size])
    valid_n = train_user['is_valid'].sum()
    build_n = (train_user.shape[0] - valid_n)

    print('build user:{}, valid user:{}'.format(build_n, valid_n))
    valid_user = train_user[train_user['is_valid']==1].user_id
    is_valid = X_train.user_id.isin(valid_user)

    dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid])
    dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid])
    watchlist = [(dbuild, 'build'),(dvalid, 'valid')]

    print('FINAL SHAPE')
    print('dbuild.shape:{}  dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()),
                                                      (dvalid.num_row(), dvalid.num_col())))

    return dbuild, dvalid, watchlist

#==============================================================================
项目:gestalt    作者:mpearmain    | 项目源码 | 文件源码
def fit(self, X, y, x_val=None, y_val=None):

        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return
项目:gestalt    作者:mpearmain    | 项目源码 | 文件源码
def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 verbose_eval=self.verbose)
        return
项目:jdata    作者:learn2Pro    | 项目源码 | 文件源码
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')

    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    # Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
项目:zhihu-machine-learning-challenge-2017    作者:HouJP    | 项目源码 | 文件源码
def fit(self,
            train_fs, train_labels,
            valid_fs, valid_labels):
        rank_k = self.config.getint('RANK', 'rank_k')

        train_DMatrix = xgb.DMatrix(train_fs, label=train_labels)
        train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k))
        valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels)
        valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k))

        watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')]
        # self.__lock()
        self.model = xgb.train(self.params,
                               train_DMatrix,
                               self.params['num_round'],
                               watchlist,
                               early_stopping_rounds=self.params['early_stop'],
                               verbose_eval=self.params['verbose_eval'])
        LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit)
        # self.__unlock()
        valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit)
        return valid_preds
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))
项目:JData    作者:Xls1994    | 项目源码 | 文件源码
def xgboost_make_submission(retrain = False):
    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'
    if os.path.exists('./cache/bstmodel.bin') and not retrain:
        bst = xgb.Booster({'ntheard':4})
        bst.load_model('./cache/bstmodel.bin')
    else:
        bst = xgboost_train()
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, )
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    dt = datetime.datetime.now()
    sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second)
    pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False)
    # P = get_sku_ids_in_P()
项目:JData    作者:Xls1994    | 项目源码 | 文件源码
def xgboost_test_offline():
    bst = xgboost_train(True)
    P = get_sku_ids_in_P()
    labels = get_labels('2016-04-11','2016-04-16')
    sub_user_index, sub_trainning_data = make_test_set('2016-04-11', '2016-04-16', )
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    # pred = sub_user_index
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    # pred = pred[pred['sku_id'].isin(P)]
    labels = labels[labels['label']==1]
    labels['user_id'] = labels['user_id'].astype(int)
    labels = labels[['user_id','sku_id']]
    labels = labels[labels['sku_id'].isin(P)]
    eval.eval(pred,labels)

    pass
项目:Sberbank    作者:dimaquick    | 项目源码 | 文件源码
def Features(my, prodShift):                                                                                                                                       
    Xtrain, Ytrain, Xvalid, Yvalid = [], [], [], []                                                                                      
    keys = []
    for u in my.Users:
        for m in my.MccList:
            for month in xrange(15 + prodShift):
                if month < 13 + prodShift: continue
                f = my.Features(u, m, month)
                ans = math.log(1.0 + my.Answers[u + '_' + m][month])
                if month == 14 + prodShift:
                    if u not in my.ValidUsers: continue
                    Xvalid.append(f)
                    Yvalid.append(ans)
                    keys.append([u, m])
                else:
                    Xtrain.append(f)
                    Ytrain.append(ans)
    Xtrain, Ytrain, Xvalid, Yvalid = map(np.asarray, [Xtrain, Ytrain, Xvalid, Yvalid])
    return xgboost.DMatrix(Xtrain, Ytrain), xgboost.DMatrix(Xvalid, Yvalid), Yvalid, keys
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:ensemble_amazon    作者:kaz-Anova    | 项目源码 | 文件源码
def build_matrix(self, X, opt_y=None, weighting=None):
    if opt_y==None: 
        if weighting==None:
            return xgb.DMatrix(csr_matrix(X), missing =-999.0)
        else :
            #scale weight
            sumtotal=float(X.shape[0])
            sumweights=np.sum(weighting)            
            for s in range(0,len(weighting)):
                weighting[s]*=sumtotal/sumweights
            return xgb.DMatrix(csr_matrix(X), missing =-999.0, weight=weighting)            
    else:
        if weighting==None:           
            return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0)
        else :
            sumtotal=float(X.shape[0])
            sumweights=np.sum(weighting)            
            for s in range(0,len(weighting)):
                weighting[s]*=sumtotal/sumweights             
            return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0, weight=weighting)
项目:ensemble_amazon    作者:kaz-Anova    | 项目源码 | 文件源码
def predict(self, X): 
    if  self.k_models!=None and len(self.k_models)<2:
        X1 = self.build_matrix(X)
        return self.bst.predict(X1)
    else :
        dtest = xgb.DMatrix(X)
        preds= [0.0 for k in X.shape[0]]
        for gbdt in self.k_models:
            predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree)  
            for g in range (0, predsnew.shape[0]):
                preds[g]+=predsnew[g]
        for g in range (0, len(preds)):
            preds[g]/=float(len(self.k_models))
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def predict(self,Xt,Xg,load_model=None):
        print("load_model",load_model)
        dtest = xgb.DMatrix(Xt)
        dtest.set_group(Xg)
        if load_model and self.bst is None:
            self.bst = xgb.Booster(self.params,model_file=load_model)
        return self.bst.predict(dtest)
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def fit(self,X,y,Xt=None,yt=None,
        load_model=None,save_model=None,
        obj=None,feval=None,print_fscore=True,evalx=None):
        print(X.shape,y.shape)

        num_round = self.params.get('num_round',100)
        early_stopping_rounds = self.params.get('early_stopping_rounds',None)
        maximize = self.params.get('maximize',False)
        dtrain = xgb.DMatrix(X, y)
        vb = self.params.get('verbose_eval',1)
        if Xt is not None:
            dvalid = xgb.DMatrix(Xt, yt)
            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                early_stopping_rounds=early_stopping_rounds,verbose_eval=vb,
                xgb_model=load_model,obj=obj,feval=feval,maximize=maximize)
        else:
            watchlist = [(dtrain, 'train')]
            bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
                verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval)
        self.bst = bst
        if save_model is not None:
            bst.save_model(save_model)            

        fscore = self.feature_importance()
        if print_fscore:
            print("Feature Importance:")
            for i in fscore:
                print(i) 
        if Xt is not None and evalx is not None:
            yp = self.predict(Xt)
            score = evalx(yt,yp)
            print(score)
            return score
        return 0
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def run_croos_validation(self):

        features,labels,cv_folds = self.getFeaturesLabel()
        dtrain_cv  = xgb.DMatrix(features, label= labels,feature_names=features.columns)
        self.set_xgb_parameters()

        # specify validations set to watch performance
        model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params)
        best_scroe = model[self.best_score_colname_in_cv].max()
        return best_scroe