我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用xgboost.DMatrix()。
def predict_proba(self, X): try: rows=(X.shape[0]) except: rows=len(X) X1 = self.build_matrix(X) if self.k_models!=None and len(self.k_models)<2: predictions = self.bst.predict(X1) else : dtest = xgb.DMatrix(X) predictions= None for gbdt in self.k_models: predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree) if predictions==None: predictions=predsnew else: for g in range (0, predsnew.shape[0]): predictions[g]+=predsnew[g] for g in range (0, len(predictions)): predictions[g]/=float(len(self.k_models)) predictions=np.array(predictions) if self.objective == 'multi:softprob': return predictions.reshape( rows, self.num_class) return np.vstack([1 - predictions, predictions]).T
def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None): print(X.shape,y.shape) num_round = self.params['num_round'] early_stopping_rounds = self.params['early_stopping_rounds'] dtrain = xgb.DMatrix(X, y) dtrain.set_group(Xg) if Xt is not None: dvalid = xgb.DMatrix(Xt, yt) dvalid.set_group(Xgt) watchlist = [(dtrain, 'train'), (dvalid, 'valid')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model, maximize=True) else: watchlist = [(dtrain, 'train')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, verbose_eval=1,xgb_model=load_model) self.bst = bst if save_model is not None: bst.save_model(save_model)
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split, y_test_split, title_name): # Split the training data into an extra set of test # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual y') plt.ylabel('Predicted y') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def predict_with_gbm(X, y, model): """ Args: X: y: model: Returns: """ assert model['model_name'] == 'GBM',\ 'Wrong model name in model info: {}. Need GBM.'.format(model['model_name']) testData = xgb.DMatrix(data=X, label=y.nMut.values, feature_names=model['feature_names']) testData.set_base_margin(np.array(np.log(y.length+1/y.N) + np.log(y.N))) kfold = model['kfold'] pred = np.zeros(y.shape[0]) for k in range(1, kfold+1): model['model'][k].set_param(model['params']) # Bypass a bug of dumping without max_delta_step pred += model['model'][k].predict(testData) pred = pred / kfold return pred
def run_grid_search(self): """ This method is called by derived class to start grid search process """ features,labels,cv_folds = self.getFeaturesLabel() dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns) parameter_iterable = self.__get_param_iterable(self.__get_param_grid()) kwargs = self.get_learning_params() for param in parameter_iterable: logging.info("used parameters: {}".format(param)) bst = xgb.cv(param, dtrain_cv, folds=cv_folds,**kwargs) self.__add_to_resultset(param, bst) self.__disp_result() return
def predict(self, X): ''' transform ASLib scenario data Arguments --------- X: numpy.array instance feature matrix Returns ------- ''' preds = np.array(self.model.predict(xgb.DMatrix(X))) preds[preds < 0.5] = 0 preds[preds >= 0.5] = 1 return preds
def crate_pre_train_model(x_,y_): (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1) (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1) dtrain = xgb.DMatrix( x_train, label=y_train) dtest = xgb.DMatrix( x_test, label=y_test) evallist = [(dtrain,'train'),(dtest,'eval')] param = {'objective':'reg:linear','max_depth':3 } param['nthread'] = 64 #param['min_child_weight'] = 15 #param['subsample'] = 1 #param['num_class'] = 7 plst = param.items() num_round = 5000 bst = xgb.train( plst, dtrain, num_round, evallist,early_stopping_rounds=100, #obj=logregobj, feval=evalerror ) return bst # %% main
def as_dmatrix(self): path = self.dmatrix_cache_path # xgb is not try/except friendly here if os.path.exists(path): dm = xgb.DMatrix(path, feature_names=self.feature_names, feature_types=(self.feature_types if FTYPES else None) ) else: logging.info('Cache miss on dmatrix. Building and caching.') dm = self._as_dmatrix() dm.save_binary(path) # We add on weights (if any) after the fact, to avoid proliferation of big # serialized dmatrix files. if self.weight_mode != 'none': weights = self.get_weights() dm.set_weight(weights) return dm
def predict_test_prob(bst): df_all=loadCSV('data/first_merge/test_join_v9.csv') df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv') print('????') df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID') del df_sta_lgbm instanceID=df_all.instanceID.values feature_all=df_all.drop(['label','clickTime','instanceID', 'residence','appCategory'],axis=1).values del df_all dtest=xgb.DMatrix(feature_all) prob=bst.predict(dtest) output=pd.DataFrame({'instanceID':instanceID,'prob':prob}) output.to_csv('result/submission2.csv',index=False) #????
def xgb_train(train_config, X_train, y_train, X_test, y_test): import xgboost as xgb LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format( X_train.shape, y_train.shape, X_test.shape, y_test.shape)) param = train_config["param"] xg_train = xgb.DMatrix(X_train, label=y_train) xg_test = xgb.DMatrix(X_test, label=y_test) num_round = int(train_config["num_round"]) watchlist = [(xg_train, 'train'), (xg_test, 'test')] try: bst = xgb.train(param, xg_train, num_round, watchlist) except KeyboardInterrupt: LOGGER.info("Canceld by user's Ctrl-C action") return y_pred = np.argmax(bst.predict(xg_test), axis=1) acc = 100. * np.sum(y_pred == y_test) / len(y_test) LOGGER.info("accuracy={}%".format(acc))
def data_pre_process(train_path, test_path, label, drop_list=None): train_dataset = pandas.read_csv(train_path) if drop_list: train_dataset = train_dataset.drop(drop_list, axis=1) y_train = train_dataset[label].astype(int) print y_train.dtypes X_train = train_dataset.drop(label, axis=1) test_dataset = pandas.read_csv(test_path) if drop_list: test_dataset = test_dataset.drop(drop_list, axis=1) y_test = test_dataset[label].astype(int) print y_test.dtypes X_test = test_dataset.drop(label, axis=1) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) return dtrain, dtest
def train_relatedness_classifier(trainX, trainY): xg_train = xgb.DMatrix(trainX, label=trainY) # setup parameters for xgboost param = {} # use softmax multi-class classification param['objective'] = 'binary:logistic' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 6 param['silent'] = 1 param['nthread'] = 20 num_round = 1000 relatedness_classifier = xgb.train(param, xg_train, num_round); return relatedness_classifier
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not HAS_XGBOOST: return if not HAS_SKLEARN: return scikit_data = load_boston() dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target, feature_names = scikit_data.feature_names) xgb_model = xgboost.train({}, dtrain, 1) # Save the data and the model self.scikit_data = scikit_data self.xgb_model = xgb_model self.feature_names = self.scikit_data.feature_names
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10): train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan')) test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan')) evallist = [(train_data,'train'), (test_data,'eval')] if xgb_params is None: xgb_params = get_default_xgboost_params() print "xgb_params not found" print "XGBoost, using param", xgb_params gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5) isgbtree = xgb_params["booster"] == "gbtree" if isgbtree : ceate_feature_map_for_feature_importance(features) show_feature_importance(gbdt) y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit) else: y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan'))) return XGBoostModel(gbdt), y_pred
def run_train_with_model(train, features, model_path): start_time = time.time() gbm = xgb.Booster() gbm.load_model(model_path) print("Validating...") check = gbm.predict(xgb.DMatrix(train[features])) score = roc_auc_score(train['isDuplicate'].values, check) validation_df = pd.DataFrame({'itemID_1': train['itemID_1'].values, 'itemID_2': train['itemID_2'].values, 'isDuplicate': train['isDuplicate'].values, 'probability': check}) print('AUC score value: {:.6f}'.format(score)) imp = get_importance(gbm, features) print('Importance array: ', imp) print('Prediction time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return validation_df, score
def run_test_with_model(train, test, features, model_path): start_time = time.time() gbm = xgb.Booster() gbm.load_model(model_path) print("Validating...") check = gbm.predict(xgb.DMatrix(train[features])) score = roc_auc_score(train['isDuplicate'].values, check) validation_df = pd.DataFrame({'isDuplicate': train['isDuplicate'].values, 'probability': check}) # print(validation_df) print('AUC score value: {:.6f}'.format(score)) # score1 = roc_auc_score(validation_df['isDuplicate'].values, validation_df['probability']) # print('AUC score check value: {:.6f}'.format(score1)) imp = get_importance(gbm, features) print('Importance array: ', imp) print("Predict test set...") test_prediction = gbm.predict(xgb.DMatrix(test[features])) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return test_prediction.tolist(), validation_df, score
def ExtGBDTEnsemblePredict(sub_clf_num, predict_x): """ ???????? :param sub_clf_num: ?????? :param predict_x: ??????feature :return: socre: ndarray, ???? """ total_score = np.zeros(len(predict_x)) # ????????????????? for i in range(sub_clf_num): predict_X = xgb.DMatrix(predict_x) model_file = '../model/model' + str(i) bst = pickle.load(open(model_file, 'r')) predict_y = bst.predict(predict_X) total_score += predict_y score = total_score / sub_clf_num return score
def ExtGBDT(train_x, train_y, test_x, test_y): """ Ext-GBDT """ num_round = 100 param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc', 'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4, 'max_delta_step': 0} train_X = xgb.DMatrix(train_x, train_y) test_X = xgb.DMatrix(test_x) bst = xgb.train(param, train_X, num_round) pred = bst.predict(test_X) predict_y = [] for i in range(len(pred)): if pred[i] < 0.5: predict_y.append(0) else: predict_y.append(1) auc = evaluate_auc(pred, test_y) evaluate(predict_y, test_y) return auc
def runXGB(train_X, train_y, seed_val=123): param = {} param['objective'] = 'multi:softprob' param['eta'] = 0.05 param['max_depth'] = 6 param['silent'] = 1 param['num_class'] = 22 param['eval_metric'] = "mlogloss" param['min_child_weight'] = 2 param['subsample'] = 0.9 param['colsample_bytree'] = 0.9 param['seed'] = seed_val num_rounds = 115 plst = list(param.items()) xgtrain = xgb.DMatrix(train_X, label=train_y) model = xgb.train(plst, xgtrain, num_rounds) return model
def cross_validate(train): #separate training and validation set X_train,X_valid= split_train_validation(train) scores = []; preds = [] for i in xrange(len(X_train)): #convert X_train, Y_train etc... to xgboost matrix dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan) #predict with xgboost parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8, 'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50, 'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'} plst = parameters.items() bst = xgb.train(plst, dtrain) pred = bst.predict(dvalid) scores.append(log_loss(X_valid[i]['group'].tolist(),pred)) pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_) preds.append(pred) return scores, preds
def test_basic(c, s, a, b): dtrain = xgb.DMatrix(df, label=labels) bst = xgb.train(param, dtrain) ddf = dd.from_pandas(df, npartitions=4) dlabels = dd.from_pandas(labels, npartitions=4) dbst = yield dxgb._train(c, param, ddf, dlabels) dbst = yield dxgb._train(c, param, ddf, dlabels) # we can do this twice result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == labels dcorrect = (dresult > 0.5) == labels assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, ddf) assert isinstance(predictions, dd.Series) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, pd.Series) assert ((predictions > 0.5) != labels).sum() < 2
def test_dmatrix_kwargs(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2,)) dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0}) # Distributed model matches local model with dmatrix kwargs dtrain = xgb.DMatrix(X, label=y, missing=0.0) bst = xgb.train(param, dtrain) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) assert np.abs(result - dresult).sum() < 0.02 # Distributed model gives bad predictions without dmatrix kwargs dtrain_incompat = xgb.DMatrix(X, label=y) dresult_incompat = dbst.predict(dtrain_incompat) assert np.abs(result - dresult_incompat).sum() > 0.02
def test_numpy(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2,)) dbst = yield dxgb._train(c, param, dX, dy) dbst = yield dxgb._train(c, param, dX, dy) # we can do this twice dtrain = xgb.DMatrix(X, label=y) bst = xgb.train(param, dtrain) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == y dcorrect = (dresult > 0.5) == y assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, np.ndarray) assert ((predictions > 0.5) != labels).sum() < 2
def test_synchronous_api(loop): # noqa dtrain = xgb.DMatrix(df, label=labels) bst = xgb.train(param, dtrain) ddf = dd.from_pandas(df, npartitions=4) dlabels = dd.from_pandas(labels, npartitions=4) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: dbst = dxgb.train(c, param, ddf, dlabels) result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == labels dcorrect = (dresult > 0.5) == labels assert dcorrect.sum() >= correct.sum()
def split_build_valid(): train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), p=[1-valid_size, valid_size]) valid_n = train_user['is_valid'].sum() build_n = (train_user.shape[0] - valid_n) print('build user:{}, valid user:{}'.format(build_n, valid_n)) valid_user = train_user[train_user['is_valid']==1].user_id is_valid = X_train.user_id.isin(valid_user) dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) watchlist = [(dbuild, 'build'),(dvalid, 'valid')] print('FINAL SHAPE') print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), (dvalid.num_row(), dvalid.num_col()))) return dbuild, dvalid, watchlist #==============================================================================
def split_build_valid(): train_user['is_valid'] = np.random.choice([0,1], size=len(train_user), p=[1-valid_size, valid_size]) valid_n = train_user['is_valid'].sum() build_n = (train_user.shape[0] - valid_n) print('build user:{}, valid user:{}'.format(build_n, valid_n)) valid_user = train_user[train_user['is_valid']==1].user_id is_valid = X_train.user_id.isin(valid_user) dbuild = xgb.DMatrix(X_train[~is_valid].drop('user_id', axis=1), y_train[~is_valid]) dvalid = xgb.DMatrix(X_train[is_valid].drop('user_id', axis=1), label=y_train[is_valid]) watchlist = [(dbuild, 'build'),(dvalid, 'valid')] label = dbuild.get_label() scale_pos_weight = float(np.sum(label == 0)) / np.sum(label==1) print('scale_pos_weight', scale_pos_weight) print('FINAL SHAPE') print('dbuild.shape:{} dvalid.shape:{}\n'.format((dbuild.num_row(), dbuild.num_col()), (dvalid.num_row(), dvalid.num_col()))) return dbuild, dvalid, watchlist, scale_pos_weight
def fit(self, X, y, x_val=None, y_val=None): dtrain = xgb.DMatrix(X, label=y) if x_val is not None: dtest = xgb.DMatrix(x_val, label=y_val) watchlist = [(dtrain, 'train'), (dtest, 'validation')] self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, evals=watchlist, verbose_eval=self.verbose) else: self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds) return
def fit(self, X, y, x_val=None, y_val=None): dtrain = xgb.DMatrix(X, label=y) if x_val is not None: dtest = xgb.DMatrix(x_val, label=y_val) watchlist = [(dtrain, 'train'), (dtest, 'validation')] self.xgb = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, evals=watchlist, verbose_eval=self.verbose) else: self.xgb = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_round, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) return
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc') # Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] # Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions) print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score')
def fit(self, train_fs, train_labels, valid_fs, valid_labels): rank_k = self.config.getint('RANK', 'rank_k') train_DMatrix = xgb.DMatrix(train_fs, label=train_labels) train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k)) valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels) valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k)) watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')] # self.__lock() self.model = xgb.train(self.params, train_DMatrix, self.params['num_round'], watchlist, early_stopping_rounds=self.params['early_stop'], verbose_eval=self.params['verbose_eval']) LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit) # self.__unlock() valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit) return valid_preds
def xgboost_make_submission(retrain = False): sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' if os.path.exists('./cache/bstmodel.bin') and not retrain: bst = xgb.Booster({'ntheard':4}) bst.load_model('./cache/bstmodel.bin') else: bst = xgboost_train() sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, ) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y = bst.predict(sub_trainning_data) sub_user_index['label'] = y pred = sub_user_index[sub_user_index['label'] >= 0.03] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) dt = datetime.datetime.now() sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second) pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False) # P = get_sku_ids_in_P()
def xgboost_test_offline(): bst = xgboost_train(True) P = get_sku_ids_in_P() labels = get_labels('2016-04-11','2016-04-16') sub_user_index, sub_trainning_data = make_test_set('2016-04-11', '2016-04-16', ) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y = bst.predict(sub_trainning_data) sub_user_index['label'] = y pred = sub_user_index[sub_user_index['label'] >= 0.03] # pred = sub_user_index pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) # pred = pred[pred['sku_id'].isin(P)] labels = labels[labels['label']==1] labels['user_id'] = labels['user_id'].astype(int) labels = labels[['user_id','sku_id']] labels = labels[labels['sku_id'].isin(P)] eval.eval(pred,labels) pass
def Features(my, prodShift): Xtrain, Ytrain, Xvalid, Yvalid = [], [], [], [] keys = [] for u in my.Users: for m in my.MccList: for month in xrange(15 + prodShift): if month < 13 + prodShift: continue f = my.Features(u, m, month) ans = math.log(1.0 + my.Answers[u + '_' + m][month]) if month == 14 + prodShift: if u not in my.ValidUsers: continue Xvalid.append(f) Yvalid.append(ans) keys.append([u, m]) else: Xtrain.append(f) Ytrain.append(ans) Xtrain, Ytrain, Xvalid, Yvalid = map(np.asarray, [Xtrain, Ytrain, Xvalid, Yvalid]) return xgboost.DMatrix(Xtrain, Ytrain), xgboost.DMatrix(Xvalid, Yvalid), Yvalid, keys
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed) for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def build_matrix(self, X, opt_y=None, weighting=None): if opt_y==None: if weighting==None: return xgb.DMatrix(csr_matrix(X), missing =-999.0) else : #scale weight sumtotal=float(X.shape[0]) sumweights=np.sum(weighting) for s in range(0,len(weighting)): weighting[s]*=sumtotal/sumweights return xgb.DMatrix(csr_matrix(X), missing =-999.0, weight=weighting) else: if weighting==None: return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0) else : sumtotal=float(X.shape[0]) sumweights=np.sum(weighting) for s in range(0,len(weighting)): weighting[s]*=sumtotal/sumweights return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0, weight=weighting)
def predict(self, X): if self.k_models!=None and len(self.k_models)<2: X1 = self.build_matrix(X) return self.bst.predict(X1) else : dtest = xgb.DMatrix(X) preds= [0.0 for k in X.shape[0]] for gbdt in self.k_models: predsnew = gbdt.predict(dtest, ntree_limit=(gbdt.best_iteration+1)*self.num_parallel_tree) for g in range (0, predsnew.shape[0]): preds[g]+=predsnew[g] for g in range (0, len(preds)): preds[g]/=float(len(self.k_models))
def predict(self,Xt,Xg,load_model=None): print("load_model",load_model) dtest = xgb.DMatrix(Xt) dtest.set_group(Xg) if load_model and self.bst is None: self.bst = xgb.Booster(self.params,model_file=load_model) return self.bst.predict(dtest)
def fit(self,X,y,Xt=None,yt=None, load_model=None,save_model=None, obj=None,feval=None,print_fscore=True,evalx=None): print(X.shape,y.shape) num_round = self.params.get('num_round',100) early_stopping_rounds = self.params.get('early_stopping_rounds',None) maximize = self.params.get('maximize',False) dtrain = xgb.DMatrix(X, y) vb = self.params.get('verbose_eval',1) if Xt is not None: dvalid = xgb.DMatrix(Xt, yt) watchlist = [(dtrain, 'train'), (dvalid, 'valid')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, early_stopping_rounds=early_stopping_rounds,verbose_eval=vb, xgb_model=load_model,obj=obj,feval=feval,maximize=maximize) else: watchlist = [(dtrain, 'train')] bst = xgb.train(self.params, dtrain, num_round, evals = watchlist, verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval) self.bst = bst if save_model is not None: bst.save_model(save_model) fscore = self.feature_importance() if print_fscore: print("Feature Importance:") for i in fscore: print(i) if Xt is not None and evalx is not None: yp = self.predict(Xt) score = evalx(yt,yp) print(score) return score return 0
def run_croos_validation(self): features,labels,cv_folds = self.getFeaturesLabel() dtrain_cv = xgb.DMatrix(features, label= labels,feature_names=features.columns) self.set_xgb_parameters() # specify validations set to watch performance model = xgb.cv(self.xgb_params, dtrain_cv, folds=cv_folds, **self.xgb_learning_params) best_scroe = model[self.best_score_colname_in_cv].max() return best_scroe