我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用xgboost.Booster()。
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tags', metavar='tag', nargs='+') parser.add_argument('--fold', default='test', help='identifier for file with the users to test on (default: test)') args = parser.parse_args() for model_tag in args.tags: hps = hypers.hps_for_tag(model_tag) dataset = Dataset(args.fold, hps, mode=Mode.inference) path = common.resolve_xgboostmodel_path(model_tag) logging.info('Loading model with tag {}'.format(model_tag)) model = xgb.Booster(model_file=path) logging.info('Computing probs for tag {}'.format(model_tag)) with time_me('Computed probs for {}'.format(model_tag), mode='stderr'): pdict = get_pdict(model, dataset) logging.info('Got probs for {} users'.format(len(pdict))) # TODO: might want to enforce some namespace separation between # rnn-generated pdicts and ones coming from xgboost models? common.save_pdict_for_tag(model_tag, pdict, args.fold)
def run_train_with_model(train, features, model_path): start_time = time.time() gbm = xgb.Booster() gbm.load_model(model_path) print("Validating...") check = gbm.predict(xgb.DMatrix(train[features])) score = roc_auc_score(train['isDuplicate'].values, check) validation_df = pd.DataFrame({'itemID_1': train['itemID_1'].values, 'itemID_2': train['itemID_2'].values, 'isDuplicate': train['isDuplicate'].values, 'probability': check}) print('AUC score value: {:.6f}'.format(score)) imp = get_importance(gbm, features) print('Importance array: ', imp) print('Prediction time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return validation_df, score
def run_test_with_model(train, test, features, model_path): start_time = time.time() gbm = xgb.Booster() gbm.load_model(model_path) print("Validating...") check = gbm.predict(xgb.DMatrix(train[features])) score = roc_auc_score(train['isDuplicate'].values, check) validation_df = pd.DataFrame({'isDuplicate': train['isDuplicate'].values, 'probability': check}) # print(validation_df) print('AUC score value: {:.6f}'.format(score)) # score1 = roc_auc_score(validation_df['isDuplicate'].values, validation_df['probability']) # print('AUC score check value: {:.6f}'.format(score1)) imp = get_importance(gbm, features) print('Importance array: ', imp) print("Predict test set...") test_prediction = gbm.predict(xgb.DMatrix(test[features])) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return test_prediction.tolist(), validation_df, score
def xgboost_make_submission(retrain = False): sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' if os.path.exists('./cache/bstmodel.bin') and not retrain: bst = xgb.Booster({'ntheard':4}) bst.load_model('./cache/bstmodel.bin') else: bst = xgboost_train() sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, ) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y = bst.predict(sub_trainning_data) sub_user_index['label'] = y pred = sub_user_index[sub_user_index['label'] >= 0.03] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) dt = datetime.datetime.now() sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second) pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False) # P = get_sku_ids_in_P()
def predict(self,Xt,Xg,load_model=None): print("load_model",load_model) dtest = xgb.DMatrix(Xt) dtest.set_group(Xg) if load_model and self.bst is None: self.bst = xgb.Booster(self.params,model_file=load_model) return self.bst.predict(dtest)
def load_model(xgb_regressor,day,folder_path): booster = xgb.Booster() booster.load_model(folder_path+'%d.xgbmodel'%day) xgb_regressor._Booster = booster
def predict_eval_model(dtest,model_path): labels = dtest.get_label() bst = xgb.Booster(model_file=model_path) preds_prob = bst.predict(data=dtest) preds_original = bst.predict(data=dtest,output_margin=True) preds_label = [] for pred_prob in preds_prob: if pred_prob>=0.5: preds_label.append(1) else: preds_label.append(0) print 'true label:\n',labels from sklearn.metrics import accuracy_score accuracy_score = accuracy_score(labels,preds_label) # accuracy_score_num = accuracy_score(labels,preds_label,normalize=True) # print 'accuracy_score : %f\tnum of the predicted truly : %d/%d'%accuracy_score,accuracy_score_num,len(labels) print 'accuracy_score : %f'%accuracy_score print 'average_precision_score : %f'%average_precision_score(labels,preds_prob) print classification_report(labels,preds_label,target_names=['class 0','class 1']) print confusion_matrix(labels,preds_label,labels=[0,1]) f1_score_s = f1_score(labels,preds_label,pos_label=1) print 'f1 score : %f'%f1_score_s print 'precision_score : %f'%precision_score(labels,preds_label,pos_label=1) print 'recall_score : %f'%recall_score(labels,preds_label,pos_label=1) print 'roc_auc_score : %f'%roc_auc_score(labels,preds_prob) fpr, tpr, thresholds = roc_curve(labels, preds_prob) print fpr,tpr,thresholds roc_auc = auc(fpr, tpr) precision,recall,thresholds_pr = precision_recall_curve(labels,preds_prob,pos_label=1) plt.plot(precision,recall,label='P-R f1 score %f'%(f1_score_s)) # plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc)) # average_precision_score = average_precision_score(labels,preds_prob) # print 'average_precision_score : %f'%average_precision_score
def predict_user(): print('????.') xgb_model = xgb.Booster({'nthread':-1}) xgb_model.load_model('./model/xgb_user.model') Online = pd.read_csv('./feat/online_user_model_feat.csv') donline = xgb.DMatrix(Online.drop(['user_id'], axis=1)) xgb_proba = xgb_model.predict(donline) online_proba = Online[['user_id']] online_proba.loc[:,'proba'] = xgb_proba online_proba.to_csv("./online_user_proba.csv", index=False)
def predict_sku(): print('????.') xgb_model = xgb.Booster({'nthread':-1}) xgb_model.load_model('./model/xgb_sku.model') Online = pd.read_csv("./feat/online_sku_feat.csv") Online_drop_cols = ['user_id', 'sku_id', 'cate', 'brand'] donline = xgb.DMatrix(Online.drop(Online_drop_cols, axis=1)) ##?? xgb_proba = xgb_model.predict(donline) sku_proba = Online[['user_id', 'sku_id']] sku_proba.loc[:,'sku_proba'] = xgb_proba ##??????????????? sku_proba = sku_proba.groupby(['user_id'], as_index=False).apply(lambda t: t[t.sku_proba == t.sku_proba.max()]).reset_index()[['user_id', 'sku_id', 'sku_proba']] ##?????????? user_proba = pd.read_csv("./online_user_proba.csv") ##??????????? sku_proba.sort_values(by="sku_proba", ascending=False, inplace=True) user_proba.sort_values(by="proba", ascending=False, inplace=True) ##???? ? ???? ???500?? Top_user = user_proba.iloc[:500] Top_sku = sku_proba.iloc[:500][['user_id', 'sku_id']] Top_user = sku_proba[sku_proba.user_id.isin(Top_user.user_id)] Top_user = Top_user.groupby(['user_id'], as_index=False).apply(lambda t: t[t.sku_proba == t.sku_proba.max()]).reset_index()[['user_id', 'sku_id']] pred = pd.concat([Top_sku, Top_user]) pred = pred.drop_duplicates() pred = pred[pred.user_id.duplicated()==False] pred.astype(int).to_csv("online_submit.csv", index=False) print('????.')
def run_test_with_model(train, test, features, target, random_state=0): start_time = time.time() test_size = 0.02 # X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state) split = round((1-test_size)*len(train.index)) X_train = train[0:split] X_valid = train[split:] print('Length train:', len(X_train.index)) print('Length valid:', len(X_valid.index)) # watchlist = [(dtrain, 'train'), (dvalid, 'eval')] # gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, feval=auc_xgboost, verbose_eval=True) # gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True) gbm = xgb.Booster() gbm.load_model("models/model_0.968276662916_eta_0.2_md_5_test_size_0.02.bin") print("Validating...") check = gbm.predict(xgb.DMatrix(X_valid[features])) score = roc_auc_score(X_valid[target].values, check) score_kaggle = auc(X_valid[target].values, check) print('Check error value: {:.6f} (Kaggle: {:.6f})'.format(score, score_kaggle)) imp = get_importance(gbm, features) print('Importance array: ', imp) print("Predict test set...") test_prediction = gbm.predict(xgb.DMatrix(test[features])) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return test_prediction.tolist(), score
def __init__(self): self.clf = xgb.Booster()
def fit(self, X, y): d = xgb.DMatrix(X, y) self.clf = xgb.Booster(param, [d]) for i in range(50): self.clf.update(d, i)
def fit(self, X, y): d = xgb.DMatrix(X, y) self.clf = xgb.Booster(param, [d]) for i in range(10): self.clf.update(d, i)
def load(self, model_fp): self.model = xgb.Booster(self.params) self.model.load_model(model_fp)
def predict(self,Xt,load_model=None): dtest = xgb.DMatrix(Xt) if load_model and self.bst is None: self.bst = xgb.Booster(self.params,model_file=load_model) return self.bst.predict(dtest)
def output_critical_tests(train, features, target, model_path, test_size): out_path = "cache/fails.html" out = open(out_path, "w", encoding='utf-8') gbm = xgb.Booster() gbm.load_model(model_path) types2 = { 'itemID': np.dtype(int), 'categoryID': np.dtype(int), 'title': np.dtype(str), 'description': np.dtype(str), 'images_array': np.dtype(str), 'attrsJSON': np.dtype(str), 'price': np.dtype(float), 'locationID': np.dtype(int), 'metroID': np.dtype(float), 'lat': np.dtype(float), 'lon': np.dtype(float), } print("Load ItemInfo_train.csv") items = pd.read_csv("../input/ItemInfo_train.csv", dtype=types2) items.fillna(-1, inplace=True) split = round((1-test_size)*len(train.index)) X_train = train[0:split] X_valid = train[split:] print('Length train:', len(X_train.index)) print('Length valid:', len(X_valid.index)) print("Validating...") check = gbm.predict(xgb.DMatrix(X_valid[features])) # print(X_valid[features][:100]) # print(check[:100]) score = roc_auc_score(X_valid[target].values, check) print('Score: {}'.format(score)) X_valid = append_items_info(X_valid, items) count = 0 for i in range(len(X_valid[target].values)): if abs(X_valid[target].values[i] - check[i]) > 0.9: print(X_valid[target].values[i], check[i]) if count > 100: break print_debug_data(out, X_valid, features, i, check[i], X_valid[target].values[i]) count += 1 print('Count critical: {} from {}'.format(count, len(check))) out.close()
def get_ensemble_score(name): if os.path.exists(util.features_prefix + name + "_XXXYYY.pkl") is False: print 'file does not exist' exit() [X_train, X_validate, X_test, y_train, y_validate, y_test] = pd.read_pickle( util.features_prefix + name + '_XXXYYY.pkl') import xgboost as xgb rf_clf_2 = pd.read_pickle(util.models_prefix + name + '_rf.pkl') list_all = [] rf_2_list = rf_clf_2.predict_proba(X_test) from sklearn.feature_selection import SelectFromModel list_all.append(rf_2_list) xgb_2 = xgb.Booster({'nthread': 4}) # init model xgb_2.load_model(util.models_prefix + name + '_xgb_prob.pkl') # load data dtest = xgb.DMatrix(X_test) xgb_2_test = xgb_2.predict(dtest) list_all.append(xgb_2_test) # list_all.append(xgb_1_test) import copy [train_X, train_Y] = pd.read_pickle(util.features_prefix + name + '_XY.pkl') X_semantic = np.array(copy.deepcopy(X_test[:, range(95, 475)])) X_manual = np.array(copy.deepcopy(X_test[:, range(0, 95)])) X_cluster = np.array(copy.deepcopy(X_test[:, range(475, 545)])) X_document = np.array(copy.deepcopy(X_test[:, range(545, 547)])) X_document[:, [0]] = X_document[:, [0]] + train_X[:, [-1]].max() X_semantic = X_semantic.reshape(X_semantic.shape[0], 10, -1) X_semantic_1 = np.zeros((X_semantic.shape[0], X_semantic.shape[2], X_semantic.shape[1])) for i in range(int(X_semantic.shape[0])): X_semantic_1[i] = np.transpose(X_semantic[i]) json_string = pd.read_pickle(util.models_prefix + name + '_json_string_cnn.pkl') model_cnn = model_from_json(json_string) model_cnn.load_weights(util.models_prefix + name + '_nn_weight_cnn.h5') cnn_list = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) kk = list(cnn_list) list_all.append(kk) json_string = pd.read_pickle(util.models_prefix + name + '_json_string_lstm.pkl') model_lstm = model_from_json(json_string) model_lstm.load_weights(util.models_prefix + name + '_nn_weight_lstm.h5') lstm_list = model_lstm.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) kk = list(lstm_list) list_all.append(kk) temp_list = [] for i in range(len(y_test)): temp = np.zeros(len(list_all[0][0])) for z in list_all: temp += np.array(z[i]) temp_list.append(temp) evaluate_k_recall(1, y_test, temp_list) print '**************************'