Python xgboost 模块,XGBClassifier() 实例源码

我们从Python开源项目中,提取了以下47个代码示例,用于说明如何使用xgboost.XGBClassifier()

项目:JData-rank37    作者:763337092    | 项目源码 | 文件源码
def xgb_model(train_data, train_label, test_data, test_label):
    clf = xgb.XGBClassifier(max_depth=7,
                           min_child_weight=1,
                           learning_rate=0.1,
                           n_estimators=500,
                           silent=True,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None)
    clf.fit(train_data, train_label, eval_metric='auc', verbose=True,
            eval_set=[(test_data, test_label)], early_stopping_rounds=100)
    y_pre = clf.predict(test_data)
    y_pro = clf.predict_proba(test_data)[:, 1]
    #print "AUC Score : %f" % metrics.roc_auc_score(test_label, y_pro)
    #print"Accuracy : %.4g" % metrics.accuracy_score(test_label, y_pre)
    return clf
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating
            hyper-parameters

        """
        classifier = a_clf
        self._gs = a_grid_search
        if a_clf is None:
            classifier = XGBClassifier(max_depth=MAX_DEPTH,
                                       n_estimators=NTREES,
                                       learning_rate=ALPHA,
                                       objective="multi:softprob")
            self._clf = classifier
        # latest version of XGBoost cannot deal with non-sparse feature vectors
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])
项目:PEP    作者:ma-compbio    | 项目源码 | 文件源码
def threshold_estimate(x,y):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
    weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
    w1 = np.array([1]*y_train.shape[0])
    w1[y_train==1]=weight
    print("samples: %d %d %f" % (x_train.shape[0], x_test.shape[0], weight))
    estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
    estimator.fit(x_train, y_train, sample_weight=w1)
    y_scores = estimator.predict_proba(x_test)[:,1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
    m_idx = np.argmax(f1)
    m_thresh = thresholds[2+m_idx]
    print("%d %f %f" % (precision.shape[0], f1[m_idx], m_thresh))
    return m_thresh

# Estimate threshold for the classifier using inner-round cross validation
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_model_detection(self):
        sklearn_model = LogisticRegression()
        pipeline_model = Pipeline([('log', sklearn_model)])
        xgb_model = XGBClassifier()
        nn_model = NNModel(100,10)
        sklearn_opt = Optimizer(sklearn_model,[], lambda x: x)
        pipeline_opt = Optimizer(pipeline_model,[], lambda x: x)
        xgb_opt = Optimizer(xgb_model,[], lambda x: x)
        nn_opt = Optimizer(nn_model,[], lambda x: x)

        self.assertEqual(sklearn_opt.model_module, 'sklearn')
        self.assertEqual(pipeline_opt.model_module, 'pipeline')
        self.assertEqual(xgb_opt.model_module, 'xgboost')
        self.assertEqual(nn_opt.model_module, 'keras')
项目:KagglePlanetPytorch    作者:Mctigger    | 项目源码 | 文件源码
def objective(space):
                estimator = XGBClassifier(
                    n_estimators=n_estimators,
                    max_depth=int(space['max_depth']),
                    min_child_weight=int(space['min_child_weight']),
                    gamma=space['gamma'],
                    subsample=space['subsample'],
                    colsample_bytree=space['colsample_bytree']
                )

                estimator.fit(
                    x_train,
                    y_train,
                    eval_set=[(x_train, y_train), (x_val, y_val)],
                    early_stopping_rounds=30,
                    verbose=False,
                    eval_metric='error'
                )

                score = accuracy_score(y_val, estimator.predict(x_val))

                return {'loss': 1 - score, 'status': STATUS_OK}
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn

    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
            if len(params_untuned)==1:
                return v
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn
    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):

    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        # if(param_untuned.keys()[0] == 'n_estimators'):
        #     cv = 1
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
        grid_search.fit(x, y)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
    return estimator,params_sklearn
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1):
    """
    ExtraTrees
    """
    import xgboost as xgb
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob',
            seed=0, silent=True, nthread=-1, learning_rate=learning_rate)
    eval_set = [(X_test, y_test)]
    clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror")
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def get_classifier(method='logistic_regression'):
    if 'logistic_regression' == method:
        return LogisticRegression(C=1e3,
                                  tol=0.01,
                                  multi_class='ovr',
                                  solver='liblinear',
                                  n_jobs=-1,
                                  random_state=123)
    if 'random_forest' == method:
        return RandomForestClassifier(n_estimators=250,
                                      bootstrap=False,
                                      n_jobs=-1,
                                      random_state=123)

    if 'gradient_boosting' == method:
        return xgb.XGBClassifier(max_depth=10,
                                 subsample=0.7,
                                 n_estimators=500,
                                 min_child_weight=0.05,
                                 colsample_bytree=0.3,
                                 learning_rate=0.1)
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1):
    """
    ExtraTrees
    """
    import xgboost as xgb
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob',
            seed=0, silent=True, nthread=-1, learning_rate=learning_rate)
    eval_set = [(X_test, y_test)]
    clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror")
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params, get_predictions = False ):

    n_estimators = int( round( n_iterations * trees_per_iteration ))
    print "n_estimators:", n_estimators
    pprint( params )

    clf = XGB( n_estimators = n_estimators, nthread = -1, **params )

    return train_and_eval_sklearn_classifier( clf, data )
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_build_new_model_xgboost(self):
        xgb_model = XGBClassifier(max_depth=3)
        xgb_opt = Optimizer(xgb_model,[], lambda x: x)
        new_model = xgb_opt.build_new_model({'max_depth': 2})
        self.assertEqual(new_model.get_params()['max_depth'], 2)
项目:avito-contest    作者:fmilepe    | 项目源码 | 文件源码
def XGBoost(X, y):
    print("Iniciando treinamento do XGBoost")
    start_time = time.time()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=1)
    clf = xgb.XGBClassifier(learning_rate=0.15, n_estimators=170, nthread=6, max_depth=8, seed=0, silent=True,
                            subsample=0.85, colsample_bytree=0.85)
    clf.fit(X, y)
    score = clf.score(X_test, y_test)
    print("XGBoost score: ", score, "(", (time.time()-start_time)/60.0, "minutos )")

    return clf
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def __init__(self,name,kwargs):
        import xgboost as xgb
        kwargs = kwargs.copy()
        if "random_state" in kwargs:
            kwargs["seed"] = kwargs["random_state"]
            kwargs.pop("random_state")
        super(GCXGBClassifier,self).__init__(name,xgb.XGBClassifier,kwargs)
项目:CCIT    作者:rajatsen91    | 项目源码 | 文件源码
def XGBOUT2(bp, all_samples,train_samp,Xcoords, Ycoords, Zcoords,k,threshold,nthread,bootstrap = True):
    '''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor  Bootstrap'''

    num_samp = len(all_samples)
    if bootstrap:
        np.random.seed()
        random.seed()
        I = np.random.choice(num_samp,size = num_samp, replace = True)
        samples = all_samples[I,:]
    else:
        samples = all_samples
    Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], all_samples[:,Zcoords],train_samp,k)
    model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11)
    gbm = model.fit(Xtrain,Ytrain)
    pred = gbm.predict_proba(Xtest)
    pred_exact = gbm.predict(Xtest)
    acc1 = accuracy_score(Ytest, pred_exact)
    AUC1 = roc_auc_score(Ytest,pred[:,1])
    del gbm
    gbm = model.fit(Xtrain[:,len(Xcoords)::],Ytrain)
    pred = gbm.predict_proba(Xtest[:,len(Xcoords)::])
    pred_exact = gbm.predict(Xtest[:,len(Xcoords)::])
    acc2 = accuracy_score(Ytest, pred_exact)
    AUC2 = roc_auc_score(Ytest,pred[:,1])
    del gbm
    if AUC1 > AUC2 + threshold:
        return [0.0, AUC1 - AUC2 , AUC2 - 0.5, acc1 - acc2, acc2 - 0.5]
    else:
        return [1.0, AUC1 - AUC2, AUC2 - 0.5, acc1 - acc2, acc2 - 0.5]
项目:CCIT    作者:rajatsen91    | 项目源码 | 文件源码
def XGBOUT_Independence(bp, all_samples,train_samp,Xcoords, Ycoords, k,threshold,nthread,bootstrap = True):
    '''Function that takes a CI test data-set and returns classification accuracy after Nearest-Neighbor  Bootstrap'''

    num_samp = len(all_samples)
    if bootstrap:
        np.random.seed()
        random.seed()
        I = np.random.choice(num_samp,size = num_samp, replace = True)
        samples = all_samples[I,:]
    else:
        samples = all_samples
    Xtrain,Ytrain,Xtest,Ytest,CI_data = CI_sampler_conditional_kNN(all_samples[:,Xcoords],all_samples[:,Ycoords], None,train_samp,k)
    s1,s2 = Xtrain.shape
    if s2 >= 4:
        model = xgb.XGBClassifier(nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11)
    else:
        model = xgb.XGBClassifier() 
    gbm = model.fit(Xtrain,Ytrain)
    pred = gbm.predict_proba(Xtest)
    pred_exact = gbm.predict(Xtest)
    acc1 = accuracy_score(Ytest, pred_exact)
    AUC1 = roc_auc_score(Ytest,pred[:,1])
    del gbm
    if AUC1 > 0.5 + threshold:
        return [0.0, AUC1 - 0.5 , acc1- 0.5]
    else:
        return [1.0, AUC1 - 0.5 , acc1- 0.5]
项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def train_xgboost_classifier():
    return mp.ModelProperties(), xgboost.XGBClassifier()
项目:jingjuSingingPhraseMatching    作者:ronggong    | 项目源码 | 文件源码
def buildEstimators(mode):
    if mode == 'train' or mode == 'cv':
        # best parameters got by gridsearchCV, best score: 1
        estimators = [('anova_filter', SelectKBest(f_classif, k='all')),
                      ('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))]
        clf = Pipeline(estimators)
    elif mode == 'test':
        clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r"))
    return clf
项目:PEP    作者:ma-compbio    | 项目源码 | 文件源码
def threshold_estimate_cv(x,y,k_fold):
    print "%d %d %d" % (y.shape[0], sum(y==1), sum(y==0))
    kf1 = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
    threshold = np.zeros((k_fold),dtype="float32")
    cnt = 0
    for train_index, test_index in kf1:
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        w1 = np.array([1]*y_train.shape[0])
        weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
        w1 = np.array([1]*y_train.shape[0])
        w1[y_train==1]=weight

        estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
        estimator.fit(x_train, y_train, sample_weight=w1)
        y_scores = estimator.predict_proba(x_test)[:,1]
        precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
        f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
        m_idx = np.argmax(f1)
        threshold[cnt] = thresholds[2+m_idx]
        cnt += 1
        print("%d %f %f" % (precision.shape[0], f1[m_idx], thresholds[2+m_idx]))
    return np.mean(threshold), threshold

# Cross validation using gradient tree boosting
项目:PEP    作者:ma-compbio    | 项目源码 | 文件源码
def parametered_single(x_train,y_train,x_test,y_test,thresh_opt):
    print("samples: %d %d %d %d" % (x_train.shape[0],x_train.shape[1],x_test.shape[0],x_test.shape[1]))

    metrics = np.zeros((1,5),dtype="float32")
    thresh = 0.5

    # estimate the threshold
    if thresh_opt==1:
        thresh = threshold_estimate(x_train,y_train)

    clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=500, nthread=50)
    weight = float(sum(y_train<1))/float(sum(y_train==1))
    w1 = np.array([1]*y_train.shape[0])
    w1[y_train==1]=weight
    clf.fit(x_train, y_train, sample_weight=w1)

    prob = clf.predict_proba(x_test)
    yfit = (prob[:,1]>thresh)

    precision, recall, f1, mcc = score_function(y_test,yfit)
    metrics = np.array((thresh,precision,recall,f1,mcc))
    print metrics

    importances = clf.feature_importances_
    indices1 = np.argsort(importances)[::-1]
    features1 = np.transpose(np.array((indices1,importances[indices1])))

    pred = np.transpose(np.array((y_test,yfit)))
    return metrics, pred, prob, features1

# Cross validation for PEP-Word
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def xgb0(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        return df.drop(['time'], axis=1)
    logging.info("train xgb0 model")
    clf = xgb.XGBClassifier()
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def xgb150opt(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        return df.drop(['time'], axis=1)
    logging.info("train xgb150opt model")
    clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=3, subsample=0.667, colsample_bytree=1)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def xgb150opt2(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        return df.drop(['time'], axis=1)
    logging.info("train xgb150opt2 model")
    clf = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1, subsample=0.85263, colsample_bytree=0.657894, reg_alpha=1.55556, reg_lambda=1.22222, gamma=0.3333333)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:data-science-bowl-2017    作者:tondonia    | 项目源码 | 文件源码
def __init__(self, trainX, trainY):
        self.trainX = trainX
        self.trainY = trainY

        self.level0 = xgb.XGBClassifier(learning_rate=0.325,
                                       silent=True,
                                       objective="binary:logistic",
                                       nthread=-1,
                                       gamma=0.85,
                                       min_child_weight=5,
                                       max_delta_step=1,
                                       subsample=0.85,
                                       colsample_bytree=0.55,
                                       colsample_bylevel=1,
                                       reg_alpha=0.5,
                                       reg_lambda=1,
                                       scale_pos_weight=1,
                                       base_score=0.5,
                                       seed=0,
                                       missing=None,
                                       n_estimators=1920, max_depth=6)
        self.h_param_grid = {'max_depth': hp.quniform('max_depth', 1, 13, 1),
                        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
                        'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
                        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
                        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
                        'n_estimators': hp.quniform('n_estimators', 10, 200, 5),
                        }
        self.to_int_params = ['n_estimators', 'max_depth']
项目:Brain_Tumor_Segmentation    作者:KarthikRevanuru    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('survival_data.csv', index_col=0)
    p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])

    y=np.array([])
    t=0
    z=np.array([])
    for ind in range(len(folder_names_train)):
        try:
            temp = df.get_value(str(folder_names_train[ind]),'Class')
            y=np.append(y,temp)
            temp = df.get_value(str(folder_names_train[ind]),'Age')
            z=np.append(z,np.array([temp]))
        except Exception as e:
            t+=1 
            print (t,str(e),"Label Not found, deleting entry")
            y=np.append(y,0)

    z=np.array([[v] for v in z])

    t=np.concatenate((p,q),axis=1)
    u=np.concatenate((r,s),axis=1)
    x=np.concatenate((t,u),axis=1) 
    #print(x.shape)
    #print (x)
    #print (x.shape,z.shape)
    x=np.concatenate((x,z),axis=1)
    #print (x)
    #clf=linear_model.LogisticRegression(C=1e5)
    #clf = RandomForestClassifier()
    clf = xgb.XGBClassifier()
    clf.fit(x,y)
    return clf
项目:mriqc    作者:poldracklab    | 项目源码 | 文件源码
def _get_model(self):
        if self._model == 'xgb':
            return XGBClassifier()

        if self._model == 'svc_rbf':
            return SVC()

        if self._model == 'svc_lin':
            return LinearSVC()

        return RFC()
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def models():
    params = {'n_jobs':nthread,'random_state':seed,'class_weight':None}

    # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)

    rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1}

    clfs = [
        # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),

    ]
    for clf in clfs:
        yield clf
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_classifier(loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            a = dxgb.XGBClassifier()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 5)
            a.fit(X2, y2)
            p1 = a.predict(X2)

    b = xgb.XGBClassifier()
    b.fit(X, y)
    np.testing.assert_array_almost_equal(a.feature_importances_,
                                         b.feature_importances_)
    assert_eq(p1, b.predict(X))
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit a gradient boosting classifier

        Parameters
        ----------
        X : array-like [n_samples, n_features]
            Feature Matrix. May be a dask.array or dask.dataframe
        y : array-like
            Labels

        Returns
        -------
        self : XGBClassifier

        Notes
        -----
        This differs from the XGBoost version in three ways

        1. The ``sample_weight``, ``eval_set``, ``eval_metric``,
          ``early_stopping_rounds`` and ``verbose`` fit kwargs are not
          supported.
        2. The labels are not automatically label-encoded
        3. The ``classes_`` and ``n_classes_`` attributes are not learned
        """
        client = default_client()
        xgb_options = self.get_xgb_params()
        self._Booster = train(client, xgb_options, X, y,
                              num_boost_round=self.n_estimators)
        return self
项目:Yelp    作者:alexander-rakhlin    | 项目源码 | 文件源码
def process_fold(X_train, X_val, y_train, y_val, X_test):
    #XGBoos
    clf = OneVsRestClassifier(xgb.XGBClassifier(learning_rate=0.005, n_estimators=500))
    clf.fit(X_train, y_train)
    y_p_x = clf.predict_proba(X_val)
    y_p_x_tst = clf.predict_proba(X_test)

    # Keras
    y_p_k, y_p_k_tst = KerasClassifier(X_train, y_train, X_val, y_val, X_test)

    return (y_p_x+y_p_k) / 2.0, (y_p_x_tst+y_p_k_tst) / 2.0
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def train_xgb(X_train, y_train, X_test, y_test):
    n_trees = 1000
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=5, objective='multi:softprob',
            seed=0, silent=True, nthread=-1, learning_rate=0.1)
    eval_set = [(X_test, y_test)]
    clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror", early_stopping_rounds=10)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              subsample,
              colsample_bytree,
              gamma,
              min_child_weight,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        gamma=gamma,
                        min_child_weight = min_child_weight,
                        seed=seed,
                        objective="binary:logistic")

    clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25)
    ll = -log_loss(y1, clf.predict_proba(x1))
    return ll
项目:LeaguePredictor    作者:dgarwin    | 项目源码 | 文件源码
def xgboo():
    # Gradient Boosted Trees to grid search
    model = XGBClassifier(seed=random_state, nthread=8)
    parameters = {'max_depth': [3, 6, 9], 'n_estimators': [50, 100, 200, 400]}
    grid = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
    return grid
项目:parisfellows_anonymize    作者:armgilles    | 项目源码 | 文件源码
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
#              gamma,
#              min_child_weight,
#              max_delta_step,
              subsample,
              colsample_bytree,
              ratio=131.708,
              silent =True,
              nthread = -1,
              seed = 42):
    return cross_val_score(XGBClassifier(max_depth = int(max_depth),
                                         learning_rate = learning_rate,
                                         n_estimators = int(n_estimators),
                                         silent = silent,
                                         nthread = nthread,
#                                         gamma = gamma,
#                                         min_child_weight = min_child_weight,
#                                         max_delta_step = max_delta_step,
                                         subsample = subsample,
                                         colsample_bytree = colsample_bytree,
                                         scale_pos_weight = ratio,
                                         seed = seed),
                           X,
                           y,
                           scoring='f1',
                           cv=5).mean()
项目:kaggle-pipeline    作者:randxie    | 项目源码 | 文件源码
def select_mdl(self, mdl_type, param):
        """
        # define classifier and parameters
        :param mdl_type: specify which model to initialize
        :param param: a dict storing model parameters
        """
        if (mdl_type == 'xgb'):
            self.mdl = xgb.XGBClassifier(**param)
        elif (mdl_type == 'lr'):
            self.mdl = LogisticRegression(**param)
        elif (mdl_type == 'rf'):
            self.mdl = RandomForestClassifier(**param)
项目:hh-page-classifier    作者:TeamHG-Memex    | 项目源码 | 文件源码
def pipeline(self):
        # This is a property for serialization support with xgboost,
        # because we change self.clf after __init__.
        pipeline = [self.vec]
        if isinstance(self.clf, XGBClassifier):
            # Work around xgboost issue:
            # https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543
            pipeline.append(CSCTransformer())
        pipeline.append(self.clf)
        return make_pipeline(*pipeline)
项目:hh-page-classifier    作者:TeamHG-Memex    | 项目源码 | 文件源码
def explain_predictions(self, docs, top=30):
        if not isinstance(self.clf, XGBClassifier):
            raise NotImplementedError
        booster = self.clf.booster()
        xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)}
        feature_names = get_feature_names(self.clf, self.vec,
                                          num_features=len(xgb_feature_names))
        feature_names.bias_name = '<BIAS>'
        X = self.vec.transform(docs)
        X = X.tocsc()
        dmatrix = DMatrix(X, missing=self.clf.missing)
        leaf_ids = booster.predict(dmatrix, pred_leaf=True)
        tree_dumps = booster.get_dump(with_stats=True)
        docs_weights = []
        for i, _leaf_ids in enumerate(leaf_ids):
            all_weights = _target_feature_weights(
                _leaf_ids, tree_dumps,
                feature_names=feature_names,
                xgb_feature_names=xgb_feature_names)[1]
            weights = np.zeros_like(all_weights)
            idx = X[i].nonzero()[1]
            bias_idx = feature_names.bias_idx
            weights[idx] = all_weights[idx]
            weights[bias_idx] = all_weights[bias_idx]
            docs_weights.append(weights)
        weights = np.mean(docs_weights, axis=0)
        feature_weights = get_top_features(
            feature_names=np.array(
                [_prettify_feature(f) for f in feature_names]),
            coef=weights,
            top=top)
        return Explanation(
            estimator=type(self.clf).__name__,
            targets=[TargetExplanation('y', feature_weights=feature_weights)],
        )
项目:hh-page-classifier    作者:TeamHG-Memex    | 项目源码 | 文件源码
def get_attributes(obj):
    if isinstance(obj, TfidfVectorizer):
        return get_tfidf_attributes(obj)
    elif isinstance(obj, XGBClassifier):
        return pickle.dumps(obj)
    elif isinstance(obj, BaseEstimator):
        return {attr: getattr(obj, attr) for attr in dir(obj)
                if not attr.startswith('_') and attr.endswith('_')
                and attr not in skip_attributes}
    elif obj is not None:
        raise TypeError(type(obj))
项目:hh-page-classifier    作者:TeamHG-Memex    | 项目源码 | 文件源码
def set_attributes(parent, field, attributes):
    obj = getattr(parent, field)
    if isinstance(obj, TfidfVectorizer):
        set_ifidf_attributes(obj, attributes)
    elif isinstance(obj, XGBClassifier):
        setattr(parent, field, pickle.loads(attributes))
    elif isinstance(obj, BaseEstimator):
        for k, v in attributes.items():
            try:
                setattr(obj, k, v)
            except AttributeError:
                raise AttributeError(
                    'can\'t set attribute {} on {}'.format(k, obj))
    elif obj is not None:
        raise TypeError(type(obj))
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, y):
        import xgboost as xgb

        self.learning_rate = float(self.learning_rate)
        self.n_estimators = int(self.n_estimators)
        self.subsample = float(self.subsample)
        self.max_depth = int(self.max_depth)

        # (TODO) Gb used at most half of the features, here we use all
        self.colsample_bylevel = float(self.colsample_bylevel)

        self.colsample_bytree = float(self.colsample_bytree)
        self.gamma = float(self.gamma)
        self.min_child_weight = int(self.min_child_weight)
        self.max_delta_step = int(self.max_delta_step)
        self.reg_alpha = float(self.reg_alpha)
        self.reg_lambda = float(self.reg_lambda)
        self.nthread = int(self.nthread)
        self.base_score = float(self.base_score)
        self.scale_pos_weight = float(self.scale_pos_weight)

        # We don't support multilabel, so we only need 1 objective function
        if len(numpy.unique(y)) == 2:
            # We probably have binary classification
            self.objective = 'binary:logistic'
        else:
            self.objective = 'multi:softprob'

        self.estimator = xgb.XGBClassifier(
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                n_estimators=self.n_estimators,
                silent=self.silent,
                objective=self.objective,
                nthread=self.nthread,
                gamma=self.gamma,
                scale_pos_weight=self.scale_pos_weight,
                min_child_weight=self.min_child_weight,
                max_delta_step=self.max_delta_step,
                subsample=self.subsample,
                colsample_bytree=self.colsample_bytree,
                colsample_bylevel=self.colsample_bylevel,
                reg_alpha=self.reg_alpha,
                reg_lambda=self.reg_lambda,
                base_score=self.base_score,
                seed=self.seed
                )
        self.estimator.fit(X, y, eval_metric='auc')

        return self
项目:PEP    作者:ma-compbio    | 项目源码 | 文件源码
def parametered_cv(x,y,k_fold,k_fold1):
    print("samples: %d %d %d %d" % (x.shape[0],x.shape[1],k_fold,k_fold1))
    kf = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
    index = []
    label = []
    yfit = []
    metrics = np.zeros((k_fold,5),dtype="float32")
    thresholds = []
    predicted = np.array([[0,0]])
    features1 = np.array([[0,0]])
    thresh = 0.5
    cnt = 0
    print "Positive: %d Negative: %d" % (sum(y==1), sum(y==0))
    for train_index, test_index in kf:
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print y_train.shape
        print("%d %d %d %d" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))
        if k_fold1>1:
            thresh, thresh_vec = threshold_estimate_cv(x_train,y_train,k_fold1)
        elif k_fold1==1:
            thresh = threshold_estimate(x_train,y_train)
        else:
            thresh = 0.5
        print("%d %f" % (x_train.shape[0], thresh))
        weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
        w1 = np.array([1]*y_train.shape[0])
        w1[y_train==1]=weight
        weight1 = float(len(y_test[y_test == 0]))/float(len(y_test[y_test == 1]))
        clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
        clf.fit(x_train, y_train, sample_weight=w1)
        prob = clf.predict_proba(x_test)
        yfit1 = (prob[:,1]>thresh)
        index = np.concatenate((index,test_index),axis=0)
        label = np.concatenate((label,y_test),axis=0)
        yfit = np.concatenate((yfit,yfit1),axis=0)
        precision, recall, f1, mcc = score_function(y_test,yfit1)
        metrics[cnt,:] = np.array((thresh,precision,recall,f1,mcc))
        print metrics[cnt,:]
        cnt += 1
        predicted = np.concatenate((predicted,prob),axis=0) 
        importances = clf.feature_importances_
        indices1 = np.argsort(importances)[::-1]
        feature_1 = np.transpose(np.array((indices1,importances[indices1])))
        features1 = np.concatenate((features1,feature_1),axis=0)

    pred = np.transpose(np.array((index,label,yfit)))
    aver_metrics = np.mean(metrics,axis=0)
    aver_metrics = np.reshape(aver_metrics,(1,metrics.shape[1]))
    metrics_1 = np.concatenate((metrics,aver_metrics),axis=0)
    print aver_metrics
    return metrics_1, pred, predicted[1:,], features1[1:,]

# Single run using gradient tree boosting
项目:FinancialRiskControl    作者:XierHacker    | 项目源码 | 文件源码
def online(X_org, y_org, test_x, test_uid):
    n_folds = 5
    verbose = True
    shuffle = False

    X = X_org
    y = y_org
    X_submission = test_x

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [
        RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
        ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
        GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
        LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
        xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
        xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
        ]

    print "Creating train and test sets for blending."

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

    print "Blending."
    # clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1)
    clf = linear_model.RidgeCV(
            alphas=np.linspace(0, 200), cv=LM_CV_NUM)
    # clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100)
    clf.fit(dataset_blend_train, y)
    # y_submission = clf.predict_proba(dataset_blend_test)[:,1]
    print clf.coef_, clf.intercept_
    y_submission = clf.predict(dataset_blend_test)  # for RidgeCV

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    print "blend result"
    save_submission(os.path.join(consts.SUBMISSION_PATH,
                                     MODEL_NAME + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'),
                        test_uid, y_submission)
项目:FinancialRiskControl    作者:XierHacker    | 项目源码 | 文件源码
def online2(X_org, y_org, test_x, test_uid):
    n_folds = 5
    verbose = True
    shuffle = False

    X = X_org
    y = y_org
    X_submission = test_x

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [
        RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
        ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
        GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
        LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
        # xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
        # xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
        ]

    print "Creating train and test sets for blending."

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            clf.fit(X_train, y_train)
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
        save_submission(os.path.join(consts.SUBMISSION_PATH,
                                         clf.__class__.__name__ + '_' + strftime("%m_%d_%H_%M_%S", localtime()) + '.csv'),
                            test_uid, dataset_blend_test[:, j])
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def models():

    extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy',
                           'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
                           'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}

    extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse',
                           'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
                           'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}


    xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
           'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}

    xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
           'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}


    #NN params
    nb_epoch = 3
    batch_size = 128
    esr = 402

    param1 = {
        'hidden_units': (256, 256),
        'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
        'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
    }
    param2 = {
        'hidden_units': (1024, 1024),
        'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
        'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
    }
    clfs = [
        (D2, XGBClassifier(**xgb_cla)),
        (D11, XGBClassifier(**xgb_cla)),

        (D2, XGBRegressor(**xgb_reg)),
        (D11, XGBRegressor(**xgb_reg)),

        (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
        (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),

        (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
        (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),

    # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
    # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
    # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
    #
    # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
    # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
    # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))

    ]
    for clf in clfs:
        yield clf
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]],
                                    label: np.ndarray,
                                    metric_sklearn: str,
                                    n_jobs: int,
                                    param_grid: dict,
                                    params: dict,
                                    strat_folds: StratifiedKFold,
                                    train: np.ndarray,
                                    verbosity_level: int = 10) -> Tuple[dict, float]:
    """
    Grid search over a segment of XGBoost parameters.

    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param param_grid:
        A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search
        values [0.5, 0.6, 0.7, 0.8].
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)

    grid = GridSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_jobs=n_jobs,
        param_grid=param_grid,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    grid.fit(train, label)
    best_score = grid.best_score_
    # Massage the score to be in line with what xgboost reports
    if metric_sklearn == 'neg_mean_squared_error':
        best_score = abs(best_score) ** 0.5
    elif metric_sklearn == 'neg_log_loss':
        best_score = abs(best_score)
    return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def tune_xgb_params_randomized(estimator_cls,
                               label: np.ndarray,
                               metric_sklearn: str,
                               n_jobs: int,
                               params: dict,
                               strat_folds: StratifiedKFold,
                               train: np.ndarray,
                               n_iter: int = 20,
                               verbosity_level: int = 10,
                               **kwargs):
    """
    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param n_iter:
        An optional parameter to control the number of parameter settings that are sampled.
    :param n_jobs:
        An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :param kwargs:
        Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
        colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)
    param_distributions = {
        'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
        'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
        'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
        'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
        'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
    }

    rand_search = RandomizedSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_iter=n_iter,
        n_jobs=n_jobs,
        param_distributions=param_distributions,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    rand_search.fit(train, label)
    return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]