Python xgboost 模块,XGBRegressor() 实例源码

我们从Python开源项目中,提取了以下31个代码示例,用于说明如何使用xgboost.XGBRegressor()

项目:Brain_Tumor_Segmentation    作者:KarthikRevanuru    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
    p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])

    y=np.array([])
    t=0
    z=np.array([])
    for ind in range(len(folder_names_train)):
        try:
            temp = df.get_value(str(folder_names_train[ind]),'Survival')
            y=np.append(y,temp)
            temp = df.get_value(str(folder_names_train[ind]),'Age')
            z=np.append(z,np.array([temp]))
        except Exception as e:
            t+=1 
            print (t,str(e),"Label Not found, deleting entry")
            y=np.append(y,0)

    z=np.array([[v] for v in z])

    t=np.concatenate((p,q),axis=1)
    u=np.concatenate((r,s),axis=1)
    x=np.concatenate((t,u),axis=1) 
    #print(x.shape)
    #print (x)
    #print (x.shape,z.shape)
    x=np.concatenate((x,z),axis=1)
    #print (x)
    #clf=linear_model.LogisticRegression(C=1e5)
    #clf = RandomForestRegressor()
    clf = xgb.XGBRegressor()
    clf.fit(x,y)
    return clf
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def _train_convert_evaluate(self, bt_params = {}, **params):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        # Train a model
        xgb_model = xgboost.XGBRegressor(**params)
        xgb_model.fit(self.X, self.target)

        # Convert the model (feature_names can't be given because of XGboost)
        spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False)

        # Get predictions
        df = pd.DataFrame(self.X, columns=self.feature_names)
        df['prediction'] = xgb_model.predict(self.X)

        # Evaluate it
        metrics = evaluate_regressor(spec, df, target = 'target', verbose = False)
        return metrics
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def xgb_model_select(file_name):  
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]}
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def train_model_for_appcounts(df):
    app_df = df[['appCount','age','gender','education','marriageStatus','haveBaby']]
    known_app = app_df[app_df.appCount.notnull()].as_matrix()
    unknown_app = app_df[app_df.appCount.isnull()].as_matrix()
    y = known_app[:, 0]
    X = known_app[:, 1:]

    print 'Train Xgboost Model(For Missing AppCount)...'
    start_time  = datetime.datetime.now()
    xgb_reg = XGBRegressor(n_estimators=100, max_depth=3)
    xgb_reg.fit(X, y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)

    predicted_app = xgb_reg.predict(unknown_app[:, 1:])
    df.loc[ (df.appCount.isnull()), 'appCount' ] = predicted_app 

    return df, xgb_reg
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def train_model_for_age(df):
    age_df = df[['age', 'appCount','gender','education','marriageStatus','haveBaby']]
    known_age = age_df[age_df.age != 0].as_matrix()
    unknown_age = age_df[age_df.age == 0].as_matrix()
    y = known_age[:, 0]
    X = known_age[:, 1:]

    print 'Train Xgboost Model(For Missing Age)...'
    start_time  = datetime.datetime.now()
    xgb_reg = XGBRegressor(n_estimators=100, max_depth=3)
    xgb_reg.fit(X, y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)

    predicted_age = xgb_reg.predict(unknown_age[:, 1:])
    df.loc[ (df.age == 0), 'age' ] = predicted_age 

    return df, xgb_reg
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_XGB_model(train_df):
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    '''print 'Train And Fix Missing Age Value...'
    train_df, xgb_age = train_model_for_age(train_df)
    joblib.dump(xgb_age, 'XGB_age.model')'''
    train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True)
    print 'Done'
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Xgboost Model...'
    start_time  = datetime.datetime.now()
    xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False)
    xbg_clf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
    model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_})
    print model_df
    return xbg_clf
项目:huaat_ml_dl    作者:ieee820    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
    print(df.head())

    x = np.array([np.mean(np.load('npy_result/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)

    clf = xgb.XGBRegressor(max_depth=10,
                           n_estimators=1500,
                           min_child_weight=9,
                           learning_rate=0.05,
                           nthread=8,
                           subsample=0.80,
                           colsample_bytree=0.80,
                           seed=4242)

    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
    return clf
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params, get_predictions = False ):

    n_estimators = int( round( n_iterations * trees_per_iteration ))
    print "n_estimators:", n_estimators
    pprint( params )

    model = XGB( n_estimators = n_estimators, nthread = -1, **params )

    return train_and_eval_sklearn_regressor( model, data )
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def print_results(self, model_name):
        if self.ml_for_analytics and model_name in ('LogisticRegression', 'RidgeClassifier', 'LinearRegression', 'Ridge'):
            self._print_ml_analytics_results_linear_model()

        elif self.ml_for_analytics and model_name in ['RandomForestClassifier', 'RandomForestRegressor', 'XGBClassifier', 'XGBRegressor', 'GradientBoostingRegressor', 'GradientBoostingClassifier', 'LGBMRegressor', 'LGBMClassifier']:
            self._print_ml_analytics_results_random_forest()
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def _get_xgb_feat_importances(self, clf):

        try:
            # Handles case when clf has been created by calling
            # xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit()
            fscore = clf.booster().get_fscore()
        except:
            # Handles case when clf has been created by calling xgb.train.
            # Thus, clf is an instance of xgb.Booster.
            fscore = clf.get_fscore()

        trained_feature_names = self._get_trained_feature_names()

        feat_importances = []

        # Somewhat annoying. XGBoost only returns importances for the features it finds useful.
        # So we have to go in, get the index of the feature from the "feature name" by removing the f before the feature name, and grabbing the rest of that string, which is actually the index of that feature name.
        fscore_list = [[int(k[1:]), v] for k, v in fscore.items()]


        feature_infos = []
        sum_of_all_feature_importances = 0.0

        for idx_and_result in fscore_list:
            idx = idx_and_result[0]
            # Use the index that we grabbed above to find the human-readable feature name
            feature_name = trained_feature_names[idx]
            feat_importance = idx_and_result[1]

            # If we sum up all the feature importances and then divide by that sum, we will be able to have each feature importance as it's relative feature imoprtance, and the sum of all of them will sum up to 1, just as it is in scikit-learn.
            sum_of_all_feature_importances += feat_importance
            feature_infos.append([feature_name, feat_importance])

        sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1])

        print('Here are the feature_importances from the tree-based model:')
        print('The printed list will only contain at most the top 50 features.')
        for feature in sorted_feature_infos[-50:]:
            print(str(feature[0]) + ': ' + str(round(feature[1] / sum_of_all_feature_importances, 4)))
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def _print_ml_analytics_results_random_forest(self):
        try:
            final_model_obj = self.trained_final_model.named_steps['final_model']
        except:
            final_model_obj = self.trained_final_model

        print('\n\nHere are the results from our ' + final_model_obj.model_name)
        if self.name is not None:
            print(self.name)
        print('predicting ' + self.output_column)

        # XGB's Classifier has a proper .feature_importances_ property, while the XGBRegressor does not.
        if final_model_obj.model_name in ['XGBRegressor', 'XGBClassifier']:
            self._get_xgb_feat_importances(final_model_obj.model)

        else:
            trained_feature_names = self._get_trained_feature_names()

            try:
                trained_feature_importances = final_model_obj.model.feature_importances_
            except AttributeError as e:
                # There was a version of LightGBM that had this misnamed to miss the "s" at the end
                trained_feature_importances = final_model_obj.model.feature_importance_

            feature_infos = zip(trained_feature_names, trained_feature_importances)

            sorted_feature_infos = sorted(feature_infos, key=lambda x: x[1])

            print('Here are the feature_importances from the tree-based model:')
            print('The printed list will only contain at most the top 50 features.')
            for feature in sorted_feature_infos[-50:]:
                print(feature[0] + ': ' + str(round(feature[1], 4)))
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
        self.clf = XGBRegressor(max_depth=7, learning_rate=0.01, n_estimators=100)

        return
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def create_features(user_id,is_exp,
                         feature_cloumn_func = lambda day:get_feature_cloumn(None,day,has_user_type=False),
                         load_exp_func = load_user_exp_model,
                         load_func = load_user_model,
                         is_exp_power = False
                        ):
    print user_id
    dataset = get_month_by_id(user_id)
    result = []
    for day in range(1,32):
        feature_column = feature_cloumn_func(day)
        x_ = dataset[feature_column]
        trainer = xgb.XGBRegressor()
        if is_exp:
            if is_exp_power:
                x_ = exp_power(x_)
            load_exp_func(trainer,day,user_id)
        else:
            load_func(trainer,day,user_id)
        y_p = trainer.predict(x_)
        y_p = pd.Series(y_p,name='y_p#%d'%(day-1))
        if not is_exp:
            y_p = np.exp(y_p)
        result.append(y_p)
    result = pd.DataFrame(result).T
    result.index = dataset.index
    for day in range(31):
        result['real#%d'%day] = dataset['y#%d'%day].apply(np.exp)
    sys.stdout.flush()
    return result
项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def train_xgboost_regressor():
    return mp.ModelProperties(regression=True), xgboost.XGBRegressor()
项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def apply_filler(self, x_train, y_train, x_test):
        model = xgboost.XGBRegressor()
        model = model.fit(x_train, y_train)
        return model.predict(x_test)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_unsupported_conversion(self):

        feature_names = self.scikit_data.feature_names
        output_name = 'target'
        xgb_model = xgboost.XGBRegressor(objective = 'reg:gamma')
        xgb_model.fit(self.scikit_data.data, self.scikit_data.target)
        with self.assertRaises(ValueError):
            spec = xgb_converter.convert(xgb_model, feature_names, 'target')

        xgb_model = xgboost.XGBRegressor(objective = 'reg:tweedie')
        xgb_model.fit(self.scikit_data.data, self.scikit_data.target)
        with self.assertRaises(ValueError):
            spec = xgb_converter.convert(xgb_model, feature_names, 'target')
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def test():
    iris = load_iris()  
    xgb_model = xgb.XGBRegressor(n_estimators=300000, max_depth=2)
    xgb_model.fit(iris.data[:120],iris.target[:120])

    predict = xgb_model.predict(iris.data[:120])
    print mean_squared_error(iris.target[:120], predict)

    pred = xgb_model.predict(iris.data[120:])
    print mean_squared_error(iris.target[120:], pred)
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None):
    train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
    test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
    evallist  = [(test_data,'eval'), (train_data,'train')]

    #if xgb_params == None:
    #    xgb_params = get_default_xgboost_params()

    if not use_cv:
        num_rounds = 10
    else:
        cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5,
            metrics={'rmse'}, show_progress=True)
        print cvresult
        num_rounds = len(cvresult)
    gbdt = None
    if(use_sklean):
        #gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
        xgb_params['n_estimators'] = num_rounds
        gbdt = xgboost.XGBRegressor(xgb_params)

        gbdt.fit(x_train, y_train)
        y_pred = gbdt.predict(X_test)

        return gbdt, y_pred
    else:
        #gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
        gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True)

        ceate_feature_map_for_feature_importance(features)
        show_feature_importance(gbdt, feature_names=features)

        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan")))
        return XGBoostModel(gbdt), y_pred
项目:Kaggle-DS-Bowl-17    作者:Zephyr-D    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
#    print df.head()

    x = []
    y = []
    did = df['id'].tolist()
    cancer = df['cancer'].tolist()
    for i in range(len(df)):
        if os.path.isfile('data/stage1/%s.npy' % did[i]):
            f = np.load('data/stage1/%s.npy' % did[i])
            f = f.reshape(f.shape[0], 2048)
            x.append(np.mean(f, axis=0))
            y.append(cancer[i])

    x = np.array(x)
    print x.shape
    y = np.array(y)

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=822, stratify=y, test_size=0.1)

    clfs = []
    for s in range(5):
    # Some parameters were taken from discussion.
        clf = xgb.XGBRegressor(n_estimators=1000, max_depth=10, min_child_weight=10,
                               learning_rate=0.01, subsample=0.80, colsample_bytree=0.70,
                               seed=822 + s, reg_alpha=0.1)

        clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=100)
        clfs.append(clf)
    return clfs
项目:groot    作者:zhpmatrix    | 项目源码 | 文件源码
def xgbr(X,y):
    X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0)
    xgbr_boost = xgb.XGBRegressor(seed=1)
    xgbr_boost.fit(X_train,y_train.ravel())
    print 'training error:',1.0 - xgbr_boost.score(X_train,y_train)
    print 'validation error:',1.0 - xgbr_boost.score(X_validation,y_validation)
    time_fit(xgbr_boost,X_train,y_train.ravel())
项目:mars_express    作者:wsteitz    | 项目源码 | 文件源码
def __init__(self):
        self.name = "onegbm"
        self.m = Pipeline([
        ("drop", FeatureRemover(["UPBS", "UPBE", "SCMN", "earthmars_km", "OCC_MARS_200KM_START_", "sa_monthly"])),
        ("gbm", xgboost.XGBRegressor(max_depth=7, n_estimators=1000, learning_rate=0.05, silent=1, seed=42))
        ])
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def models():
    params = {'n_jobs':nthread,'random_state':seed,'class_weight':None}

    # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)

    rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1}

    clfs = [
        # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),

    ]
    for clf in clfs:
        yield clf
项目:dask-xgboost    作者:dask    | 项目源码 | 文件源码
def test_regressor(loop):  # noqa
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            a = dxgb.XGBRegressor()
            X2 = da.from_array(X, 5)
            y2 = da.from_array(y, 5)
            a.fit(X2, y2)
            p1 = a.predict(X2)

    b = xgb.XGBRegressor()
    b.fit(X, y)
    assert_eq(p1, b.predict(X))
项目:real_estate    作者:cooperoelrichs    | 项目源码 | 文件源码
def make_model(params):
        return xgb.XGBRegressor(**params)
项目:GZ_travelTime    作者:zhilonglu    | 项目源码 | 文件源码
def xgb_Fit(knownX,knownY,preX):
    xlf = xgb.XGBRegressor(max_depth=11,
                           learning_rate=0.01,
                           n_estimators=301,
                           silent=True,
                           objective=mape,
                           gamma=0,
                           min_child_weight=5,
                           max_delta_step=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           colsample_bylevel=1,
                           reg_alpha=1e0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=9,
                           missing=None)
    x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1)
    for i in range(y_train.shape[1]):
        xlf.fit(x_train, y_train[:, i].reshape(-1, 1), eval_metric=mape, verbose=False)
                # eval_set=[(x_test, y_test[:, i].reshape(-1, 1))], early_stopping_rounds=2)
        tempPre = xlf.predict(preX).reshape(-1, 1)
        if i == 0:
            Y_pre = tempPre
        else:
            Y_pre = np.c_[Y_pre, tempPre]
    Y_pre = Y_pre.reshape(-1, 1)
    return Y_pre

#?model??gridsearch
项目:GZ_travelTime    作者:zhilonglu    | 项目源码 | 文件源码
def xgb_Fit(knownX,knownY,preX):
    xlf = xgb.XGBRegressor(max_depth=7,#11
                           learning_rate=0.06,#0.01
                           n_estimators=1000,
                           silent=True,
                           objective=mapeobj,
                           gamma=0,
                           min_child_weight=5,
                           max_delta_step=0,
                           subsample=1,#0.8
                           colsample_bytree=0.8,
                           colsample_bylevel=1,
                           reg_alpha=1e0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1850,
                           missing=None)
    x_train, x_test, y_train, y_test = train_test_split(knownX, knownY, test_size=0.5, random_state=1)
    for i in range(y_train.shape[1]):
        xlf.fit(x_train, y_train[:, i].reshape(-1,1))
        # print('Training Error: {:.3f}'.format(1 - xlf.score(x_train,y_train[:,i].reshape(-1,1))))
        # print('Validation Error: {:.3f}'.format(1 - xlf.score(x_test,y_test[:,i].reshape(-1,1))))
        #predict value for output
        tempPre = xlf.predict(preX).reshape(-1, 1)
        if i == 0:
            Y_pre = tempPre
        else:
            Y_pre = np.c_[Y_pre, tempPre]
    Y_pre = Y_pre.reshape(-1, 1)
    return Y_pre

#sklearn???????
项目:svm-prediction    作者:zhengze    | 项目源码 | 文件源码
def svr_main(X, Y):
    X_train = X[:TRAIN_SIZE]
    Y_train = Y[:TRAIN_SIZE]
    X_test = X[TRAIN_SIZE:]
    Y_test = Y[TRAIN_SIZE:]

    clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
    #clf.fit(X_train,Y_train)
    #y_pred = clf.predict(X_test)
    #plt.plot(X_test, y_pred, linestyle='-', color='red') 

    #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1)
    #clf = DecisionTreeRegressor(max_depth=25)
    #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
    #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
    #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)
    predict_list = []
    for i in xrange(TEST_SIZE):
        X = [ [x] for x in xrange(i, TRAIN_SIZE+i)]
        clf.fit(X, Y[i:TRAIN_SIZE+i])
        y_pred = clf.predict([TRAIN_SIZE+1+i])
        predict_list.append(y_pred)

    print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)
    print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))
    origin_data = Y_test
    print "origin data:%s"%origin_data
    plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model')  
    plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') 
    plt.legend(loc=1, prop={'size': 12})
    plt.show()
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, y, refit=False):
        import xgboost as xgb

        self.learning_rate = float(self.learning_rate)
        self.n_estimators = int(self.n_estimators)
        self.subsample = float(self.subsample)
        self.max_depth = int(self.max_depth)

        # (TODO) Gb used at most half of the features, here we use all
        self.colsample_bylevel = float(self.colsample_bylevel)

        self.colsample_bytree = float(self.colsample_bytree)
        self.gamma = float(self.gamma)
        self.min_child_weight = int(self.min_child_weight)
        self.max_delta_step = int(self.max_delta_step)
        self.reg_alpha = float(self.reg_alpha)
        self.reg_lambda = float(self.reg_lambda)
        self.nthread = int(self.nthread)
        self.base_score = float(self.base_score)
        self.scale_pos_weight = float(self.scale_pos_weight)

        self.objective = 'reg:linear'

        self.estimator = xgb.XGBRegressor(
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                n_estimators=self.n_estimators,
                silent=self.silent,
                objective=self.objective,
                nthread=self.nthread,
                gamma=self.gamma,
                scale_pos_weight=self.scale_pos_weight,
                min_child_weight=self.min_child_weight,
                max_delta_step=self.max_delta_step,
                subsample=self.subsample,
                colsample_bytree=self.colsample_bytree,
                colsample_bylevel=self.colsample_bylevel,
                reg_alpha=self.reg_alpha,
                reg_lambda=self.reg_lambda,
                base_score=self.base_score,
                seed=self.seed
                )

        self.estimator.fit(X, y)

        return self
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def models():

    extra_params_kaggle_cla = {'n_estimators':1200,'max_features':30,'criterion':'entropy',
                           'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
                           'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}

    extra_params_kaggle_reg = {'n_estimators':1200,'max_features':30,'criterion':'mse',
                           'min_samples_leaf': 2, 'min_samples_split': 2,'max_depth': 30,
                           'min_samples_leaf': 2, 'n_jobs':nthread, 'random_state':seed}


    xgb_reg = {'objective':'reg:linear', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
           'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}

    xgb_cla = {'objective':'binary:logistic', 'max_depth': 11, 'learning_rate':0.01, 'subsample':.9,
           'n_estimators':10000, 'colsample_bytree':0.45, 'nthread':nthread, 'seed':seed}


    #NN params
    nb_epoch = 3
    batch_size = 128
    esr = 402

    param1 = {
        'hidden_units': (256, 256),
        'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
        'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
    }
    param2 = {
        'hidden_units': (1024, 1024),
        'activation': (advanced_activations.PReLU(),advanced_activations.PReLU(),core.activations.sigmoid),
        'dropout': (0., 0.), 'optimizer': RMSprop(), 'nb_epoch': nb_epoch,
    }
    clfs = [
        (D2, XGBClassifier(**xgb_cla)),
        (D11, XGBClassifier(**xgb_cla)),

        (D2, XGBRegressor(**xgb_reg)),
        (D11, XGBRegressor(**xgb_reg)),

        (D2, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),
        (D11, ensemble.ExtraTreesClassifier(**extra_params_kaggle_cla)),

        (D2, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),
        (D11, ensemble.ExtraTreesRegressor(**extra_params_kaggle_reg)),

    # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2, loss='binary_crossentropy', class_mode='binary', **param1)),
    # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
    # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param1)),
    #
    # (D1, NN(input_dim=D1[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
    # (D3, NN(input_dim=D3[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2)),
    # (D5, NN(input_dim=D5[0].shape[1], output_dim=1, batch_size=batch_size, early_stopping_epoch=esr, verbose=2,loss='binary_crossentropy', class_mode='binary', **param2))

    ]
    for clf in clfs:
        yield clf
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def tune_xgb_params_segment_by_grid(estimator_cls: Type[Union[xgb.XGBClassifier, xgb.XGBRegressor]],
                                    label: np.ndarray,
                                    metric_sklearn: str,
                                    n_jobs: int,
                                    param_grid: dict,
                                    params: dict,
                                    strat_folds: StratifiedKFold,
                                    train: np.ndarray,
                                    verbosity_level: int = 10) -> Tuple[dict, float]:
    """
    Grid search over a segment of XGBoost parameters.

    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param param_grid:
        A dictionary of the grid of parameters to be searched over - e.g. {'colsample_bytree': range(0.5, 0.9, 0.1)} to search
        values [0.5, 0.6, 0.7, 0.8].
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)

    grid = GridSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_jobs=n_jobs,
        param_grid=param_grid,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    grid.fit(train, label)
    best_score = grid.best_score_
    # Massage the score to be in line with what xgboost reports
    if metric_sklearn == 'neg_mean_squared_error':
        best_score = abs(best_score) ** 0.5
    elif metric_sklearn == 'neg_log_loss':
        best_score = abs(best_score)
    return {k: grid.best_params_[k] for k in param_grid.keys()}, best_score
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def tune_xgb_params_randomized(estimator_cls,
                               label: np.ndarray,
                               metric_sklearn: str,
                               n_jobs: int,
                               params: dict,
                               strat_folds: StratifiedKFold,
                               train: np.ndarray,
                               n_iter: int = 20,
                               verbosity_level: int = 10,
                               **kwargs):
    """
    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param n_iter:
        An optional parameter to control the number of parameter settings that are sampled.
    :param n_jobs:
        An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :param kwargs:
        Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
        colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)
    param_distributions = {
        'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
        'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
        'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
        'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
        'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
    }

    rand_search = RandomizedSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_iter=n_iter,
        n_jobs=n_jobs,
        param_distributions=param_distributions,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    rand_search.fit(train, label)
    return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]