我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.RandomForestRegressor()。
def cross_validate_best_known(): ''' import and clean the tractor data, then do a corss validation on each of the three models we are training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print the scores. The parameters we're using here are the "best" that we've found so far using a grid search. ''' tractor_data = pd.read_csv('data/train.csv') tractor_data = cln.clean_all(tractor_data) X = tractor_data y = tractor_data.pop('SalePrice') rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2) gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1) ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3) ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000) validate.cross_v_scores([rf, gb, ab], X, y) # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946 # GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829 # AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
def test_stacked_regressor(self): bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = StackedRegressor(bclf, clfs, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def test_fwls_regressor(self): feature_func = lambda x: np.ones(x.shape) bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = FWLSRegressor(bclf, clfs, feature_func, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def model_random_forecast(Xtrain,Xtest,ytrain): X_train = Xtrain y_train = ytrain rfr = RandomForestRegressor(n_jobs=1, random_state=0) param_grid = {'n_estimators': [1000]} # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]} model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE) model.fit(X_train, y_train) print('Random forecast regression...') print('Best Params:') print(model.best_params_) print('Best CV Score:') print(-model.best_score_) y_pred = model.predict(Xtest) return y_pred, -model.best_score_
def rfr_feature_select(): from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.cross_validation import cross_val_score, ShuffleSplit boston = load_boston() X = boston["data"] Y = boston["target"] names = boston["feature_names"] rf = RandomForestRegressor(n_estimators=20, max_depth=4) scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i + 1], Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), names[i])) print sorted(scores, reverse=True)
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def rforest2(train, test, tunings=None, smoteit=True, duplicate=True): "RF " # Apply random forest Classifier to predict the number of bugs. if smoteit: train = SMOTE(train, atleast=50, atmost=101, resample=duplicate) if not tunings: clf = RandomForestRegressor(n_estimators=100, random_state=1) else: clf = RandomForestRegressor(n_estimators=int(tunings[0]), max_features=tunings[1] / 100, min_samples_leaf=int(tunings[2]), min_samples_split=int(tunings[3]) ) train_DF = formatData(train) test_DF = formatData(test) features = train_DF.columns[:-2] klass = train_DF[train_DF.columns[-2]] # set_trace() clf.fit(train_DF[features], klass) preds = clf.predict(test_DF[test_DF.columns[:-2]]) return preds
def test_regressor(self): X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] index = [i for i in range(200)] rf = RandomForestRegressor() jrf = JoblibedRegressor(rf, "rfr", cache_dir='') jrf.fit(X_train, y_train, index) prediction = jrf.predict(X_train, index) mse = mean_squared_error(y_train, prediction) assert_less(mse, 6.0) rf = RandomForestRegressor(n_estimators=20) jrf = JoblibedRegressor(rf, "rfr", cache_dir='') jrf.fit(X_train, y_train, index) prediction2 = jrf.predict(X_train, index) assert_allclose(prediction, prediction2)
def unscaled_pipelines(): # Random forest parameters random_forest_kwargs = { 'n_estimators': 10, 'criterion': 'mse', 'random_state': _RANDOM_STATE, 'n_jobs': cpu_count(), 'verbose': True, } # Gradient boosting parameters gradient_boost_kwargs = { 'random_state': _RANDOM_STATE, 'verbose': 1, } models = [ DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE), # RandomForestRegressor(**random_forest_kwargs), # GradientBoostingRegressor(**gradient_boost_kwargs), ] pipelines = [] for m in models: # Steps pipelines.append(make_pipeline(m)) return pipelines
def parameterChoosing(self): #Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(20,60), 'n_estimators': range(10,40), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "MSE for test data set:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print mean_squared_error(y_true, y_pred)
def train_xgboost(): df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7') p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) y=np.array([]) t=0 z=np.array([]) for ind in range(len(folder_names_train)): try: temp = df.get_value(str(folder_names_train[ind]),'Survival') y=np.append(y,temp) temp = df.get_value(str(folder_names_train[ind]),'Age') z=np.append(z,np.array([temp])) except Exception as e: t+=1 print (t,str(e),"Label Not found, deleting entry") y=np.append(y,0) z=np.array([[v] for v in z]) t=np.concatenate((p,q),axis=1) u=np.concatenate((r,s),axis=1) x=np.concatenate((t,u),axis=1) #print(x.shape) #print (x) #print (x.shape,z.shape) x=np.concatenate((x,z),axis=1) #print (x) #clf=linear_model.LogisticRegression(C=1e5) #clf = RandomForestRegressor() clf = xgb.XGBRegressor() clf.fit(x,y) return clf
def fit(self, X, y): """ Fit a Random Forest model to data `X` and targets `y`. Parameters ---------- X : array-like Input values. y: array-like Target values. """ self.X = X self.y = y self.n = self.X.shape[0] self.model = RandomForestRegressor(**self.params) self.model.fit(X, y)
def test_random_forest_regressor(self): for dtype in self.number_data_type.keys(): scikit_model = RandomForestRegressor(random_state=1) data = self.scikit_data['data'].astype(dtype) target = self.scikit_data['target'].astype(dtype) scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target) test_data = data[0].reshape(1, -1) self._check_tree_model(spec, 'multiArrayType', 'doubleType', 1) coreml_model = create_model(spec) try: self.assertEqual(scikit_model.predict(test_data)[0].dtype, type(coreml_model.predict({'data': test_data})['target'])) self.assertAlmostEqual(scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], msg="{} != {} for Dtype: {}".format( scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], dtype ) ) except RuntimeError: print("{} not supported. ".format(dtype))
def _train_convert_evaluate(self, **scikit_params): """ Train a scikit-learn model, convert it and then evaluate it with CoreML """ scikit_model = RandomForestRegressor(random_state = 1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_regressor(spec, df, verbose = False) return metrics
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def train_model(self, train_file_path, model_path): print("==> Load the data ...") X_train, Y_train = self.load_file(train_file_path) print(train_file_path, shape(X_train)) print("==> Train the model ...") min_max_scaler = preprocessing.MaxAbsScaler() X_train_minmax = min_max_scaler.fit_transform(X_train) clf = RandomForestRegressor(n_estimators=self.n_estimators) clf.fit(X_train_minmax.toarray(), Y_train) print("==> Save the model ...") pickle.dump(clf, open(model_path, 'wb')) scaler_path = model_path.replace('.pkl', '.scaler.pkl') pickle.dump(min_max_scaler, open(scaler_path, 'wb')) return clf
def trainModel(featureCount, imageCount, save): clf = RandomForestRegressor(n_estimators=1, n_jobs=-1) features = generateFeatures(featureCount) for image in range(0, imageCount): print "Image " + str(image) train(clf, features, image) clf = clf.fit(X, Y) model = (clf, features) if save: joblib.dump(model, "model.pkl") return model
def set_missing_ages(df): age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() y = known_age[:, 0] X = known_age[:, 1:] # fit by RamdomForestRegressor rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # predict the unknown age predictedAges = rfr.predict(unknown_age[:, 1:]) # backfill the value of unknown age df.loc[(df.Age.isnull()), 'Age'] = predictedAges return df, rfr
def set_missing_ages(df): age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() y = known_age[:, 0] X = known_age[:, 1:] # fit by RamdomForestRegressor rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # predict the unknown age predictedAges = rfr.predict(unknown_age[:, 1:]) # backfill the value of unknown age df.loc[(df.Age.isnull()), 'Age'] = predictedAges return df, rfr # processing the column : Cabin
def test_gbrt_base_estimator(): rng = np.random.RandomState(1) N = 10000 X = np.ones((N, 1)) y = rng.normal(size=N) base = RandomForestRegressor() rgr = GradientBoostingQuantileRegressor(base_estimator=base) assert_raise_message(ValueError, 'type GradientBoostingRegressor', rgr.fit, X, y) base = GradientBoostingRegressor() rgr = GradientBoostingQuantileRegressor(base_estimator=base) assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y) base = GradientBoostingRegressor(loss='quantile', n_estimators=20) rgr = GradientBoostingQuantileRegressor(base_estimator=base) rgr.fit(X, y) estimates = rgr.predict(X, return_quantiles=True) assert_almost_equal(stats.norm.ppf(rgr.quantiles), np.mean(estimates, axis=0), decimal=2)
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 300 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) clf = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=3, random_state=13) clf.fit(train2[itrain], y[itrain]) p = clf.predict(train2[ival]) v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) z[cname] += np.log1p(clf.predict(test2)) print(cname, 'step %d: score'%(n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits
def test_RandomForestRegressor_num(*data): ''' test the performance with different n_estimators :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data nums=np.arange(1,100,step=2) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for num in nums: regr=ensemble.RandomForestRegressor(n_estimators=num) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(nums,training_scores,label="Training Score") ax.plot(nums,testing_scores,label="Testing Score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(-1,1) plt.suptitle("RandomForestRegressor") plt.show()
def test_RandomForestRegressor_max_depth(*data): ''' test the performance with different max_depth :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data maxdepths=range(1,20) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for max_depth in maxdepths: regr=ensemble.RandomForestRegressor(max_depth=max_depth) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(maxdepths,training_scores,label="Training Score") ax.plot(maxdepths,testing_scores,label="Testing Score") ax.set_xlabel("max_depth") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("RandomForestRegressor") plt.show()
def test_RandomForestRegressor_max_features(*data): ''' test the performance with different max_features :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data max_features=np.linspace(0.01,1.0) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for max_feature in max_features: regr=ensemble.RandomForestRegressor(max_features=max_feature) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(max_features,training_scores,label="Training Score") ax.plot(max_features,testing_scores,label="Testing Score") ax.set_xlabel("max_feature") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("RandomForestRegressor") plt.show()
def rf(train_sample, validation_sample, features, seed): log_base = np.e rf_est = RandomForestRegressor(n_estimators=500, criterion='mse', max_features=4, max_depth=None, bootstrap=True, min_samples_split=4, min_samples_leaf=1, min_weight_fraction_leaf=0, max_leaf_nodes=None, random_state=seed ).fit( train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base)) rf_prob = np.power(log_base, rf_est.predict(validation_sample[features])) - 1 print_mape(validation_sample['volume'], rf_prob, 'RF') return rf_prob
def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b']) check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2))) assert_raises_regexp(ValueError, 'inconsistent numbers of samples', check_consistent_length, [1, 2], [1]) assert_raises_regexp(TypeError, 'got <\w+ \'int\'>', check_consistent_length, [1, 2], 1) assert_raises_regexp(TypeError, 'got <\w+ \'object\'>', check_consistent_length, [1, 2], object()) assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1)) # Despite ensembles having __len__ they must raise TypeError assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor()) # XXX: We should have a test with a string, but what is correct behaviour?
def try_params( n_iterations, params ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = RF( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params ) return train_and_eval_sklearn_regressor( clf, data )
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger=VoidLogger(None)): if task.task == "classification": space = RandomForestOptimizer.Params.classification_space model = ensemble.RandomForestClassifier() else: space = RandomForestOptimizer.Params.regression_space model = ensemble.RandomForestRegressor() super().__init__(model, task, space, scorer, opt_logger)
def setClf(self): # min_samples_split = 3 # self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.3, min_samples_split =1, verbose=100, n_jobs=-1) self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.8) return
def machine_learning_RF(x_train,y_train,x_test,y_test): import numpy as np mask = [] #Gets rid of NaNs for i in range(np.shape(x_train)[1]): mask.append(~np.isnan(x_train[:,i])) mask.append(~np.isnan(np.transpose(y_train))) mask = np.transpose(reduce(np.logical_and, mask)) mask = mask.reshape(len(mask),) inputs = x_train[mask,:] targets = y_train[mask] mask2 = [] for i in range(np.shape(x_test)[1]): mask2.append(~np.isnan(x_test[:,i])) mask2 = np.transpose(reduce(np.logical_and, mask2)) inputs_test = x_test[mask2,:] #End getting rid of NaNs #Sets up forest #n-estimators is how many "trees" (samples) you will take from sklearn.ensemble import RandomForestRegressor rfc_new = RandomForestRegressor(n_estimators=100,random_state=42,max_features=2) #Training rfc_new = rfc_new.fit(inputs,targets) #Predicting predicted_y = rfc_new.predict(inputs_test) print rfc_new.feature_importances_ return y_test[mask2], predicted_y
def convert(model, feature_names, target): """Convert a boosted tree model to protobuf format. Parameters ---------- decision_tree : RandomForestRegressor A trained scikit-learn tree model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _ensemble.RandomForestRegressor) def is_rf_model(m): if len(m.estimators_) == 0: return False if hasattr(m, 'estimators_') and m.estimators_ is not None: for t in m.estimators_: if not hasattr(t, 'tree_') or t.tree_ is None: return False return True else: return False _sklearn_util.check_fitted(model, is_rf_model) return _MLModel(_convert_tree_ensemble(model, feature_names, target))
def persist_pipelines(pipelines): Path('models').mkdir(exist_ok=True) fp_fmt = 'models/{}-{:%y-%m-%d}.pkl' now = dt.datetime.now() for pipe in pipelines: print(utils.pipeline_name(pipe)) fp_name = fp_fmt.format(utils.pipeline_name(pipe), now) joblib.dump(pipe, fp_name) # Pickle fails to work on RandomForestRegressor # with open(fp_name, 'wb') as fp: # pickle.dump(pipe, fp)
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def __init__(self, isTrain): super(RegressionRandomForest, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() # Create linear regression object self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"): self.case_id_col = case_id_col self.nr_events = nr_events self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs) if cls_method == "gbm": self.cls = GradientBoostingRegressor(**cls_kwargs) elif cls_method == "rf": self.cls = RandomForestRegressor(**cls_kwargs) else: print("Classifier method not known")
def fastLapModel(xList, labels, names, multiple=0, full_set=0): X = numpy.array(xList) y = numpy.array(labels) featureNames = [] featureNames = numpy.array(names) # take fixed holdout set 30% of data rows xTrain, xTest, yTrain, yTest = train_test_split( X, y, test_size=0.30, random_state=531) # for final model (no CV) if full_set: xTrain = X yTrain = y check_set(xTrain, xTest, yTrain, yTest) print "Fitting the model to the data set..." # train random forest at a range of ensemble sizes in order to see how the # mse changes mseOos = [] m = 10 ** multiple nTreeList = range(500 * m, 1000 * m, 100 * m) # iTrees = 10000 for iTrees in nTreeList: depth = None maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1 # try tweaking RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, oob_score=False, random_state=531, n_jobs=-1) # RFmd.n_features = 5 RFmd.fit(xTrain, yTrain) # Accumulate mse on test set prediction = RFmd.predict(xTest) mseOos.append(mean_squared_error(yTest, prediction)) # plot training and test errors vs number of trees in ensemble plot.plot(nTreeList, mseOos) plot.xlabel('Number of Trees in Ensemble') plot.ylabel('Mean Squared Error') #plot.ylim([0.0, 1.1*max(mseOob)]) plot.show() print("MSE") print(mseOos[-1]) return xTrain, xTest, yTrain, yTest, RFmd
def fit_forest(X, y, window=100000, estimators=100, samples_leaf=250, validate=True): ''' Fits Random Forest ''' model = RandomForestRegressor(n_estimators=estimators, min_samples_leaf=samples_leaf, random_state=42, n_jobs=-1) if validate: return cross_validate(X, y, model, window) return model.fit(X, y)
def __init__(self, **params): """ Wrapper around sklearn's Random Forest implementation for pyGPGO. Random Forests can also be used for surrogate models in Bayesian Optimization. An estimate of 'posterior' variance can be obtained by using the `impurity` criterion value in each subtree. Parameters ---------- params: tuple, optional Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's. """ self.params = params
def generate_RF_model(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Random Forest Regression Model...' start_time = datetime.datetime.now() rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced') rf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time-start_time).seconds print 'Save Model...' joblib.dump(rf, 'RF.model') return rf
def rf_from_cfg(cfg, seed): """ Creates a random forest regressor from sklearn and fits the given data on it. This is the function-call we try to optimize. Chosen values are stored in the configuration (cfg). Parameters: ----------- cfg: Configuration configuration chosen by smac seed: int or RandomState used to initialize the rf's random generator Returns: ----------- np.mean(rmses): float mean of root mean square errors of random-forest test predictions per cv-fold """ rfr = RandomForestRegressor( n_estimators=cfg["num_trees"], criterion=cfg["criterion"], min_samples_split=cfg["min_samples_to_split"], min_samples_leaf=cfg["min_samples_in_leaf"], min_weight_fraction_leaf=cfg["min_weight_frac_leaf"], max_features=cfg["max_features"], max_leaf_nodes=cfg["max_leaf_nodes"], bootstrap=cfg["do_bootstrapping"], random_state=seed) def rmse(y, y_pred): return np.sqrt(np.mean((y_pred - y)**2)) # Creating root mean square error for sklearns crossvalidation rmse_scorer = make_scorer(rmse, greater_is_better=False) score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer) return -1 * np.mean(score) # Because cross_validation sign-flips the score
def train(self, x, y, n_estimators=10, max_depth=None, min_samples_leaf=1): n_estimators = self.to_int(n_estimators) max_depth = self.to_int(max_depth) min_samples_leaf = self.pos_int(min_samples_leaf) if self.problem_type == ProblemType.BINARY_CLAS: self.model = RandomForestClassifier(n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf) elif self.problem_type == ProblemType.REGRESSION: self.model = RandomForestRegressor(n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf) else: raise NotImplementedError('Problem type {0} not implemented'.format(self.problem_type)) self.model.fit(x, y)
def define_model(self): #if self.modeltype == "AR" : # return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order']) if self.modeltype == "RandomForest" : return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators']) #return ensemble.RandomForestClassifier( # n_estimators=self.parameters['n_estimators']) elif self.modeltype == "LinearRegression" : return linear_model.LinearRegression() elif self.modeltype == "Lasso" : return linear_model.Lasso( alpha=self.parameters['alpha']) elif self.modeltype == "ElasticNet" : return linear_model.ElasticNet( alpha=self.parameters['alpha'], l1_ratio=self.parameters['l1_ratio']) elif self.modeltype == "SVR" : return SVR( C=self.parameters['C'], epsilon=self.parameters['epsilon'], kernel=self.parameters['kernel']) #elif self.modeltype == 'StaticModel': # return StaticModel ( # parameters=self.parameters # ) #elif self.modeltype == 'AdvancedStaticModel': # return AdvancedStaticModel ( # parameters=self.parameters # ) # elif self.modeltype == 'SGDRegressor' : # print(self.parameters) # return linear_model.SGDRegressor( # loss=self.parameters['loss'], # penalty=self.parameters['penalty'], # l1_ratio=self.parameters['l1_ratio']) else: raise ConfigError("Unsupported model {0}".format(self.modeltype))
def predict(self, X, return_std=False): if return_std: trees = self.estimators_ y = np.concatenate([tree.predict(X)[np.newaxis, :] for tree in trees], axis=0) mean = y.mean(axis=0) std = y.std(axis=0) return mean, std else: return super(RandomForestRegressor, self).predict(X)
def greedy_elim(df): # do feature selection using boruta X = df[[x for x in df.columns if x!='SalePrice']] y = df['SalePrice'] #model = RandomForestRegressor(n_estimators=50) model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05) # 150 features seems to be the best at the moment. Why this is is unclear. feat_selector = RFE(estimator=model, step=1, n_features_to_select=150) # find all relevant features feat_selector.fit_transform(X.as_matrix(), y.as_matrix()) # check selected features features_bool = np.array(feat_selector.support_) features = np.array(X.columns) result = features[features_bool] #print(result) # check ranking of features features_rank = feat_selector.ranking_ #print(features_rank) rank = features_rank[features_bool] #print(rank) return result
def model_fit_and_test(TrainX,TrainY,TestX,TestY): def bulid_model(model_name): model = model_name() return model #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]: for model_name in [LinearRegression, ElasticNet]: model = bulid_model(model_name) model.fit(TrainX,TrainY) print(model_name) resid = model.predict(TestX) - TestY #print resid print("Residual sum of squares: %f"% np.mean(resid ** 2)) #print model.predict(TestX) #print TestY # Explained variance score: 1 is perfect prediction plt.scatter(model.predict(TestX), resid); plt.axhline(0, color='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') #plt.xlim([1, 50]) plt.show() print('Variance score: %.2f' % model.score(TestX, TestY)) from statsmodels.stats.stattools import jarque_bera _, pvalue, _, _ = jarque_bera(resid) print ("Test Residuals Normal", pvalue) from statsmodels import regression, stats import statsmodels.api as sms import statsmodels.stats.diagnostic as smd # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4))) xs_with_constant = sms.add_constant(TestX) _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant) print ("Test Heteroskedasticity", pvalue1) ljung_box = smd.acorr_ljungbox(resid, lags=10) #print "Lagrange Multiplier Statistics:", ljung_box[0] print "Test Autocorrelation P-values:", ljung_box[1] if any(ljung_box[1] < 0.05): print "The residuals are autocorrelated." else: print "The residuals are not autocorrelated."
def __init__(self, **params): """ Wrapper around sklearn's ExtraTreesRegressor implementation for pyGPGO. Random Forests can also be used for surrogate models in Bayesian Optimization. An estimate of 'posterior' variance can be obtained by using the `impurity` criterion value in each subtree. Parameters ---------- params: tuple, optional Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's. """ self.params = params
def random_forest_grid_search(): random_forest_grid = { 'n_estimators': [50, 100, 1000], 'max_features': ['sqrt', 'log2', 'auto'], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 2], } rf = RandomForestRegressor() return random_forest_grid, rf
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor scikit_data = load_boston() scikit_model = RandomForestRegressor(random_state = 1) scikit_model.fit(scikit_data['data'], scikit_data['target']) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model