我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.tree.DecisionTreeRegressor()。
def cross_validate_best_known(): ''' import and clean the tractor data, then do a corss validation on each of the three models we are training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print the scores. The parameters we're using here are the "best" that we've found so far using a grid search. ''' tractor_data = pd.read_csv('data/train.csv') tractor_data = cln.clean_all(tractor_data) X = tractor_data y = tractor_data.pop('SalePrice') rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2) gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1) ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3) ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000) validate.cross_v_scores([rf, gb, ab], X, y) # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946 # GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829 # AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
def test_regression(): # Check regression for various parameter settings. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [0.5, 1.0], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyRegressor(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR()]: for params in grid: BaggingRegressor(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def load(file_path): with open(file_path + '.params', 'r') as params_file: params = json.load(params_file) weak_learners = list() for wl_id in range(params['n_round']): # wl = DecisionTreeRegressor(max_depth=params['max_depth'], # max_features=params['max_features'], # min_samples_leaf=params['min_samples_leaf']) wl = joblib.load(file_path + '.wl%d' % wl_id) weak_learners.append(wl) rankgbm = RankGBM(params['vote_k'], n_round=params['n_round'], max_depth=params['max_depth'], max_features=params['max_features'], min_samples_leaf=params['min_samples_leaf'], learn_rate=params['learn_rate']) rankgbm.weak_learners = weak_learners return rankgbm
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_features': ['sqrt', 'log2', None], 'max_depth': range(2,1000), } ] reg = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error') reg.fit(self.X_train, self.y_train) print "Best parameters set found on development set:\n" print reg.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in reg.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "MSE for test data set:\n" y_true, y_pred = self.y_test, reg.predict(self.X_test) print mean_squared_error(y_true, y_pred)
def convert(model, feature_names, target): """Convert a decision tree model to protobuf format. Parameters ---------- decision_tree : DecisionTreeRegressor A trained scikit-learn tree model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _tree.DecisionTreeRegressor) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None) return _MLModel(_convert_tree_ensemble(model, feature_names, target))
def test_tree_regressor(self): for dtype in self.number_data_type.keys(): scikit_model = DecisionTreeRegressor(random_state=1) data = self.scikit_data['data'].astype(dtype) target = self.scikit_data['target'].astype(dtype) scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target) test_data = data[0].reshape(1, -1) self._check_tree_model(spec, 'multiArrayType', 'doubleType', 1) coreml_model = create_model(spec) try: self.assertEqual(scikit_model.predict(test_data)[0].dtype, type(coreml_model.predict({'data': test_data})['target'])) self.assertEqual(scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], msg="{} != {} for Dtype: {}".format( scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], dtype ) ) except RuntimeError: print("{} not supported. ".format(dtype))
def decision_tree(X, y, regression, max_depth=3): from sklearn.tree import export_graphviz from sklearn.externals.six import StringIO from IPython.core.pylabtools import figsize from IPython.display import Image figsize(12.5, 6) import pydot if regression: clf = DecisionTreeRegressor(max_depth=max_depth) else: clf = DecisionTreeClassifier(max_depth=max_depth) clf.fit(X, y) dot_data = StringIO() export_graphviz(clf, out_file=dot_data, feature_names=list(X.columns), filled=True, rounded=True,) graph = pydot.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png())
def test_DecisionTreeRegressor(*data): ''' test DT regression :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data regr = DecisionTreeRegressor() regr.fit(X_train, y_train) print("Training score:{0}".format(regr.score(X_train,y_train))) print("Testing score:{0}".format(regr.score(X_test,y_test))) ##graph fig=plt.figure() ax=fig.add_subplot(1,1,1) X = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] Y = regr.predict(X) ax.scatter(X_train, y_train, label="train sample",c='g') ax.scatter(X_test, y_test, label="test sample",c='r') ax.plot(X, Y, label="predict_value", linewidth=2,alpha=0.5) ax.set_xlabel("data") ax.set_ylabel("target") ax.set_title("Decision Tree Regression") ax.legend(framealpha=0.5) plt.show()
def bench_scikit_tree_regressor(X, Y): """Benchmark with scikit-learn decision tree regressor""" from sklearn.tree import DecisionTreeRegressor gc.collect() # start time tstart = datetime.now() clf = DecisionTreeRegressor() clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_regressor_results.append( delta.seconds + delta.microseconds / mu_second)
def test_importances_gini_equal_mse(): # Check that gini is equivalent to mse for binary output variable X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # The gini index and the mean square error (variance) might differ due # to numerical instability. Since those instabilities mainly occurs at # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(X, y) reg = DecisionTreeRegressor(criterion="mse", max_depth=5, random_state=0).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_friedman_mse_in_graphviz(): clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0) clf.fit(X, y) dot_data = StringIO() export_graphviz(clf, out_file=dot_data) clf = GradientBoostingClassifier(n_estimators=2, random_state=0) clf.fit(X, y) for estimator in clf.estimators_: export_graphviz(estimator[0], out_file=dot_data) for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()): assert_in("friedman_mse", finding.group())
def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=1.0, bootstrap_features=False, random_state=rng).fit(X_train, y_train) for features in ensemble.estimators_features_: assert_equal(boston.data.shape[1], np.unique(features).shape[0]) ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=1.0, bootstrap_features=True, random_state=rng).fit(X_train, y_train) for features in ensemble.estimators_features_: assert_greater(boston.data.shape[1], np.unique(features).shape[0])
def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R')} clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def _get_shape_for_attribute(attribute_data, labels, class_weights, feature_name, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, presort): dtr = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, random_state=random_state, max_leaf_nodes=max_leaf_nodes, presort=presort) dtr.fit(attribute_data.reshape(-1, 1), labels) return feature_name, _get_sum_of_gamma_correction(dtr.tree_, attribute_data, labels, class_weights, feature_name)
def create_model(list_of_features): n_estimators=10000 n_jobs=4 x_train=data_frame[list_of_features] y_train=data_frame.iloc[:,-1] x_test=data_frame_test[list_of_features] random_state=0 forest=BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=n_estimators,random_state=random_state, n_jobs=n_jobs) forest.fit(x_train[list_of_features],y_train) Y_pred=forest.predict(data_frame_test[list_of_features].as_matrix()) i=0 file=open('submission.csv','w') header="Id,SalePrice" header=header+'\n' file.write(header) for id in (data_frame_test['Id']): str="{},{}".format(id,Y_pred[i]) str=str+'\n' file.write(str) i+=1
def RunTestor(): # Create Volume test vXs, vYs = generateBaseVector(volume_filename, "volume") vXs_more = createVolumeVector(vXs, weather_filename) vXs_fin = generateProcessedVolumeVector(vXs_more) vX_train, vX_test, vy_train, vy_test = train_test_split(vXs_fin, vYs, test_size=0.1) volume_reg = DecisionTreeRegressor() volume_reg.fit(vX_train, vy_train) vResult = volume_reg.predict(vX_test) v_mape = VolumeMAPE(vX_test, vResult, vy_test) # Create Travel Time test tXs, tYs = generateBaseVector(travel_filename, "travel_time") tXs_more = createTravelTimeVector(tXs, weather_filename) tXs_fin = generateProcessedTravelTimeVector(tXs_more) tX_train, tX_test, ty_train, ty_test = train_test_split(tXs_fin, tYs, test_size=0.1) travelTime_reg = DecisionTreeRegressor() travelTime_reg.fit(tX_train, ty_train) tResult = travelTime_reg.predict(tX_test) t_mape = TravelTimeMAPE(tX_test, tResult, ty_test) print("MAPE of Volume Prediction: " + str(v_mape) + "\n") print("MAPE of Travel Prediction: " + str(t_mape) + "\n") return True
def setClf(self): min_samples_split = 10 self.clf = DecisionTreeRegressor(random_state=0, min_samples_split= min_samples_split) return
def test_logitboost_musk_fitting(): c = LogitBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = MUSK1() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.6) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.05
def test_logitboost_hastie_fitting(): c = LogitBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = Hastie_10_2() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.5) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
def test_gentleboost_musk_fitting(): c = GentleBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = MUSK1() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.5) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.1
def test_gentleboost_hastie_fitting(): c = GentleBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = Hastie_10_2() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.5) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
def fit(self, X, y, sample_weight=None): from sklearn.tree import DecisionTreeRegressor self.max_features = float(self.max_features) if self.max_depth == "None": self.max_depth = None else: num_features = X.shape[1] max_depth = max(1, int(np.round(self.max_depth * num_features, 0))) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if self.max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) self.estimator = DecisionTreeRegressor( criterion=self.criterion, max_depth=max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def test_cart_d1_agrees_with_scikit(): d_cart = GaussCART(X, y, 1) d_pred = d_cart.predict(X) sk_cart = tree.DecisionTreeRegressor(max_depth=1) sk_cart = sk_cart.fit(X, y) sk_pred = sk_cart.predict(X) d_error = np.round(sose(y, d_pred), 6) sk_error = np.round(sose(y, sk_pred), 6) assert d_error == sk_error
def test_cart_d3_agrees_with_scikit(): d_cart = GaussCART(X, y, 3) d_pred = d_cart.predict(X) sk_cart = tree.DecisionTreeRegressor(max_depth=3) sk_cart = sk_cart.fit(X, y) sk_pred = sk_cart.predict(X) d_error = np.round(sose(y, d_pred), 6) sk_error = np.round(sose(y, sk_pred), 6) assert d_error == sk_error
def model_fit_and_test(TrainX,TrainY,TestX,TestY): def bulid_model(model_name): model = model_name() return model #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]: for model_name in [LinearRegression, ElasticNet]: model = bulid_model(model_name) model.fit(TrainX,TrainY) print(model_name) resid = model.predict(TestX) - TestY #print resid print("Residual sum of squares: %f"% np.mean(resid ** 2)) #print model.predict(TestX) #print TestY # Explained variance score: 1 is perfect prediction plt.scatter(model.predict(TestX), resid); plt.axhline(0, color='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') #plt.xlim([1, 50]) plt.show() print('Variance score: %.2f' % model.score(TestX, TestY)) from statsmodels.stats.stattools import jarque_bera _, pvalue, _, _ = jarque_bera(resid) print ("Test Residuals Normal", pvalue) from statsmodels import regression, stats import statsmodels.api as sms import statsmodels.stats.diagnostic as smd # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4))) xs_with_constant = sms.add_constant(TestX) _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant) print ("Test Heteroskedasticity", pvalue1) ljung_box = smd.acorr_ljungbox(resid, lags=10) #print "Lagrange Multiplier Statistics:", ljung_box[0] print "Test Autocorrelation P-values:", ljung_box[1] if any(ljung_box[1] < 0.05): print "The residuals are autocorrelated." else: print "The residuals are not autocorrelated."
def __init__(self, isTrain): super(RegressionAdaBoost, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() # Create AdaBoost regression object decisionReg = DecisionTreeRegressor(max_depth=10) rng = np.random.RandomState(1) self.adaReg = AdaBoostRegressor(decisionReg, n_estimators=400, random_state=rng)
def __init__(self, isTrain): super(RegressionDecisionTree, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() # Create linear regression object self.model = DecisionTreeRegressor(max_depth=7, max_features=None)
def drawValidationCurve(self): """ To draw the validation curve :return:NA """ X, y = self.X_train, self.y_train.ravel() indices = np.arange(y.shape[0]) #np.random.shuffle(indices) X, y = X[indices], y[indices] train_sizes = range(2,60) train_scores, valid_scores = validation_curve(DecisionTreeRegressor(max_features=None), X, y, "max_depth", train_sizes, cv=5, scoring='mean_squared_error') train_scores = -1.0/5 *train_scores valid_scores = -1.0/5 *valid_scores train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training MSE") plt.plot(train_sizes, valid_scores_mean, '*-', color="g", label="Cross-validation MSE") plt.legend(loc="best") plt.xlabel('Max Depth') plt.ylabel('MSE') plt.title('Validation Curve with Decision \nTree Regression on the parameter of Max Depth') plt.grid(True) plt.show()
def test_search_cv_results_none_param(): X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1] estimators = (DecisionTreeRegressor(), DecisionTreeClassifier()) est_parameters = {"random_state": [0, None]} cv = KFold(random_state=0) for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv).fit(X, y) assert_array_equal(grid_search.cv_results_['param_random_state'], [0, None])
def regress(y, x, test_x=[]): if len(test_x) == 0: test_x = x clf = DecisionTreeRegressor() clf.fit(x, y) y_p = clf.predict(test_x) plt.scatter(y, y_p)
def ada_boost_tree_grid_search(): ada_boost_tree_grid = { 'base_estimator__max_features': ['sqrt'], 'base_estimator__splitter': ['best', 'random'], 'base_estimator__min_samples_split': [2, 4], 'base_estimator__max_depth': [1, 3], 'n_estimators': [50, 100, 1000], 'learning_rate': [.001, .01, .1], 'loss': ['linear', 'square', 'exponential'] } abr = AdaBoostRegressor(DecisionTreeRegressor()) return ada_boost_tree_grid, abr
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston from sklearn.tree import DecisionTreeRegressor scikit_data = load_boston() scikit_model = DecisionTreeRegressor(random_state = 1) scikit_model.fit(scikit_data['data'], scikit_data['target']) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = DecisionTreeRegressor() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston from sklearn.tree import DecisionTreeRegressor # Load data and train model scikit_data = load_boston() self.scikit_data = scikit_data self.X = scikit_data['data'] self.target = scikit_data['target'] self.feature_names = scikit_data.feature_names self.output_name = 'target'
def spot_check(X, y): if type == 'regression': models = [ (LinearRegression(), 'Ordinary Least Squares'), (Ridge(alpha=0.1), 'Ridge (alpha 0.1)'), (Ridge(), 'Ridge (alpha 1.0)'), (Lasso(alpha=0.1), 'Lasso (alpha 0.1)'), (Lasso(), 'Lasso (alpha 1.0)'), (ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'), (ElasticNet(), 'ElasticNet (alpha 1.0)'), (DecisionTreeRegressor(), 'Decision Tree'), (KNeighborsRegressor(), 'K-Nearest Neighbors'), # (RandomForestRegressor(), 'Random Forest Regressor'), # (BaggingRegressor(), 'Bagging Regressor'), # (GradientBoostingRegressor(), 'Gradient Bosted Regression'), # (SVR(), 'Support Vector Regression') ] splits = 5 scores = [] for model, model_name in models: score = check_model(model, splits, X, y) # get average score scores.append(score) model_names = map(lambda x: x[1], models) for name, score in zip(model_names, scores): print('%s: %f' % (name, score))
def get_classifier(self, X, Y): """ ???????? :param X: ???? :param Y: ?????? :return: ?? """ # rng = np.random.RandomState(1) clf = AdaBoostRegressor(DecisionTreeRegressor()) clf.fit(X, Y) return clf
def get_classifier(self, X, Y): """ ???????? :param X: ???? :param Y: ?????? :return: ?? """ clf = DecisionTreeRegressor() clf.fit(X, Y) return clf
def get_classifier(self, X, Y): """ ???????? :param X: ???? :param Y: ?????? :return: ?? """ clf = DecisionTreeRegressor(max_depth=4) clf.fit(X, Y) return clf
def test_boston(self): from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn model = DecisionTreeRegressor(max_n_splits=3) model_sklearn = DecisionTreeRegressorSklearn() dataset = load_boston() mse = [] mse_sklearn = [] for fold in range(5): X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.33) model.fit(X_train, y_train) y = model.predict(X_test) mse.append(mean_squared_error(y, y_test)) model_sklearn.fit(X_train, y_train) y = model_sklearn.predict(X_test) mse_sklearn.append(mean_squared_error(y, y_test)) mean_mse = np.mean(mse) mean_mse_sklearn = np.mean(mse_sklearn) print(mean_mse, mean_mse_sklearn) # Check that our model differs in MSE no worse than 20% self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.2)
def test_boston(self): from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn model = DecisionTreeRegressor(tree_type='oblivious', max_n_splits=3) model_sklearn = DecisionTreeRegressorSklearn() dataset = load_boston() mse = [] mse_sklearn = [] for fold in range(5): X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.33) model.fit(X_train, y_train) y = model.predict(X_test) mse.append(mean_squared_error(y, y_test)) model_sklearn.fit(X_train, y_train) y = model_sklearn.predict(X_test) mse_sklearn.append(mean_squared_error(y, y_test)) mean_mse = np.mean(mse) mean_mse_sklearn = np.mean(mse_sklearn) print(mean_mse, mean_mse_sklearn) # Check that our model differs in MSE no worse than 50% self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5) # def test_check_estimators(self): # """ # Tests that models adhere to scikit-learn Estimator interface. # """ # check_estimator(DecisionTreeClassifier)
def __init__(self, problem_type): self.problem_type = problem_type if self._is_classification(): self.model = DecisionTreeClassifier(random_state=RANDOM_STATE+1) elif self._is_regression(): self.model = DecisionTreeRegressor(random_state=RANDOM_STATE+2) else: raise NotImplementedError
def evaluate_decision_tree_regression(X, y): tree = DecisionTreeRegressor(max_depth=3) tree.fit(X, y) sort_index = X.flatten().argsort() lin_regplot(X[sort_index], y[sort_index], tree) plt.xlabel('% lower status of the population [LSTAT]') plt.ylabel("Price in $1000's [MEDV]") plt.show()
def __init__(self, base_estimator=None, n_estimators=50, max_features=1.0, max_depth=6, learning_rate=1.0, loss='linear', random_state=None): if base_estimator and base_estimator == 'etr': base_estimator = ExtraTreeRegressor(max_depth=max_depth, max_features=max_features) else: base_estimator = DecisionTreeRegressor(max_depth=max_depth, max_features=max_features) self.model = sklearn.ensemble.AdaBoostRegressor( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state, loss=loss)
def test_DecisionTreeRegressor_splitter(*data): ''' test the performance with different splitters :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data splitters=['best','random'] for splitter in splitters: regr = DecisionTreeRegressor(splitter=splitter) regr.fit(X_train, y_train) print("Splitter {0}".format(splitter)) print("Training score:{0}".format(regr.score(X_train,y_train))) print("Testing score:{0}".format(regr.score(X_test,y_test)))
def test_DecisionTreeRegressor_depth(*data,maxdepth): ''' test the score with different max_depth :param data: train_data, test_data, train_value, test_value :param maxdepth: an integer :return: None ''' X_train,X_test,y_train,y_test=data depths=np.arange(1,maxdepth) training_scores=[] testing_scores=[] for depth in depths: regr = DecisionTreeRegressor(max_depth=depth) regr.fit(X_train, y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(depths,training_scores,label="traing score") ax.plot(depths,testing_scores,label="testing score") ax.set_xlabel("maxdepth") ax.set_ylabel("score") ax.set_title("Decision Tree Regression") ax.legend(framealpha=0.5) plt.show()
def test_presort_sparse(): ests = (DecisionTreeClassifier(presort=True), DecisionTreeRegressor(presort=True)) sparse_matrices = (csr_matrix, csc_matrix, coo_matrix) y, X = datasets.make_multilabel_classification(random_state=0, n_samples=50, n_features=1, n_classes=20) y = y[:, 0] for est, sparse_matrix in product(ests, sparse_matrices): yield check_presort_sparse, est, sparse_matrix(X), y
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)