我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用sklearn.metrics.make_scorer()。
def learn_decision_tree(data): DT = tree.DecisionTreeClassifier(max_depth=7) scorer = make_scorer(matthews_corrcoef) for i in range(5): scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer) print("iteration",i, "dt mean:", scores.mean()) scores = list(scores) print("Decision Tree train scores:\n", scores) return DT # DT = DT.fit(train_data[:, :-1], train_data[:, -1]) # predictionsDT = DT.predict(validation_data[:, :-1]) # validating predicions # dtError = 0 # for i in range(0, len(validation_data)): # if(validation_data[i][20] != predictionsDT[i]): # dtError = dtError + 1 # print("DT Error : ", float(dtError)/len(validation_data)*100.0)
def fit_model(X, y): classifier = svm.SVC() parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]} f1_scorer = make_scorer(performance_metric, greater_is_better=True) clf = GridSearchCV(classifier, param_grid=parameters, scoring=f1_scorer) clf.fit(X, y) return clf # Read student data
def rf_from_cfg(cfg, seed): """ Creates a random forest regressor from sklearn and fits the given data on it. This is the function-call we try to optimize. Chosen values are stored in the configuration (cfg). Parameters: ----------- cfg: Configuration configuration chosen by smac seed: int or RandomState used to initialize the rf's random generator Returns: ----------- np.mean(rmses): float mean of root mean square errors of random-forest test predictions per cv-fold """ rfr = RandomForestRegressor( n_estimators=cfg["num_trees"], criterion=cfg["criterion"], min_samples_split=cfg["min_samples_to_split"], min_samples_leaf=cfg["min_samples_in_leaf"], min_weight_fraction_leaf=cfg["min_weight_frac_leaf"], max_features=cfg["max_features"], max_leaf_nodes=cfg["max_leaf_nodes"], bootstrap=cfg["do_bootstrapping"], random_state=seed) def rmse(y, y_pred): return np.sqrt(np.mean((y_pred - y)**2)) # Creating root mean square error for sklearns crossvalidation rmse_scorer = make_scorer(rmse, greater_is_better=False) score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer) return -1 * np.mean(score) # Because cross_validation sign-flips the score
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def build_grid_search(X, y): parameters = { "estimator__criterion": ['gini', 'entropy'], "estimator__max_depth": [10, 15, 20, 25, None], "estimator__max_features": ['auto', 'sqrt', 'log2', None] } ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000, oob_score=True, n_jobs=-1, verbose=1)) model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)) model_tunning.fit(X, y) test_score = model_tunning.best_score_ print 'The best test score: ', test_score y_score = model_tunning.predict_proba(X_test) multiclass_roc(y_score, 'grid_search_02') return model_tunning
def __init__(self, name,classifier=None, number_gen=20, verbose=0, repeat=1, parallel=False, make_logbook=False, random_state=None, cv_metric_fuction=make_scorer(matthews_corrcoef), features_metric_function=None): self._name = name self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier) self.number_gen = number_gen self.verbose = verbose self.repeat = repeat self.parallel=parallel self.make_logbook = make_logbook self.random_state = random_state self.cv_metric_function= cv_metric_fuction self.features_metric_function= features_metric_function self._random_object = check_random_state(self.random_state) random.seed(self.random_state)
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def main(): import sys import numpy as np from sklearn import cross_validation from sklearn import svm import cPickle data_dir = sys.argv[1] fet_list = load_list(osp.join(data_dir, 'c3d.list')) pos_list = load_list(osp.join(data_dir, 'pos.urls')) features = np.load(osp.join(data_dir, 'c3d.npy')) fet_set = set(fet_list) pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set] y = np.zeros(features.shape[0]) y[pos_idx] = 1 print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y) params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]} #params = {'n_estimators':[50, 70, 100, 120, 150, 200]} clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5) clf.fit(features, y) print clf.best_score_ print clf.best_estimator_ cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
def opt_classifier(clf, params, features_train, labels_train, optimize=True): ''' GridSearchCV to find optimal parameters of the classifier. ''' if optimize: scorer = make_scorer(f1_score) clf = GridSearchCV(clf, params, scoring=scorer) clf = clf.fit(features_train, labels_train) clf = clf.best_estimator_ else: clf = clf.fit(features_train, labels_train) return clf
def cross_validate(self): clf = self._clf[self._learner] (X_train, y_train) = self._train_data print " + Cross-validating classifier (learner = %s)..." \ % self._learner,; stdout.flush() scores = cross_val_score( self._clf[self._learner], X_train, y_train, scoring=make_scorer(roc_auc_score), cv=3) print "done.\n * Scores: %r" % scores
def hierarchical_f_measure_scorer(graph): measure = partial(hierarchical_f_measure, graph) return make_scorer(measure)
def make_scoring(scoring): """ Score is reversed if greater_is_better is False. """ if scoring == 'r2': return metrics.make_scorer(metrics.r2_score) elif scoring == 'mean_absolute_error': return metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False) elif scoring == 'mean_squared_error': return metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) elif scoring == 'median_absolute_error': return metrics.make_scorer(metrics.median_absolute_error, greater_is_better=False) else: raise ValueError("Not supported scoring")
def make_scoring( scoring): if scoring == 'r2': return make_scorer( metrics.r2_score) elif scoring == 'mean_absolute_error': return make_scorer( metrics.mean_absolute_error, greater_is_better=False) elif scoring == 'mean_squared_error': return make_scorer( metrics.mean_squared_error, greater_is_better=False) elif scoring == 'median_absolute_error': return make_scorer( metrics.median_absolute_error, greater_is_better=False) else: raise ValueError("Not supported scoring")
def _make_scoring_r0( scoring): if scoring == 'r2': return metrics.make_scorer( metrics.r2_score) elif scoring == 'mean_absolute_error': return metrics.make_scorer( metrics.mean_absolute_error, greater_is_better=False) elif scoring == 'mean_squared_error': return metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False) elif scoring == 'median_absolute_error': return metrics.make_scorer( metrics.median_absolute_error, greater_is_better=False) else: raise ValueError("Not supported scoring")
def my_custom_log_loss_func(ground_truth, p_predicitons, penalty=list(), eps=1e-15): # # as a general rule, the first parameter of your function should be the actual answer (ground_truth) and the second should be the predictions or the predicted probabilities (p_predicitons) adj_p = np.clip(p_predicitons, eps, 1 - eps) lb = LabelBinarizer() g = lb.fit_transform(ground_truth) if g.shape[1] == 1: g = np.append(1 - g, g, axis=1) if penalty: g[:,penalty] = g[:,penalty] * 2 summation = np.sum(g * np.log(adj_p)) return summation * (-1.0/len(ground_truth)) # my_custom_scorer = make_scorer(my_custom_log_loss_func, greater_is_better=False, needs_proba=True, penalty=[4,9]) # here we set the penalty on for highly confusable numbers 4 and 9 (can change it or even leave it empty to check whether the resulting loss will be the same as that of the previous experiment with the sklearn.metrics.log_loss function) # This new loss function will double log_loss when evaluating the results of the classes of number 4 and 9
def search(X,y): rmse = make_scorer(RMSE, greater_is_better = False) param_test1 = {'n_estimators':range(150,401,50)} gsearch1 = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=30, min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), param_grid = param_test1, scoring=rmse,cv=5) gsearch1.fit(X,y) print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
def crossV(model, X, y, folds = 5): rmse = make_scorer(RMSE, greater_is_better = False) scores = cross_val_score(model, X, y, cv = folds, scoring=rmse, n_jobs = 1) print scores print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def test_cross_val_score_score_func(): clf = MockClassifier() _score_func_args = [] def score_func(y_test, y_predict): _score_func_args.append((y_test, y_predict)) return 1.0 with warnings.catch_warnings(record=True): scoring = make_scorer(score_func) score = cross_val_score(clf, X, y, scoring=scoring) assert_array_equal(score, [1.0, 1.0, 1.0]) assert len(_score_func_args) == 3
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def test_make_scorer(): # Sanity check on the make_scorer factory function. f = lambda *args: 0 assert_raises(ValueError, make_scorer, f, needs_threshold=True, needs_proba=True)
def test_raises_on_score_list(): # Test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) f1_scorer_no_average = make_scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y)
def fit_cv(self, data, labels, cv_params, epochs=10, **kwargs): n_jobs = kwargs.get('n_jobs', 1) iid = kwargs.get('iid', True) refit = kwargs.get('refit', True) cv = kwargs.get('cv', None) verbose = kwargs.get('verbose', 0) pre_dispatch = kwargs.get('pre_dispatch', '2*n_jobs') error_score = kwargs.get('error_score', 'raise') return_train_score = kwargs.get('return_train_score', True) param_dct = self.get_params() param_dct.update({'bootstrap_fraction': 1.0}) rscv = GridSearchCV(SGDBolasso(**param_dct), scoring=make_scorer(accuracy_score), verbose=verbose, param_grid=cv_params, fit_params={'epochs': 1, 'verbose': 0}, cv=cv, return_train_score=return_train_score, n_jobs=n_jobs, iid=iid, refit=refit, pre_dispatch=pre_dispatch, error_score=error_score) rscv.fit(data, labels) param_dct = rscv.best_params_.copy() param_dct.update({'bootstrap_fraction': self.bootstrap_fraction}) best_estim = SGDBolasso(**param_dct) best_estim.fit(data, labels, epochs=epochs) return best_estim, rscv
def __grid_search_model(self, clf_factory, documents, labels, pos_label): boolndarr = labels.values == pos_label n = documents.size n_pos = labels[boolndarr].size n_neg = n - n_pos param_grid = { 'vect__binary' : [False, True], 'vect__min_df' : [1, 2], 'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)], 'vect__smooth_idf' : [False, True], 'vect__stop_words' : [None, 'english'], 'vect__sublinear_tf': [False, True], 'vect__use_idf' : [False, True], 'clf__alpha' : [0, 0.01, 0.05, 0.1, 0.5, 1] } k = 5 cv = ShuffleSplit( n, n_iter = k, test_size = 1 / k, random_state = 0 ) pos_weight = n_neg / n_pos sample_weight = np.ones(n) sample_weight[boolndarr] *= pos_weight fit_params = {'clf__sample_weight': sample_weight} f1_scorer = make_scorer(f1_score, pos_label=pos_label) grid_search = GridSearchCV( clf_factory, param_grid, cv = cv, fit_params = fit_params, n_jobs = -1, scoring = f1_scorer ) grid_search.fit(documents, labels) best_estimator = grid_search.best_estimator_ best_score = grid_search.best_score_ best_params = grid_search.best_params_ print("Best F1 score: {0:04.3f}".format(best_score)) print("Parameters: {0}".format(best_params)) return best_estimator
def train(self, a_train_data, a_dev_data=None, a_n_y=-1, a_i=-1, a_train_out=None, a_dev_out=None): """Method for training the model. Args: a_train_data (tuple[list, dict]): list of training JSON data a_dev_data (tuple[list, dict] or None): list of development JSON data a_n_y (int): number of distinct classes a_i (int): row index for the output predictions a_train_out (np.array or None): predictions for the training set a_dev_out (np.array or None): predictions for the training set Returns: void: Note: updates ``a_train_out`` and ``a_dev_out`` in place """ self.n_y = a_n_y x_train, y_train = self._generate_ts(a_train_data) x_dev, y_dev = self._generate_ts(a_dev_data) # determine cross-validation and grid-search strategy and fit the model if self._gs: if a_dev_data is None or not a_dev_data[0]: cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True) else: cv = self._devset_cv(y_train, len(y_dev), NFOLDS) x_train = x_train + x_dev y_train = y_train + y_dev scorer = make_scorer(f1_score, average="macro") self._model = GridSearchCV(self._model, self.PARAM_GRID, scoring=scorer, cv=cv, n_jobs=self.N_JOBS, verbose=1) self._model.fit([el[-1] for el in x_train], y_train) # output best hyper-parameters if self._gs: print("Best params:", repr(self._model.best_params_), file=sys.stderr) if a_i >= 0: if a_train_out is not None: if self._gs and a_dev_data and a_dev_data[0]: x_train = x_train[:-len(x_dev)] for i, x_i in x_train: self._predict(x_i, a_train_out[i], a_i) if a_dev_out is not None: for i, x_i in x_dev: self._predict(x_i, a_dev_out[i], a_i)
def greedy_select_features(self): print('initial shapes:', self.train_.shape, self.test_.shape) saved = None if self.debug_ else self.load('chosen_features') if saved == None: g_best_score = 1e9 g_best_features = [] current = set() finished = False else: g_best_features, g_best_score, finished = saved current = set(g_best_features) print('SFS REUSE:', g_best_score, len(current), sorted(g_best_features), self.now()) if not finished: col_names = self.train_.columns y = self.y_.ravel() scorer = metrics.make_scorer(metrics.log_loss) loop_count = len(col_names) - len(g_best_features) for _ in range(loop_count): avail = set(col_names).difference(current) best_score = 1e9 best_features = None for f in avail: newf = list(current | {f}) score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer) if best_score > score: best_score = score best_features = newf current = set(best_features) if g_best_score > best_score: g_best_score = best_score g_best_features = best_features print('new best:', g_best_score, sorted(g_best_features), self.now()) else: print('no luck', len(current), self.now()) if len(best_features) - len(g_best_features) >= 5: break self.save('chosen_features', (g_best_features, g_best_score, False)) # now self.save('chosen_features', (g_best_features, g_best_score, True)) print('feature selection complete.', self.now()) self.train_ = self.train_[g_best_features] self.test_ = self.test_[g_best_features]
def greedy_select_features(self): saved = None if self.debug_ else self.load('chosen_features') if saved == None: print('initial shapes:', self.train_.shape, self.test_.shape) num_columns = self.train_.shape[1] col_names = [str(c) for c in range(num_columns)] self.train_.columns = col_names self.test_.columns = col_names g_best_score = 1e9 g_best_features = None y = self.y_.ravel() current = set() scorer = metrics.make_scorer(metrics.log_loss) for _ in enumerate(col_names): avail = set(col_names).difference(current) best_score = 1e9 best_features = None for f in avail: newf = list(current | {f}) cv = model_selection.cross_val_score(linear_model.BayesianRidge(), self.train_[newf], y, cv=self.n_fold_, n_jobs=-2, scoring = scorer) score = np.mean(cv) if best_score > score: best_score = score best_features = newf current = set(best_features) if g_best_score > best_score: g_best_score = best_score g_best_features = best_features print('new best:', g_best_score, g_best_features, self.now()) if len(best_features) - len(g_best_features) > 15: break self.save('chosen_features', (g_best_features, None)) else: g_best_features, _ = saved print('feature selection complete.', self.now()) self.train_ = self.train_[g_best_features] self.test_ = self.test_[g_best_features]
def greedy_select_features(self): print('initial shapes:', self.train_.shape, self.test_.shape) saved = None if self.debug_ else self.load('chosen_features') if saved == None: g_best_score = 1e9 g_best_features = [] current = set() finished = False else: g_best_features, g_best_score, finished = saved current = set(g_best_features) print('SFS REUSE:', g_best_score, g_best_features, self.now()) num_columns = self.train_.shape[1] col_names = [str(c) for c in range(num_columns)] self.train_.columns = col_names self.test_.columns = col_names if not finished: y = self.y_.ravel() scorer = metrics.make_scorer(metrics.log_loss) loop_count = len(col_names) - len(g_best_features) for _ in range(loop_count): avail = set(col_names).difference(current) best_score = 1e9 best_features = None for f in avail: newf = list(current | {f}) score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer) if best_score > score: best_score = score best_features = newf current = set(best_features) if g_best_score > best_score: g_best_score = best_score g_best_features = best_features print('new best:', g_best_score, g_best_features, self.now()) if len(best_features) - len(g_best_features) > 5: break self.save('chosen_features', (g_best_features, g_best_score, False)) # now self.save('chosen_features', (g_best_features, g_best_score, True)) print('feature selection complete.', self.now()) self.train_ = self.train_[g_best_features] self.test_ = self.test_[g_best_features]
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = StratifiedKFold(2) score, scores, pvalue = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = StratifiedKFold(2) score_label, _, pvalue_label = permutation_test_score( svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = permutation_test_score( svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score( svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)