我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.grid_search.GridSearchCV()。
def model_random_forecast(Xtrain,Xtest,ytrain): X_train = Xtrain y_train = ytrain rfr = RandomForestRegressor(n_jobs=1, random_state=0) param_grid = {'n_estimators': [1000]} # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]} model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE) model.fit(X_train, y_train) print('Random forecast regression...') print('Best Params:') print(model.best_params_) print('Best CV Score:') print(-model.best_score_) y_pred = model.predict(Xtest) return y_pred, -model.best_score_
def model_extra_trees_regression(Xtrain,Xtest,ytrain): X_train = Xtrain y_train = ytrain etr = ExtraTreesRegressor(n_jobs=1, random_state=0) param_grid = {}#'n_estimators': [500], 'max_features': [10,15,20]} model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE) model.fit(X_train, y_train) print('Extra trees regression...') print('Best Params:') print(model.best_params_) print('Best CV Score:') print(-model.best_score_) y_pred = model.predict(Xtest) return y_pred, -model.best_score_ # read data, build model and do prediction # read train data
def evaluate_svm(train_data, train_labels, test_data, test_labels, n_jobs=-1): """ Evaluates a representation using a Linear SVM It uses 3-fold cross validation for selecting the C parameter :param train_data: :param train_labels: :param test_data: :param test_labels: :param n_jobs: :return: the test accuracy """ # Scale data to 0-1 scaler = MinMaxScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) parameters = {'kernel': ['linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]} model = svm.SVC(max_iter=10000) clf = grid_search.GridSearchCV(model, parameters, n_jobs=n_jobs, cv=3) clf.fit(train_data, train_labels) lin_svm_test = clf.score(test_data, test_labels) return lin_svm_test
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain): X_train = Xtrain y_train = ytrain gbr = GradientBoostingRegressor(random_state=0) param_grid = { 'n_estimators': [800,1500], 'max_features': [20,15], 'max_depth': [8,10], 'learning_rate': [0.1], 'subsample': [1] } model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE) model.fit(X_train, y_train) print('Gradient boosted tree regression...') print('Best Params:') print(model.best_params_) print('Best CV Score:') print(-model.best_score_) y_pred = model.predict(Xtest) return y_pred, -model.best_score_ # read data, build model and do prediction
def parameterChoosing(self): #Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(20,60), 'n_estimators': range(10,40), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "MSE for test data set:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print mean_squared_error(y_true, y_pred)
def train_classifier(self, trainvectors, labels, no_label_encoding=False, alpha='default', fit_prior=True, iterations=10): if alpha == '': paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6) paramsearch.fit(trainvectors,self.label_encoder.transform(labels)) selected_alpha = paramsearch.best_estimator_.alpha elif alpha == 'default': selected_alpha = 1.0 else: selected_alpha = float(alpha) if fit_prior == 'False': fit_prior = False else: fit_prior = True self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior) if no_label_encoding: self.model.fit(trainvectors, labels) else: self.model.fit(trainvectors, self.label_encoder.transform(labels))
def training(X, y, model, param_grid={}, cv=None, n_jobs=1): print "Training the model..." gs = grid_search.GridSearchCV( estimator = model, param_grid = param_grid, verbose = 5, cv = cv, n_jobs = n_jobs, refit = True ) gs.fit(X, y) print "#" print "# Best score:", gs.best_score_ best_parameters = gs.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print "# {0}: {1}".format(param_name, best_parameters[param_name]) print "#" return gs.best_estimator_
def grid_search_gamma(rbf_svm, X, y): ## grid search - gamma only # use a full grid over all parameters param_grid = {'gamma': np.logspace(-15, 4, num = 5000, base = 2.0)} grid_search = GridSearchCV(rbf_svm, param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = -1) # re-fit on the whole training data grid_search.fit(X, y) grid_search_scores = [score[1] for score in grid_search.grid_scores_] print('Best parameters : {}'.format(grid_search.best_params_)) print('Best score : {}'.format(grid_search.best_score_)) # set canvas fig, ax = plt.subplots(1, 1) # ax.scatter(X[:, 0], X[:, 1], c = y) ax.plot(param_grid['gamma'], grid_search_scores) ax.set_title('AUC = f(gamma, C = 1.0)', fontsize = 'large') ax.set_xlabel('gamma', fontsize = 'medium') ax.set_ylabel('AUC', fontsize = 'medium') return fig
def gridSearch(data, params, true_k): tfidf = TfidfVectorizer(strip_accents=None, lowercase=True, sublinear_tf=True, analyzer='word') lr_tfidf = Pipeline([('vect', tfidf), ('clf', KMeans(init='k-means++', n_jobs=-1, random_state=0, verbose=0))]) gsTfIdf = GridSearchCV( lr_tfidf, params, n_jobs=1, verbose=1) gsTfIdf.fit(data) print() print("Best score: %0.3f" % gsTfIdf.best_score_) print("Best parameters set:") best_parameters = gsTfIdf.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_features': ['sqrt', 'log2', None], 'max_depth': range(2,1000), } ] reg = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error') reg.fit(self.X_train, self.y_train) print "Best parameters set found on development set:\n" print reg.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in reg.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "MSE for test data set:\n" y_true, y_pred = self.y_test, reg.predict(self.X_test) print mean_squared_error(y_true, y_pred)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': range(2,100) } ] reg = GridSearchCV(neighbors.KNeighborsRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error') reg.fit(self.X_train, self.y_train) print "Best parameters set found on development set:\n" print reg.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in reg.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print reg.scorer_ print "MSE for test data set:" y_true, y_pred = self.y_test, reg.predict(self.X_test) print mean_squared_error(y_pred, y_true)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'alpha': np.logspace(-5,5) } ] reg = GridSearchCV(linear_model.Ridge(alpha = 0.5), tuned_parameters, cv=5, scoring='mean_squared_error') reg.fit(self.X_train, self.y_train) print "Best parameters set found on development set:\n" print reg.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in reg.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print reg.scorer_ print "MSE for test data set:" y_true, y_pred = self.y_test, reg.predict(self.X_test) print mean_squared_error(y_pred, y_true)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': np.logspace(-4, 3, 30), 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}, {'kernel': ['poly'], 'degree': [1, 2, 3, 4], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'coef0': np.logspace(-4, 3, 30)}, {'kernel': ['linear'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}] clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': range(2,60) } ] clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(20,60), 'n_estimators': range(10,40), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(RandomForestClassifier(n_estimators=30), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(2,60), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def grid_search(clf, X, y): params = dict(anova_filter__k=[50, 100, 'all'], xgb__max_depth=[3, 5, 10], xgb__n_estimators=[50, 100, 300, 500], xgb__learning_rate=[0.05, 0.1]) gs = GridSearchCV(clf, param_grid=params, n_jobs=4, cv=10, verbose=2) gs.fit(X, y) print "Best estimator:" print gs.best_estimator_ print "Best parameters:" print gs.best_params_ print "Best score:" print gs.best_score_ y_pred = gs.predict(X) y_test = y
def quic_graph_lasso_cv(X, metric): '''Run QuicGraphLassoCV on data with metric of choice. Compare results with GridSearchCV + quic_graph_lasso. The number of lambdas tested should be much lower with similar final lam_ selected. ''' print('QuicGraphLassoCV with:') print(' metric: {}'.format(metric)) model = QuicGraphLassoCV( cv=2, # cant deal w more folds at small size n_refinements=6, n_jobs=1, init_method='cov', score_metric=metric) model.fit(X) print(' len(cv_lams): {}'.format(len(model.cv_lams_))) print(' lam_scale_: {}'.format(model.lam_scale_)) print(' lam_: {}'.format(model.lam_)) return model.covariance_, model.precision_, model.lam_
def kde_opt1(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = (1 + df["hour"]) * 3.92105 df_new["weekday"] = (1 + df["weekday"]) * 4.28947 df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 9.44736 df_new["x"] = df["x"] * 424.489 df_new["y"] = df["y"] * 959.183 return df_new logging.info("train kde_opt1 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") Xte = df_cell_test_feats_kde.values for i in range(n_class): X = df_cell_train_feats_kde[y_train == i].values cstd = np.std(np.sum(np.abs(X), axis=1)) gridcv = GridSearchCV(KernelDensity(kernel='gaussian', metric='manhattan'), {'bandwidth': cstd * np.logspace(-1, 1, 10)}, cv=5) gridcv.fit(X) y_test_pred[:, i] += np.exp(gridcv.best_estimator_.score_samples(Xte)) return y_test_pred
def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, "english"], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) x, y = online.collect_pts(100, -1) i = 0 q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) while q < 3500: i += 1 # h_ = ex.fit(x, y) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_, _ = online_.collect_pts(10, 200) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(oracle(x_)) q += online_.get_n_query() pred_y = h_.predict(test_x) print len(x), q, sm.accuracy_score(test_y, pred_y)
def grid_retrain_in_f(self, n_dim=500): rbf_map = RBFSampler(n_dim, random_state=1) fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map), ("svm", LinearSVC())]) # C_range = np.logspace(-5, 15, 21, base=2) # gamma_range = np.logspace(-15, 3, 19, base=2) # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range) # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42) # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv) # grid.fit(X, Y) # # rbf_svc2 = grid.best_estimator_ rbf_svc2 = fourier_approx_svm rbf_svc2.fit(self.X_ex, self.y_ex) self.set_clf2(rbf_svc2) return self.benchmark()
def grid_search(self): C_range = np.logspace(-5, 15, 21, base=2) param_grid = dict(C=C_range) cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0) logger.info('start grid search for Linear') grid.fit(self.X_ex, self.y_ex) logger.info('end grid search for Linear') scores = [x[1] for x in grid.grid_scores_] # final train clf = grid.best_estimator_ pred_train = clf.predict(self.X_ex) pred_val = clf.predict(self.val_x) pred_test = clf.predict(self.test_x) r = Result(self.name + ' (X)', 'Poly', len(self.X_ex), sm.accuracy_score(self.y_ex, pred_train), sm.accuracy_score(self.val_y, pred_val), sm.accuracy_score(self.test_y, pred_test)) return r
def fit_model(X, y): classifier = svm.SVC() parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]} f1_scorer = make_scorer(performance_metric, greater_is_better=True) clf = GridSearchCV(classifier, param_grid=parameters, scoring=f1_scorer) clf.fit(X, y) return clf # Read student data
def get_model(): if FLAGS.model == 'logistic': return linear_model.LogisticRegressionCV(class_weight='balanced', scoring='roc_auc', n_jobs=FLAGS.n_jobs, max_iter=10000, verbose=1) elif FLAGS.model == 'random_forest': return ensemble.RandomForestClassifier(n_estimators=100, n_jobs=FLAGS.n_jobs, class_weight='balanced', verbose=1) elif FLAGS.model == 'svm': return grid_search.GridSearchCV( estimator=svm.SVC(kernel='rbf', gamma='auto', class_weight='balanced'), param_grid={'C': np.logspace(-4, 4, 10)}, scoring='roc_auc', n_jobs=FLAGS.n_jobs, verbose=1) else: raise ValueError('Unrecognized model %s' % FLAGS.model)
def transform_pca(clf_list): ''' From classifier list to pipeline list of the same classifiers and PCA. ''' pca = PCA() params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]} for j in range(len(clf_list)): name = "clf_" + str(j) clf, params = clf_list[j] # Parameters in GridSearchCV need to have double underscores # between specific classifiers. new_params = {} for key, value in params.iteritems(): new_params[name + "__" + key] = value new_params.update(params_pca) clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params) return clf_list
def do(data_file_path, outputs_dir, xy_class, model_class, params): data = ModelAnalysis.read_data(data_file_path) xy = xy_class(data, exclude_suburb=False) model = model_class( xy.X.values, xy.y.values, xy.X.columns.values, data, ) grid = GridSearchCV( model.model, params, scoring='mean_squared_error', cv=KFold(xy.y.values.shape[0]), verbose=0, n_jobs=4 ) grid.fit(xy.X.values, xy.y.values) model.model = grid.best_estimator_ return grid.best_params_, model
def getKernelDensityEstimation(nodes, metric='euclidean', metric_params=None, bbox=None, bandwidth=0.002, optimizeBandwidth=False, bwmin=0.0001, bwmax=0.01, crossValidation=20): lon = [] lat = [] for nlon,nlat in nodes: lon.append(nlon) lat.append(nlat) lon = np.array(lon) lat = np.array(lat) if bbox is None: xmin, xmax = min(lon), max(lon) ymin, ymax = min(lat), max(lat) bbox = [xmin, xmax, ymin, ymax] else: xmin, ymin, xmax, ymax = bbox[0],bbox[1],bbox[2],bbox[3] bbox = [xmin, xmax, ymin, ymax] x, y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([x.ravel(), y.ravel()]) values = np.vstack([lon, lat]) if optimizeBandwidth: grid = GridSearchCV(KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree'), {'bandwidth': np.linspace(bwmin, bwmax, 30)}, cv=crossValidation) # 20-fold cross-validation grid.fit(zip(*values)) bandwidth = grid.best_params_['bandwidth'] kernel = grid.best_estimator_ else: kernel = KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree', bandwidth=bandwidth) kernel.fit(zip(*values)) return kernel, positions, x, y, bbox, bandwidth
def Second_Model_KRR(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)} krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error") krr_Tuned.fit(Scaled_Input_Data, Output_Data) KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma']) KRR_Time = time.time() - T0 print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time) MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_KRR = np.mean(list(MSEs_KRR)) print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR)) return(MeanMSE_KRR, krr_Tuned)
def _gs_SVC_r0( xM, yVc, params): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True) gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1) gs.fit( xM, yVc) return gs
def gs_SVC( xM, yVc, params, n_folds = 5): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1) gs.fit( xM, yVc) return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, XX) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def gs_Ridge( xM, yV, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1, scoring = 'r2'): """ Parameters ------------- scoring: mean_absolute_error, mean_squared_error, median_absolute_error, r2 """ print(xM.shape, yV.shape) clf = linear_model.Ridge() #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace( *alphas_log)} kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, parmas, scoring = scoring, cv = kf_n, n_jobs = n_jobs) gs.fit( xM, yV) return gs
def regressorOp(x, y): """ This will optimize the parameters for the algo """ regr_rbf = svm.SVR(kernel="rbf") C = [1000, 10, 1] gamma = [0.005, 0.004, 0.003, 0.002, 0.001] epsilon = [0.1, 0.01] parameters = {"C":C, "gamma":gamma, "epsilon":epsilon} gs = grid_search.GridSearchCV(regr_rbf, parameters, scoring="r2") gs.fit(x, y) print "Best Estimator:\n", gs.best_estimator_ print "Type: ", type(gs.best_estimator_) return gs.best_estimator_
def testIrisDNN(self): if HAS_SKLEARN: random.seed(42) iris = datasets.load_iris() feature_columns = learn.infer_real_valued_columns_from_input(iris.data) classifier = learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3) grid_search = GridSearchCV( classifier, {'hidden_units': [[5, 5], [10, 10]]}, scoring='accuracy', fit_params={'steps': [50]}) grid_search.fit(iris.data, iris.target) score = accuracy_score(iris.target, grid_search.predict(iris.data)) self.assertGreater(score, 0.5, 'Failed with score = {0}'.format(score))
def gridSearchPipeline(pipeline, paramsGrid, Xtrain, Ytrain, **cvParams): print("Grid Searching pipeline:") print(pipeline) # use 5-fold stratified cross-validation by default to maintain # consistent class balance across training and testing if 'cv' not in cvParams: # print "Ytrain: ", Ytrain # numClasses = len(np.unique(Ytrain)) # examplesPerClass = len(Ytrain) / numClasses # nFolds = max(5, examplesPerClass / 5) # if nFolds < 5: # if True: # r, c = Ytrain.shape # print "tiny Ytrain size: (%d, %d)" % Ytrain.shape # (r, c) # for row in Ytrain: print row # cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=nFolds) cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=5) cv = GridSearchCV(pipeline, paramsGrid, **cvParams) cv.fit(Xtrain, Ytrain) return cv
def fit(self, X, y, test_size=0.3): # Grid search cross-val (best C param) cv = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=self.seed_) clf_cv = GridSearchCV(self.clf_base_, self.clf_hyparams_, cv=cv, n_jobs=-1, verbose=4) print('====> Training Classifier (with grid search hyperparam tuning) .. ') print('====> BATCH Training (in-memory): {:4.3f} MB'.format(X.nbytes / 1024.0 / 1024.0) ) clf_cv.fit(X, y) print('BEST: {}, {}'.format(clf_cv.best_score_, clf_cv.best_params_)) # Setting clf to best estimator self.clf_ = clf_cv.best_estimator_ # # Calibrating classifier # print('Calibrating Classifier ... ') # self.clf_prob_ = CalibratedClassifierCV(self.clf_, cv=cv, method='sigmoid') # self.clf_prob_.fit(X, y) # # Setting clf to best estimator # self.clf_ = clf_cv.best_estimator_ # pred_targets = self.clf_.predict(X) if self.epoch_no_ % 10 == 0: self.save(self.filename_.replace('.h5', '_iter_{}.h5'.format(self.epoch_no_))) self.save(self.filename_) self.epoch_no_ += 1
def main(): import sys import numpy as np from sklearn import cross_validation from sklearn import svm import cPickle data_dir = sys.argv[1] fet_list = load_list(osp.join(data_dir, 'c3d.list')) pos_list = load_list(osp.join(data_dir, 'pos.urls')) features = np.load(osp.join(data_dir, 'c3d.npy')) fet_set = set(fet_list) pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set] y = np.zeros(features.shape[0]) y[pos_idx] = 1 print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y) params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]} #params = {'n_estimators':[50, 70, 100, 120, 150, 200]} clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5) clf.fit(features, y) print clf.best_score_ print clf.best_estimator_ cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
def runGridSearch(self, model): logging.debug("run grid search on model: {}".format(model.__class__.__name__)) logging.debug("cross validation strategy: {}".format(model.holdout_split)) logging.debug("used features: {}".format(model.usedFeatures)) logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions())) features,labels,cv = model.getFeaturesLabel() # do grid search if self.do_random_gridsearch: estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs, scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch) else: estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, fit_params=model.get_fit_params(), scoring=mean_absolute_percentage_error_scoring, verbose = 500) estimator.fit(features, labels) model.clf = estimator.best_estimator_ model.save_final_model = True model.save_model() # model.dispFeatureImportance() logging.debug('estimaator parameters: {}'.format(estimator.get_params)) logging.debug('Best parameters: {}'.format(estimator.best_params_)) logging.debug('Best Scores: {}'.format(-estimator.best_score_)) logging.debug('Score grid: {}'.format(estimator.grid_scores_ )) for i in estimator.grid_scores_ : logging.debug('parameters: {}'.format(i.parameters )) logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score))) logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) )) return
def train_classifier(self, trainvectors, labels, alpha='default', fit_prior=True, iterations=10): if alpha == '': paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6) paramsearch.fit(trainvectors,self.label_encoder.transform(labels)) selected_alpha = paramsearch.best_estimator_.alpha elif alpha == 'default': selected_alpha = 1.0 else: selected_alpha = alpha if fit_prior == 'False': fit_prior = False else: fit_prior = True self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior) self.model.fit(trainvectors, self.label_encoder.transform(labels))
def train_classifier(self, trainvectors, labels, alpha='', iterations=50, jobs=10): iterations = int(iterations) jobs = int(jobs) if alpha == '': paramsearch = GridSearchCV(estimator=Perceptron(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:],n_iter=[iterations]), n_jobs=jobs) paramsearch.fit(trainvectors,self.label_encoder.transform(labels)) selected_alpha = paramsearch.best_estimator_.alpha elif alpha == 'default': selected_alpha = 1.0 else: selected_alpha = alpha # train a perceptron with the settings that led to the best performance self.model = Perceptron(alpha=selected_alpha,n_iter=iterations,n_jobs=jobs) self.model.fit(trainvectors, self.label_encoder.transform(labels))
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()): labels = [x.severity for x in data] generatePrimaryFeats(data, featTypes) featurized = [] for d in data: instance = {} for featname, values in d.feats.items(): # Give each feature a unique name to avoid overwriting features. # If e.g. a concept feature has the same name as a bow word, the old code # would overwrite one of the features. instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()}) featurized.append(instance) d = DictVectorizer() x_train = d.fit_transform(featurized) folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed) grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds) fit_grid = grid.fit(x_train, labels) print(fit_grid.best_params_) return fit_grid.best_params_
def grid_search(model,tuned_parameters,X,Y): startTime = datetime.now() model_gs = GridSearchCV(model, tuned_parameters, cv=10) model_gs.fit(X, Y.values.ravel()) print("Best parameters set found on development set:") print(model_gs.best_params_) print "\n"+"Task Completed! Completion time: "+ str(datetime.now()-startTime) return # Load data
def __gridSearch(self, est, X, y): if isinstance(est, RandomForestClassifier): param_grid = { 'n_estimators': [200, 300, 400, 500, 800, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [5, 15, 25] } CV_rfc = GridSearchCV(estimator=est, param_grid=param_grid, cv=5) CV_rfc.fit(X, y) logger.info('[%s] : [INFO] Best parameters are: %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), CV_rfc.best_params_) print 'Best parameters are: %s' % CV_rfc.best_params_ return CV_rfc.best_params_
def __init__(self, *models, **kwargs): """ Initializes the grid search :param list models: List of models to use. Each one should be a tuple with a model instance or class and a dictionary for the search space. :param kwargs: addition initialization arguments for `sklearn.grid_search.GridSearchCV` """ self.models = filter(None, models) kwargs['refit'] = True self.kwargs = kwargs
def fit(self, training_sets): """ Searches for the best estimator and its arguments as well as the best training set amongst those specified. :param generator training_sets: Training set to use. Should be a sequence of tuples (x, y, metadata) where x is the training set, y is the correct answer for each chunk and metadata contains additional data that will be returned back :return: the metadata of the training set which yielded the best score, the best score obtained by the model, parameters of the model and fitted model itself :rtype: tuple """ best_training, best_score, best_params, best_model = None, None, None, None for i, (metadata, extractor) in enumerate(training_sets): for model, grid in self.models: assert isclass(model) x, y = extractor.get_features(refit=True) grid['model_cls'] = [model] grid['selector_column'] = [None, extractor.lu_column()] search = GridSearchCV( FeatureSelectedClassifier(model), param_grid=grid, **self.kwargs ) search.fit(x, y) score, params, model = search.best_score_, search.best_params_, search.best_estimator_ logger.debug('%s with parameters %s and training meta %s has score %s', type(model), params, metadata, score) if best_score is None or score > best_score: best_training, best_score, best_params, best_model = (x, y, metadata), score, params, model return best_training, best_score, best_params, best_model # needs to be pickleable and callable