我们从Python开源项目中,提取了以下11个代码示例,用于说明如何使用sklearn.grid_search.RandomizedSearchCV()。
def randomsearch_xgboost(df): param_distributions={'max_depth': sp.stats.randint(1, 11), 'subsample': sp.stats.uniform(0.25, 0.75), 'colsample_bytree': sp.stats.uniform(0.25, 0.75) } xgb_model = XGBClassifier() rs = RandomizedSearchCV(xgb_model, param_distributions, cv=10, n_iter=20, scoring="log_loss", n_jobs=1, verbose=2) rs.fit(train_X, train_y.transpose()[0]) predict = rs.predict_proba(test_X) return predict[:, 1]
def runGridSearch(self, model): logging.debug("run grid search on model: {}".format(model.__class__.__name__)) logging.debug("cross validation strategy: {}".format(model.holdout_split)) logging.debug("used features: {}".format(model.usedFeatures)) logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions())) features,labels,cv = model.getFeaturesLabel() # do grid search if self.do_random_gridsearch: estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs, scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch) else: estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, fit_params=model.get_fit_params(), scoring=mean_absolute_percentage_error_scoring, verbose = 500) estimator.fit(features, labels) model.clf = estimator.best_estimator_ model.save_final_model = True model.save_model() # model.dispFeatureImportance() logging.debug('estimaator parameters: {}'.format(estimator.get_params)) logging.debug('Best parameters: {}'.format(estimator.best_params_)) logging.debug('Best Scores: {}'.format(-estimator.best_score_)) logging.debug('Score grid: {}'.format(estimator.grid_scores_ )) for i in estimator.grid_scores_ : logging.debug('parameters: {}'.format(i.parameters )) logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score))) logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) )) return
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def random_search_cv(clf, param_distribution, n_iter_search, X_train, y_train): ''' random search with optimization with nested resampling @return: random search object ''' rnd_search = RandomizedSearchCV(clf, param_distributions = param_distribution, n_iter = n_iter_search, pre_dispatch = '2*n_jobs', n_jobs = 4) rnd_search.fit(X_train, y_train) return rnd_search
def rf_cv(fv_train,target_train,fv_test,target_test): ####---- cross validation of train dataset, gridsearch the best parameters for random forest # Set the parameters by cross-validation tuned_parameters = {'n_estimators': [1000, 2000], "max_depth": [3, 6, 9, None], "max_features": ["auto","log2",None], "class_weight": [None, 'balanced']} scores = ['recall_macro'] n_iter_search = 20 for score in scores: print("# Tuning hyper-parameters for %s" % score) print() mycv = StratifiedKFold(target_train, n_folds = 5) clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search, scoring='%s' % score) clf.fit(fv_train, target_train) report_cv(clf,fv_test,target_test)
def train_classifier(self, trainvectors, labels, c='', kernel='', gamma='', degree='', class_weight='', iterations=10): if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish parameters = ['estimator__C', 'estimator__kernel', 'estimator__gamma', 'estimator__degree'] multi = True else: # only two classes to distinguish parameters = ['C', 'kernel', 'gamma', 'degree'] multi = False c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()] kernel_values = ['linear', 'rbf', 'poly'] if kernel == '' else [k for k in kernel.split()] gamma_values = [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048] if gamma == '' else [float(x) for x in gamma.split()] degree_values = [1, 2, 3, 4] if degree == '' else [int(x) for x in degree.split()] grid_values = [c_values, kernel_values, gamma_values, degree_values] if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings settings = {} for i, parameter in enumerate(parameters): settings[parameter] = grid_values[i][0] if class_weight == '': class_weight = 'balanced' else: iterations=int(iterations) param_grid = {} for i, parameter in enumerate(parameters): param_grid[parameter] = grid_values[i] model = svm.SVC(probability=True) if multi: model = OutputCodeClassifier(model) paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4) paramsearch.fit(trainvectors, self.label_encoder.transform(labels)) settings = paramsearch.best_params_ # train an SVC classifier with the settings that led to the best performance self.model = svm.SVC( probability = True, C = settings[parameters[0]], kernel = settings[parameters[1]], gamma = settings[parameters[2]], degree = settings[parameters[3]], class_weight = class_weight, cache_size = 1000, verbose = 2 ) # if multi: # self.model = OutputCodeClassifier(self.model) # trainvectors = trainvectors.todense() self.model.fit(trainvectors, self.label_encoder.transform(labels))
def train_classifier(self, trainvectors, labels, c='', solver='', dual='', penalty='', multiclass='', max_iterations=1000, iterations=10): if len(self.label_encoder.classes_) > 2: # more than two classes to distinguish parameters = ['estimator__C', 'estimator__solver', 'estimator__penalty', 'estimator__dual', 'estimator__multi_class'] # multi = True else: # only two classes to distinguish parameters = ['C', 'solver', 'penalty', 'dual', 'multi_class'] # multi = False c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == '' else [float(x) for x in c.split()] solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag'] if solver == '' else [s for s in solver.split()] if penalty == '': if not set(['newton-cg','lbfgs','sag']) & set(solver_values): penalty_values = ['l1', 'l2'] else: penalty_values = ['l2'] else: penalty_values = [penalty] if dual == '': if len(solver_values) == 1 and solver_values[0] == 'liblinear': if len(penalty_values) == 1 and penalty_values[0] == 'l2': dual_values = [True,False] else: dual_values = [False] else: dual_values = [int(dual)] # 1 or 0 if multiclass == '': if 'liblinear' not in solver_values: multiclass_values = ['ovr', 'multinomial'] else: multiclass_values = ['ovr'] else: multiclass_values = [multiclass] grid_values = [c_values, solver_values, penalty_values, dual_values, multiclass_values] max_iterations = int(max_iterations) if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings settings = {} for i, parameter in enumerate(parameters): settings[parameter] = grid_values[i][0] else: # try different parameter combinations iterations=int(iterations) param_grid = {} for i, parameter in enumerate(parameters): param_grid[parameter] = grid_values[i] model = LogisticRegression(max_iter=max_iterations) # if multi: # model = OutputCodeClassifier(model) paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, verbose = 2, n_iter = iterations, n_jobs = 10, pre_dispatch = 4) paramsearch.fit(trainvectors, self.label_encoder.transform(labels)) settings = paramsearch.best_params_ # train a logistic regression classifier with the settings that led to the best performance self.model = LogisticRegression( C = settings[parameters[0]], solver = settings[parameters[1]], penalty = settings[parameters[2]], dual = settings[parameters[3]], multi_class = settings[parameters[4]], max_iter = max_iterations, verbose = 2 ) # if multi: # self.model = OutputCodeClassifier(self.model) self.model.fit(trainvectors, self.label_encoder.transform(labels))
def search_best_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=do_nothing)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(oob_score=True, verbose=1)), ]) ############# initialize the search parameters = { 'vect__max_features': (2000,3000,4000), 'rf__n_estimators': range(300,1200,100), 'rf__criterion':['gini','entropy'], 'rf__max_depth': range(10,100,10), 'rf__min_samples_split': range(10,100,10), } validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw))) scoring_method = "roc_auc" searchcv = RandomizedSearchCV(estimator=pipeline, param_distributions=parameters, n_iter=200, scoring=scoring_method, n_jobs=-1, verbose=1, cv = validate_split) ############# search print "#################### search cv begins" searchcv.fit(Xtrain_raw, ytrain_raw) print "#################### search cv ends" print "best {}: {}".format(scoring_method, searchcv.best_score_) print "best parameters: ", searchcv.best_params_ ############# check the best model bestpipeline = searchcv.best_estimator_ common.dump_predictor("pipeline_rf.pkl",bestpipeline) rf = bestpipeline.steps[-1][1] print "RF's OOB score: {}".format(rf.oob_score_) # words = bestpipeline.steps[0][1].get_feature_names() # feat_importances = zip(words, rf.feature_importances_) # feat_importances.sort(key=lambda t: -t[1]) # print feat_importances ############# training error analysis ytrain_predict = bestpipeline.predict(Xtrain_raw) print_classification_report('Training Data', ytrain_raw, ytrain_predict) ############# test error analysis Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = bestpipeline.predict(Xtest_raw) print_classification_report('Testing Data', ytest_raw, ytest_predict)
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment): # split our data into training and test datasets xTrain, xTest, yTrain, yTest = train_test_split( X, y, test_size=0.33, random_state=8) classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1) # for simplicity's sake, we could train a single random forest: # classifier.fit(xTrain, yTrain) # print classifier.score(xTest, yTest) # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV parametersToTry = { 'max_features': ['sqrt','log2',None,.01,.1,.2,.3], 'criterion': ['gini','entropy'], 'min_samples_leaf': [1], 'min_samples_split': scipy.stats.randint(2,30), 'bootstrap': [True,False] } # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV. # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3) print 'shape of this training data set:' print xTrain.shape searchCV.fit(xTrain, yTrain) print 'the best hyperparameters from this search are:' print searchCV.best_params_ print 'best score from hyperparameter search is: ' + str(searchCV.best_score_) print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) ) print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n' testPredictions = searchCV.predict_proba(testTweetsAll) ensemblePredictions = searchCV.predict_proba(ensembleTweets) def singlePrediction(predictions): cleanedPredictions = [] for predictionRow in predictions: cleanedPredictions.append(predictionRow[1]) return cleanedPredictions # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case) testPredictions = singlePrediction(testPredictions) ensemblePredictions = singlePrediction(ensemblePredictions) return testPredictions, ensemblePredictions
def tune(insights, x_train, y_train, x_test, y_test, models='all', requirements=None, maximize=False): if requirements is None: requirements = requirements_bare_minimum(y_train) # do vanilla models satisfy the requirements? # assuming decision tree is the most intuitive, then logistic regression and then random forest # TODO: extend this to metrics other than accuracy using the confusion matrix for model_name in ['dt', 'lr', 'rf']: model_insights = insights[model_name] model_variation = np.std(model_insights['accuracy_folds']) if check_requirements(model_insights, requirements) and not maximize: pass # TODO: turn this back on # return model_name # model selection and tuning loop models_to_train = [] if models == 'all': models_to_train += models_linear + models_nonlinear_cheap + models_nonlinear_expensive elif models == 'linear': models_to_train += models_online elif models_to_train == 'cheap': models_to_train += models_linear + models_nonlinear_cheap # TODO: using all of the training data, need to use less data if runtime for insights models is large (how large?) for model in models_to_train: # TODO: add the looping logic if model == LogisticRegression: number_configurations = np.prod(np.array([len(_) for _ in hyperparameters[model]])) random_search_iterations = np.min([random_search_iterations_max, number_configurations]) random_search = RandomizedSearchCV(model(n_jobs=-1, random_state=random_state), param_distributions=hyperparameters[model], n_iter=random_search_iterations, n_jobs=-1, random_state=0) runtime = time() random_search.fit(x_train, y_train) runtime = time() - runtime info = dict() info['runtime'] = runtime # info['accuracy'] = min(scores) # info['accuracy_test'] = accuracy_score(y_test, y_test_predicted) # info['accuracy_folds'] = scores # info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted) # clf.fit(x_train, y_train) # fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test)) # info['fpr'] = fpr # info['tpr'] = tpr # info['auc'] = auc(fpr, tpr) return random_search return None
def generate_model(data, classes, args): # Define the parameters tuned_parameters = {'C': C_RANGE, 'class_weight': CLASS_WEIGHTS} # Define the classifier if args.kernel == 'rbf': clf = svm.SVC(cache_size=CACHE_SIZE) tuned_parameters['gamma'] = GAMMA_RANGE else: clf = svm.LinearSVC(dual=False) print_verbose("Classifier: %s" % str(clf), 5) print_verbose("Parameters: %s" % str(tuned_parameters), 5) # Generate the K-fold development skf = cross_validation.StratifiedKFold(classes, n_folds=K_FOLD, shuffle=True) print_verbose("KFold: %s" % str(skf), 5) # Generate the grid search if args.search == 'grid': gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='f1', n_jobs=1, verbose=get_verbose_level()) else: gscv = grid_search.RandomizedSearchCV(clf, tuned_parameters, cv=skf, scoring='f1', n_jobs=1, verbose=get_verbose_level(), n_iter=args.iter) # Search print_verbose("GridSearch: %s" % str(gscv), 5) gscv.fit(data, classes) # Print scores print_verbose("GridSearch scores:", 5) for params, mean_score, scores in gscv.grid_scores_: print_verbose("%0.6f (+/-%0.06f) for %r" % (mean_score, scores.std() / 2, params), 5) # Print best score print_verbose("GridSearch best score:", 0) print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0) return gscv