Python sklearn.grid_search 模块,GridSearchCV() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.grid_search.GridSearchCV()

项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def model_random_forecast(Xtrain,Xtest,ytrain):

    X_train = Xtrain
    y_train = ytrain
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {'n_estimators': [1000]}
    # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def model_extra_trees_regression(Xtrain,Xtest,ytrain):

    X_train = Xtrain
    y_train = ytrain

    etr = ExtraTreesRegressor(n_jobs=1, random_state=0)
    param_grid = {}#'n_estimators': [500], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


# read data, build model and do prediction
# read train data
项目:sef    作者:passalis    | 项目源码 | 文件源码
def evaluate_svm(train_data, train_labels, test_data, test_labels, n_jobs=-1):
    """
    Evaluates a representation using a Linear SVM
    It uses 3-fold cross validation for selecting the C parameter
    :param train_data:
    :param train_labels:
    :param test_data:
    :param test_labels:
    :param n_jobs:
    :return: the test accuracy
    """

    # Scale data to 0-1
    scaler = MinMaxScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)

    parameters = {'kernel': ['linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
    model = svm.SVC(max_iter=10000)
    clf = grid_search.GridSearchCV(model, parameters, n_jobs=n_jobs, cv=3)
    clf.fit(train_data, train_labels)
    lin_svm_test = clf.score(test_data, test_labels)
    return lin_svm_test
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain):

    X_train = Xtrain
    y_train = ytrain 
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {
        'n_estimators': [800,1500],
        'max_features': [20,15],
        'max_depth': [8,10],
        'learning_rate': [0.1],
       'subsample': [1]
    }
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Gradient boosted tree regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


# read data, build model and do prediction
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        #Set the parameters by cross-validation
        tuned_parameters = [{'max_depth': range(20,60),
                             'n_estimators': range(10,40),
                             'max_features': ['sqrt', 'log2', None]
                             }
                            ]

        clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "MSE for test data set:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print mean_squared_error(y_true, y_pred)
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def train_classifier(self, trainvectors, labels, no_label_encoding=False, alpha='default', fit_prior=True, iterations=10):
        if alpha == '':
            paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
            paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
            selected_alpha = paramsearch.best_estimator_.alpha
        elif alpha == 'default':
            selected_alpha = 1.0
        else:
            selected_alpha = float(alpha)
        if fit_prior == 'False':
            fit_prior = False
        else:
            fit_prior = True
        self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
        if no_label_encoding:
            self.model.fit(trainvectors, labels)
        else:
            self.model.fit(trainvectors, self.label_encoder.transform(labels))
项目:kaggle-cooking    作者:fpoli    | 项目源码 | 文件源码
def training(X, y, model, param_grid={}, cv=None, n_jobs=1):
    print "Training the model..."

    gs = grid_search.GridSearchCV(
        estimator = model,
        param_grid = param_grid,
        verbose = 5,
        cv = cv,
        n_jobs = n_jobs,
        refit = True
    )

    gs.fit(X, y)

    print "#"
    print "# Best score:", gs.best_score_
    best_parameters = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print "#  {0}: {1}".format(param_name, best_parameters[param_name])
    print "#"

    return gs.best_estimator_
项目:DataMining    作者:lidalei    | 项目源码 | 文件源码
def grid_search_gamma(rbf_svm, X, y):
    ## grid search - gamma only
    # use a full grid over all parameters
    param_grid = {'gamma': np.logspace(-15, 4, num = 5000, base = 2.0)}
    grid_search = GridSearchCV(rbf_svm, param_grid = param_grid, scoring = 'roc_auc',
                               cv = 10, pre_dispatch = '2*n_jobs', n_jobs = -1)
    # re-fit on the whole training data
    grid_search.fit(X, y)
    grid_search_scores = [score[1] for score in grid_search.grid_scores_]
    print('Best parameters : {}'.format(grid_search.best_params_))
    print('Best score : {}'.format(grid_search.best_score_))

    # set canvas
    fig, ax = plt.subplots(1, 1)
    # ax.scatter(X[:, 0], X[:, 1], c = y)
    ax.plot(param_grid['gamma'], grid_search_scores)
    ax.set_title('AUC = f(gamma, C = 1.0)', fontsize = 'large')
    ax.set_xlabel('gamma', fontsize = 'medium')
    ax.set_ylabel('AUC', fontsize = 'medium')

    return fig
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def gridSearch(data, params, true_k):

    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=True,
                            sublinear_tf=True,
                            analyzer='word')

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf', KMeans(init='k-means++',
                                        n_jobs=-1,
                                        random_state=0,
                                        verbose=0))])
    gsTfIdf = GridSearchCV(
        lr_tfidf, params, n_jobs=1, verbose=1)

    gsTfIdf.fit(data)
    print()
    print("Best score: %0.3f" % gsTfIdf.best_score_)
    print("Best parameters set:")
    best_parameters = gsTfIdf.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'max_features': ['sqrt', 'log2', None],
                             'max_depth': range(2,1000),
                             }
                            ]


        reg = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
        reg.fit(self.X_train, self.y_train)

        print "Best parameters set found on development set:\n"
        print reg.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in reg.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "MSE for test data set:\n"
        y_true, y_pred = self.y_test, reg.predict(self.X_test)
        print mean_squared_error(y_true, y_pred)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'weights': ['uniform', 'distance'],
                             'n_neighbors': range(2,100)
                             }
                            ]


        reg = GridSearchCV(neighbors.KNeighborsRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
        reg.fit(self.X_train, self.y_train)

        print "Best parameters set found on development set:\n"
        print reg.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in reg.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print reg.scorer_

        print "MSE for test data set:"
        y_true, y_pred = self.y_test, reg.predict(self.X_test)
        print mean_squared_error(y_pred, y_true)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'alpha': np.logspace(-5,5)
                             }
                            ]


        reg = GridSearchCV(linear_model.Ridge(alpha = 0.5), tuned_parameters, cv=5, scoring='mean_squared_error')
        reg.fit(self.X_train, self.y_train)

        print "Best parameters set found on development set:\n"
        print reg.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in reg.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print reg.scorer_

        print "MSE for test data set:"
        y_true, y_pred = self.y_test, reg.predict(self.X_test)
        print mean_squared_error(y_pred, y_true)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'kernel': ['rbf'],
                             'gamma': np.logspace(-4, 3, 30),
                             'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]},
                             {'kernel': ['poly'],
                              'degree': [1, 2, 3, 4],
                              'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
                              'coef0': np.logspace(-4, 3, 30)},
                            {'kernel': ['linear'],
                             'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]

        clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'weights': ['uniform', 'distance'],
                             'n_neighbors': range(2,60)
                             }
                            ]


        clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'max_depth': range(20,60),
                             'n_estimators': range(10,40),
                             'max_features': ['sqrt', 'log2', None]
                             }
                            ]

        clf = GridSearchCV(RandomForestClassifier(n_estimators=30), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'max_depth': range(2,60),
                             'max_features': ['sqrt', 'log2', None]
                             }
                            ]

        clf = GridSearchCV(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:jingjuSingingPhraseMatching    作者:ronggong    | 项目源码 | 文件源码
def grid_search(clf, X, y):
    params = dict(anova_filter__k=[50, 100, 'all'],
                  xgb__max_depth=[3, 5, 10], xgb__n_estimators=[50, 100, 300, 500],
                  xgb__learning_rate=[0.05, 0.1])
    gs = GridSearchCV(clf, param_grid=params, n_jobs=4, cv=10, verbose=2)
    gs.fit(X, y)

    print "Best estimator:"
    print gs.best_estimator_
    print "Best parameters:"
    print gs.best_params_
    print "Best score:"
    print gs.best_score_

    y_pred = gs.predict(X)
    y_test = y
项目:skggm    作者:skggm    | 项目源码 | 文件源码
def quic_graph_lasso_cv(X, metric):
    '''Run QuicGraphLassoCV on data with metric of choice.

    Compare results with GridSearchCV + quic_graph_lasso.  The number of
    lambdas tested should be much lower with similar final lam_ selected.
    '''
    print('QuicGraphLassoCV with:')
    print('   metric: {}'.format(metric))
    model = QuicGraphLassoCV(
            cv=2,  # cant deal w more folds at small size
            n_refinements=6,
            n_jobs=1,
            init_method='cov',
            score_metric=metric)
    model.fit(X)
    print('   len(cv_lams): {}'.format(len(model.cv_lams_)))
    print('   lam_scale_: {}'.format(model.lam_scale_))
    print('   lam_: {}'.format(model.lam_))
    return model.covariance_, model.precision_, model.lam_
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def kde_opt1(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["hour"] = (1 + df["hour"]) * 3.92105
        df_new["weekday"] = (1 + df["weekday"]) * 4.28947
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 9.44736
        df_new["x"] = df["x"] * 424.489
        df_new["y"] = df["y"] * 959.183
        return df_new
    logging.info("train kde_opt1 model")
    df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
    df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
    n_class = len(np.unique(y_train))
    y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
    Xte = df_cell_test_feats_kde.values
    for i in range(n_class):
        X = df_cell_train_feats_kde[y_train == i].values
        cstd = np.std(np.sum(np.abs(X), axis=1))
        gridcv = GridSearchCV(KernelDensity(kernel='gaussian', metric='manhattan'), {'bandwidth': cstd * np.logspace(-1, 1, 10)}, cv=5)
        gridcv.fit(X)
        y_test_pred[:, i] += np.exp(gridcv.best_estimator_.score_samples(Xte))
    return y_test_pred
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print len(x), q, sm.accuracy_score(test_y, pred_y)
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def grid_retrain_in_f(self, n_dim=500):
        rbf_map = RBFSampler(n_dim, random_state=1)
        fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
                                                ("svm", LinearSVC())])

        # C_range = np.logspace(-5, 15, 21, base=2)
        # gamma_range = np.logspace(-15, 3, 19, base=2)
        # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
        # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
        # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
        # grid.fit(X, Y)
        #
        # rbf_svc2 = grid.best_estimator_

        rbf_svc2 = fourier_approx_svm
        rbf_svc2.fit(self.X_ex, self.y_ex)

        self.set_clf2(rbf_svc2)
        return self.benchmark()
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def grid_search(self):
        C_range = np.logspace(-5, 15, 21, base=2)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0)

        logger.info('start grid search for Linear')
        grid.fit(self.X_ex, self.y_ex)
        logger.info('end grid search for Linear')

        scores = [x[1] for x in grid.grid_scores_]

        # final train
        clf = grid.best_estimator_

        pred_train = clf.predict(self.X_ex)
        pred_val = clf.predict(self.val_x)
        pred_test = clf.predict(self.test_x)

        r = Result(self.name + ' (X)', 'Poly', len(self.X_ex),
                   sm.accuracy_score(self.y_ex, pred_train),
                   sm.accuracy_score(self.val_y, pred_val),
                   sm.accuracy_score(self.test_y, pred_test))
        return r
项目:Parkinsons-Vocal-Analysis-Model    作者:WilliamY97    | 项目源码 | 文件源码
def fit_model(X, y):

    classifier = svm.SVC()

    parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]}


    f1_scorer = make_scorer(performance_metric,
                                   greater_is_better=True)

    clf = GridSearchCV(classifier,
                       param_grid=parameters,
                       scoring=f1_scorer)

    clf.fit(X, y)

    return clf


# Read student data
项目:color-features    作者:skearnes    | 项目源码 | 文件源码
def get_model():
    if FLAGS.model == 'logistic':
        return linear_model.LogisticRegressionCV(class_weight='balanced',
                                                 scoring='roc_auc',
                                                 n_jobs=FLAGS.n_jobs,
                                                 max_iter=10000, verbose=1)
    elif FLAGS.model == 'random_forest':
        return ensemble.RandomForestClassifier(n_estimators=100,
                                               n_jobs=FLAGS.n_jobs,
                                               class_weight='balanced',
                                               verbose=1)
    elif FLAGS.model == 'svm':
        return grid_search.GridSearchCV(
            estimator=svm.SVC(kernel='rbf', gamma='auto',
                              class_weight='balanced'),
            param_grid={'C': np.logspace(-4, 4, 10)}, scoring='roc_auc',
            n_jobs=FLAGS.n_jobs, verbose=1)
    else:
        raise ValueError('Unrecognized model %s' % FLAGS.model)
项目:machine-learning    作者:cinserra    | 项目源码 | 文件源码
def transform_pca(clf_list):
    '''
    From classifier list to pipeline list of the same classifiers and PCA.
    '''

    pca = PCA()
    params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]}

    for j in range(len(clf_list)):

        name = "clf_" + str(j)
        clf, params = clf_list[j]

        # Parameters in GridSearchCV need to have double underscores
        # between specific classifiers.
        new_params = {}
        for key, value in params.iteritems():
            new_params[name + "__" + key] = value

        new_params.update(params_pca)
        clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params)

    return clf_list
项目:real_estate    作者:cooperoelrichs    | 项目源码 | 文件源码
def do(data_file_path, outputs_dir, xy_class, model_class, params):
        data = ModelAnalysis.read_data(data_file_path)
        xy = xy_class(data, exclude_suburb=False)

        model = model_class(
            xy.X.values, xy.y.values,
            xy.X.columns.values, data,
        )

        grid = GridSearchCV(
            model.model,
            params,
            scoring='mean_squared_error',
            cv=KFold(xy.y.values.shape[0]),
            verbose=0,
            n_jobs=4
        )

        grid.fit(xy.X.values, xy.y.values)
        model.model = grid.best_estimator_
        return grid.best_params_, model
项目:policosm    作者:ComplexCity    | 项目源码 | 文件源码
def getKernelDensityEstimation(nodes, metric='euclidean', metric_params=None, bbox=None, bandwidth=0.002, optimizeBandwidth=False, bwmin=0.0001, bwmax=0.01, crossValidation=20):
    lon = []
    lat = []
    for nlon,nlat in nodes:
        lon.append(nlon)
        lat.append(nlat)
    lon = np.array(lon)
    lat = np.array(lat)

    if bbox is None:
        xmin, xmax = min(lon), max(lon)
        ymin, ymax = min(lat), max(lat)
        bbox = [xmin, xmax, ymin, ymax]
    else:
        xmin, ymin, xmax, ymax = bbox[0],bbox[1],bbox[2],bbox[3]
        bbox = [xmin, xmax, ymin, ymax]

    x, y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([x.ravel(), y.ravel()])
    values = np.vstack([lon, lat])

    if optimizeBandwidth:
        grid = GridSearchCV(KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree'), {'bandwidth': np.linspace(bwmin, bwmax, 30)}, cv=crossValidation) # 20-fold cross-validation
        grid.fit(zip(*values))

        bandwidth = grid.best_params_['bandwidth']
        kernel = grid.best_estimator_
    else:
        kernel = KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree', bandwidth=bandwidth)
        kernel.fit(zip(*values))

    return kernel, positions, x, y, bbox, bandwidth
项目:Stock-Prediction-Time-Series-Analysis-Python    作者:Nekooeimehr    | 项目源码 | 文件源码
def Second_Model_KRR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)}
    krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error")
    krr_Tuned.fit(Scaled_Input_Data, Output_Data)
    KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma'])
    KRR_Time = time.time() - T0
    print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time)
    MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_KRR = np.mean(list(MSEs_KRR))
    print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR))
    return(MeanMSE_KRR, krr_Tuned)
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _gs_SVC_r0( xM, yVc, params):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_SVC( xM, yVc, params, n_folds = 5):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _gs_SVC_r0( xM, yVc, params):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_SVC( xM, yVc, params, n_folds = 5):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge( xM, yV, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1, scoring = 'r2'):
    """
    Parameters
    -------------
    scoring: mean_absolute_error, mean_squared_error, median_absolute_error, r2
    """
    print(xM.shape, yV.shape)

    clf = linear_model.Ridge()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    parmas = {'alpha': np.logspace( *alphas_log)}
    kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = scoring, cv = kf_n, n_jobs = n_jobs)

    gs.fit( xM, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def regressorOp(x, y):
    """
    This will optimize the parameters for the algo
    """
    regr_rbf = svm.SVR(kernel="rbf")
    C = [1000, 10, 1]
    gamma = [0.005, 0.004, 0.003, 0.002, 0.001]
    epsilon = [0.1, 0.01]
    parameters = {"C":C, "gamma":gamma, "epsilon":epsilon}

    gs = grid_search.GridSearchCV(regr_rbf, parameters, scoring="r2")   
    gs.fit(x, y)

    print "Best Estimator:\n", gs.best_estimator_
    print "Type: ", type(gs.best_estimator_)

    return gs.best_estimator_
项目:DeepLearning_VirtualReality_BigData_Project    作者:rashmitripathi    | 项目源码 | 文件源码
def testIrisDNN(self):
    if HAS_SKLEARN:
      random.seed(42)
      iris = datasets.load_iris()
      feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
      classifier = learn.DNNClassifier(
          feature_columns=feature_columns,
          hidden_units=[10, 20, 10],
          n_classes=3)
      grid_search = GridSearchCV(
          classifier, {'hidden_units': [[5, 5], [10, 10]]},
          scoring='accuracy',
          fit_params={'steps': [50]})
      grid_search.fit(iris.data, iris.target)
      score = accuracy_score(iris.target, grid_search.predict(iris.data))
      self.assertGreater(score, 0.5, 'Failed with score = {0}'.format(score))
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def gridSearchPipeline(pipeline, paramsGrid, Xtrain, Ytrain, **cvParams):
    print("Grid Searching pipeline:")
    print(pipeline)

    # use 5-fold stratified cross-validation by default to maintain
    # consistent class balance across training and testing
    if 'cv' not in cvParams:
        # print "Ytrain: ", Ytrain
        # numClasses = len(np.unique(Ytrain))
        # examplesPerClass = len(Ytrain) / numClasses
        # nFolds = max(5, examplesPerClass / 5)
        # if nFolds < 5:
        # if True:
            # r, c = Ytrain.shape
            # print "tiny Ytrain size: (%d, %d)" % Ytrain.shape # (r, c)
            # for row in Ytrain: print row
        # cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=nFolds)
        cvParams['cv'] = StratifiedKFold(Ytrain, n_folds=5)

    cv = GridSearchCV(pipeline, paramsGrid, **cvParams)
    cv.fit(Xtrain, Ytrain)
    return cv
项目:pybot    作者:spillai    | 项目源码 | 文件源码
def fit(self, X, y, test_size=0.3):
        # Grid search cross-val (best C param)
        cv = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=self.seed_)
        clf_cv = GridSearchCV(self.clf_base_, self.clf_hyparams_, cv=cv, n_jobs=-1, verbose=4)

        print('====> Training Classifier (with grid search hyperparam tuning) .. ')
        print('====> BATCH Training (in-memory): {:4.3f} MB'.format(X.nbytes / 1024.0 / 1024.0) )
        clf_cv.fit(X, y)
        print('BEST: {}, {}'.format(clf_cv.best_score_, clf_cv.best_params_))

        # Setting clf to best estimator
        self.clf_ = clf_cv.best_estimator_

        # # Calibrating classifier
        # print('Calibrating Classifier ... ')
        # self.clf_prob_ = CalibratedClassifierCV(self.clf_, cv=cv, method='sigmoid')
        # self.clf_prob_.fit(X, y)        

        # # Setting clf to best estimator
        # self.clf_ = clf_cv.best_estimator_
        # pred_targets = self.clf_.predict(X)

        if self.epoch_no_ % 10 == 0: 
            self.save(self.filename_.replace('.h5', '_iter_{}.h5'.format(self.epoch_no_)))
        self.save(self.filename_)
        self.epoch_no_ += 1
项目:TGIF-Release    作者:raingo    | 项目源码 | 文件源码
def main():

    import sys
    import numpy as np
    from sklearn import cross_validation
    from sklearn import svm
    import cPickle

    data_dir = sys.argv[1]

    fet_list = load_list(osp.join(data_dir, 'c3d.list'))
    pos_list = load_list(osp.join(data_dir, 'pos.urls'))

    features = np.load(osp.join(data_dir, 'c3d.npy'))
    fet_set = set(fet_list)

    pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set]

    y = np.zeros(features.shape[0])
    y[pos_idx] = 1

    print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y)

    params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]}
    #params = {'n_estimators':[50, 70, 100, 120, 150, 200]}
    clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5)
    clf.fit(features, y)
    print clf.best_score_
    print clf.best_estimator_
    cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def runGridSearch(self, model):
        logging.debug("run grid search on model: {}".format(model.__class__.__name__))
        logging.debug("cross validation strategy: {}".format(model.holdout_split))
        logging.debug("used features: {}".format(model.usedFeatures))
        logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))

        features,labels,cv = model.getFeaturesLabel()
        # do grid search
        if self.do_random_gridsearch:
            estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
        else:
            estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, 
                                     fit_params=model.get_fit_params(),
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500)
        estimator.fit(features, labels)
        model.clf = estimator.best_estimator_
        model.save_final_model = True
        model.save_model()

#         model.dispFeatureImportance()
        logging.debug('estimaator parameters: {}'.format(estimator.get_params))
        logging.debug('Best parameters: {}'.format(estimator.best_params_))
        logging.debug('Best Scores: {}'.format(-estimator.best_score_))
        logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
        for i in estimator.grid_scores_ :
            logging.debug('parameters: {}'.format(i.parameters ))
            logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
            logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))



        return
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def train_classifier(self, trainvectors, labels, alpha='default', fit_prior=True, iterations=10):
        if alpha == '':
            paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
            paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
            selected_alpha = paramsearch.best_estimator_.alpha
        elif alpha == 'default':
            selected_alpha = 1.0
        else:
            selected_alpha = alpha
        if fit_prior == 'False':
            fit_prior = False
        else:
            fit_prior = True
        self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
        self.model.fit(trainvectors, self.label_encoder.transform(labels))
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def train_classifier(self, trainvectors, labels, alpha='', iterations=50, jobs=10):
        iterations = int(iterations)
        jobs = int(jobs)
        if alpha == '':
            paramsearch = GridSearchCV(estimator=Perceptron(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:],n_iter=[iterations]), n_jobs=jobs)
            paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
            selected_alpha = paramsearch.best_estimator_.alpha
        elif alpha == 'default':
            selected_alpha = 1.0
        else:
            selected_alpha = alpha
        # train a perceptron with the settings that led to the best performance
        self.model = Perceptron(alpha=selected_alpha,n_iter=iterations,n_jobs=jobs)
        self.model.fit(trainvectors, self.label_encoder.transform(labels))
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()):

    labels = [x.severity for x in data]

    generatePrimaryFeats(data, featTypes)

    featurized = []
    for d in data:
        instance = {}
        for featname, values in d.feats.items():
            # Give each feature a unique name to avoid overwriting features.
            # If e.g. a concept feature has the same name as a bow word, the old code
            # would overwrite one of the features.
            instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()})

        featurized.append(instance)

    d = DictVectorizer()
    x_train = d.fit_transform(featurized)

    folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed)
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds)
    fit_grid = grid.fit(x_train, labels)

    print(fit_grid.best_params_)
    return fit_grid.best_params_
项目:TFMicrobiome    作者:alifar76    | 项目源码 | 文件源码
def grid_search(model,tuned_parameters,X,Y):
    startTime = datetime.now()
    model_gs = GridSearchCV(model, 
    tuned_parameters, cv=10)
    model_gs.fit(X, Y.values.ravel())
    print("Best parameters set found on development set:")
    print(model_gs.best_params_)
    print "\n"+"Task Completed! Completion time: "+ str(datetime.now()-startTime)
    return


# Load data
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def __gridSearch(self, est, X, y):
        if isinstance(est, RandomForestClassifier):
            param_grid = {
                'n_estimators': [200, 300, 400, 500, 800, 1000],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth': [5, 15, 25]
            }

        CV_rfc = GridSearchCV(estimator=est, param_grid=param_grid, cv=5)
        CV_rfc.fit(X, y)
        logger.info('[%s] : [INFO] Best parameters are: %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), CV_rfc.best_params_)
        print 'Best parameters are: %s'  % CV_rfc.best_params_
        return CV_rfc.best_params_
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def __init__(self, *models, **kwargs):
        """ Initializes the grid search

            :param list models: List of models to use. Each one should be a tuple
             with a model instance or class and a dictionary for the search space.
            :param kwargs: addition initialization arguments
             for `sklearn.grid_search.GridSearchCV`
        """
        self.models = filter(None, models)
        kwargs['refit'] = True
        self.kwargs = kwargs
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def fit(self, training_sets):
        """ Searches for the best estimator and its arguments as well as the best
            training set amongst those specified.

            :param generator training_sets: Training set to use. Should be a sequence
             of tuples (x, y, metadata) where x is the training set, y is the
             correct answer for each chunk and metadata contains additional data that will
             be returned back
            :return: the metadata of the training set which yielded the best score,
             the best score obtained by the model, parameters of the model and
             fitted model itself
            :rtype: tuple
        """
        best_training, best_score, best_params, best_model = None, None, None, None
        for i, (metadata, extractor) in enumerate(training_sets):
            for model, grid in self.models:
                assert isclass(model)

                x, y = extractor.get_features(refit=True)

                grid['model_cls'] = [model]
                grid['selector_column'] = [None, extractor.lu_column()]

                search = GridSearchCV(
                    FeatureSelectedClassifier(model), param_grid=grid, **self.kwargs
                )
                search.fit(x, y)

                score, params, model = search.best_score_, search.best_params_, search.best_estimator_
                logger.debug('%s with parameters %s and training meta %s has score %s',
                             type(model), params, metadata, score)
                if best_score is None or score > best_score:
                    best_training, best_score, best_params, best_model = (x, y, metadata), score, params, model

        return best_training, best_score, best_params, best_model


# needs to be pickleable and callable