Python sklearn 模块，pipeline() 实例源码

我们从Python开源项目中，提取了以下3个代码示例，用于说明如何使用sklearn.pipeline()。

项目：clue-hackathon 作者：adrinjalali | 项目源码 | 文件源码

def run():
    data = load_binary()

    # Extract features
    user_feat_matrix = process_level2(data)  # X

    del user_feat_matrix['X']['user_id']
    X = user_feat_matrix['X'].values
    X[np.isnan(X)] = 0
    Y = user_feat_matrix['Y']
    Y.fillna(0, inplace=True)
    del user_feat_matrix['X_all']['user_id']
    X_all = user_feat_matrix['X_all'].values
    X_all[np.isnan(X_all)] = 0

    cols = list(Y.columns.values)
    symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
                'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
                'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
    with open("result.txt", 'w') as f:
        f.write("user_id,day_in_cycle,symptom,probability\n")

    for symptom in symptoms:
        print(symptom)

        pipeline = Pipeline([
            ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
            #('standard_scale', StandardScaler()),
            ('estimator', Lasso()),
        ])

        param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
        model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
                             verbose=2)
        model.fit(X, s_Y.values)

        print("dumping...")
        data_dir = 'data'
        cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
        c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
        dump(symptom, model, X_all, c_length, data['users'].user_id)

项目：FaceAnalysis 作者：ElliotSalisbury | 项目源码 | 文件源码

def trainGP(df, dstPath, featureset="facefeatures", train_on_PCA=True, generate_PCA=True, transformer_func=None):
    #we need to train for both male and female
    grouped = df.groupby("gender")

    for gender, group in grouped:
        print("training regression for {}'s on {} features".format(gender,featureset))

        X = np.array(group[featureset].as_matrix().tolist())
        Y = np.array(group["attractiveness"].as_matrix().tolist())

        if featureset == "facefeaturesCNN":
            X = X[:, 0:99]

        pipe = []

        if transformer_func == "facefeatures3D":
            pipe.append(('custom_transformer',CustomTransformer(transformer_func)))

        if generate_PCA or train_on_PCA:
            pca = fitPCA(X)
            if train_on_PCA:
                pipe.append(('pca',pca))
        else:
            pca = None

        #scale the data
        # pipe.append(('scaling',sklearn.preprocessing.StandardScaler()))

        estimator = sklearn.svm.SVR(kernel='rbf')
        # estimator = sklearn.linear_model.LinearRegression()
        # estimator = sklearn.ensemble.RandomForestRegressor()
        pipe.append(('estimator', estimator))

        pipeline = sklearn.pipeline.Pipeline(pipe)

        parameters_to_search = {'estimator__C': np.logspace(0, 2, 3), "estimator__epsilon":np.logspace(-2, 2, 5), "estimator__gamma": np.logspace(-2, 2, 5)}
        if train_on_PCA:
            parameters_to_search["pca__n_components"] = np.arange(10, int(X.shape[1]), step=2)
        gridsearch = sklearn.model_selection.GridSearchCV(pipeline, parameters_to_search)
        gridsearch.fit(X,Y)

        print("Best parameters set found on development set:")
        print(gridsearch.best_params_)

        pipeline = gridsearch.best_estimator_


        score = sklearn.model_selection.cross_val_score(pipeline, X, Y).mean()
        print("Score with the entire dataset = %.2f" % score)

        # plot_learning_curve(pipeline, "learning curve for linear regression", X, Y, train_sizes=np.linspace(.1, 1.0, 5))
        # plt.draw()

        pickle.dump((pca,pipeline), open(os.path.join(dstPath,"GP_%s.p"%gender), "wb"))

项目：clue-hackathon 作者：adrinjalali | 项目源码 | 文件源码

def run():
    data = load_binary()

    # Extract features
    user_feat_matrix = process_level2(data)  # X

    del user_feat_matrix['X']['user_id']
    X = user_feat_matrix['X'].values
    X[np.isnan(X)] = 0
    Y = user_feat_matrix['Y']
    Y.fillna(0, inplace=True)
    del user_feat_matrix['X_all']['user_id']
    X_all = user_feat_matrix['X_all'].values
    X_all[np.isnan(X_all)] = 0

    cols = list(Y.columns.values)
    symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
                'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
                'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
    with open("result.txt", 'w') as f:
        f.write("user_id,day_in_cycle,symptom,probability\n")

    labels = final_labels['labels']

    for symptom in symptoms:
        print(symptom)
        s_Y = Y[[x for x in cols if x[1] == symptom]]

        pipeline = Pipeline([
            ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
            #('standard_scale', StandardScaler()),
            ('estimator', Lasso()),
        ])

        for cluster in range(3): #number of clusters
            print (cluster)


            param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
            model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
                             verbose=2)
            model.fit(X[labels == cluster], s_Y.values[labels == cluster])

            print("dumping...")
            data_dir = 'data'
            cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
            c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])}
            dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])