我们从Python开源项目中,提取了以下3个代码示例,用于说明如何使用sklearn.pipeline()。
def run(): data = load_binary() # Extract features user_feat_matrix = process_level2(data) # X del user_feat_matrix['X']['user_id'] X = user_feat_matrix['X'].values X[np.isnan(X)] = 0 Y = user_feat_matrix['Y'] Y.fillna(0, inplace=True) del user_feat_matrix['X_all']['user_id'] X_all = user_feat_matrix['X_all'].values X_all[np.isnan(X_all)] = 0 cols = list(Y.columns.values) symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted', 'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain', 'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'] with open("result.txt", 'w') as f: f.write("user_id,day_in_cycle,symptom,probability\n") for symptom in symptoms: print(symptom) pipeline = Pipeline([ ('remove_low_variance_features', VarianceThreshold(threshold=0.0)), #('standard_scale', StandardScaler()), ('estimator', Lasso()), ]) param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]} model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4, verbose=2) model.fit(X, s_Y.values) print("dumping...") data_dir = 'data' cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv')) c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)} dump(symptom, model, X_all, c_length, data['users'].user_id)
def trainGP(df, dstPath, featureset="facefeatures", train_on_PCA=True, generate_PCA=True, transformer_func=None): #we need to train for both male and female grouped = df.groupby("gender") for gender, group in grouped: print("training regression for {}'s on {} features".format(gender,featureset)) X = np.array(group[featureset].as_matrix().tolist()) Y = np.array(group["attractiveness"].as_matrix().tolist()) if featureset == "facefeaturesCNN": X = X[:, 0:99] pipe = [] if transformer_func == "facefeatures3D": pipe.append(('custom_transformer',CustomTransformer(transformer_func))) if generate_PCA or train_on_PCA: pca = fitPCA(X) if train_on_PCA: pipe.append(('pca',pca)) else: pca = None #scale the data # pipe.append(('scaling',sklearn.preprocessing.StandardScaler())) estimator = sklearn.svm.SVR(kernel='rbf') # estimator = sklearn.linear_model.LinearRegression() # estimator = sklearn.ensemble.RandomForestRegressor() pipe.append(('estimator', estimator)) pipeline = sklearn.pipeline.Pipeline(pipe) parameters_to_search = {'estimator__C': np.logspace(0, 2, 3), "estimator__epsilon":np.logspace(-2, 2, 5), "estimator__gamma": np.logspace(-2, 2, 5)} if train_on_PCA: parameters_to_search["pca__n_components"] = np.arange(10, int(X.shape[1]), step=2) gridsearch = sklearn.model_selection.GridSearchCV(pipeline, parameters_to_search) gridsearch.fit(X,Y) print("Best parameters set found on development set:") print(gridsearch.best_params_) pipeline = gridsearch.best_estimator_ score = sklearn.model_selection.cross_val_score(pipeline, X, Y).mean() print("Score with the entire dataset = %.2f" % score) # plot_learning_curve(pipeline, "learning curve for linear regression", X, Y, train_sizes=np.linspace(.1, 1.0, 5)) # plt.draw() pickle.dump((pca,pipeline), open(os.path.join(dstPath,"GP_%s.p"%gender), "wb"))
def run(): data = load_binary() # Extract features user_feat_matrix = process_level2(data) # X del user_feat_matrix['X']['user_id'] X = user_feat_matrix['X'].values X[np.isnan(X)] = 0 Y = user_feat_matrix['Y'] Y.fillna(0, inplace=True) del user_feat_matrix['X_all']['user_id'] X_all = user_feat_matrix['X_all'].values X_all[np.isnan(X_all)] = 0 cols = list(Y.columns.values) symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted', 'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain', 'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'] with open("result.txt", 'w') as f: f.write("user_id,day_in_cycle,symptom,probability\n") labels = final_labels['labels'] for symptom in symptoms: print(symptom) s_Y = Y[[x for x in cols if x[1] == symptom]] pipeline = Pipeline([ ('remove_low_variance_features', VarianceThreshold(threshold=0.0)), #('standard_scale', StandardScaler()), ('estimator', Lasso()), ]) for cluster in range(3): #number of clusters print (cluster) param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]} model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4, verbose=2) model.fit(X[labels == cluster], s_Y.values[labels == cluster]) print("dumping...") data_dir = 'data' cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv')) c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])} dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])