Python sklearn.pipeline 模块，make_pipeline() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.pipeline.make_pipeline()。

项目：time_series_modeling 作者：rheineke | 项目源码 | 文件源码

def unscaled_pipelines():
    # Random forest parameters
    random_forest_kwargs = {
        'n_estimators': 10,
        'criterion': 'mse',
        'random_state': _RANDOM_STATE,
        'n_jobs': cpu_count(),
        'verbose': True,
    }
    # Gradient boosting parameters
    gradient_boost_kwargs = {
        'random_state': _RANDOM_STATE,
        'verbose': 1,
    }
    models = [
        DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
        # RandomForestRegressor(**random_forest_kwargs),
        # GradientBoostingRegressor(**gradient_boost_kwargs),
    ]
    pipelines = []
    for m in models:
        # Steps
        pipelines.append(make_pipeline(m))
    return pipelines

项目：decoding-brain-challenge-2016 作者：alexandrebarachant | 项目源码 | 文件源码

def fit(self, X, y):
        """Fit TSclassifier.

        Parameters
        ----------
        X : ndarray, shape (n_trials, n_channels, n_channels)
            ndarray of SPD matrices.
        y : ndarray shape (n_trials, 1)
            labels corresponding to each trial.

        Returns
        -------
        self : TSclassifier. instance
            The TSclassifier. instance.
        """
        ts = TangentSpace(metric=self.metric, tsupdate=self.tsupdate)
        self._pipe = make_pipeline(ts, self.clf)
        self._pipe.fit(X, y)
        return self

项目：time_series_modeling 作者：rheineke | 项目源码 | 文件源码

def sample_pipelines(pca_kernels=None, svr_kernels=None):
    """
    Pipelines that can't be fit in a reasonable amount of time on the whole
    dataset
    """
    # Model instances
    model_steps = []
    if pca_kernels is None:
        pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine']
    for pca_kernel in pca_kernels:
        model_steps.append([
            KernelPCA(n_components=2, kernel=pca_kernel),
            LinearRegression(),
        ])
    if svr_kernels is None:
        svr_kernels = ['poly', 'rbf', 'sigmoid']
    for svr_kernel in svr_kernels:
        model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000))

    # Pipelines
    pipelines = []
    for m in model_steps:
        # Steps
        common_steps = [
            StandardScaler(),
        ]
        model_steps = m if isinstance(m, list) else [m]
        steps = common_steps + model_steps
        pipelines.append(make_pipeline(*steps))
    return pipelines

项目：Optimus 作者：Yatoom | 项目源码 | 文件源码

def build_pipeline(base_estimator, parameters):
        """
        Builds a pipeline where the base estimator is initialized with given parameters. The `@preprocessor` parameter
        is a special parameter that will determine which pre-processing steps to use.
        :param base_estimator: The base estimator of the pipeline
        :param parameters: The parameters for the base estimator, includes special parameters for the pipeline itself
        :return: The (pipeline with the) base estimator, initialized with given parameters
        """
        params = copy(parameters)
        preprocessors = Builder.extract_preprocessors(params)
        estimator = Builder.setup_estimator(base_estimator, params)

        if preprocessors is None:
            return estimator

        return make_pipeline(*preprocessors, estimator)

项目：document_classification 作者：scotthlee | 项目源码 | 文件源码

def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
    svd = TruncatedSVD(n_features)  
    if normalize:   
        if flip:
            lsa = make_pipeline(svd, Normalizer(copy=False))
            doc_mat = lsa.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            lsa = make_pipeline(svd, Normalizer(copy=False))        
            doc_mat = lsa.fit_transform(doc_vecs)
        return doc_mat
    else:
        if flip:
            doc_mat = svd.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            doc_mat = svd.fit_transform(doc_vecs)
        return doc_mat

项目：scikit-mdr 作者：EpistasisLab | 项目源码 | 文件源码

def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.

项目：scikit-mdr 作者：EpistasisLab | 项目源码 | 文件源码

def test_mdr_sklearn_pipeline_parallel():
    """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
    assert np.mean(cv_scores) > 0.

项目：decoding_challenge_cortana_2016_3rd 作者：kingjr | 项目源码 | 文件源码

def fit(self, X, y):
        """Fit TSclassifier.

        Parameters
        ----------
        X : ndarray, shape (n_trials, n_channels, n_channels)
            ndarray of SPD matrices.
        y : ndarray shape (n_trials, 1)
            labels corresponding to each trial.

        Returns
        -------
        self : TSclassifier. instance
            The TSclassifier. instance.
        """
        ts = TangentSpace(metric=self.metric, tsupdate=self.tsupdate)
        self._pipe = make_pipeline(ts, self.clf)
        self._pipe.fit(X, y)
        return self

项目：bitcoin-forecast 作者：roksela | 项目源码 | 文件源码

def __init__(self, model_type=DEFAULT_MODEL_TYPE):
        """
        Set ups model and pipeline for learning and predicting.

        :param model_type: only 'SVR' model is supported for now
        """
        assert (model_type == 'SVR'), "Model '{}' is not supported. " \
                                      "We support only SVR for now.".format(model_type)
        self._model_type = model_type
        self._model_params = BTCForecast.DEFAULT_SVR_MODEL_PARAMS

        # set up SVR pipeline
        self._scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
        self._model = SVR(kernel=self._model_params['kernel'],
                          epsilon=self._model_params['epsilon'],
                          C=self._model_params['c'],
                          gamma=self._model_params['gamma'])
        self._pipeline = make_pipeline(self._scaler, self._model)
        self.has_learned = False

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]})
    scorer = check_scoring(grid, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
                             scoring=DummyScorer())
    assert_array_equal(scores, 1)

项目：hh-page-classifier 作者：TeamHG-Memex | 项目源码 | 文件源码

def LDAPageVctorizer(*,
                     n_topics: int,
                     min_df: int,
                     max_features: int,
                     max_iter: int,
                     ngram_range: Tuple[int, int],
                     vocabulary=None,
                     batch_size: int=4096,
                     verbose=1):
    vec = _vectorizer(min_df=min_df, max_features=max_features,
                      ngram_range=ngram_range, vocabulary=vocabulary)
    lda = LatentDirichletAllocation(
        learning_method='online',
        n_topics=n_topics,
        batch_size=batch_size,
        evaluate_every=2,
        verbose=verbose,
        max_iter=max_iter,
        n_jobs=1,
    )
    return make_pipeline(vec, lda)

项目：dask-ml 作者：dask | 项目源码 | 文件源码

def test_in_pipeline():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    pipe.fit(X, y)

项目：dask-ml 作者：dask | 项目源码 | 文件源码

def test_gridsearch():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    grid = {
        'logisticregression__C': [1000, 100, 10, 2]
    }
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    search = dcv.GridSearchCV(pipe, grid, cv=3)
    search.fit(X, y)

项目：whereami 作者：kootenpv | 项目源码 | 文件源码

def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
    return make_pipeline(DictVectorizer(sparse=False), clf)

项目：nlp-lt 作者：minven | 项目源码 | 文件源码

def truncated_svd(self):
        # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
        svd = TruncatedSVD(self.dimensions)   
        lsa = make_pipeline(svd, Normalizer(copy=False))
        X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
        print(svd.components_[0])
        print(svd.explained_variance_ratio_) 
        print(svd.explained_variance_ratio_.sum())

项目：time_series_modeling 作者：rheineke | 项目源码 | 文件源码

def scaled_pipelines():
    # Model parameters
    # RANSAC parameters
    # 500 max trials takes 90s
    ransac_kwargs = {
        'max_trials': 1000,
        'min_samples': 5000,
        'loss': 'absolute_loss',
        'residual_threshold': 2.0,
        'random_state': _RANDOM_STATE,
    }
    # Ridge CV parameters
    alphas = [.01, .1, 1, 10]
    # Model instances
    model_steps = [
        LinearRegression(),
        # [PolynomialFeatures(degree=2), LinearRegression()],
        # [PolynomialFeatures(degree=3), LinearRegression()],
        # RANSACRegressor(base_estimator=LinearRegression(), **ransac_kwargs),
        # RANSACRegressor with polynomial regression?
        # RidgeCV(alphas=alphas),
        # LassoCV(),  # Alphas set automatically by default
        # ElasticNetCV(l1_ratio=0.5),  # Same as default
        # [PolynomialFeatures(degree=2), ElasticNetCV(l1_ratio=0.5)],
        # SGDRegressor(),
    ]
    # Pipelines
    pipelines = []
    for m in model_steps:
        # Steps
        common_steps = [
            StandardScaler(),
            PCA(**_PCA_KWARGS)
        ]
        model_steps = m if isinstance(m, list) else [m]
        steps = common_steps + model_steps
        pipelines.append(make_pipeline(*steps))
    return pipelines

项目：sktransformers 作者：TomAugspurger | 项目源码 | 文件源码

def fit():
    X, y = generate()
    dX = dd.from_pandas(X, npartitions=10)
    y = dd.from_pandas(y, npartitions=10)

    pre_pipe = make_pipeline(
        CategoricalEncoder(),
        DummyEncoder(),
        Imputer(),
        SGDRegressor(),
    )

    pipe = make_pipeline(
        SelectFromModel(pre_pipe),
        GradientBoostingRegressor(),
    )
    X_ = pre_pipe.fit_transform(dX)

    for i in range(X_.npartitions):
        for j in range(5):
            print(i, j)
            X_sub = X_.get_partition(i).compute()
            y_sub = y.get_partition(i).compute()
            clf.partial_fit(X_sub, y_sub)

    sfm = SelectFromModel(clf, prefit=True)
    return pipe, clf, sfm

项目：postlearn 作者：TomAugspurger | 项目源码 | 文件源码

def regression_pipeline(regression_model):
    return make_pipeline([StandardScaler(), regression_model])

项目：false-friends 作者：pln-fing-udelar | 项目源码 | 文件源码

def build_classifier(base_clf=svm.SVC()):
    # The imputer is for "use_taxonomy", and shouldn't affect if it's False.
    # TODO: should also try with other imputer strategies
    return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(),
                                  base_clf)


# noinspection PyPep8Naming

项目：TextStageProcessor 作者：mhyhre | 项目源码 | 文件源码

def make_ward_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'WARD/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
        predict_result = ward.fit_predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

项目：TextStageProcessor 作者：mhyhre | 项目源码 | 文件源码

def make_spectral_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'spectral/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
        predict_result = spectral.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

    # aa = Affinity Propagation

项目：TextStageProcessor 作者：mhyhre | 项目源码 | 文件源码

def make_aa_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'affinity_propagation/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
                                               max_iter=self.aa_max_iter,
                                               convergence_iter=self.aa_no_change_stop)

        predict_result = aa_clusterizator.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

项目：TextStageProcessor 作者：mhyhre | 项目源码 | 文件源码

def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

项目：mlprojects-py 作者：srinathperera | 项目源码 | 文件源码

def doPCA(X, output_columns_count):
    #DO PCA on the data and use it to transform
    svd = TruncatedSVD(output_columns_count)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)
    return X

项目：PredictiveServer 作者：KeyboardNerd | 项目源码 | 文件源码

def test_train():
        # define input streams
        names = ['v', 'p', 't', 'w', 'a']
        # define first transformation
        units = ["knot","in_Hg","celsius","force_pound","degree"]
        tounits = ["m/s", "pascal", "kelvin", "newton", "radian"]
        tuple_units = []
        for i, unit in enumerate(units):
            tuple_units.append((unit, tounits[i]))
        s1 = UnitTransformer(tuple_units)
        # second layer of transformation
        constants = {"s": 61.0, "R": 286.9}
        labels = ["2*w/(v**2*(p/R/t)*s)"]
        s2 = FormulaTransformer(labels, names, constants)
        # sink ( any sink transformation could be used to predict )
        # no fit_transform rule, can only predict
        features = ["a"]
        s3 = make_pipeline(FormulaTransformer(features, names), LinearRegression())
        # train the shit outof it
        with (open("data/training.csv")) as f:
            df = pd.read_csv(f, names=names, header=0)
            # awkward transformation from dataframe to numpy matrix
            # could use panda sklearn to solve
            ndarray = df.as_matrix(names)
            rawX = s1.fit_transform(ndarray)
            y = s2.fit_transform(rawX)
            X = rawX
            s3.fit(X, y)
            y_ = s3.predict(X)
            print X.shape, y_.shape
            #plt.scatter(FormulaTransformer(features, names).fit_transform(X), y_)
            #plt.show()
        # wrap the process as StreamPipeline for learning machine
        sp = StreamPipeline(names, s3)
        sp.predict(v=1.0, p=2.0, t=3.0, w=4.0, a=5.0)

项目：stegasawus 作者：rokkuran | 项目源码 | 文件源码

def _get_pipeline(self, name):
        return make_pipeline(self.pipeline, classifiers[name])

项目：DocumentClassification 作者：bahmanh | 项目源码 | 文件源码

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def fit_quadratic(X, y):
    """
    Uses OLS with Polynomial order 2.
    """
    model = make_pipeline(
        PolynomialFeatures(2), linear_model.LinearRegression()
    )
    model.fit(X, y)
    return model