Python sklearn.pipeline 模块,Pipeline() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用sklearn.pipeline.Pipeline()

项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_fit_failure():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)
    check_scores_all_nan(gs, 'union__bad__parameter')
项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def computeNeighboursScores(self):
        all_instances = self.iteration.datasets.instances
        # Connectivity matrix
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))])
        pipeline.fit(all_instances.getFeatures())
        # Labels
        labels = np.array([generateLabel(x) for x in all_instances.getLabels()])
        # Compute neighbour scores
        scores = []
        all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False)
        for i, label in enumerate(labels):
            if label != 0:
                continue
            else:
                neighbours = all_neighbours[i]
                score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours)
                scores.append(score)
        return np.array(scores)
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating
            hyper-parameters

        """
        classifier = a_clf
        self._gs = a_grid_search
        if a_clf is None:
            classifier = XGBClassifier(max_depth=MAX_DEPTH,
                                       n_estimators=NTREES,
                                       learning_rate=ALPHA,
                                       objective="multi:softprob")
            self._clf = classifier
        # latest version of XGBoost cannot deal with non-sparse feature vectors
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])
项目:MorphoBabushka    作者:nvanva    | 项目源码 | 文件源码
def nbsvm(base_clf, fit_scaler=None, transform_scaler='bin', multi_class='ovr'):
    """
    NB-SVM classifier: pipeline of MNBScaler+base_clf wrapped in OneVsRestClassifier / OneVsOneClassifier to support
    multiclass (MNBScaler supports only binary problems itself!).
    :param base_clf: classifier to use after MNBScaler, LogisticRegression or LinearSVC are usually used
    :param fit_scaler: look at MNBScaler class
    :param transform_scaler: look at MNBScaler class
    :param multi_class: ovr for OneVsRestClassifier, ovo for OneVsOneClassifier
    :return: OneVsRestClassifier / OneVsOneClassifier
    """
    mnb_scaler = MNBScaler(fit_scaler=fit_scaler, transform_scaler=transform_scaler)
    pipe = Pipeline([('mnbscaler', mnb_scaler), ('clf', base_clf)])
    if multi_class=='ovr':
        return OneVsRestClassifier(pipe)
    elif multi_class=='ovo':
        return OneVsOneClassifier(pipe)
    else:
        raise ValueError('Unsuppported multi_class=%s, should be one of %r.' % (multi_class, ['ovr','ovo']))
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_model_detection(self):
        sklearn_model = LogisticRegression()
        pipeline_model = Pipeline([('log', sklearn_model)])
        xgb_model = XGBClassifier()
        nn_model = NNModel(100,10)
        sklearn_opt = Optimizer(sklearn_model,[], lambda x: x)
        pipeline_opt = Optimizer(pipeline_model,[], lambda x: x)
        xgb_opt = Optimizer(xgb_model,[], lambda x: x)
        nn_opt = Optimizer(nn_model,[], lambda x: x)

        self.assertEqual(sklearn_opt.model_module, 'sklearn')
        self.assertEqual(pipeline_opt.model_module, 'pipeline')
        self.assertEqual(xgb_opt.model_module, 'xgboost')
        self.assertEqual(nn_opt.model_module, 'keras')
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_
项目:triage    作者:dssg    | 项目源码 | 文件源码
def test_cutoff_inside_a_pipeline(data):
    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5

    mms = preprocessing.MinMaxScaler().fit(data['X_train'])

    assert np.all(( mms.transform(X_fake_new_data) > 1  ) == (pipeline.transform(X_fake_new_data) == 1))
项目:triage    作者:dssg    | 项目源码 | 文件源码
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_pipeline(get_models, get_transform, get_kernel):

    alg, model = get_models
    trans = get_transform()
    kernel = get_kernel() + WhiteKernel()

    pipe = Pipeline(steps=[(alg, model())])
    param_dict = {}
    if hasattr(model(), 'n_estimators'):
        param_dict[alg + '__n_estimators'] = [5]
    if hasattr(model(), 'kernel'):
        param_dict[alg + '__kernel'] = [kernel]
    param_dict[alg + '__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(10)
    estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -15.0
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_svr_pipeline(get_transform, get_svr_kernel):
    trans = get_transform()
    pipe = Pipeline(steps=[('svr', svr())])
    param_dict = {'svr__kernel': [get_svr_kernel]}
    param_dict['svr__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(1)
    estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -10.0
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_krige_pipeline(get_krige_method, get_variogram_model):
    pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))])
    param_dict = {'krige__variogram_model': [get_variogram_model]}

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True
                            )
    np.random.seed(1)
    X = np.random.randint(0, 400, size=(20, 2)).astype(float)
    y = 5*np.random.rand(20)
    estimator.fit(X=X, y=y)
    assert estimator.cv_results_['mean_train_score'][0] > -1.0
项目:searchgrid    作者:jnothman    | 项目源码 | 文件源码
def test_build_param_grid_set_estimator():
    clf1 = SVC()
    clf2 = LogisticRegression()
    clf3 = SVC()
    clf4 = SGDClassifier()
    estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
                                   ('clf', None)]),
                         clf=[set_grid(clf1, kernel=['linear']),
                              clf2,
                              set_grid(clf3, kernel=['poly'], degree=[2, 3]),
                              clf4])
    param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
                  {'clf': [clf3], 'clf__kernel': ['poly'],
                   'clf__degree': [2, 3], 'sel__k': [2, 3]},
                  {'clf': [clf2, clf4], 'sel__k': [2, 3]}]
    assert build_param_grid(estimator) == param_grid
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_binary(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      epsilon=0.1,
                                                      eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='log',
                                                      n_iter=10,
                                                      n_jobs=1,
                                                      penalty='l2',
                                                      power_t=0.5,
                                                      random_state=None,
                                                      shuffle=True,
                                                      verbose=0,
                                                      warm_start=False
            )))
        ])
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])
项目:marseille    作者:vene    | 项目源码 | 文件源码
def custom_fnames(union):
    feature_names = []
    for name, trans, weight in union._iter():
        if hasattr(trans, 'get_feature_names'):
            this_fn = trans.get_feature_names()

        elif isinstance(trans, Pipeline):
            # we use pipelines to scale only specific attributes.
            # In this case, the vectorizer is first in the pipe.
            this_fn = trans.steps[0][-1].get_feature_names()

        else:
            raise AttributeError("Transformer %s (type %s) does not "
                                 "provide get_feature_names." % (
                                     str(name), type(trans).__name__))
        feature_names.extend([name + "__" + f for f in this_fn])
    return feature_names
项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def createPipeline(self):
        self.pipeline = Pipeline([
            ('model', GradientBoostingClassifier(
                loss = self.conf.loss,
                learning_rate = self.conf.learning_rate,
                n_estimators = self.conf.n_estimators,
                criterion = self.conf.criterion,
                max_depth = self.conf.max_depth,
                min_samples_split = self.conf.min_samples_split,
                min_samples_leaf = self.conf.min_samples_leaf,
                min_weight_fraction_leaf = self.conf.min_weight_fraction_leaf,
                subsample = self.conf.subsample,
                max_features = self.conf.max_features,
                max_leaf_nodes = self.conf.max_leaf_nodes,
                min_impurity_split = self.conf.min_impurity_decrease,
                presort = self.conf.presort))])
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def gridSearch(data, params, true_k):

    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=True,
                            sublinear_tf=True,
                            analyzer='word')

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf', KMeans(init='k-means++',
                                        n_jobs=-1,
                                        random_state=0,
                                        verbose=0))])
    gsTfIdf = GridSearchCV(
        lr_tfidf, params, n_jobs=1, verbose=1)

    gsTfIdf.fit(data)
    print()
    print("Best score: %0.3f" % gsTfIdf.best_score_)
    print("Best parameters set:")
    best_parameters = gsTfIdf.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Initialize classifier.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating hyper-parameters

        """
        classifier = a_clf or LinearSVC(C=DFLT_C,
                                        **DFLT_PARAMS)
        self._gs = a_grid_search
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])
项目:postlearn    作者:TomAugspurger    | 项目源码 | 文件源码
def model_from_pipeline(pipe):
    '''
    Extract the model from the last stage of a pipeline.

    Parameters
    ----------
    pipe : Pipeline or Estimator

    Returns
    -------

    model: Estimator
    '''
    if isinstance(pipe, Pipeline):
        return pipe[-1][1]
    else:
        return pipe
项目:SPHERE-HyperStream    作者:IRC-SPHERE    | 项目源码 | 文件源码
def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
项目:elm    作者:ContinuumIO    | 项目源码 | 文件源码
def test_ea_search_sklearn_elm_steps(label, do_predict):
    '''Test that EaSearchCV can work with numpy, dask.array,
    pandas.DataFrame, xarray.Dataset, xarray_filters.MLDataset
    '''
    from scipy.stats import lognorm
    est, make_data, sel, kw = args[label]
    parameters = {'kernel': ['linear', 'rbf'],
                  'C': lognorm(4),}
    if isinstance(est, (sk_Pipeline, Pipeline)):
        parameters = {'est__{}'.format(k): v
                      for k, v in parameters.items()}
    ea = EaSearchCV(est, parameters,
                    n_iter=4,
                    ngen=2,
                    model_selection=sel,
                    model_selection_kwargs=kw)
    X, y = make_data()
    ea.fit(X, y)
    if do_predict:
        pred = ea.predict(X)
        assert isinstance(pred, type(y))
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)
项目:senti    作者:stevenxxiu    | 项目源码 | 文件源码
def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
        if embedding_type == 'google':
            embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
            embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
        elif embedding_type == 'twitter':
            estimator = Pipeline([
                ('tokenize', MapCorporas(tokenize_)),
                ('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec(
                    sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
                ), self.memory))),
            ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs])
            embeddings_ = estimator.named_steps['word2vec'].estimator
            embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
        else:
            embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={})
        estimator = Pipeline([
            ('tokenize', MapCorporas(tokenize_)),
            # 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones
            ('embeddings', MapCorporas(Embeddings(
                embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'),
                include_zero=True
            ))),
        ])
        estimator.fit(construct_docs)
        return estimator.named_steps['embeddings'].estimator
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_fit_failure_multiple_metrics():
    scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer}
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    for key in scoring:
        check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_raises():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    union = FeatureUnion([('tr0', MockClassifier()),
                          ('tr1', MockClassifier())])
    pipe = Pipeline([('union', union), ('est', MockClassifier())])

    grid = {'union__tr2__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(ValueError):
        gs.fit(X, y)

    grid = {'union__transformer_list': [[('one', MockClassifier())]]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(NotImplementedError):
        gs.fit(X, y)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_hyperparameter_searcher_with_fit_params(cls, kwargs):
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)
    clf = CheckingClassifier(expected_fit_params=['spam', 'eggs'])
    pipe = Pipeline([('clf', clf)])
    searcher = cls(pipe, {'clf__foo_param': [1, 2, 3]}, cv=2, **kwargs)

    # The CheckingClassifer generates an assertion error if
    # a parameter is missing or has length != len(X).
    with pytest.raises(AssertionError) as exc:
        searcher.fit(X, y, clf__spam=np.ones(10))
    assert "Expected fit parameter(s) ['eggs'] not seen." in str(exc.value)

    searcher.fit(X, y, clf__spam=np.ones(10), clf__eggs=np.zeros(10))
    # Test with dask objects as parameters
    searcher.fit(X, y, clf__spam=da.ones(10, chunks=2),
                 clf__eggs=dask.delayed(np.zeros(10)))
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_plus_normalizer(self): 

        data = load_boston()

        pl = Pipeline([
            ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), 
            ("Scaler",StandardScaler())])

        pl.fit(data.data, data.target)

        # Convert the model
        spec = convert(pl, data.feature_names, 'out')

        input_data = [dict(zip(data.feature_names, row)) for row in data.data]
        output_data = [{"out" : row} for row in pl.transform(data.data)]

        result = evaluate_transformer(spec, input_data, output_data)

        assert result["num_errors"] == 0
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_plus_trees(self): 

        data = load_boston()

        pl = Pipeline([
            ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), 
            ("Trees",GradientBoostingRegressor(random_state = 1))])

        pl.fit(data.data, data.target)

        # Convert the model
        spec = convert(pl, data.feature_names, 'target')

        # Get predictions
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df['prediction'] = pl.predict(data.data)

        # Evaluate it
        result = evaluate_regressor(spec, df, 'target', verbose = False)

        assert result["max_error"] < 0.0001
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_pipeline(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct. 

            model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
                 ("Normalizer", Normalizer())])

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def __init__(self, lang=None, method=None, features=None):
        fs = []
        if 'unigram' in features:
            fs.append(word_unigrams())
        if 'bigram' in features:
            fs.append(word_bigrams())
        if 'spelling' in features:
            fs.append(avg_spelling_error(lang=lang))
        if 'punctuation' in features:
            fs.append(punctuation_features())
        if 'char' in features:
            fs.append(char_ngrams())

        fu = FeatureUnion(fs, n_jobs=1)
        self.pipeline = Pipeline([('features', fu),
                                  ('scale', Normalizer()),
                                  ('classifier', get_classifier(method=method))])
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_ngram_model(params=None):
    def preprocessor(tweet):
        global emoticons_replaced
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    clf = MultinomialNB()
    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:wende    作者:h404bi    | 项目源码 | 文件源码
def init_model():
        # “????”??
        f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize)

        # Word2Vec ????
        f_word2vec = Question2VecVectorizer(tokenizer=tokenize)

        # ???? (400 ?)
        union_features = FeatureUnion([
            ('f_trunk_lsa', Pipeline([
                ('trunk', f_trunk),
                # ??_????: ?????? (LSA)
                ('lsa', TruncatedSVD(n_components=200, n_iter=10))
            ])),
            ('f_word2vec', f_word2vec),
        ])

        model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))])
        return model
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.

    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def grid_retrain_in_f(self, n_dim=500):
        rbf_map = RBFSampler(n_dim, random_state=1)
        fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
                                                ("svm", LinearSVC())])

        # C_range = np.logspace(-5, 15, 21, base=2)
        # gamma_range = np.logspace(-15, 3, 19, base=2)
        # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
        # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
        # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
        # grid.fit(X, Y)
        #
        # rbf_svc2 = grid.best_estimator_

        rbf_svc2 = fourier_approx_svm
        rbf_svc2.fit(self.X_ex, self.y_ex)

        self.set_clf2(rbf_svc2)
        return self.benchmark()
项目:odin    作者:imito    | 项目源码 | 文件源码
def test_transform_then_prediction(self):
        with TemporaryDirectory() as temp:
            from sklearn.pipeline import Pipeline
            path = os.path.join(temp, 'audio.sph')
            urlretrieve(filename=path,
                        url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
            f = Pipeline([
                ('mspec', model.SpeechTransform('mspec', fs=8000, vad=False)),
                ('slice', model.Transform(lambda x: x[:, :40])),
                ('pred', model.SequentialModel(N.Dropout(0.3),
                                               N.Dense(20, activation=K.relu),
                                               N.Dense(10, activation=K.softmax))
                )
            ])
            x1 = f.predict(path)
            x2 = f.predict_proba(path)

            f = cPickle.loads(cPickle.dumps(f))
            y1 = f.predict(path)
            y2 = f.predict_proba(path)
            self.assertEqual(np.array_equal(x1, y1), True)
            self.assertEqual(np.array_equal(x2, y2), True)
项目:odin    作者:imito    | 项目源码 | 文件源码
def test_complex_transform(self):
        with TemporaryDirectory() as temp:
            from sklearn.pipeline import Pipeline
            path = os.path.join(temp, 'audio.sph')
            urlretrieve(filename=path,
                        url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
            f = Pipeline([
                ('step1', model.SpeechTransform('mspec', fs=8000, vad=True)),
                ('step2', model.Transform(lambda x: (x[0][:, :40],
                                                     x[1].astype(str)))),
                ('step3', model.Transform(lambda x: (np.sum(x[0]),
                                                    ''.join(x[1].tolist()))))
            ])
            x = f.transform(path)
            f = cPickle.loads(cPickle.dumps(f))
            y = f.transform(path)
            self.assertEqual(x[0], y[0])
            self.assertEqual(y[0], -3444229.0)
            self.assertEqual(x[1], y[1])
项目:machine-learning    作者:cinserra    | 项目源码 | 文件源码
def transform_pca(clf_list):
    '''
    From classifier list to pipeline list of the same classifiers and PCA.
    '''

    pca = PCA()
    params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]}

    for j in range(len(clf_list)):

        name = "clf_" + str(j)
        clf, params = clf_list[j]

        # Parameters in GridSearchCV need to have double underscores
        # between specific classifiers.
        new_params = {}
        for key, value in params.iteritems():
            new_params[name + "__" + key] = value

        new_params.update(params_pca)
        clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params)

    return clf_list
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2,
                 colormap=palettes.DEFAULT_SEQUENCE, **kwargs):
        super(PCADecomposition, self).__init__(ax=ax, **kwargs)
        # Data Parameters
        if proj_dim not in (2, 3):
            raise YellowbrickValueError("proj_dim object is not 2 or 3.")

        self.color = color
        self.pca_features_ = None
        self.scale = scale
        self.proj_dim = proj_dim
        self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)),
                                         ('pca', PCA(self.proj_dim, ))
                                         ])
        # Visual Parameters
        self.colormap = colormap
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_select_best(self):
        """
        Test the select best fit estimator
        """
        X, y = ANSCOMBE[1]
        X = np.array(X)
        y = np.array(y)
        X = X[:,np.newaxis]

        model = fit_select_best(X, y)
        self.assertIsNotNone(model)
        self.assertIsInstance(model, Pipeline)

        X, y = ANSCOMBE[3]
        X = np.array(X)
        y = np.array(y)
        X = X[:,np.newaxis]

        model = fit_select_best(X, y)
        self.assertIsNotNone(model)
        self.assertIsInstance(model, LinearRegression)
项目:fake-news-detection    作者:aldengolab    | 项目源码 | 文件源码
def run(self):
        '''
        Runs a model with params p.
        '''
        self.clf.set_params(**self.params)
        # f = get_feature_transformer(self.parser)
        # self.X_train_fts = f.fit_transform(self.X_train)
        # self.X_test_fts = f.transform(self.X_test)
        self.pipeline = Pipeline([
            # ('feature_gen', f),
            ('clf', self.clf),
        ])
        self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
        if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
            self.importances = self.clf.feature_importances_
        elif self.model_type in ['SVM', 'LR', 'SGD']:
            self.importances = self.clf.coef_[0]
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, path, etype, **kwargs):
        super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
        self.basedir = "models/ensemble/"
        self.goldstd = kwargs.get("goldstd")
        self.data = {}
        self.offsets = []
        self.pipeline = Pipeline(
            [
                #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                #('clf', SGDClassifier())
                # ('clf', svm.NuSVC(nu=0.01 ))
                ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                # ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                # ('clf', MultinomialNB())
                # ('clf', GaussianNB())
                #('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
                #('clf', svm.SVC(kernel="linear", C=2))
                #('clf', DummyClassifier(strategy="constant", constant=True))
            ])
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, path, goldset, base_model, features=None, types=None):
        self.ensemble_pipeline = Pipeline([
            ('clf', ensemble.RandomForestClassifier(criterion="gini", n_estimators=1000))
            ])
        self.base_model = base_model
        self.path = path
        self.predicted = []
        self.res = None
        self.ids, self.data, self.labels = [], [], []
        self.goldset = goldset
        if types: # features is a list of classifier names
            self.types = types
        else:
            self.types = []
        self.feature_names = []
        for t in self.types:
            self.feature_names.append(t)
            self.feature_names.append(t + "_ssm")
        for f in features:
            self.feature_names.append(f)
项目:catwalk    作者:dssg    | 项目源码 | 文件源码
def test_cutoff_inside_a_pipeline(data):
    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5

    mms = preprocessing.MinMaxScaler().fit(data['X_train'])

    assert np.all(( mms.transform(X_fake_new_data) > 1  ) == (pipeline.transform(X_fake_new_data) == 1))
项目:catwalk    作者:dssg    | 项目源码 | 文件源码
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
项目:sparsereg    作者:Ohjeah    | 项目源码 | 文件源码
def fit(self, x, y=None):
        if y is not None:
            xdot = y
        else:
            xdot = self.derivative.transform(x)

        if self.operators is not None:
            feature_transformer = SymbolicFeatures(exponents=np.linspace(1, self.degree, self.degree), operators=self.operators)
        else:
            feature_transformer = PolynomialFeatures(degree=self.degree, include_bias=False)

        steps = [("features", feature_transformer),
                 ("model", STRidge(alpha=self.alpha, threshold=self.threshold, **self.kw))]
        self.model = MultiOutputRegressor(Pipeline(steps), n_jobs=self.n_jobs)
        self.model.fit(x, xdot)

        self.n_input_features_ = self.model.estimators_[0].steps[0][1].n_input_features_
        self.n_output_features_ = self.model.estimators_[0].steps[0][1].n_output_features_
        return self
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def get_best_params_and_model(self):
        """
        Returns the best parameters and model after optimization.
        Keyword arguments:
            None
        """
        best_params_idx = np.argmax([score for score, params in self.hyperparam_history])
        best_params = self.hyperparam_history[best_params_idx][1]
        if isinstance(self.model, Pipeline):
            all_params = self.model.get_params()
            all_params.update(best_params)
            best_model = self.model.set_params(**all_params)
        else:
            best_model = self.model.__class__(**dict(self.model.get_params(), **best_params))
        return best_params, best_model