我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用sklearn.pipeline.Pipeline()。
def test_feature_union_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()), ('bad', FailingClassifier())], transformer_weights={'bad': 0.5})), ('clf', MockClassifier())]) grid = {'union__bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, 'union__bad__parameter')
def computeNeighboursScores(self): all_instances = self.iteration.datasets.instances # Connectivity matrix pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))]) pipeline.fit(all_instances.getFeatures()) # Labels labels = np.array([generateLabel(x) for x in all_instances.getLabels()]) # Compute neighbour scores scores = [] all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False) for i, label in enumerate(labels): if label != 0: continue else: neighbours = all_neighbours[i] score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours) scores.append(score) return np.array(scores)
def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf self._gs = a_grid_search if a_clf is None: classifier = XGBClassifier(max_depth=MAX_DEPTH, n_estimators=NTREES, learning_rate=ALPHA, objective="multi:softprob") self._clf = classifier # latest version of XGBoost cannot deal with non-sparse feature vectors self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)])
def nbsvm(base_clf, fit_scaler=None, transform_scaler='bin', multi_class='ovr'): """ NB-SVM classifier: pipeline of MNBScaler+base_clf wrapped in OneVsRestClassifier / OneVsOneClassifier to support multiclass (MNBScaler supports only binary problems itself!). :param base_clf: classifier to use after MNBScaler, LogisticRegression or LinearSVC are usually used :param fit_scaler: look at MNBScaler class :param transform_scaler: look at MNBScaler class :param multi_class: ovr for OneVsRestClassifier, ovo for OneVsOneClassifier :return: OneVsRestClassifier / OneVsOneClassifier """ mnb_scaler = MNBScaler(fit_scaler=fit_scaler, transform_scaler=transform_scaler) pipe = Pipeline([('mnbscaler', mnb_scaler), ('clf', base_clf)]) if multi_class=='ovr': return OneVsRestClassifier(pipe) elif multi_class=='ovo': return OneVsOneClassifier(pipe) else: raise ValueError('Unsuppported multi_class=%s, should be one of %r.' % (multi_class, ['ovr','ovo']))
def test_model_detection(self): sklearn_model = LogisticRegression() pipeline_model = Pipeline([('log', sklearn_model)]) xgb_model = XGBClassifier() nn_model = NNModel(100,10) sklearn_opt = Optimizer(sklearn_model,[], lambda x: x) pipeline_opt = Optimizer(pipeline_model,[], lambda x: x) xgb_opt = Optimizer(xgb_model,[], lambda x: x) nn_opt = Optimizer(nn_model,[], lambda x: x) self.assertEqual(sklearn_opt.model_module, 'sklearn') self.assertEqual(pipeline_opt.model_module, 'pipeline') self.assertEqual(xgb_opt.model_module, 'xgboost') self.assertEqual(nn_opt.model_module, 'keras')
def train(self, train_size=0.8, k_folds=5): # retrieve data from DB and pre-process self._get_data() # perform train/test split self._get_train_test_split(train_size=train_size) # define text pre-processing pipeline text_pipeline = Pipeline([ ('extract_text', DFColumnExtractor(TEXT_FEATURES)), ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer)) ]) # define pipeline for pre-processing of numeric features numeric_pipeline = Pipeline([ ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)), ('scaler', MinMaxScaler()) ]) # combine both steps into a single pipeline pipeline = Pipeline([ ('features', FeatureUnion([ ('text_processing', text_pipeline), ('num_processing', numeric_pipeline) ])), ('clf', self._estimator) ]) self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds)) gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds) X = self.data.iloc[self.train_inds_, :] y = self.data[LABEL].values[self.train_inds_] gs.fit(X, y) self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_)) self.gs_ = gs self.model_ = gs.best_estimator_
def test_cutoff_inside_a_pipeline(data): minmax_scaler = preprocessing.MinMaxScaler() dsapp_cutoff = CutOff() pipeline =Pipeline([ ('minmax_scaler',minmax_scaler), ('dsapp_cutoff', dsapp_cutoff) ]) pipeline.fit(data['X_train'], data['y_train']) X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5 mms = preprocessing.MinMaxScaler().fit(data['X_train']) assert np.all(( mms.transform(X_fake_new_data) > 1 ) == (pipeline.transform(X_fake_new_data) == 1))
def test_dsapp_lr(data): dsapp_lr = ScaledLogisticRegression() dsapp_lr.fit(data['X_train'], data['y_train']) minmax_scaler = preprocessing.MinMaxScaler() dsapp_cutoff = CutOff() lr = linear_model.LogisticRegression() pipeline =Pipeline([ ('minmax_scaler',minmax_scaler), ('dsapp_cutoff', dsapp_cutoff), ('lr', lr) ]) pipeline.fit(data['X_train'], data['y_train']) assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
def test_pipeline(get_models, get_transform, get_kernel): alg, model = get_models trans = get_transform() kernel = get_kernel() + WhiteKernel() pipe = Pipeline(steps=[(alg, model())]) param_dict = {} if hasattr(model(), 'n_estimators'): param_dict[alg + '__n_estimators'] = [5] if hasattr(model(), 'kernel'): param_dict[alg + '__kernel'] = [kernel] param_dict[alg + '__target_transform'] = [trans] estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True, ) np.random.seed(10) estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10)) assert estimator.cv_results_['mean_train_score'][0] > -15.0
def test_svr_pipeline(get_transform, get_svr_kernel): trans = get_transform() pipe = Pipeline(steps=[('svr', svr())]) param_dict = {'svr__kernel': [get_svr_kernel]} param_dict['svr__target_transform'] = [trans] estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True, ) np.random.seed(1) estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10)) assert estimator.cv_results_['mean_train_score'][0] > -10.0
def test_krige_pipeline(get_krige_method, get_variogram_model): pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))]) param_dict = {'krige__variogram_model': [get_variogram_model]} estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True ) np.random.seed(1) X = np.random.randint(0, 400, size=(20, 2)).astype(float) y = 5*np.random.rand(20) estimator.fit(X=X, y=y) assert estimator.cv_results_['mean_train_score'][0] > -1.0
def test_build_param_grid_set_estimator(): clf1 = SVC() clf2 = LogisticRegression() clf3 = SVC() clf4 = SGDClassifier() estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])), ('clf', None)]), clf=[set_grid(clf1, kernel=['linear']), clf2, set_grid(clf3, kernel=['poly'], degree=[2, 3]), clf4]) param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]}, {'clf': [clf3], 'clf__kernel': ['poly'], 'clf__degree': [2, 3], 'sel__k': [2, 3]}, {'clf': [clf2, clf4], 'sel__k': [2, 3]}] assert build_param_grid(estimator) == param_grid
def get_binary(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False ))) ])
def get_sgdc(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)) ])
def custom_fnames(union): feature_names = [] for name, trans, weight in union._iter(): if hasattr(trans, 'get_feature_names'): this_fn = trans.get_feature_names() elif isinstance(trans, Pipeline): # we use pipelines to scale only specific attributes. # In this case, the vectorizer is first in the pipe. this_fn = trans.steps[0][-1].get_feature_names() else: raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % ( str(name), type(trans).__name__)) feature_names.extend([name + "__" + f for f in this_fn]) return feature_names
def createPipeline(self): self.pipeline = Pipeline([ ('model', GradientBoostingClassifier( loss = self.conf.loss, learning_rate = self.conf.learning_rate, n_estimators = self.conf.n_estimators, criterion = self.conf.criterion, max_depth = self.conf.max_depth, min_samples_split = self.conf.min_samples_split, min_samples_leaf = self.conf.min_samples_leaf, min_weight_fraction_leaf = self.conf.min_weight_fraction_leaf, subsample = self.conf.subsample, max_features = self.conf.max_features, max_leaf_nodes = self.conf.max_leaf_nodes, min_impurity_split = self.conf.min_impurity_decrease, presort = self.conf.presort))])
def gridSearch(data, params, true_k): tfidf = TfidfVectorizer(strip_accents=None, lowercase=True, sublinear_tf=True, analyzer='word') lr_tfidf = Pipeline([('vect', tfidf), ('clf', KMeans(init='k-means++', n_jobs=-1, random_state=0, verbose=0))]) gsTfIdf = GridSearchCV( lr_tfidf, params, n_jobs=1, verbose=1) gsTfIdf.fit(data) print() print("Best score: %0.3f" % gsTfIdf.best_score_) print("Best parameters set:") best_parameters = gsTfIdf.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Initialize classifier. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf or LinearSVC(C=DFLT_C, **DFLT_PARAMS) self._gs = a_grid_search self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)])
def model_from_pipeline(pipe): ''' Extract the model from the last stage of a pipeline. Parameters ---------- pipe : Pipeline or Estimator Returns ------- model: Estimator ''' if isinstance(pipe, Pipeline): return pipe[-1][1] else: return pipe
def _execute(self, sources, alignment_stream, interval): time_interval = TimeInterval(MIN_DATE, interval.end) param_doc = sources[0].window(time_interval, force_calculation=True).last() if param_doc is None: logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval)) return steps = deserialise_json_pipeline({ 'vectorisation': DictVectorizer(sparse=False), 'fill_missing': FillZeros(), 'classifier': LinearDiscriminantAnalysis(), 'label_encoder': LabelEncoder() }, param_doc.value) clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')]) locations = steps['label_encoder'].classes_ data = sources[1].window(interval, force_calculation=True) for tt, dd in data: yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
def test_ea_search_sklearn_elm_steps(label, do_predict): '''Test that EaSearchCV can work with numpy, dask.array, pandas.DataFrame, xarray.Dataset, xarray_filters.MLDataset ''' from scipy.stats import lognorm est, make_data, sel, kw = args[label] parameters = {'kernel': ['linear', 'rbf'], 'C': lognorm(4),} if isinstance(est, (sk_Pipeline, Pipeline)): parameters = {'est__{}'.format(k): v for k, v in parameters.items()} ea = EaSearchCV(est, parameters, n_iter=4, ngen=2, model_selection=sel, model_selection_kwargs=kw) X, y = make_data() ea.fit(X, y) if do_predict: pred = ea.predict(X) assert isinstance(pred, type(y))
def create_pipeline(estimator, reduction=False): steps = [ ('normalize', TextNormalizer()), ('vectorize', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )) ] if reduction: steps.append(( 'reduction', TruncatedSVD(n_components=10000) )) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps)
def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None): if embedding_type == 'google': embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle') embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()}) elif embedding_type == 'twitter': estimator = Pipeline([ ('tokenize', MapCorporas(tokenize_)), ('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec( sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 ), self.memory))), ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]) embeddings_ = estimator.named_steps['word2vec'].estimator embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()}) else: embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={}) estimator = Pipeline([ ('tokenize', MapCorporas(tokenize_)), # 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones ('embeddings', MapCorporas(Embeddings( embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'), include_zero=True ))), ]) estimator.fit(construct_docs) return estimator.named_steps['embeddings'].estimator
def test_feature_union_fit_failure_multiple_metrics(): scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer} X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()), ('bad', FailingClassifier())], transformer_weights={'bad': 0.5})), ('clf', MockClassifier())]) grid = {'union__bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) for key in scoring: check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
def test_feature_union_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) union = FeatureUnion([('tr0', MockClassifier()), ('tr1', MockClassifier())]) pipe = Pipeline([('union', union), ('est', MockClassifier())]) grid = {'union__tr2__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {'union__transformer_list': [[('one', MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def test_hyperparameter_searcher_with_fit_params(cls, kwargs): X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(expected_fit_params=['spam', 'eggs']) pipe = Pipeline([('clf', clf)]) searcher = cls(pipe, {'clf__foo_param': [1, 2, 3]}, cv=2, **kwargs) # The CheckingClassifer generates an assertion error if # a parameter is missing or has length != len(X). with pytest.raises(AssertionError) as exc: searcher.fit(X, y, clf__spam=np.ones(10)) assert "Expected fit parameter(s) ['eggs'] not seen." in str(exc.value) searcher.fit(X, y, clf__spam=np.ones(10), clf__eggs=np.zeros(10)) # Test with dask objects as parameters searcher.fit(X, y, clf__spam=da.ones(10, chunks=2), clf__eggs=dask.delayed(np.zeros(10)))
def test_boston_OHE_plus_normalizer(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Scaler",StandardScaler())]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'out') input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in pl.transform(data.data)] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_boston_OHE_plus_trees(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Trees",GradientBoostingRegressor(random_state = 1))]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'target') # Get predictions df = pd.DataFrame(data.data, columns=data.feature_names) df['prediction'] = pl.predict(data.data) # Evaluate it result = evaluate_regressor(spec, df, 'target', verbose = False) assert result["max_error"] < 0.0001
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)), ("Normalizer", Normalizer())]) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def word_unigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) vectorizer = CountVectorizer(min_df=2, stop_words=get_stopwords(), preprocessor=preprocessor, ngram_range=(1, 1)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_unigrams', pipeline)
def __init__(self, lang=None, method=None, features=None): fs = [] if 'unigram' in features: fs.append(word_unigrams()) if 'bigram' in features: fs.append(word_bigrams()) if 'spelling' in features: fs.append(avg_spelling_error(lang=lang)) if 'punctuation' in features: fs.append(punctuation_features()) if 'char' in features: fs.append(char_ngrams()) fu = FeatureUnion(fs, n_jobs=1) self.pipeline = Pipeline([('features', fu), ('scale', Normalizer()), ('classifier', get_classifier(method=method))])
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def create_ngram_model(params=None): def preprocessor(tweet): global emoticons_replaced tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") clf = MultinomialNB() pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def init_model(): # “????”?? f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize) # Word2Vec ???? f_word2vec = Question2VecVectorizer(tokenizer=tokenize) # ???? (400 ?) union_features = FeatureUnion([ ('f_trunk_lsa', Pipeline([ ('trunk', f_trunk), # ??_????: ?????? (LSA) ('lsa', TruncatedSVD(n_components=200, n_iter=10)) ])), ('f_word2vec', f_word2vec), ]) model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))]) return model
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True): """ Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data. Note advanced users may wish to use their own custom pipeline. """ # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for # inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays. pipeline = Pipeline([ ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()), ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)), # Perform one of two basic imputation methods # TODO we need to think about making this optional to solve the problem of rare and very predictive values ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)), ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)), ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)), ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)), ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])), ]) return pipeline
def grid_retrain_in_f(self, n_dim=500): rbf_map = RBFSampler(n_dim, random_state=1) fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map), ("svm", LinearSVC())]) # C_range = np.logspace(-5, 15, 21, base=2) # gamma_range = np.logspace(-15, 3, 19, base=2) # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range) # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42) # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv) # grid.fit(X, Y) # # rbf_svc2 = grid.best_estimator_ rbf_svc2 = fourier_approx_svm rbf_svc2.fit(self.X_ex, self.y_ex) self.set_clf2(rbf_svc2) return self.benchmark()
def test_transform_then_prediction(self): with TemporaryDirectory() as temp: from sklearn.pipeline import Pipeline path = os.path.join(temp, 'audio.sph') urlretrieve(filename=path, url='https://s3.amazonaws.com/ai-datasets/sw02001.sph') f = Pipeline([ ('mspec', model.SpeechTransform('mspec', fs=8000, vad=False)), ('slice', model.Transform(lambda x: x[:, :40])), ('pred', model.SequentialModel(N.Dropout(0.3), N.Dense(20, activation=K.relu), N.Dense(10, activation=K.softmax)) ) ]) x1 = f.predict(path) x2 = f.predict_proba(path) f = cPickle.loads(cPickle.dumps(f)) y1 = f.predict(path) y2 = f.predict_proba(path) self.assertEqual(np.array_equal(x1, y1), True) self.assertEqual(np.array_equal(x2, y2), True)
def test_complex_transform(self): with TemporaryDirectory() as temp: from sklearn.pipeline import Pipeline path = os.path.join(temp, 'audio.sph') urlretrieve(filename=path, url='https://s3.amazonaws.com/ai-datasets/sw02001.sph') f = Pipeline([ ('step1', model.SpeechTransform('mspec', fs=8000, vad=True)), ('step2', model.Transform(lambda x: (x[0][:, :40], x[1].astype(str)))), ('step3', model.Transform(lambda x: (np.sum(x[0]), ''.join(x[1].tolist())))) ]) x = f.transform(path) f = cPickle.loads(cPickle.dumps(f)) y = f.transform(path) self.assertEqual(x[0], y[0]) self.assertEqual(y[0], -3444229.0) self.assertEqual(x[1], y[1])
def transform_pca(clf_list): ''' From classifier list to pipeline list of the same classifiers and PCA. ''' pca = PCA() params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]} for j in range(len(clf_list)): name = "clf_" + str(j) clf, params = clf_list[j] # Parameters in GridSearchCV need to have double underscores # between specific classifiers. new_params = {} for key, value in params.iteritems(): new_params[name + "__" + key] = value new_params.update(params_pca) clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params) return clf_list
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2, colormap=palettes.DEFAULT_SEQUENCE, **kwargs): super(PCADecomposition, self).__init__(ax=ax, **kwargs) # Data Parameters if proj_dim not in (2, 3): raise YellowbrickValueError("proj_dim object is not 2 or 3.") self.color = color self.pca_features_ = None self.scale = scale self.proj_dim = proj_dim self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)), ('pca', PCA(self.proj_dim, )) ]) # Visual Parameters self.colormap = colormap
def test_select_best(self): """ Test the select best fit estimator """ X, y = ANSCOMBE[1] X = np.array(X) y = np.array(y) X = X[:,np.newaxis] model = fit_select_best(X, y) self.assertIsNotNone(model) self.assertIsInstance(model, Pipeline) X, y = ANSCOMBE[3] X = np.array(X) y = np.array(y) X = X[:,np.newaxis] model = fit_select_best(X, y) self.assertIsNotNone(model) self.assertIsInstance(model, LinearRegression)
def run(self): ''' Runs a model with params p. ''' self.clf.set_params(**self.params) # f = get_feature_transformer(self.parser) # self.X_train_fts = f.fit_transform(self.X_train) # self.X_test_fts = f.transform(self.X_test) self.pipeline = Pipeline([ # ('feature_gen', f), ('clf', self.clf), ]) self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1] if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']: self.importances = self.clf.feature_importances_ elif self.model_type in ['SVM', 'LR', 'SGD']: self.importances = self.clf.coef_[0]
def __init__(self, path, etype, **kwargs): super(EnsembleModel, self).__init__(path, etype=etype, **kwargs) self.basedir = "models/ensemble/" self.goldstd = kwargs.get("goldstd") self.data = {} self.offsets = [] self.pipeline = Pipeline( [ #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) # ('clf', svm.NuSVC(nu=0.01 )) ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True)) # ('clf', tree.DecisionTreeClassifier(criterion="entropy")), # ('clf', MultinomialNB()) # ('clf', GaussianNB()) #('clf', svm.SVC(kernel="rbf", degree=2, C=1)), #('clf', svm.SVC(kernel="linear", C=2)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def __init__(self, path, goldset, base_model, features=None, types=None): self.ensemble_pipeline = Pipeline([ ('clf', ensemble.RandomForestClassifier(criterion="gini", n_estimators=1000)) ]) self.base_model = base_model self.path = path self.predicted = [] self.res = None self.ids, self.data, self.labels = [], [], [] self.goldset = goldset if types: # features is a list of classifier names self.types = types else: self.types = [] self.feature_names = [] for t in self.types: self.feature_names.append(t) self.feature_names.append(t + "_ssm") for f in features: self.feature_names.append(f)
def fit(self, x, y=None): if y is not None: xdot = y else: xdot = self.derivative.transform(x) if self.operators is not None: feature_transformer = SymbolicFeatures(exponents=np.linspace(1, self.degree, self.degree), operators=self.operators) else: feature_transformer = PolynomialFeatures(degree=self.degree, include_bias=False) steps = [("features", feature_transformer), ("model", STRidge(alpha=self.alpha, threshold=self.threshold, **self.kw))] self.model = MultiOutputRegressor(Pipeline(steps), n_jobs=self.n_jobs) self.model.fit(x, xdot) self.n_input_features_ = self.model.estimators_[0].steps[0][1].n_input_features_ self.n_output_features_ = self.model.estimators_[0].steps[0][1].n_output_features_ return self
def get_best_params_and_model(self): """ Returns the best parameters and model after optimization. Keyword arguments: None """ best_params_idx = np.argmax([score for score, params in self.hyperparam_history]) best_params = self.hyperparam_history[best_params_idx][1] if isinstance(self.model, Pipeline): all_params = self.model.get_params() all_params.update(best_params) best_model = self.model.set_params(**all_params) else: best_model = self.model.__class__(**dict(self.model.get_params(), **best_params)) return best_params, best_model