def unscaled_pipelines(): # Random forest parameters random_forest_kwargs = { 'n_estimators': 10, 'criterion': 'mse', 'random_state': _RANDOM_STATE, 'n_jobs': cpu_count(), 'verbose': True, } # Gradient boosting parameters gradient_boost_kwargs = { 'random_state': _RANDOM_STATE, 'verbose': 1, } models = [ DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE), # RandomForestRegressor(**random_forest_kwargs), # GradientBoostingRegressor(**gradient_boost_kwargs), ] pipelines = [] for m in models: # Steps pipelines.append(make_pipeline(m)) return pipelines
def fit(self, X, y): """Fit TSclassifier. Parameters ---------- X : ndarray, shape (n_trials, n_channels, n_channels) ndarray of SPD matrices. y : ndarray shape (n_trials, 1) labels corresponding to each trial. Returns ------- self : TSclassifier. instance The TSclassifier. instance. """ ts = TangentSpace(metric=self.metric, tsupdate=self.tsupdate) self._pipe = make_pipeline(ts, self.clf) self._pipe.fit(X, y) return self
def sample_pipelines(pca_kernels=None, svr_kernels=None): """ Pipelines that can't be fit in a reasonable amount of time on the whole dataset """ # Model instances model_steps = [] if pca_kernels is None: pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine'] for pca_kernel in pca_kernels: model_steps.append([ KernelPCA(n_components=2, kernel=pca_kernel), LinearRegression(), ]) if svr_kernels is None: svr_kernels = ['poly', 'rbf', 'sigmoid'] for svr_kernel in svr_kernels: model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000)) # Pipelines pipelines = [] for m in model_steps: # Steps common_steps = [ StandardScaler(), ] model_steps = m if isinstance(m, list) else [m] steps = common_steps + model_steps pipelines.append(make_pipeline(*steps)) return pipelines
def build_pipeline(base_estimator, parameters): """ Builds a pipeline where the base estimator is initialized with given parameters. The `@preprocessor` parameter is a special parameter that will determine which pre-processing steps to use. :param base_estimator: The base estimator of the pipeline :param parameters: The parameters for the base estimator, includes special parameters for the pipeline itself :return: The (pipeline with the) base estimator, initialized with given parameters """ params = copy(parameters) preprocessors = Builder.extract_preprocessors(params) estimator = Builder.setup_estimator(base_estimator, params) if preprocessors is None: return estimator return make_pipeline(*preprocessors, estimator)
def decompose(doc_vecs, n_features=100, normalize=False, flip=False): svd = TruncatedSVD(n_features) if normalize: if flip: lsa = make_pipeline(svd, Normalizer(copy=False)) doc_mat = lsa.fit_transform(doc_vecs.transpose()) doc_mat = doc_mat.transpose() else: lsa = make_pipeline(svd, Normalizer(copy=False)) doc_mat = lsa.fit_transform(doc_vecs) return doc_mat else: if flip: doc_mat = svd.fit_transform(doc_vecs.transpose()) doc_mat = doc_mat.transpose() else: doc_mat = svd.fit_transform(doc_vecs) return doc_mat
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel(): """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1) assert np.mean(cv_scores) > 0.
def __init__(self, model_type=DEFAULT_MODEL_TYPE): """ Set ups model and pipeline for learning and predicting. :param model_type: only 'SVR' model is supported for now """ assert (model_type == 'SVR'), "Model '{}' is not supported. " \ "We support only SVR for now.".format(model_type) self._model_type = model_type self._model_params = BTCForecast.DEFAULT_SVR_MODEL_PARAMS # set up SVR pipeline self._scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True) self._model = SVR(kernel=self._model_params['kernel'], epsilon=self._model_params['epsilon'], C=self._model_params['c'], gamma=self._model_params['gamma']) self._pipeline = make_pipeline(self._scaler, self._model) self.has_learned = False
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}) scorer = check_scoring(grid, "f1") assert_true(isinstance(scorer, _PredictScorer)) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, "f1") assert_true(isinstance(scorer, _PredictScorer)) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer()) assert_array_equal(scores, 1)
def LDAPageVctorizer(*, n_topics: int, min_df: int, max_features: int, max_iter: int, ngram_range: Tuple[int, int], vocabulary=None, batch_size: int=4096, verbose=1): vec = _vectorizer(min_df=min_df, max_features=max_features, ngram_range=ngram_range, vocabulary=vocabulary) lda = LatentDirichletAllocation( learning_method='online', n_topics=n_topics, batch_size=batch_size, evaluate_every=2, verbose=verbose, max_iter=max_iter, n_jobs=1, ) return make_pipeline(vec, lda)
def test_in_pipeline(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) pipe.fit(X, y)
def test_gridsearch(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) grid = { 'logisticregression__C': [1000, 100, 10, 2] } pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) search = dcv.GridSearchCV(pipe, grid, cv=3) search.fit(X, y)
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")): return make_pipeline(DictVectorizer(sparse=False), clf)
def truncated_svd(self): # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py svd = TruncatedSVD(self.dimensions) lsa = make_pipeline(svd, Normalizer(copy=False)) X_reduced = lsa.fit_transform(self.bag_of_words_matrix) print(svd.components_[0]) print(svd.explained_variance_ratio_) print(svd.explained_variance_ratio_.sum())
def scaled_pipelines(): # Model parameters # RANSAC parameters # 500 max trials takes 90s ransac_kwargs = { 'max_trials': 1000, 'min_samples': 5000, 'loss': 'absolute_loss', 'residual_threshold': 2.0, 'random_state': _RANDOM_STATE, } # Ridge CV parameters alphas = [.01, .1, 1, 10] # Model instances model_steps = [ LinearRegression(), # [PolynomialFeatures(degree=2), LinearRegression()], # [PolynomialFeatures(degree=3), LinearRegression()], # RANSACRegressor(base_estimator=LinearRegression(), **ransac_kwargs), # RANSACRegressor with polynomial regression? # RidgeCV(alphas=alphas), # LassoCV(), # Alphas set automatically by default # ElasticNetCV(l1_ratio=0.5), # Same as default # [PolynomialFeatures(degree=2), ElasticNetCV(l1_ratio=0.5)], # SGDRegressor(), ] # Pipelines pipelines = [] for m in model_steps: # Steps common_steps = [ StandardScaler(), PCA(**_PCA_KWARGS) ] model_steps = m if isinstance(m, list) else [m] steps = common_steps + model_steps pipelines.append(make_pipeline(*steps)) return pipelines
def fit(): X, y = generate() dX = dd.from_pandas(X, npartitions=10) y = dd.from_pandas(y, npartitions=10) pre_pipe = make_pipeline( CategoricalEncoder(), DummyEncoder(), Imputer(), SGDRegressor(), ) pipe = make_pipeline( SelectFromModel(pre_pipe), GradientBoostingRegressor(), ) X_ = pre_pipe.fit_transform(dX) for i in range(X_.npartitions): for j in range(5): print(i, j) X_sub = X_.get_partition(i).compute() y_sub = y.get_partition(i).compute() clf.partial_fit(X_sub, y_sub) sfm = SelectFromModel(clf, prefit=True) return pipe, clf, sfm
def regression_pipeline(regression_model): return make_pipeline([StandardScaler(), regression_model])
def build_classifier(base_clf=svm.SVC()): # The imputer is for "use_taxonomy", and shouldn't affect if it's False. # TODO: should also try with other imputer strategies return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(), base_clf) # noinspection PyPep8Naming
def make_ward_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'WARD/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward') predict_result = ward.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def make_spectral_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'spectral/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) spectral = SpectralClustering(n_clusters=self.spectral_clusters_count) predict_result = spectral.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames) # aa = Affinity Propagation
def make_aa_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'affinity_propagation/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) aa_clusterizator = AffinityPropagation(damping=self.aa_damping, max_iter=self.aa_max_iter, convergence_iter=self.aa_no_change_stop) predict_result = aa_clusterizator.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def make_birch_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'birch/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) birch = Birch(threshold=self.birch_threshold, branching_factor=self.birch_branching_factor, n_clusters=self.birch_clusters_count) predict_result = birch.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def doPCA(X, output_columns_count): #DO PCA on the data and use it to transform svd = TruncatedSVD(output_columns_count) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) return X
def test_train(): # define input streams names = ['v', 'p', 't', 'w', 'a'] # define first transformation units = ["knot","in_Hg","celsius","force_pound","degree"] tounits = ["m/s", "pascal", "kelvin", "newton", "radian"] tuple_units = [] for i, unit in enumerate(units): tuple_units.append((unit, tounits[i])) s1 = UnitTransformer(tuple_units) # second layer of transformation constants = {"s": 61.0, "R": 286.9} labels = ["2*w/(v**2*(p/R/t)*s)"] s2 = FormulaTransformer(labels, names, constants) # sink ( any sink transformation could be used to predict ) # no fit_transform rule, can only predict features = ["a"] s3 = make_pipeline(FormulaTransformer(features, names), LinearRegression()) # train the shit outof it with (open("data/training.csv")) as f: df = pd.read_csv(f, names=names, header=0) # awkward transformation from dataframe to numpy matrix # could use panda sklearn to solve ndarray = df.as_matrix(names) rawX = s1.fit_transform(ndarray) y = s2.fit_transform(rawX) X = rawX s3.fit(X, y) y_ = s3.predict(X) print X.shape, y_.shape #plt.scatter(FormulaTransformer(features, names).fit_transform(X), y_) #plt.show() # wrap the process as StreamPipeline for learning machine sp = StreamPipeline(names, s3) sp.predict(v=1.0, p=2.0, t=3.0, w=4.0, a=5.0)
def _get_pipeline(self, name): return make_pipeline(self.pipeline, classifiers[name])
def featuresByLSA(features,ncomponents=100): svd = TruncatedSVD(n_components=ncomponents) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) dtm_lsa = lsa.fit_transform(features) return dtm_lsa
def fit_quadratic(X, y): """ Uses OLS with Polynomial order 2. """ model = make_pipeline( PolynomialFeatures(2), linear_model.LinearRegression() ) model.fit(X, y) return model
def test_relieff_pipeline(): """Ensure that ReliefF works in a sklearn pipeline when it is parallelized""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_relieff_pipeline_parallel(): """Ensure that ReliefF works in a sklearn pipeline where cross_val_score is parallelized""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_relieffpercent_pipeline(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline when it is parallelized""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_relieffpercent_pipeline_parallel(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline where cross_val_score is parallelized""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_surf_pipeline(): """Ensure that SURF works in a sklearn pipeline when it is parallelized""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surfstar_pipeline(): """Ensure that SURF* works in a sklearn pipeline when it is parallelized""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_surfstar_pipeline_parallel(): """Ensure that SURF* works in a sklearn pipeline where cross_val_score is parallelized""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_multisurf_pipeline(): """Ensure that MultiSURF works in a sklearn pipeline when it is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_multisurf_pipeline_parallel(): """Ensure that MultiSURF works in a sklearn pipeline where cross_val_score is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_multisurfstar_pipeline(): """Ensure that MultiSURF* works in a sklearn pipeline when it is parallelized""" np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_relieff_pipeline_cont_endpoint(): """Ensure that ReliefF works in a sklearn pipeline with continuous endpoint data""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3))) < 0.5
def test_relieff_pipeline_cont_endpoint(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline with continuous endpoint data""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3))) < 0.5
def test_surf_pipeline_cont_endpoint(): """Ensure that SURF works in a sklearn pipeline with continuous endpoint data""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3))) < 0.5
def test_surfstar_pipeline_cont_endpoint(): """Ensure that SURF* works in a sklearn pipeline with continuous endpoint data""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3))) < 0.5
def test_multisurf_pipeline_cont_endpoint(): """Ensure that MultiSURF works in a sklearn pipeline with continuous endpoint data""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3))) < 0.5
def test_relieff_pipeline_mixed_attributes(): """Ensure that ReliefF works in a sklearn pipeline with mixed attributes""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3)) > 0.7
def test_relieffpercent_pipeline_mixed_attributes(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline with mixed attributes""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3)) > 0.7
def test_surf_pipeline_mixed_attributes(): """Ensure that SURF works in a sklearn pipeline with mixed attributes""" np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3)) > 0.7
def test_surfstar_pipeline_mixed_attributes(): """Ensure that SURF* works in a sklearn pipeline with mixed attributes""" np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3)) > 0.7
def test_multisurf_pipeline_mixed_attributes(): """Ensure that MultiSURF works in a sklearn pipeline with mixed attributes""" np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3)) > 0.7
def test_relieff_pipeline_missing_values(): """Ensure that ReliefF works in a sklearn pipeline with missing values""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
def test_relieffpercent_pipeline_missing_values(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline with missing values""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, labels_missing_values, cv=3)) > 0.7