Python sklearn.datasets 模块,make_classification() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.datasets.make_classification()

项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_fit_failure():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)
    check_scores_all_nan(gs, 'union__bad__parameter')
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        hyperopt = HyperoptOptimizer(model, [p1], clf_score)
        best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

        for status in hyperopt.trials.statuses():
            self.assertEqual(status, 'ok')
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        grid_sizes = {'max_depth': 5}
        grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes)
        best_params, best_model = grid_search.fit(X_train=data, y_train=target)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_objective_function(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=10,
                                   n_informative=10,
                                   n_redundant=0,
                                   class_sep=100,
                                   n_clusters_per_class=1,
                                   flip_y=0.0)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        fun = partial(objective, model, 
                                 'sklearn', 
                                 clf_score,
                                 data, target, data, target)
        # model should fit the data perfectly
        final_score = fun(model.get_params())[0]
        self.assertEqual(final_score,1)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        n_init_samples = 4    
        mutation_noise = {'max_depth': 0.4, 'learning_rate': 0.05, 
                          'reg_lambda':0.5}
        geneticOpt = GeneticOptimizer(model, [p1], clf_score, n_init_samples, 
                                     'RouletteWheel', mutation_noise)

        best_params, best_model = geneticOpt.fit(X_train=data, y_train=target, n_iters=30)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_expected_improvement_tractable(self):
        np.random.seed(5)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement')
        best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
        self.assertTrue(bayesOpt.success)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_upper_confidence_bound_tractable(self):
        np.random.seed(5)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound')
        best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
        self.assertTrue(bayesOpt.success)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        rand_search = RandomSearchOptimizer(model, [p1], clf_score)
        best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def setUp(self):
        os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE")
        self.X_class, self.y_class = datasets.make_classification(random_state=42)
        self.X_reg, self.y_reg = datasets.make_regression(random_state=42)
        self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
        self.reg_scorer = Scorer("mse", metrics.mean_squared_error)

        self.classification_task_split = \
            Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42)
        self.regression_task_split = \
            Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42)

        self.classification_task_cv = \
            Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42)
        self.regression_task_cv = \
            Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def case2():
    from sklearn.datasets import make_classification

    x,y = make_classification(n_samples=1000, n_features=2,n_redundant=0,n_informative=1,n_clusters_per_class=1)

    print len(x)
    print len(y)
    print x
    print y
    for i in range(len(x)):
        print x[i],y[i]

    x_data_train = x[:800,:]

    x_data_test = x[800:,:]
    y_data_train = y[:800]
    y_data_test = y[800:]

    print '*'*20
    print x_data_train
    print x_data_test
    print y_data_train
    print y_data_test

    print x[0,0]
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_visualize():
    pytest.importorskip('graphviz')

    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = SVC(random_state=0)
    grid = {'C': [.1, .5, .9]}
    gs = dcv.GridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, 'dask_graph_')

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_fit_failure_multiple_metrics():
    scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer}
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    for key in scoring:
        check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_pipeline_fit_failure():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('bad', FailingClassifier()),
                     ('good1', MockClassifier()),
                     ('good2', MockClassifier())])

    grid = {'bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    check_scores_all_nan(gs, 'bad__parameter')
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_feature_union_raises():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    union = FeatureUnion([('tr0', MockClassifier()),
                          ('tr1', MockClassifier())])
    pipe = Pipeline([('union', union), ('est', MockClassifier())])

    grid = {'union__tr2__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(ValueError):
        gs.fit(X, y)

    grid = {'union__transformer_list': [[('one', MockClassifier())]]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(NotImplementedError):
        gs.fit(X, y)
项目:Gaussian_process    作者:happyjin    | 项目源码 | 文件源码
def dataset_generator():
    """
    generate dataset for binary classification
    :return:
    """
    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=0),
                make_circles(noise=0.2, factor=0.5, random_state=1),
                linearly_separable
                ]

    X, y = datasets[0]
    y[y == 0] = -1
    X = StandardScaler().fit_transform(X)
    return X, y
项目:MLAlgorithms    作者:rushter    | 项目源码 | 文件源码
def classification():
    # Generate a random binary classification problem.
    X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
                               random_state=1111, n_classes=2,
                               class_sep=1., n_redundant=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                        random_state=1111)

    model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
                                       max_features=8, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(predictions)
    print(predictions.min())
    print(predictions.max())
    print('classification, roc auc score: %s'
          % roc_auc_score(y_test, predictions))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_importances_gini_equal_mse():
    # Check that gini is equivalent to mse for binary output variable

    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    # The gini index and the mean square error (variance) might differ due
    # to numerical instability. Since those instabilities mainly occurs at
    # high tree depth, we restrict this maximal depth.
    clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                 random_state=0).fit(X, y)
    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
                                random_state=0).fit(X, y)

    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_labels():
    # Check if ValueError (when labels is None) propagates to GridSearchCV
    # And also check if labels is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(),
                 LabelShuffleSplit()]
    for cv in label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             gs.fit, X, y)
        gs.fit(X, y, labels)

    non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_true(np.mean(y_pred == y_pred2) >= .9)
    assert_equal(C, C2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    with warnings.catch_warnings(record=True) as w:
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
    if len(w) > 0:
        raise RuntimeError("Unexpected warning: %r" % w[0].message)
    assert_equal(train_scores.shape, (10, 3))
    assert_equal(test_scores.shape, (10, 3))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve_verbose():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        train_sizes, train_scores, test_scores = \
            learning_curve(estimator, X, y, cv=3, verbose=1)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    assert("[learning_curve]" in out)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve_batch_and_incremental_learning_are_equal():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    train_sizes = np.linspace(0.2, 1.0, 5)
    estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)

    train_sizes_inc, train_scores_inc, test_scores_inc = \
        learning_curve(
            estimator, X, y, train_sizes=train_sizes,
            cv=3, exploit_incremental_learning=True)
    train_sizes_batch, train_scores_batch, test_scores_batch = \
        learning_curve(
            estimator, X, y, cv=3, train_sizes=train_sizes,
            exploit_incremental_learning=False)

    assert_array_equal(train_sizes_inc, train_sizes_batch)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_true(np.mean(y_pred == y_pred2) >= .9)
    assert_equal(C, C2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    with warnings.catch_warnings(record=True) as w:
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
    if len(w) > 0:
        raise RuntimeError("Unexpected warning: %r" % w[0].message)
    assert_equal(train_scores.shape, (10, 3))
    assert_equal(test_scores.shape, (10, 3))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve_verbose():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        train_sizes, train_scores, test_scores = \
            learning_curve(estimator, X, y, cv=3, verbose=1)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    assert("[learning_curve]" in out)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_learning_curve_batch_and_incremental_learning_are_equal():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    train_sizes = np.linspace(0.2, 1.0, 5)
    estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)

    train_sizes_inc, train_scores_inc, test_scores_inc = \
        learning_curve(
            estimator, X, y, train_sizes=train_sizes,
            cv=3, exploit_incremental_learning=True)
    train_sizes_batch, train_scores_batch, test_scores_batch = \
        learning_curve(
            estimator, X, y, cv=3, train_sizes=train_sizes,
            exploit_incremental_learning=False)

    assert_array_equal(train_sizes_inc, train_sizes_batch)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_l1_ratio():
    # Test if l1 ratio extremes match L1 and L2 penalty settings.
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=100, n_informative=20,
                                        random_state=1234)

    # test if elasticnet with l1_ratio near 1 gives same result as pure l1
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet',
                           l1_ratio=0.9999999999, random_state=42).fit(X, y)
    est_l1 = SGDClassifier(alpha=0.001, penalty='l1', random_state=42).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l1.coef_)

    # test if elasticnet with l1_ratio near 0 gives same result as pure l2
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet',
                           l1_ratio=0.0000000001, random_state=42).fit(X, y)
    est_l2 = SGDClassifier(alpha=0.001, penalty='l2', random_state=42).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l2.coef_)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_liblinear_dual_random_state():
    # random_state is relevant for liblinear solver only if dual=True
    X, y = make_classification(n_samples=20)
    lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
    lr1.fit(X, y)
    lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
    lr2.fit(X, y)
    lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15)
    lr3.fit(X, y)

    # same result for same random state
    assert_array_almost_equal(lr1.coef_, lr2.coef_)
    # different results for different random states
    msg = "Arrays are not almost equal to 6 decimals"
    assert_raise_message(AssertionError, msg,
                         assert_array_almost_equal, lr1.coef_, lr3.coef_)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_logistic_regression_solvers():
    X, y = make_classification(n_features=10, n_informative=5, random_state=0)

    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
    lib = LogisticRegression(fit_intercept=False)
    sag = LogisticRegression(solver='sag', fit_intercept=False,
                             random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_logistic_regression_solvers_multiclass():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               n_classes=3, random_state=0)
    tol = 1e-6
    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol)
    lib = LogisticRegression(fit_intercept=False, tol=tol)
    sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol,
                             max_iter=1000, random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_logreg_predict_proba_multinomial():
    X, y = make_classification(n_samples=10, n_features=20, random_state=0,
                               n_classes=3, n_informative=10)

    # Predicted probabilites using the true-entropy loss should give a
    # smaller loss than those using the ovr method.
    clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
    clf_multi.fit(X, y)
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
    clf_ovr.fit(X, y)
    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
    assert_greater(clf_ovr_loss, clf_multi_loss)

    # Predicted probabilites using the soft-max function should give a
    # smaller loss than those using the logistic function.
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
    assert_greater(clf_wrong_loss, clf_multi_loss)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_mean_variance_illegal_axis():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_csr = sp.csr_matrix(X)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)

    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
                  last_mean=None, last_var=None, last_n=None)
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def test_model_assessment():
    X, y = make_classification(n_samples=40, n_features=100, n_informative=2,
                               n_classes=2, n_redundant=0)
    pipe = Pipeline([('enet', ElasticNetFeatureSelection()),
                     ('ridge', RidgeClassifier())])

    ma = ModelAssessment(GridSearchCV(pipe, {'enet__l1_ratio': [2]})).fit(X, y)
    assert len(ma.cv_results_) == 0
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def test_db_logger(self):
        X, y = datasets.make_classification(random_state=42)
        task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42)
        scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
        logger = DBLogger(task, self.engine)
        optimizer = XGBoostOptimizer(task, scorer, logger)
        optimizer.start_optimization(max_evals=10)
        self.assertEqual(len(list(logger.load_all_results())), 10)
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def test_file_logger(self):
        X, y = datasets.make_classification(random_state=42)
        task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42)
        scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
        logger = FileLogger(task)
        optimizer = XGBoostOptimizer(task, scorer, logger)
        optimizer.start_optimization(max_evals=10)
        self.assertEqual(len(list(logger.load_all_results())), 10)
        os.remove(task.name + ".log")
项目:sci-pype    作者:jay-johnson    | 项目源码 | 文件源码
def sk_generate_random_classification_set(self, samples, features, classes, informative, rds, dbs, debug=False):

        record              = {
                                "Test"  : { 
                                            "X" : {},
                                            "Y" : {}
                                        },
                                "Train" : { 
                                            "X" : {},
                                            "Y" : {}
                                        }
                            }
        results             = self.build_def_hash("Display Error", "Not Run", record )

        try:

            from sklearn.datasets import make_classification

            self.lg("Processing ROC", 6)

            X, Y = make_classification(n_samples=samples, 
                                        n_features=features, 
                                        n_classes=classes, 
                                        n_informative=informative)

            record["Test"]["X"]     = X[9000:] 
            record["Test"]["Y"]     = Y[9000:] 
            record["Train"]["X"]    = X[:9000]
            record["Train"]["Y"]    = Y[:9000]

            results         = self.build_def_hash("SUCCESS", "", record)

        except Exception,k:
            status          = "FAILED"
            err_msg         = "Unable to Generate Random Classification set with Ex(" + str(k) + ")"
            self.lg("ERROR: " + str(err_msg), 0)
            results         = self.build_def_hash("Display Error", err_msg, {})
        # end of try/ex

        return results
    # end of sk_generate_random_classification_set
项目:RFHO    作者:lucfra    | 项目源码 | 文件源码
def generate_multiclass_dataset(n_samples=100, n_features=10,
                                n_informative=5, n_redundant=3, n_repeated=2,
                                n_classes=2, n_clusters_per_class=2,
                                weights=None, flip_y=0.01, class_sep=1.0,
                                hypercube=True, shift=0.0, scale=1.0,
                                shuffle=True, random_state=None, hot_encoded=True, partitions_proportions=None,
                                negative_labels=-1.):
    X, y = sk_dt.make_classification(n_samples=n_samples, n_features=n_features,
                                     n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated,
                                     n_classes=n_classes, n_clusters_per_class=n_clusters_per_class,
                                     weights=weights, flip_y=flip_y, class_sep=class_sep,
                                     hypercube=hypercube, shift=shift, scale=scale,
                                     shuffle=True, random_state=random_state)
    if hot_encoded:
        y = to_one_hot_enc(y)
    else:
        y[y == 0] = negative_labels
    res = Dataset(data=np.array(X, dtype=np.float32), target=np.array(y, dtype=np.float32),
                  info={'n_informative': n_informative, 'n_redundant': n_redundant,
                                     'n_repeated': n_repeated,
                                     'n_classes': n_classes, 'n_clusters_per_class': n_clusters_per_class,
                                     'weights': weights, 'flip_y': flip_y, 'class_sep': class_sep,
                                     'hypercube': hypercube, 'shift': shift, 'scale': scale,
                                     'shuffle': True, 'random_state': random_state})
    np.random.seed(random_state)
    if partitions_proportions:
        res = redivide_data([res], shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)
    return res
项目:datanode    作者:jay-johnson    | 项目源码 | 文件源码
def sk_generate_random_classification_set(self, samples, features, classes, informative, rds, dbs, debug=False):

        record              = {
                                "Test"  : { 
                                            "X" : {},
                                            "Y" : {}
                                        },
                                "Train" : { 
                                            "X" : {},
                                            "Y" : {}
                                        }
                            }
        results             = self.build_def_hash("Display Error", "Not Run", record )

        try:

            from sklearn.datasets import make_classification

            self.lg("Processing ROC", 6)

            X, Y = make_classification(n_samples=samples, 
                                        n_features=features, 
                                        n_classes=classes, 
                                        n_informative=informative)

            record["Test"]["X"]     = X[9000:] 
            record["Test"]["Y"]     = Y[9000:] 
            record["Train"]["X"]    = X[:9000]
            record["Train"]["Y"]    = Y[:9000]

            results         = self.build_def_hash("SUCCESS", "", record)

        except Exception,k:
            status          = "FAILED"
            err_msg         = "Unable to Generate Random Classification set with Ex(" + str(k) + ")"
            self.lg("ERROR: " + str(err_msg), 0)
            results         = self.build_def_hash("Display Error", err_msg, {})
        # end of try/ex

        return results
    # end of sk_generate_random_classification_set
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_grid_search_dask_inputs():
    # Numpy versions
    np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0)
    np_groups = np.random.RandomState(0).randint(0, 3, 15)
    # Dask array versions
    da_X = da.from_array(np_X, chunks=5)
    da_y = da.from_array(np_y, chunks=5)
    da_groups = da.from_array(np_groups, chunks=5)
    # Delayed versions
    del_X = delayed(np_X)
    del_y = delayed(np_y)
    del_groups = delayed(np_groups)

    cv = GroupKFold()
    clf = SVC(random_state=0)
    grid = {'C': [1]}

    sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_

    for X, y, groups in product([np_X, da_X, del_X],
                                [np_y, da_y, del_y],
                                [np_groups, da_groups, del_groups]):
        gs = dcv.GridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            gs.fit(X, y)
        assert "parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)
        np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_bad_error_score():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                          error_score='badparam')

    with pytest.raises(ValueError):
        gs.fit(X, y)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_cache_cv():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    X2 = X.view(CountTakes)
    gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                          cv=3, cache_cv=False, scheduler='sync')
    gs.fit(X2, y)
    assert X2.count == 2 * 3 * 3  # (1 train + 1 test) * n_params * n_splits

    X2 = X.view(CountTakes)
    assert X2.count == 0
    gs.cache_cv = True
    gs.fit(X2, y)
    assert X2.count == 2 * 3  # (1 test + 1 train) * n_splits
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_scheduler_param(scheduler, n_jobs, get):
    if scheduler == 'multiprocessing':
        mp = pytest.importorskip('dask.multiprocessing')
        get = mp.get

    assert _normalize_scheduler(scheduler, n_jobs) is get

    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3,
                          scheduler=scheduler, n_jobs=n_jobs)
    gs.fit(X, y)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_scheduler_param_distributed(loop):
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop, set_as_default=False) as client:
            gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                                  cv=3, scheduler=client)
            gs.fit(X, y)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_cv_multiplemetrics_requires_refit_metric():
    X, y = make_classification(random_state=0)

    param_grid = {'max_depth': [1, 5]}
    a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=True,
                         scoring={'score1': 'accuracy', 'score2': 'accuracy'})

    with pytest.raises(ValueError):
        a.fit(X, y)
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_cv_multiplemetrics_no_refit():
    X, y = make_classification(random_state=0)

    param_grid = {'max_depth': [1, 5]}
    a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=False,
                         scoring={'score1': 'accuracy', 'score2': 'accuracy'})
    b = GridSearchCV(RandomForestClassifier(), param_grid, refit=False,
                     scoring={'score1': 'accuracy', 'score2': 'accuracy'})
    assert hasattr(a, 'best_index_') is hasattr(b, 'best_index_')
    assert hasattr(a, 'best_estimator_') is hasattr(b, 'best_estimator_')
    assert hasattr(a, 'best_score_') is hasattr(b, 'best_score_')
项目:tidml    作者:tidchile    | 项目源码 | 文件源码
def make_test_data():
    from sklearn.datasets import make_classification
    import pandas as pd

    data = make_classification(n_samples=3, n_features=4)
    data = data[0]
    df = pd.DataFrame(data, columns=list("ABCD"))

    prepare_path(test_data_file)
    df.to_csv(test_data_file, sep='\t', index=False)
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def test_partial_fit_equivalence():
    X, y = make_regression(random_state=0, n_samples=100)
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtr, 0, X, y)

    X, y = make_classification(random_state=0, n_samples=100)
    mtc = MondrianTreeClassifier(random_state=0)
    mtc.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def test_partial_fit_equivalence():
    X, y = make_regression(random_state=0, n_samples=100)
    mfr = MondrianForestRegressor(random_state=0)
    mfr.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mfr, 0, X, y)

    X, y = make_classification(random_state=0, n_samples=100)
    mtc = MondrianForestClassifier(random_state=0)
    mtc.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
项目:xcessiv    作者:reiinakano    | 项目源码 | 文件源码
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits