Python sklearn.datasets 模块，make_blobs() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.datasets.make_blobs()。

项目：microbiome-summer-school-2017 作者：aldro61 | 项目源码 | 文件源码

def make_classification_example(axis, random_state):
    X, y = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=2.7, random_state=random_state)

    axis.scatter(X[y == 0, 0], X[y == 0, 1], color="red", s=10, label="Disease")
    axis.scatter(X[y == 1, 0], X[y == 1, 1], color="blue", s=10, label="Healthy")

    clf = LinearSVC().fit(X, y)

    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(-5, 7)
    yy = a * xx - (clf.intercept_[0]) / w[1]

    # plot the line, the points, and the nearest vectors to the plane
    axis.plot(xx, yy, 'k-', color="black", label="Model")

    ax1.tick_params(labelbottom='off', labelleft='off')
    ax1.set_xlabel("Gene 1")
    ax1.set_ylabel("Gene 2")
    ax1.legend()

项目：pairwise_distance 作者：oliviaguest | 项目源码 | 文件源码

def generate_data(N, seed=10):
    """ This generates some test data that we can use to test our pairwise-
    distance functions.

    Required arguments:
    N       -- The number of datapoints in the test data.

    Optional arguments:
    seed    -- The seed for NumPy's random module.
    """

    # Generate some data:
    np.random.seed(seed)
    n_samples1 = N * 3 // 4  # same as floor(3/4 * N)
    n_samples2 = N - n_samples1

    # Blob set 1
    centers1 = [[0., 0.],
                [1., 0.],
                [0.5, np.sqrt(0.75)]]
    cluster_std1 = [0.3] * len(centers1)
    data, _ = make_blobs(n_samples=n_samples1,
                         centers=centers1,
                         cluster_std=cluster_std1)

    # Make sure Blob 1 checks out

    # Blob set 2
    centers2 = [[0.5, np.sqrt(0.75)]]
    cluster_std2 = [0.3] * len(centers2)
    extra, _ = make_blobs(n_samples=n_samples2,
                          centers=centers2,
                          cluster_std=cluster_std2)

    return np.concatenate((data, extra), axis=0)

项目：ML-From-Scratch 作者：eriklindernoren | 项目源码 | 文件源码

def main():
    # Load the dataset
    X, y = datasets.make_blobs()

    # Cluster the data
    clf = GaussianMixtureModel(k=3)
    y_pred = clf.predict(X)

    p = Plot()
    p.plot_in_2d(X, y_pred, title="GMM Clustering")
    p.plot_in_2d(X, y, title="Actual Clustering")

项目：ML-From-Scratch 作者：eriklindernoren | 项目源码 | 文件源码

def main():
    # Load the dataset
    X, y = datasets.make_blobs()

    # Cluster the data using K-Medoids
    clf = PAM(k=3)
    y_pred = clf.predict(X)

    # Project the data onto the 2 primary principal components
    p = Plot()
    p.plot_in_2d(X, y_pred, title="PAM Clustering")
    p.plot_in_2d(X, y, title="Actual Clustering")

项目：ML-From-Scratch 作者：eriklindernoren | 项目源码 | 文件源码

def main():
    # Load the dataset
    X, y = datasets.make_blobs()

    # Cluster the data using K-Means
    clf = KMeans(k=3)
    y_pred = clf.predict(X)

    # Project the data onto the 2 primary principal components
    p = Plot()
    p.plot_in_2d(X, y_pred, title="K-Means Clustering")
    p.plot_in_2d(X, y, title="Actual Clustering")

项目：postlearn 作者：TomAugspurger | 项目源码 | 文件源码

def data_labels():
    return make_blobs(random_state=2)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_n_samples_leaves_roots():
    # Sanity check for the number of samples in leaves and roots
    X, y = make_blobs(n_samples=10)
    brc = Birch()
    brc.fit(X)
    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
    n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
                            for sc in leaf.subclusters_])
    assert_equal(n_samples_leaves, X.shape[0])
    assert_equal(n_samples_root, X.shape[0])

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_n_clusters():
    # Test that n_clusters param works properly
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert_greater(len(brc1.subcluster_centers_), 10)
    assert_equal(len(np.unique(brc1.labels_)), 10)

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    assert_raises(ValueError, brc3.fit, X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.)
    assert_warns(UserWarning, brc4.fit, X)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_sparse_X():
    # Test that sparse and dense data give same results
    X, y = make_blobs(n_samples=100, centers=10)
    brc = Birch(n_clusters=10)
    brc.fit(X)

    csr = sparse.csr_matrix(X)
    brc_sparse = Birch(n_clusters=10)
    brc_sparse.fit(csr)

    assert_array_equal(brc.labels_, brc_sparse.labels_)
    assert_array_almost_equal(brc.subcluster_centers_,
                              brc_sparse.subcluster_centers_)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_branching_factor():
    # Test that nodes have at max branching_factor number of subclusters
    X, y = make_blobs()
    branching_factor = 9

    # Purposefully set a low threshold to maximize the subclusters.
    brc = Birch(n_clusters=None, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)
    brc = Birch(n_clusters=3, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)

    # Raises error when branching_factor is set to one.
    brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
    assert_raises(ValueError, brc.fit, X)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_threshold():
    # Test that the leaf subclusters have a threshold lesser than radius
    X, y = make_blobs(n_samples=80, centers=4)
    brc = Birch(threshold=0.5, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 0.5)

    brc = Birch(threshold=5.0, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 5.)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def test_birch_example_reproducibility(example_id):
    # check reproducibility of the Birch example
    rng = np.random.RandomState(42)

    X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)

    cluster_model = Birch(threshold=0.9, branching_factor=20,
                          compute_sample_indices=True)
    cluster_model.fit(X)
    #assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3

    htree, n_subclusters = birch_hierarchy_wrapper(cluster_model)

    assert htree.tree_size == n_subclusters

    # same random seed as in the birch hierarchy example
    assert htree.tree_size == 78
    sc = htree.flatten()[example_id]
    if example_id == 34:
        # this is true in both cases, but example_id fails on circle ci
        assert sc.current_depth == 1
        assert len(sc.children) == 3

    assert_array_equal([sc['cluster_id'] for sc in htree.flatten()],
                       np.arange(htree.tree_size))

项目：simec 作者：cod3licious | 项目源码 | 文件源码

def load_dataset(dataset, n_samples, random_state=1, n_features=3):
    # wrapper function to load one of the 3d datasets
    if dataset == 's_curve':
        return make_s_curve(n_samples, random_state=random_state)
    elif dataset == 'swiss_roll':
        return make_swiss_roll(n_samples, random_state=random_state)
    elif dataset == 'broken_swiss_roll':
        return make_broken_swiss_roll(n_samples, random_state=random_state)
    elif dataset == 'sphere':
        return make_sphere(n_samples, random_state=random_state)
    elif dataset == '3_circles':
        return make_3_circles(n_samples, random_state=random_state)
    elif dataset == 'peaks':
        return make_peaks(n_samples, random_state=random_state)
    elif dataset == 'blobs':
        return make_blobs(n_samples, n_features=n_features, centers=3, random_state=random_state)
    else:
        print("unknown dataset")

项目：Gaussian_process 作者：happyjin | 项目源码 | 文件源码

def dataset_generator():
    """
    generate multi-class dataset
    :return: data X and its labels
    """
    plt.title("Three blobs", fontsize='small')
    X, y = make_blobs(n_features=2, centers=3)
    plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    plt.show()
    #np.save('X_multi.npy', X)
    #np.save('y_multi.npy', y)
    return X, y

项目：sdp_kmeans 作者：simonsfoundation | 项目源码 | 文件源码

def gaussian_blobs(n_samples=200, return_centers=False):
    random_state = 0
    centers = [(-10, -10), (-10, 0), (0, -10)]
    centers.extend([(10, 10), (10, 0), (0, 10)])
    centers = np.array(centers)
    X, gt = sk_datasets.make_blobs(n_samples=n_samples, centers=centers,
                                   n_features=2, shuffle=False,
                                   random_state=random_state)
    if return_centers:
        return X, gt, centers
    else:
        return X, gt

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def make_clusters(skew=True, *arg, **kwargs):
    X, y = datasets.make_blobs(*arg, **kwargs)
    if skew:
        nrow = X.shape[1]
        for i in np.unique(y):
            X[y == i] = X[y == i].dot(np.random.random((nrow, nrow)) - 0.5)
    return X, y

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def kmeans_example(plot=False):
    X, y = make_blobs(centers=4, n_samples=500, n_features=2,
                      shuffle=True, random_state=42)
    clusters = len(np.unique(y))
    k = KMeans(K=clusters, max_iters=150, init='++')
    k.fit(X)
    k.predict()

    if plot:
        k.plot()

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_integrated_kmeans_elbow(self):
        """
        Test no exceptions for kmeans k-elbow visualizer on blobs dataset

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X,y = make_blobs(
            n_samples=1000, n_features=12, centers=6, shuffle=True
        )

        try:
            visualizer = KElbowVisualizer(KMeans(), k=4)
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_integrated_mini_batch_kmeans_elbow(self):
        """
        Test no exceptions for mini-batch kmeans k-elbow visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X,y = make_blobs(
            n_samples=1000, n_features=12, centers=6, shuffle=True
        )

        try:
            visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_integrated_kmeans_silhouette(self):
        """
        Test no exceptions for kmeans silhouette visualizer on blobs dataset

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X, y = make_blobs(
            n_samples=1000, n_features=12, centers=8, shuffle=True,
        )

        try:
            visualizer = SilhouetteVisualizer(KMeans())
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during silhouette: {}".format(e))

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_integrated_mini_batch_kmeans_silhouette(self):
        """
        Test no exceptions for mini-batch kmeans silhouette visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X, y = make_blobs(
            n_samples=1000, n_features=12, centers=8, shuffle=True,
        )

        try:
            visualizer = SilhouetteVisualizer(MiniBatchKMeans())
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during silhouette: {}".format(e))