Python sklearn.cluster 模块,MiniBatchKMeans() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.MiniBatchKMeans()

项目:gif-enc    作者:DavidBuchanan314    | 项目源码 | 文件源码
def palettise(data, n_entries=256):
    height = len(data)
    width = len(data[0])
    all_colours = sum(data, [])
    print("Calculating pallete...")
    kmeans = MiniBatchKMeans(n_clusters=n_entries, random_state=0).fit(all_colours)
    pallete = [list(map(int, rgb)) for rgb in kmeans.cluster_centers_]

    print("Dithering...") # Floyd–Steinberg dithering
    for y in range(height):
        print("\r{:.1f}%".format((y/height)*100), end="")
        for x in range(width):
            bucket = kmeans.predict([data[y][x]])[0]
            error = [a-b for a, b in zip(data[y][x], pallete[bucket])]
            data[y][x] = bucket
            for dx, dy, coef in [(1, 0, 7/16), (-1, 1, 3/16), (0, 1, 5/16), (1, 1, 1/16)]:
                xn = x + dx
                yn = y + dy
                if ( 0 <= xn < width and 0 <= yn < height ):
                    data[yn][xn] = [a+b*coef for a, b in zip(data[yn][xn], error)]

    print("\r100%     ")
    return data, pallete
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def k_means(self, n_clusters, batch_size=1000):
        """
        Perform K-mean clustering

        Parameters
        ----------
        n_clusters : int
           number of clusters
        batch_size : int
           the bath size for the MiniBatchKMeans algorithm
        """
        from sklearn.cluster import MiniBatchKMeans
        pars = {"batch_size": batch_size, 'is_hierarchical': False,
                "metric": self.metric}
        km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
                             n_init=10,
                             init_size=batch_size, batch_size=batch_size)
        return self._cluster_func(n_clusters, km, pars)
项目:elm    作者:ContinuumIO    | 项目源码 | 文件源码
def kmeans_aic(model, X, **kwargs):
    '''AIC (Akaike Information Criterion) for k-means for model selection

    Parameters:
        :model:  An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
        :X:      The X data that were just given to "fit", or "partial_fit"
        :kwargs: placeholder - ignored

    Returns:
        :AIC: float

    '''

    k, m = model._estimator.cluster_centers_.shape
    if isinstance(X, xr.DataArray):
        n = X.flat.values.shape[0]
    else:
        n = X.shape[0]
    d = model._estimator.inertia_
    aic =  d + 2 * m * k
    delattr(model._estimator, 'labels_')
    return aic
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_clusterer_enforcement(self):
        """
        Assert that only clustering estimators can be passed to cluster viz
        """
        nomodels = [
            SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
        ]

        for nomodel in nomodels:
            with self.assertRaises(YellowbrickTypeError):
                visualizer = ClusteringScoreVisualizer(nomodel())

        models = [
            KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
        ]

        for model in models:
            try:
                visualizer = ClusteringScoreVisualizer(model())
            except YellowbrickTypeError:
                self.fail("could not pass clustering estimator to visualizer")
项目:Shoe-Shape-Classifier    作者:jrzaurin    | 项目源码 | 文件源码
def avg_within_ss(X, k):
    """
    Compute the average within-cluster sum of squares. The code here can be
    found "almost" anywhere online

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    avgwithinss: average within-cluster sum of squares
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centroids = model.cluster_centers_
    dist_c = cdist(X, centroids, 'euclidean')
    dist   = np.min(dist_c, axis=1)
    avgwithinss = sum(dist**2)/X.shape[0]

    return avgwithinss
项目:QScode    作者:PierreHao    | 项目源码 | 文件源码
def fit(self, descs, MiniBatchKMeans=True, batch_size=10000, preprocess=False):
        """Training"""
        """
        if preprocess:
            self.stdSlr = StandardScaler().fit(descs)
            descs = self.stdSlr.transform(descs)
        else:
            self.stdSlr = None
        """
        if MiniBatchKMeans:
            self.centers = self.MiniBatchKMeans(descs, batch_size)
        else:
            self.centers = self.Kmeans(descs)
        if preprocess:
            self.stdSlr = StandardScaler().fit(descs)
            #descs = self.stdSlr.transform(descs)
        else:
            self.stdSlr = None
        return self.centers
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_k_means_explicit_init_shape():
    # test for sensible errors when giving explicit init
    # with wrong number of features or clusters
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 3))
    for Class in [KMeans, MiniBatchKMeans]:
        # mismatch of number of features
        km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
        msg = "does not match the number of features of the data"
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X))
        assert_raises_regex(ValueError, msg, km.fit, X)
        # mismatch of number of clusters
        msg = "does not match the number of clusters"
        km = Class(n_init=1, init=X[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_mini_batch_k_means_random_init_partial_fit():
    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)

    # use the partial_fit API for online learning
    for X_minibatch in np.array_split(X, 10):
        km.partial_fit(X_minibatch)

    # compute the labeling on the complete dataset
    labels = km.predict(X)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
项目:nlp-playground    作者:jamesmishra    | 项目源码 | 文件源码
def test__ClusteringWithSupervision_clusters():
    """
    Check that we change the number of clusters properly.

    We have a weird interface here where we sort of overload
    `n_clusters` but try to hide it.
    """
    train, classes = make_X_y()
    model = ClusteringWithSupervision(cluster_instance=MiniBatchKMeans())
    assert model.n_clusters is None
    assert model.get_params()['n_clusters'] is None
    assert model.cluster_instance.n_clusters == 8
    assert model._cluster_instance is None
    model.fit(train, classes)
    assert model.n_clusters is None
    assert model.get_params()['n_clusters'] is None
    assert model.cluster_instance.n_clusters == 8
    assert model._cluster_instance.n_clusters == 4
项目:pybot    作者:spillai    | 项目源码 | 文件源码
def bow_codebook(data, K=64): 
    km = MiniBatchKMeans(n_clusters=K, init='k-means++', 
                         compute_labels=False, batch_size=1000, max_iter=150, max_no_improvement=30, 
                         verbose=False).fit(data)
    return km.cluster_centers_
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_basic(self, single_chunk_blobs):
        X, y = single_chunk_blobs
        a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
        b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.partial_fit(X)
        assert_estimator_equal(a, b, exclude=['random_state_'])
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def mini_batch(fig):
    global X_iris, geo
    ax = fig.add_subplot(geo + 2, projection='3d', title='mini-batch')
    mini_batch = cluster.MiniBatchKMeans(init='random', n_clusters=3)
    mini_batch.fit(X_iris)
    res = mini_batch.labels_
    for n, i in enumerate(X_iris):
        ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    return res
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def train(X, y, true_k=50, minibatch=False, showLable=True):
    # ??????????????k-means?
    fout = open('pro1_cluster.txt', 'w+')
    if minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=False)
    else:
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
                    verbose=False)
    km.fit(X)
    print y.dtype
    if showLable:
        print("Top terms per cluster:")
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = y
        # print y
        result = list(km.predict(X))
        print('Cluster distribution:')
        print(dict([(i, result.count(i)) for i in result]))
        cluster_list = {}
        for i in range(true_k):
            cluster_list[i] = []
        for j in range(len(result)):
            # print terms[j]
            # print result[j]
            cluster_list[result[j]].append([terms[j], X[j]])
        for i in cluster_list.keys():
            cluster = cluster_list[i]
            if len(cluster) > 0:
                for bet in cluster:
                    vec = bet[1].tolist()
                    # fout.write(bet[0] + str(vec) + '\n')
                    # print bet
                    fout.write(bet[0] + '\n')
                fout.write('-------------------\n')

    return -km.score(X)
    fout.close()
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def minibatchs_k_means_clustering(self, out_path, pd_data, number_of_clusters):
        headers, repos, features = self.__fetch_data(pd_data)

        mb_kmeans = MiniBatchKMeans(n_clusters=number_of_clusters)
        mb_kmeans.fit(features)

        clusters = []
        for i in range(0, number_of_clusters): # k cluster
            repo_list = []
            for j in range (0, len(mb_kmeans.labels_)):  # a label for each repo.
                if i == mb_kmeans.labels_[j]:  # if repo label is equal to Cluster number
                    repo_list.append(repos[j])  # add repo to cluster i's list.
            clusters.append(repo_list)
        out_file_path = os.path.join(out_path, "mb_kmeans_noOfClusters" + str(number_of_clusters))
        self.__export_k_means_results(mb_kmeans, headers, clusters, out_file_path)  # avoid ".csv"
项目:feature-aggregation    作者:paschalidoud    | 项目源码 | 文件源码
def __init__(self, n_codewords, normalization=3, inner_batch=128,
                 dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.inner_batch = inner_batch
        self.normalization = normalization

        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)
项目:feature-aggregation    作者:paschalidoud    | 项目源码 | 文件源码
def __init__(self, n_codewords, neighbors=5, beta=1e-4, dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.neighbors = neighbors
        self.beta = beta
        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)
项目:feature-aggregation    作者:paschalidoud    | 项目源码 | 文件源码
def __init__(self, n_codewords, l1_norm=True, dimension_ordering="tf"):
        self.n_codewords = n_codewords
        self.l1_norm = l1_norm
        self._clusterer = cluster.MiniBatchKMeans(
            n_clusters=self.n_codewords,
            n_init=1,
            compute_labels=False
        )

        super(self.__class__, self).__init__(dimension_ordering)
项目:deephash    作者:caoyue10    | 项目源码 | 文件源码
def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#DVSQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
项目:PySCUBA    作者:GGiecold    | 项目源码 | 文件源码
def KMEANS(data, k):

    if data.shape[0] < 20000:
        centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200)
    else:
        mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20)
        mbkm.fit(data)

        centroids = mbkm.cluster_centers_
        cluster_IDs = mbkm.labels_

    return centroids, cluster_IDs
项目:elm    作者:ContinuumIO    | 项目源码 | 文件源码
def make_example_y_data(X, y=None, sample_weight=None, **kwargs):
    fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values)
    y = fitted.predict(X.flat.values)
    return (X, y, sample_weight)
项目:aaai17-cdq    作者:caoyue10    | 项目源码 | 文件源码
def initial_centers(self, img_output, txt_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#cdq train# initilizing Centers"
        all_output = np.vstack([img_output, txt_output])
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
项目:TFFRCNN    作者:InterVideo    | 项目源码 | 文件源码
def _kmeans_clustering(self, X, n_clusters, batch_size=128):
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size,
                                 n_init=10, max_no_improvement=10)
        kmeans.fit(X)
        return kmeans.cluster_centers_
项目:TFFRCNN    作者:InterVideo    | 项目源码 | 文件源码
def generate_codebook(image, detectAndCompute=SIFT_create().detectAndCompute):
    descriptors = detectAndCompute(image, window_size=None)
    kmeans = MiniBatchKMeans(n_clusters=2048, batch_size=128,
                             n_init=10, max_no_improvement=10)
    kmeans.fit(descriptors)
    codebook = kmeans.cluster_centers_[:]
    return codebook
项目:mmfeat    作者:douwekiela    | 项目源码 | 文件源码
def cluster(self):
        mbk = MiniBatchKMeans(n_clusters=self.K, batch_size=self.K*2, verbose=self.verbose, compute_labels=False)
        if self.subsample is None:
            data = np.vstack([self.data[k] for k in self.data.keys() if self.data[k] is not None])
            mbk.fit(data)
        else: # sample number of files
            fnames = self.data.keys()
            subset = random.sample(fnames, int(self.subsample * len(fnames)))
            subdata = np.vstack([self.data[k] for k in subset if self.data[k] is not None])
            mbk.fit(subdata)
        return mbk.cluster_centers_
项目:histonets-cv    作者:sul-cidr    | 项目源码 | 文件源码
def test_kmeans(self):
        n_clusters = 5
        X, y = make_blobs(n_samples=1000, centers=n_clusters, random_state=0)
        centers, labels = utils.kmeans(X, n_clusters)
        clf = MiniBatchKMeans(n_clusters=n_clusters)
        assert len(labels) == len(clf.fit_predict(X))
        assert len(centers) == len(clf.cluster_centers_)
项目:histonets-cv    作者:sul-cidr    | 项目源码 | 文件源码
def kmeans(X, n_clusters, **kwargs):
    """Classify vectors in X using K-Means algorithm with n_clusters.
    Arguments in kwargs are passed to scikit-learn MiniBatchKMeans.
    Returns a tuple of cluster centers and predicted labels."""
    clf = MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
    labels = clf.fit_predict(X)
    centers = clf.cluster_centers_.astype(np.ubyte)
    return centers, labels
项目:BugClustering    作者:w-garcia    | 项目源码 | 文件源码
def kmeans_classifier(prediction, ticket_predict_weights, ticket_target_list, tickets_to_weights_matrix):
    kmeans = MiniBatchKMeans(n_clusters=len(ticket_target_list), init_size=len(tickets_to_weights_matrix) + 1)
    kmeans.fit(tickets_to_weights_matrix)

    predicted_class = kmeans.predict(ticket_predict_weights)[0]
    print "kmeans prediction: {}".format(ticket_target_list[predicted_class])
    if prediction is not None:
        prediction.append([ticket_target_list[predicted_class]])
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_integrated_mini_batch_kmeans_elbow(self):
        """
        Test no exceptions for mini-batch kmeans k-elbow visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X,y = make_blobs(
            n_samples=1000, n_features=12, centers=6, shuffle=True
        )

        try:
            visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_integrated_mini_batch_kmeans_silhouette(self):
        """
        Test no exceptions for mini-batch kmeans silhouette visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X, y = make_blobs(
            n_samples=1000, n_features=12, centers=8, shuffle=True,
        )

        try:
            visualizer = SilhouetteVisualizer(MiniBatchKMeans())
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during silhouette: {}".format(e))
项目:Shoe-Shape-Classifier    作者:jrzaurin    | 项目源码 | 文件源码
def perc_var_explained(X,k):
    """
    Compute the percentage of variance explained defined as between sum of squares
    divided but the total sum of squares.
    WARNING: It will take a while.
    The code here can be found "almost" anywhere online.

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    pve: percentage of variance explained
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centroids = model.cluster_centers_
    dist_c = cdist(X, centroids, 'euclidean')
    dist   = np.min(dist_c, axis=1)
    tot_withinss = sum(dist**2)
    totss = sum(pdist(X)**2)/X.shape[0]
    betweenss = totss - tot_withinss
    pve = (betweenss/totss  *100)

    return pve
项目:Shoe-Shape-Classifier    作者:jrzaurin    | 项目源码 | 文件源码
def bic(X, k):
    """
    Compute the BIC score.
    Implementarion from here:
    http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf
    with corrections from here:
    https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans

    Params:
    --------
    X: numpy array with observations and features to be clustered
    k: number of clusters

    Returns:
    --------
    BIC: bic score
    """

    model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
                          n_init=3, max_no_improvement=10, verbose=0)
    model.fit(X)

    centers = model.cluster_centers_
    centers = np.expand_dims(centers, axis=1)
    labels  = model.labels_
    N_C = np.bincount(labels)
    R, M = X.shape

    wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)])
    var = (1.0/(R-k)/M) * wcss
    const_term = 0.5 * k * np.log(R) * (M+1)

    BIC = np.sum([ ( Rn * np.log(Rn) ) -
                   ( Rn * np.log(R) ) -
                   ( ((Rn * M) / 2) * np.log(2*np.pi*var) )  -
                   ( (Rn - 1) * M/ 2 )
                   for Rn in N_C]) - const_term

    return BIC
项目:kmc2    作者:obachem    | 项目源码 | 文件源码
def test_scenarios():
    """Test that everything works"""
    for s in scenarios():
        seeding = kmc2.kmc2(**s)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")  # disable sklearn warnings
            model = MiniBatchKMeans(s["k"], init=seeding).fit(s["X"])
        new_centers = model.cluster_centers_
项目:aliMusic    作者:wangqingbaidu    | 项目源码 | 文件源码
def gen_cluster(keys = None, cluster_matrix = None):
    km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
#     km = KMeans(n_jobs=-1, n_clusters=50)
    print "Clustering data..."
    labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))
    res = pd.concat([keys, labels], axis = 1, ignore_index=True)
    return res
项目:aliMusic    作者:wangqingbaidu    | 项目源码 | 文件源码
def gen_cluster(keys = None, cluster_matrix = None):
    assert cluster_matrix and keys
    km = MiniBatchKMeans(n_clusters=50, batch_size=1000)
    labels = pd.DataFrame(km.fit_predict(cluster_matrix.values))

    res = pd.concat([keys, labels], axis = 1, ignore_index=True)
    return res
项目:QScode    作者:PierreHao    | 项目源码 | 文件源码
def fit(self, descs, preprocess=True):
        if preprocess:
            self.stdSlr = StandardScaler()
            self.stdSlr.fit(descs)
            tmp = self.stdSlr.transform(descs)
        else:
            tmp = descs
            self.stdSlr = None
        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=10000)
        kmeans.fit(tmp)
        self.centers = kmeans.cluster_centers_
        self.clusters = kmeans.labels_
        return self.centers
项目:QScode    作者:PierreHao    | 项目源码 | 文件源码
def MiniBatchKMeans(self, X, batch=10000):
        print("in fit method", X.shape, self.k)
        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch)
        kmeans.fit(X)
        centers = kmeans.cluster_centers_
        clusters = kmeans.labels_
        print("shape of centers is ", centers.shape)
        return centers
项目:cvpr17-dvsq    作者:caoyue10    | 项目源码 | 文件源码
def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#DVSQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
项目:cvpr17-dvsq    作者:caoyue10    | 项目源码 | 文件源码
def initial_centers(self, img_output):
        C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim])
        print "#ZDQ train# initilizing Centers"
        all_output = img_output
        for i in xrange(self.subspace_num):
            kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num])
            C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_
            print "step: ", i, " finish"
        return C_init
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_mb_k_means_plus_plus_init_dense_array():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_mb_kmeans_verbose():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42, verbose=1)
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        mb_k_means.fit(X)
    finally:
        sys.stdout = old_stdout
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_mb_k_means_plus_plus_init_sparse_matrix():
    mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X_csr)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_init_with_large_k():
    mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
    # Check that a warning is raised, as the number clusters is larger
    # than the init_size
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_k_means_random_init_sparse_csr():
    # increase n_init to make random init stable enough
    mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters,
                                 random_state=42, n_init=10).fit(X_csr)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_k_means_perfect_init_dense_array():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=1).fit(X)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=10)
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_k_means_perfect_init_sparse_csr():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 random_state=42, n_init=1).fit(X_csr)
    _check_fitted_model(mb_k_means)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_with_many_reassignments():
    # Test for the case that the number of clusters to reassign is bigger
    # than the batch_size
    n_samples = 550
    rnd = np.random.RandomState(42)
    X = rnd.uniform(size=(n_samples, 10))
    # Check that the fit works if n_clusters is bigger than the batch_size.
    # Run the test with 550 clusters and 550 samples, because it turned out
    # that this values ensure that the number of clusters to reassign
    # is always bigger than the batch_size
    n_clusters = 550
    MiniBatchKMeans(n_clusters=n_clusters,
                    batch_size=100,
                    init_size=n_samples,
                    random_state=42).fit(X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_minibatch_default_init_size():
    mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
                                 batch_size=10, random_state=42,
                                 n_init=1).fit(X)
    assert_equal(mb_k_means.init_size_, 3 * mb_k_means.batch_size)
    _check_fitted_model(mb_k_means)