我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.MiniBatchKMeans()。
def palettise(data, n_entries=256): height = len(data) width = len(data[0]) all_colours = sum(data, []) print("Calculating pallete...") kmeans = MiniBatchKMeans(n_clusters=n_entries, random_state=0).fit(all_colours) pallete = [list(map(int, rgb)) for rgb in kmeans.cluster_centers_] print("Dithering...") # Floyd–Steinberg dithering for y in range(height): print("\r{:.1f}%".format((y/height)*100), end="") for x in range(width): bucket = kmeans.predict([data[y][x]])[0] error = [a-b for a, b in zip(data[y][x], pallete[bucket])] data[y][x] = bucket for dx, dy, coef in [(1, 0, 7/16), (-1, 1, 3/16), (0, 1, 5/16), (1, 1, 1/16)]: xn = x + dx yn = y + dy if ( 0 <= xn < width and 0 <= yn < height ): data[yn][xn] = [a+b*coef for a, b in zip(data[yn][xn], error)] print("\r100% ") return data, pallete
def k_means(self, n_clusters, batch_size=1000): """ Perform K-mean clustering Parameters ---------- n_clusters : int number of clusters batch_size : int the bath size for the MiniBatchKMeans algorithm """ from sklearn.cluster import MiniBatchKMeans pars = {"batch_size": batch_size, 'is_hierarchical': False, "metric": self.metric} km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=10, init_size=batch_size, batch_size=batch_size) return self._cluster_func(n_clusters, km, pars)
def kmeans_aic(model, X, **kwargs): '''AIC (Akaike Information Criterion) for k-means for model selection Parameters: :model: An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline :X: The X data that were just given to "fit", or "partial_fit" :kwargs: placeholder - ignored Returns: :AIC: float ''' k, m = model._estimator.cluster_centers_.shape if isinstance(X, xr.DataArray): n = X.flat.values.shape[0] else: n = X.shape[0] d = model._estimator.inertia_ aic = d + 2 * m * k delattr(model._estimator, 'labels_') return aic
def test_clusterer_enforcement(self): """ Assert that only clustering estimators can be passed to cluster viz """ nomodels = [ SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier ] for nomodel in nomodels: with self.assertRaises(YellowbrickTypeError): visualizer = ClusteringScoreVisualizer(nomodel()) models = [ KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch ] for model in models: try: visualizer = ClusteringScoreVisualizer(model()) except YellowbrickTypeError: self.fail("could not pass clustering estimator to visualizer")
def avg_within_ss(X, k): """ Compute the average within-cluster sum of squares. The code here can be found "almost" anywhere online Params: -------- X: numpy array with observations and features to be clustered k: number of clusters Returns: -------- avgwithinss: average within-cluster sum of squares """ model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50, n_init=3, max_no_improvement=10, verbose=0) model.fit(X) centroids = model.cluster_centers_ dist_c = cdist(X, centroids, 'euclidean') dist = np.min(dist_c, axis=1) avgwithinss = sum(dist**2)/X.shape[0] return avgwithinss
def fit(self, descs, MiniBatchKMeans=True, batch_size=10000, preprocess=False): """Training""" """ if preprocess: self.stdSlr = StandardScaler().fit(descs) descs = self.stdSlr.transform(descs) else: self.stdSlr = None """ if MiniBatchKMeans: self.centers = self.MiniBatchKMeans(descs, batch_size) else: self.centers = self.Kmeans(descs) if preprocess: self.stdSlr = StandardScaler().fit(descs) #descs = self.stdSlr.transform(descs) else: self.stdSlr = None return self.centers
def test_k_means_explicit_init_shape(): # test for sensible errors when giving explicit init # with wrong number of features or clusters rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 3)) for Class in [KMeans, MiniBatchKMeans]: # mismatch of number of features km = Class(n_init=1, init=X[:, :2], n_clusters=len(X)) msg = "does not match the number of features of the data" assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X)) assert_raises_regex(ValueError, msg, km.fit, X) # mismatch of number of clusters msg = "does not match the number of clusters" km = Class(n_init=1, init=X[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X)
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
def test_sparse_mb_k_means_callable_init(): def test_init(X, k, random_state): return centers # Small test to check that giving the wrong number of centers # raises a meaningful error msg = "does not match the number of clusters" assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init, random_state=42).fit, X_csr) # Now check that the fit actually works mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
def test_mini_batch_k_means_random_init_partial_fit(): km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) # use the partial_fit API for online learning for X_minibatch in np.array_split(X, 10): km.partial_fit(X_minibatch) # compute the labeling on the complete dataset labels = km.predict(X) assert_equal(v_measure_score(true_labels, labels), 1.0)
def test__ClusteringWithSupervision_clusters(): """ Check that we change the number of clusters properly. We have a weird interface here where we sort of overload `n_clusters` but try to hide it. """ train, classes = make_X_y() model = ClusteringWithSupervision(cluster_instance=MiniBatchKMeans()) assert model.n_clusters is None assert model.get_params()['n_clusters'] is None assert model.cluster_instance.n_clusters == 8 assert model._cluster_instance is None model.fit(train, classes) assert model.n_clusters is None assert model.get_params()['n_clusters'] is None assert model.cluster_instance.n_clusters == 8 assert model._cluster_instance.n_clusters == 4
def bow_codebook(data, K=64): km = MiniBatchKMeans(n_clusters=K, init='k-means++', compute_labels=False, batch_size=1000, max_iter=150, max_no_improvement=30, verbose=False).fit(data) return km.cluster_centers_
def test_basic(self, single_chunk_blobs): X, y = single_chunk_blobs a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0) b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0) a.fit(X) b.partial_fit(X) assert_estimator_equal(a, b, exclude=['random_state_'])
def mini_batch(fig): global X_iris, geo ax = fig.add_subplot(geo + 2, projection='3d', title='mini-batch') mini_batch = cluster.MiniBatchKMeans(init='random', n_clusters=3) mini_batch.fit(X_iris) res = mini_batch.labels_ for n, i in enumerate(X_iris): ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o') ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') return res
def train(X, y, true_k=50, minibatch=False, showLable=True): # ??????????????k-means? fout = open('pro1_cluster.txt', 'w+') if minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1, verbose=False) km.fit(X) print y.dtype if showLable: print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = y # print y result = list(km.predict(X)) print('Cluster distribution:') print(dict([(i, result.count(i)) for i in result])) cluster_list = {} for i in range(true_k): cluster_list[i] = [] for j in range(len(result)): # print terms[j] # print result[j] cluster_list[result[j]].append([terms[j], X[j]]) for i in cluster_list.keys(): cluster = cluster_list[i] if len(cluster) > 0: for bet in cluster: vec = bet[1].tolist() # fout.write(bet[0] + str(vec) + '\n') # print bet fout.write(bet[0] + '\n') fout.write('-------------------\n') return -km.score(X) fout.close()
def minibatchs_k_means_clustering(self, out_path, pd_data, number_of_clusters): headers, repos, features = self.__fetch_data(pd_data) mb_kmeans = MiniBatchKMeans(n_clusters=number_of_clusters) mb_kmeans.fit(features) clusters = [] for i in range(0, number_of_clusters): # k cluster repo_list = [] for j in range (0, len(mb_kmeans.labels_)): # a label for each repo. if i == mb_kmeans.labels_[j]: # if repo label is equal to Cluster number repo_list.append(repos[j]) # add repo to cluster i's list. clusters.append(repo_list) out_file_path = os.path.join(out_path, "mb_kmeans_noOfClusters" + str(number_of_clusters)) self.__export_k_means_results(mb_kmeans, headers, clusters, out_file_path) # avoid ".csv"
def __init__(self, n_codewords, normalization=3, inner_batch=128, dimension_ordering="tf"): self.n_codewords = n_codewords self.inner_batch = inner_batch self.normalization = normalization self._clusterer = cluster.MiniBatchKMeans( n_clusters=self.n_codewords, n_init=1, compute_labels=False ) super(self.__class__, self).__init__(dimension_ordering)
def __init__(self, n_codewords, neighbors=5, beta=1e-4, dimension_ordering="tf"): self.n_codewords = n_codewords self.neighbors = neighbors self.beta = beta self._clusterer = cluster.MiniBatchKMeans( n_clusters=self.n_codewords, n_init=1, compute_labels=False ) super(self.__class__, self).__init__(dimension_ordering)
def __init__(self, n_codewords, l1_norm=True, dimension_ordering="tf"): self.n_codewords = n_codewords self.l1_norm = l1_norm self._clusterer = cluster.MiniBatchKMeans( n_clusters=self.n_codewords, n_init=1, compute_labels=False ) super(self.__class__, self).__init__(dimension_ordering)
def initial_centers(self, img_output): C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim]) print "#DVSQ train# initilizing Centers" all_output = img_output for i in xrange(self.subspace_num): kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num]) C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_ print "step: ", i, " finish" return C_init
def KMEANS(data, k): if data.shape[0] < 20000: centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200) else: mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20) mbkm.fit(data) centroids = mbkm.cluster_centers_ cluster_IDs = mbkm.labels_ return centroids, cluster_IDs
def make_example_y_data(X, y=None, sample_weight=None, **kwargs): fitted = MiniBatchKMeans(n_clusters=5).fit(X.flat.values) y = fitted.predict(X.flat.values) return (X, y, sample_weight)
def initial_centers(self, img_output, txt_output): C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim]) print "#cdq train# initilizing Centers" all_output = np.vstack([img_output, txt_output]) for i in xrange(self.subspace_num): kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num]) C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_ print "step: ", i, " finish" return C_init
def _kmeans_clustering(self, X, n_clusters, batch_size=128): kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, n_init=10, max_no_improvement=10) kmeans.fit(X) return kmeans.cluster_centers_
def generate_codebook(image, detectAndCompute=SIFT_create().detectAndCompute): descriptors = detectAndCompute(image, window_size=None) kmeans = MiniBatchKMeans(n_clusters=2048, batch_size=128, n_init=10, max_no_improvement=10) kmeans.fit(descriptors) codebook = kmeans.cluster_centers_[:] return codebook
def cluster(self): mbk = MiniBatchKMeans(n_clusters=self.K, batch_size=self.K*2, verbose=self.verbose, compute_labels=False) if self.subsample is None: data = np.vstack([self.data[k] for k in self.data.keys() if self.data[k] is not None]) mbk.fit(data) else: # sample number of files fnames = self.data.keys() subset = random.sample(fnames, int(self.subsample * len(fnames))) subdata = np.vstack([self.data[k] for k in subset if self.data[k] is not None]) mbk.fit(subdata) return mbk.cluster_centers_
def test_kmeans(self): n_clusters = 5 X, y = make_blobs(n_samples=1000, centers=n_clusters, random_state=0) centers, labels = utils.kmeans(X, n_clusters) clf = MiniBatchKMeans(n_clusters=n_clusters) assert len(labels) == len(clf.fit_predict(X)) assert len(centers) == len(clf.cluster_centers_)
def kmeans(X, n_clusters, **kwargs): """Classify vectors in X using K-Means algorithm with n_clusters. Arguments in kwargs are passed to scikit-learn MiniBatchKMeans. Returns a tuple of cluster centers and predicted labels.""" clf = MiniBatchKMeans(n_clusters=n_clusters, **kwargs) labels = clf.fit_predict(X) centers = clf.cluster_centers_.astype(np.ubyte) return centers, labels
def kmeans_classifier(prediction, ticket_predict_weights, ticket_target_list, tickets_to_weights_matrix): kmeans = MiniBatchKMeans(n_clusters=len(ticket_target_list), init_size=len(tickets_to_weights_matrix) + 1) kmeans.fit(tickets_to_weights_matrix) predicted_class = kmeans.predict(ticket_predict_weights)[0] print "kmeans prediction: {}".format(ticket_target_list[predicted_class]) if prediction is not None: prediction.append([ticket_target_list[predicted_class]])
def test_integrated_mini_batch_kmeans_elbow(self): """ Test no exceptions for mini-batch kmeans k-elbow visualizer See #182: cannot use occupancy dataset because of memory usage """ # Generate a blobs data set X,y = make_blobs( n_samples=1000, n_features=12, centers=6, shuffle=True ) try: visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4) visualizer.fit(X) visualizer.poof() except Exception as e: self.fail("error during k-elbow: {}".format(e))
def test_integrated_mini_batch_kmeans_silhouette(self): """ Test no exceptions for mini-batch kmeans silhouette visualizer See #182: cannot use occupancy dataset because of memory usage """ # Generate a blobs data set X, y = make_blobs( n_samples=1000, n_features=12, centers=8, shuffle=True, ) try: visualizer = SilhouetteVisualizer(MiniBatchKMeans()) visualizer.fit(X) visualizer.poof() except Exception as e: self.fail("error during silhouette: {}".format(e))
def perc_var_explained(X,k): """ Compute the percentage of variance explained defined as between sum of squares divided but the total sum of squares. WARNING: It will take a while. The code here can be found "almost" anywhere online. Params: -------- X: numpy array with observations and features to be clustered k: number of clusters Returns: -------- pve: percentage of variance explained """ model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50, n_init=3, max_no_improvement=10, verbose=0) model.fit(X) centroids = model.cluster_centers_ dist_c = cdist(X, centroids, 'euclidean') dist = np.min(dist_c, axis=1) tot_withinss = sum(dist**2) totss = sum(pdist(X)**2)/X.shape[0] betweenss = totss - tot_withinss pve = (betweenss/totss *100) return pve
def bic(X, k): """ Compute the BIC score. Implementarion from here: http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf with corrections from here: https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans Params: -------- X: numpy array with observations and features to be clustered k: number of clusters Returns: -------- BIC: bic score """ model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50, n_init=3, max_no_improvement=10, verbose=0) model.fit(X) centers = model.cluster_centers_ centers = np.expand_dims(centers, axis=1) labels = model.labels_ N_C = np.bincount(labels) R, M = X.shape wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)]) var = (1.0/(R-k)/M) * wcss const_term = 0.5 * k * np.log(R) * (M+1) BIC = np.sum([ ( Rn * np.log(Rn) ) - ( Rn * np.log(R) ) - ( ((Rn * M) / 2) * np.log(2*np.pi*var) ) - ( (Rn - 1) * M/ 2 ) for Rn in N_C]) - const_term return BIC
def test_scenarios(): """Test that everything works""" for s in scenarios(): seeding = kmc2.kmc2(**s) with warnings.catch_warnings(): warnings.simplefilter("ignore") # disable sklearn warnings model = MiniBatchKMeans(s["k"], init=seeding).fit(s["X"]) new_centers = model.cluster_centers_
def gen_cluster(keys = None, cluster_matrix = None): km = MiniBatchKMeans(n_clusters=50, batch_size=1000) # km = KMeans(n_jobs=-1, n_clusters=50) print "Clustering data..." labels = pd.DataFrame(km.fit_predict(cluster_matrix.values)) res = pd.concat([keys, labels], axis = 1, ignore_index=True) return res
def gen_cluster(keys = None, cluster_matrix = None): assert cluster_matrix and keys km = MiniBatchKMeans(n_clusters=50, batch_size=1000) labels = pd.DataFrame(km.fit_predict(cluster_matrix.values)) res = pd.concat([keys, labels], axis = 1, ignore_index=True) return res
def fit(self, descs, preprocess=True): if preprocess: self.stdSlr = StandardScaler() self.stdSlr.fit(descs) tmp = self.stdSlr.transform(descs) else: tmp = descs self.stdSlr = None kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=10000) kmeans.fit(tmp) self.centers = kmeans.cluster_centers_ self.clusters = kmeans.labels_ return self.centers
def MiniBatchKMeans(self, X, batch=10000): print("in fit method", X.shape, self.k) kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch) kmeans.fit(X) centers = kmeans.cluster_centers_ clusters = kmeans.labels_ print("shape of centers is ", centers.shape) return centers
def initial_centers(self, img_output): C_init = np.zeros([self.subspace_num * self.subcenter_num, self.output_dim]) print "#ZDQ train# initilizing Centers" all_output = img_output for i in xrange(self.subspace_num): kmeans = MiniBatchKMeans(n_clusters=self.subcenter_num).fit(all_output[:, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num]) C_init[i * self.subcenter_num: (i + 1) * self.subcenter_num, i * self.output_dim / self.subspace_num: (i + 1) * self.output_dim / self.subspace_num] = kmeans.cluster_centers_ print "step: ", i, " finish" return C_init
def test_mb_k_means_plus_plus_init_dense_array(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X) _check_fitted_model(mb_k_means)
def test_mb_kmeans_verbose(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: mb_k_means.fit(X) finally: sys.stdout = old_stdout
def test_mb_k_means_plus_plus_init_sparse_matrix(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_init_with_large_k(): mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20) # Check that a warning is raised, as the number clusters is larger # than the init_size assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_k_means_random_init_sparse_csr(): # increase n_init to make random init stable enough mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters, random_state=42, n_init=10).fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_dense_array(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1).fit(X) _check_fitted_model(mb_k_means)
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=10) assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_k_means_perfect_init_sparse_csr(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1).fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_with_many_reassignments(): # Test for the case that the number of clusters to reassign is bigger # than the batch_size n_samples = 550 rnd = np.random.RandomState(42) X = rnd.uniform(size=(n_samples, 10)) # Check that the fit works if n_clusters is bigger than the batch_size. # Run the test with 550 clusters and 550 samples, because it turned out # that this values ensure that the number of clusters to reassign # is always bigger than the batch_size n_clusters = 550 MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, init_size=n_samples, random_state=42).fit(X)
def test_minibatch_default_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, batch_size=10, random_state=42, n_init=1).fit(X) assert_equal(mb_k_means.init_size_, 3 * mb_k_means.batch_size) _check_fitted_model(mb_k_means)