我们从Python开源项目中,提取了以下32个代码示例,用于说明如何使用sklearn.cluster.AgglomerativeClustering()。
def perform_clustering(X, connectivity, title, num_clusters=3, linkage='ward'): plt.figure() model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=num_clusters) model.fit(X) # extract labels labels = model.labels_ # specify marker shapes for different clusters markers = '.vx' for i, marker in zip(range(num_clusters), markers): # plot the points belong to the current cluster plt.scatter(X[labels==i, 0], X[labels==i, 1], s=50, marker=marker, color='k', facecolors='none') plt.title(title)
def agglomerative_clustering(self, out_path, pd_data, number_of_clusters): headers, repos, features = self.__fetch_data(pd_data) agglomerative_clustering = AgglomerativeClustering(n_clusters=number_of_clusters, linkage="complete") agglomerative_clustering.fit(features) # form clusters clusters = [] for i in range(0, number_of_clusters): # k cluster repo_list = [] for j in range(0, len(agglomerative_clustering.labels_)): # a label for each repo. if i == agglomerative_clustering.labels_[j]: # if repo label is equal to Cluster number repo_list.append(repos[j]) # add repo to cluster i's list. clusters.append(repo_list) out_file_path = os.path.join(out_path, "agglomerative_noOfClusters" + str(number_of_clusters)) self.__export_agglomerative_results(agglomerative_clustering, clusters, out_file_path)
def __init__(self, edges, branching_factor=50, threshold=0.1): # Make features list. features = [] for i in range(len(edges)): edge = edges[i] features.append([edge['perimeter'], edge['area'], edge['shape_factor'], edge['radius_deviation']]) features = np.array(features) # Normalize features normed_features = features.copy() for i in range(features.shape[1]): avg = np.median(features[::, i]) std = np.std(features[::, i]) normed_features[::, i] -= avg normed_features[::, i] /= avg self.features = features self.normed_features = normed_features self.branching_factor = branching_factor self.threshold = threshold #self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2) self.run(KMeans, n_clusters=2) #self.run(AgglomerativeClustering, n_clusters=2)
def cluster_ward(self, image_cols): # Connectivity # TODO optional connectivity connectivity = grid_to_graph(*self.image.shape[:2]) ward = AgglomerativeClustering( n_clusters=self.params.num_clusters, linkage='ward', connectivity=connectivity ) ward.fit(image_cols) self.number_of_clusters = len(np.unique(ward.labels_)) print 'number of clusters', self.number_of_clusters centers = np.zeros((self.number_of_clusters, 3)) for i in range(0, self.number_of_clusters): cluster_points = image_cols[ward.labels_ == i] cluster_mean = np.mean(cluster_points, axis=0) centers[i, :] = cluster_mean return centers, ward.labels_
def agglomerative_clustering(X, k=10): """ Run an agglomerative clustering on X. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. k: the number of clusters we want (default: 10). Returns: An AgglomerativeClustering model trained on X. """ model = AgglomerativeClustering(n_clusters=k) model.fit(X) # Note all the other functions are the same except we use # 'AgglomerativeClustering' instead of 'KMeans'. return model # Ex4.1
def cluster_agglomerative(X_train, model_args=None, gridsearch=True, connectivity_graph=True, connectivity_graph_neighbors=10): from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import kneighbors_graph print('AgglomerativeClustering') if connectivity_graph: print('Creating k-neighbors graph for connectivity restraint') connectivity = kneighbors_graph(X_train, n_neighbors=connectivity_graph_neighbors) model_args['connectivity'] = connectivity if gridsearch is True: ## TODO: # add hyperparamter searching. No scoring method available for this model, # so we can't easily use gridsearching. raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False') # prune(param_grid, model_args) else: if 'n_clusters' not in model_args: raise KeyError('Need to define n_clusters for AgglomerativeClustering') param_grid = None return ModelWrapper(AgglomerativeClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def test_AgglomerativeClustering_nclusters(*data): ''' test the performance with different n_clusters :param data: data, target :return: None ''' X,labels_true=data nums=range(1,50) ARIs=[] for num in nums: clst=cluster.AgglomerativeClustering(n_clusters=num) predicted_labels=clst.fit_predict(X) ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(nums,ARIs,marker="+") ax.set_xlabel("n_clusters") ax.set_ylabel("ARI") fig.suptitle("AgglomerativeClustering") plt.show()
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def agglomerativeClustering(sourceFiles, fileExtension): """ Performs agglomerative hierarchical clustering using files with <fileExtension> in the <sourceFiles> directory and return accuracy measure""" try: accuracy = 0 # Step 1 - Check the required algorithm to specify the data type to load dataFiles = glob.glob("%s/*.%s" % (arguments.sourcedir, arguments.datatype)) # Get the paths of files to load dataSamples, dataLabels, loadedClusters = [], [], [] for dataPoint in dataFiles: dataSamples.append([float(x) for x in open(dataPoint).read()[1:-1].split(",")]) # Also load its cluster clusterName, paramNames = loadLabelFromFile(dataPoint.replace(".%s" % arguments.datatype, ".metadata")) if not clusterName in loadedClusters: loadedClusters.append(clusterName) dataLabels.append(loadedClusters.index(clusterName)) prettyPrint("Successfully retrieved %s instances for clustering" % len(dataSamples)) # Step 2 - Perform clustering clusterer = AgglomerativeClustering(n_clusters=len(loadedClusters)) predicted = clusterer.fit_predict(numpy.array(dataSamples), dataLabels) accuracy = round(metrics.accuracy_score(dataLabels, predicted), 2) except Exception as e: prettyPrint("Error encountered: %s" % e, "error") return accuracy
def agglom(reduced_data, n_clusters): #----Do Agglomerative clustering and return relevant performance data clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters) clustering = clustering.fit(reduced_data) sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean') return { "labels":clustering.labels_, "silhouette_score": sil_score }
def _cluster_documents(self): method = self.params['cluster_method'] n_clusters = int(self.params['cluster_n_clusters']) n_samples = len(self.document_vectors) if n_clusters > n_samples: n_clusters = n_samples if method == 'kmeans': clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1) else: clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine') clustering = clusterer.fit(self.document_vectors) cluster_labels = clustering.labels_ clustering_dict = clustering.__dict__ cluster_centers = clustering_dict['cluster_centers_'] clusters = {} for document_id,cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) return clusters,cluster_centers
def test_compute_centers(self, data_labels): data, _ = data_labels ac = cluster.AgglomerativeClustering() fit = ac.fit(data) result = compute_centers(fit, data) assert result.shape == (data.shape[1], len(set(fit.labels_)))
def _get_htree(self, X=None, metric='cosine'): km = self.km method_name = type(km).__name__ if method_name == 'AgglomerativeClustering': htree = {'n_leaves': km.n_leaves_, 'n_components': km.n_components_, 'children': km.children_.tolist()} elif method_name in ['Birch', '_BirchDummy']\ and self._pars['n_clusters'] is None: hmod = _BirchHierarchy(km, metric=metric) hmod.fit(X) htree = hmod.htree else: htree = {} return htree
def ward_hc(self, n_clusters, n_neighbors=10): """ Perform Ward hierarchical clustering Parameters ---------- n_clusters : int number of clusters lsi_components : int apply LSA before the clustering algorithm n_neighbors : int N nearest neighbors used for computing the connectivity matrix """ from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import kneighbors_graph pars = {'n_neighbors': n_neighbors, 'is_hierarchical': True, "metric": self.metric} if 'lsi' not in self.pipeline: raise ValueError("you must use lsi with birch clustering " "for scaling reasons.") # This is really not efficient as # it's done a second time in _cluster_func X = self.pipeline.data connectivity = kneighbors_graph(X, n_neighbors=n_neighbors, include_self=False) km = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) return self._cluster_func(n_clusters, km, pars)
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'): # collect embeddings for mfi: X = np.asarray([self.w2v_model[w] for w in self.mfi \ if w in self.w2v_model], dtype='float32') # dimension reduction: tsne = TSNE(n_components=2) coor = tsne.fit_transform(X) # unsparsify plt.clf() sns.set_style('dark') sns.plt.rcParams['axes.linewidth'] = 0.4 fig, ax1 = sns.plt.subplots() labels = self.mfi # first plot slices: x1, x2 = coor[:,0], coor[:,1] ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none') # clustering on top (add some colouring): clustering = AgglomerativeClustering(linkage='ward', affinity='euclidean', n_clusters=nb_clusters) clustering.fit(coor) # add names: for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_): ax1.text(x, y, name, ha='center', va="center", color=plt.cm.spectral(cluster_label / 10.), fontdict={'family': 'Arial', 'size': 8}) # control aesthetics: ax1.set_xlabel('') ax1.set_ylabel('') ax1.set_xticklabels([]) ax1.set_xticks([]) ax1.set_yticklabels([]) ax1.set_yticks([]) sns.plt.savefig(outputfile, bbox_inches=0)
def clustering(docs,n_clusters): # ?? n_clusters ??? kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs) # kmeans?? labels=kmeans_model.labels_ # hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs) # ???? # labels=hmodel.labels_ score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean') # euclidean ?? return labels,score
def make_ward_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'WARD/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward') predict_result = ward.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def ex4_agglomerative_clustering(X, y): """ This does the same thing as ex2_kmeans but with an agglomerative clustering and K=2. """ # AgglomerativeClustering needs a non-spare matrix X = X.toarray() k = 2 model = AgglomerativeClustering(k).fit(X, y) print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_) # Ex 5
def Learning(X): from sklearn.cluster import AgglomerativeClustering learner = AgglomerativeClustering(n_clusters=3) y = learner.fit_predict(X) yield 'Agglomerative clusters(n=3)', y #=================================================
def test_AgglomerativeClustering(*data): ''' test AGG method :param data: data, target :return: None ''' X,labels_true=data clst=cluster.AgglomerativeClustering() predicted_labels=clst.fit_predict(X) print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels)))
def test_AgglomerativeClustering_linkage(*data): ''' test the performance with different linkages :param data: data, target :return: None ''' X,labels_true=data nums=range(1,50) fig=plt.figure() ax=fig.add_subplot(1,1,1) linkages=['ward','complete','average'] markers="+o*" for i, linkage in enumerate(linkages): ARIs=[] for num in nums: clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage) predicted_labels=clst.fit_predict(X) ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) ax.plot(nums,ARIs,marker=markers[i],label="linkage:{0}".format(linkage)) ax.set_xlabel("n_clusters") ax.set_ylabel("ARI") ax.legend(loc="best") fig.suptitle("AgglomerativeClustering") plt.show()
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144)]) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering( n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_connectivity_fixing_non_lil(): # Check non regression of a bug if a non item assignable connectivity is # provided with more than one component. # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = AgglomerativeClustering(connectivity=c, linkage='ward') assert_warns(UserWarning, w.fit, x)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering( connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_agg_n_clusters(): # Test that an error is raised when n_clusters <= 0 rng = np.random.RandomState(0) X = rng.rand(20, 10) for n_clus in [-1, 0]: agc = AgglomerativeClustering(n_clusters=n_clus) msg = ("n_clusters should be an integer greater than 0." " %s was provided." % str(agc.n_clusters)) assert_raise_message(ValueError, msg, agc.fit, X)
def makeWard(X, k=2): # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) return cluster.AgglomerativeClustering(n_clusters=k, linkage='ward', connectivity=connectivity)
def makeAvgLinkage(X=None, k=2): connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) return cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=k, connectivity=connectivity)
def makeMaxLinkage(X=None, k=2): connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) return cluster.AgglomerativeClustering(linkage="complete", affinity="cityblock", n_clusters=k, connectivity=connectivity)
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5): if len(patch) == 1: return [patch] if statistic == 'db': if method == 'kmeans': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): kmeans = cluster.KMeans(n_clusters=k).fit(X) clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch) dist_mu = squareform(pdist(kmeans.cluster_centers_)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif method == 'agglomerative': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X) clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch) tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)] centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp]) dist_mu = squareform(pdist(centers)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif statistic == 'gap': X = np.array(df.ix[patch, :]) if method == 'kmeans': f = cluster.KMeans gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f) k_optimal = list(gaps).index(max(gaps))+1 clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch) return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)] else: raise 'error: only db and gat statistics are supported'
def plot_cluster(reduced_data, cluster_type, k_clusters, plot_title): if cluster_type.lower() == "kmeans": clus = KMeans(init='k-means++', n_clusters=k_clusters, n_init=10) elif cluster_type.lower() == "agglom": clus = AgglomerativeClustering(n_clusters = k_clusters) clus.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = clus.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1, figsize=(15,10)) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=10) if cluster_type.lower() == "kmeans": # Plot the centroids as a white X centroids = clus.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(plot_title) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()