我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用sklearn.cluster.SpectralClustering()。
def compare_clusters(X,Y,method='spectral',s=10000): A = (X/np.linalg.norm(X,axis=0)).T A[np.isnan(A)] = 0 B = (Y/np.linalg.norm(Y,axis=0)).T B[np.isnan(B)] = 0 random_samples = np.zeros(A.shape[0],dtype=np.bool) random_samples[:min(s,A.shape[0])] = True np.random.shuffle(random_samples) A = A[random_samples] B = B[random_samples] dA = 1 - A.dot(A.T) dA = np.exp(-dA**2/2.) dB = 1 - B.dot(B.T) dB = np.exp(-dB**2/2.) del A,B if method == 'spectral': n = max(5,min(30,X.shape[1]/50)) lA = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dA) lB = SpectralClustering(n_clusters=n,affinity='precomputed').fit_predict(dB) elif method == 'ap': lA = AffinityPropagation(affinity='precomputed').fit_predict(dA) lB = AffinityPropagation(affinity='precomputed').fit_predict(dB) return adjusted_mutual_info_score(lA,lB)
def spectral(data): spectral = SpectralClustering( eigen_solver='arpack', affinity='rbf', assign_labels='discretize' ).fit(data) print 'Spectral' print collections.Counter(spectral.labels_) print metrics.silhouette_score(data, spectral.labels_) reduced_data = reduce_with_pca(data, 2) plot_2d_data(reduced_data, spectral.labels_)
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace C = 0.5*(C + C.T) r = d*K + 1 U, S, _ = svds(C,r,v0 = np.ones(C.shape[0])) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace C = 0.5*(C + C.T) r = min(d*K + 1, C.shape[0]-1) U, S, _ = svds(C,r,v0 = np.ones(C.shape[0])) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace n = C.shape[0] C = 0.5*(C + C.T) C = C - np.diag(np.diag(C)) + np.eye(n,n) # for sparse C, this step will make the algorithm more numerically stable r = d*K + 1 U, S, _ = svds(C,r,v0 = np.ones(n)) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed', assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def compute_meta_centroid_set(self, C): print("Intermediate clusters", C.shape) # By eye, it looks like the top 60%-80% of the # remaining clusters are stable... nc = int(self.subcluster_pcut * self.subcluster_kn) clf = SpectralClustering(n_clusters=nc, affinity="precomputed") S = cosine_affinity(C) labels = clf.fit_predict(S) meta_clusters = [] meta_cluster_size = [] for i in range(labels.max() + 1): idx = labels == i mu = C[idx].mean(axis=0) mu /= np.linalg.norm(mu) meta_clusters.append(mu) meta_cluster_size.append(idx.sum()) return np.array(meta_clusters)
def cluster(aff_matrix, records, n_clusters, medoid_indexes): Cluster = SpectralClustering(n_clusters=n_clusters, affinity='precomputed') labels = Cluster.fit_predict(aff_matrix) medoid_indexes = medoid_indexes.loc[records] t_records = [] indexes = [] for i in range(n_clusters): labels_i = np.where(labels == i)[0] sub_aff_matrix = aff_matrix[labels_i, :][:, labels_i] medoid_index = np.argmax(np.prod(sub_aff_matrix, axis=0)) absolute_index = labels_i[medoid_index] r = medoid_indexes.index[absolute_index] t_records.append(r) i = medoid_indexes.iloc[absolute_index].values[0] indexes.append(i) return t_records, indexes
def _clusterAffinity(aff, k, imdb, cls_idx): """ Cluster error correlation matrix using spectral clustering into k cluster, show the class labels in each cluster. """ # clustering model spectral = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="precomputed") print 'Performing clustering...' labels = spectral.fit_predict(aff) # print out all labels for i in xrange(k): find_idx = np.where(labels==i)[0] print 'The list of classes in cluster {}'.format(i) print [imdb.classes[id] for id in find_idx] print '--------------------------------------------' return labels if __name__ == '__main__': # TODO: debug code if necessary pass
def cluster_spectral(X_train, model_args=None, gridsearch=True): from sklearn.cluster import SpectralClustering print('SpectralClustering') if gridsearch is True: ## TODO: # add hyperparamter searching. No scoring method available for this model, # so we can't easily use gridsearching. raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False') # prune(param_grid, model_args) else: if 'n_clusters' not in model_args: raise KeyError('Need to define n_clusters for SpectralClustering') param_grid = None return ModelWrapper(SpectralClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def clustering( points, k=2,name='kmeans'): ''' points: N_samples * N_features k: number of clusters ''' if name == 'kmeans': kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points) ## print within_variance #cluster_distance = kmeans.transform( points ) #within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] ) #print("AvgWithinSS:"+str(within_variance)) if len( np.unique(kmeans.labels_) ) > 1: si = silhouette_score( points,kmeans.labels_ ) #print("Silhouette:"+str(si)) else: si = 0 print("Silhouette:"+str(si)) return kmeans.labels_,si if name == 'spec': spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points ) si = silhouette_score( points,spec.labels_ ) print("Silhouette:"+str(si)) return spec.labels_,si
def find_spectral_alphas(self, n_alphas, max_log_alpha, n_alphas_to_return): self.create_affinity_matrix(max_log_alpha, n_alphas) affinity = self.affinity_matrix spectral = cluster.SpectralClustering(n_clusters=n_alphas_to_return, affinity='precomputed') alphas = np.concatenate(([0],np.logspace(-1,max_log_alpha,n_alphas))) spectral.fit(affinity) labels = spectral.labels_ best_alphas = list() for i in range(n_alphas_to_return): idx = np.where(labels==i)[0] if not(0 in idx): #because we don't want to include the cluster that includes alpha=0 affinity_submatrix = affinity[idx][:, idx] sum_affinities = np.sum(affinity_submatrix, axis=0) exemplar_idx = idx[np.argmax(sum_affinities)] best_alphas.append(alphas[exemplar_idx]) return np.sort(best_alphas), alphas, affinity[0,:], labels
def spectral(k,D, rs): """ From clustering_on_transcript_compatibility_counts, see github for MIT license """ if D[1,1] < 1: D = 1-D # Convert distance to similarity matrix spectral = cluster.SpectralClustering(n_clusters=k,affinity='precomputed', random_state=rs) spectral.fit(D) labels = spectral.labels_ return labels # gets max weight matching of a biparetite graph with row_label x column_label # (weights are given by weight_matrix)
def spectral_clustering(S, X, config): ''' Computes spectral clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from sklearn.cluster import SpectralClustering nk = int(config["n_clusters"]) clf = SpectralClustering(affinity='cosine', n_clusters=nk) return clf.fit_predict(X)
def compute_centroid_set(self): INPUT_ITR = subset_iterator( X=self.docv, m=self.subcluster_m, repeats=self.subcluster_repeats, ) kn = self.subcluster_kn clf = SpectralClustering( n_clusters=kn, affinity="precomputed", ) C = [] for X in INPUT_ITR: # Remove any rows that have zero vectors bad_row_idx = ((X**2).sum(axis=1) == 0) X = X[~bad_row_idx] A = cosine_affinity(X) # "Force" symmetry due to rounding errors A = np.maximum( A, A.transpose() ) labels = clf.fit_predict(A) # Compute the centroids (N, dim) = X.shape centroids = np.zeros((kn, dim)) for i in range(kn): idx = labels == i mu = X[idx].mean(axis=0) mu /= np.linalg.norm(mu) centroids[i] = mu C.append(centroids) return np.vstack(C)
def spectral_clustering_clusters(similarity_matrix): return SpectralClustering(n_clusters=10, affinity='precomputed').fit(similarity_matrix)
def __init__(self, n_clusters=8, eigen_solver=None, random_state=None, n_init=10, gamma=1., affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, norm_laplacian=True): super(SpectralClustering, self).__init__( n_clusters=n_clusters, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, degree=degree, coef0=coef0, kernel_params=kernel_params) self.norm_laplacian = norm_laplacian
def make_spectral_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'spectral/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) spectral = SpectralClustering(n_clusters=self.spectral_clusters_count) predict_result = spectral.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames) # aa = Affinity Propagation
def SpectralAccuracy(): clusterer = SpectralClustering(n_clusters=2) tdm = pickle.load(open(DATASET_PATH + "BOW_TDIDF.p", "rb")) predictions = clusterer.fit_predict(tdm) true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0] numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels] numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels] one = f1_score(numerical_mapped_1, predictions) two = f1_score(numerical_mapped_2, predictions) print("The F1 score of Spectral Clustering on BOW (w/Tdidf) is: " + str(max(one, two)))
def split(self, node): # Perform normalized cut try: ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity']) except KeyboardInterrupt: raise except: return None, None, 0 # Create left and right node mask1, mask2 = (ind == 0), (ind == 1) if not (np.any(mask1) and np.any(mask2)): return None, None, 0 left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] } right = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] } # Force the node with the lower minimum distance to the query to be the left node if ind[0] == 1: # items are already sorted when passed to fit(), so we just need to look at the first item instead of re-computing all distances left, right = right, left # Modify parent node['children'] = [left, right] # Modify parent chain parent = node while parent is not None: parent['height'] += 1 parent['size'] += 2 parent['leafs'] += 1 parent = parent['parent'] return left, right, self.ncut_value(node['affinity'], ind)
def _get_spectral(parameters): if parameters is None: parameters = { 'n_clusters': 2, 'affinity': 'nearest_neighbors' } return SpectralClustering(**parameters)
def test_spectral_clustering(): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]]) for eigen_solver in ('arpack', 'lobpcg'): for assign_labels in ('kmeans', 'discretize'): for mat in (S, sparse.csr_matrix(S)): model = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed', eigen_solver=eigen_solver, assign_labels=assign_labels ).fit(mat) labels = model.labels_ if labels[0] == 0: labels = 1 - labels assert_array_equal(labels, [1, 1, 1, 0, 0, 0, 0]) model_copy = loads(dumps(model)) assert_equal(model_copy.n_clusters, model.n_clusters) assert_equal(model_copy.eigen_solver, model.eigen_solver) assert_array_equal(model_copy.labels_, model.labels_)
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, eigen_solver="amg")
def test_spectral_unknown_mode(): # Test that SpectralClustering fails with an unknown mode set. centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, eigen_solver="<unknown>")
def test_spectral_unknown_assign_labels(): # Test that SpectralClustering fails with an unknown assign_labels set. centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, assign_labels="<unknown>")
def test_spectral_clustering_sparse(): X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) S = rbf_kernel(X, gamma=1) S = np.maximum(S - 1e-4, 0) S = sparse.coo_matrix(S) labels = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed').fit(S).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
def makeSpectral(X=None, k=2): return cluster.SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="nearest_neighbors")
def makeClusterers(X, k=2): return [('MiniBatchKMeans', makeKMeans(X, k)), ('AffinityPropagation', makeAffinityProp()), ('MeanShift', makeMeanShift(X)), ('SpectralClustering', makeSpectral(X, k)), ('Ward', makeWard(X, k)), ('AgglomerativeAvg', makeAvgLinkage(X, k)), ('AgglomerativeMax', makeMaxLinkage(X, k)), ('AgglomerativeWard', makeWardLinkage(X, k)), ('DBSCAN', makeDBScan())]