我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用scipy.cluster.hierarchy.fcluster()。
def formClusters(dists, link, distance): """Form clusters based on hierarchical clustering of input distance matrix with linkage type and cutoff distance :param dists: numpy matrix of distances :param link: linkage type for hierarchical clustering :param distance: distance at which to cut into clusters :return: list of cluster assignments """ # Make distance matrix square dists = squareform(dists) # Compute linkage links = linkage(dists, link) # import matplotlib.pyplot as plt # from scipy.cluster import hierarchy # plt.figure(figsize=(15,5)) # p = hierarchy.dendrogram(links) # Break into clusters based on cutoff clusters = fcluster(links, distance, criterion='distance') return clusters
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) for u,p in path_length.items(): for v,d in p.items(): distances[u][v]=d # Create hierarchical cluster Y=distance.squareform(distances) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership=list(hierarchy.fcluster(Z,t=1.15)) # Create collection of lists for blockmodel partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(n) return list(partition.values())
def fcluster(df, Z, n_groups, n_clusters): """ """ # create flat cluster, i.e. maximal number of clusters... T = hac.fcluster(Z, criterion='maxclust', depth=2, t=n_clusters) # add cluster id to original dataframe df['cluster_id'] = np.NAN # group is either days (1-365) or weeks (1-52) #for d in df.index.get_level_values('group').unique(): for g in range(1, n_groups+1): # T[d-1] because df.index is e.g. 1-365 (d) and T= is 0...364 df.ix[g, 'cluster_id'] = T[g-1] # add the cluster id to the index df.set_index(['cluster_id'], append=True, inplace=True) # set cluster id as first index level for easier looping through cluster_ids df.index = df.index.swaplevel(0, 'cluster_id') # just to have datetime at the last level of the multiindex df df.index = df.index.swaplevel('datetime', 'group') return df
def hierarchicalClustering(X,y,Maxclust, C, Method = 'single', Metric = 'euclidean'): # Perform hierarchical/agglomerative clustering on data matrix Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure() #clusterplot(X, cls.reshape(cls.shape[0],1), y=y) clusterPlot(X, cls.reshape(cls.shape[0],1), Maxclust, C, y=y) # Display dendrogram max_display_levels=7 figure() dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=0.5*np.max(Z[:,2])) title("Dendrgram of the Hierarchical Clustering") show()
def thrEstimation(self): x = 0.00 dx = 0.05 countsList = [] x_list = [] while x < 1: FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] countsList.append(counter[Best]) x+= dx x_list.append(x) dy = np.diff(countsList) for a, b in zip (x_list, dy): if b == max(dy): return a
def get_cluster_assignments(sim_matrix, parameters): """ (np.array, list of int) -> list of int sim_matrix: list of list of float -- similarity matrix between exemplars parameters: list of parameters in the format ["method:method_name", "algo:algo_name", "k:num_clusters", "damping:damping"] where order doesn't matter (k and damping only relevant for certain clustering methods) the possible values for each parameter are listed in the function below. Returns a list of integers. The integer at each index of the list corresponds to the cluster number of the exemplar at the same index in sim_matrix. """ algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap') # from { 'hierarchical', 'kmeans', 'ap', 'ward' } method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single') # from {'single', 'complete', 'average'} (only relevant for hierarchical clustering) kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8) # any integer <= the data length damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5) # only relevant for AP -- in [0.5,1] # if algorithm == 'hierarchical': clustering = hierarchy.linkage(sim_matrix, method) k = get_k(clustering, 20) cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1 elif algorithm == 'kmeans': cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix) elif algorithm == 'ap': cluster_assignments = AffinityPropagation().fit_predict(sim_matrix) elif algorithm == 'ward': clustering = hierarchy.ward(sim_matrix) k = get_k(clustering, 20) cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1 return cluster_assignments
def single_silhouette_dendrogram(dist_matrix, Z, threshold, mode='clusters', method='single', sample_names=None): """Compute the average silhouette at a given threshold. Parameters ---------- dist_matrix : array-like Precomputed distance matrix between points. Z : array-like Linkage matrix, results of scipy.cluster.hierarchy.linkage. threshold : float Specifies where to cut the dendrogram. mode : ('clusters', 'thresholds'), optional Choose what to visualise on the x-axis. Returns ------- x : float Based on mode, it can contains the number of clusters or threshold. silhouette_avg : float The average silhouette. """ cluster_labels = fcluster(Z, threshold, 'distance') nclusts = np.unique(cluster_labels).shape[0] save_results_clusters("res_{}_{:03d}_clust.csv".format(method, nclusts), sample_names, cluster_labels) try: silhouette_list = silhouette_samples(dist_matrix, cluster_labels, metric="precomputed") silhouette_avg = np.mean(silhouette_list) x = max(cluster_labels) if mode == 'clusters' else threshold except ValueError as e: if max(cluster_labels) == 1: x = 1 if mode == 'clusters' else threshold silhouette_avg = 0 else: raise(e) return x, silhouette_avg
def anglesCluster(angles): # this function uses hierarchical clustering to find the clusters of a set of angle values Dists = dist.pdist(angles, angleDiff) linkageMatrix = hier.linkage(Dists, metric = angleDiff) C = hier.fcluster(linkageMatrix, 5, 'maxclust') return C
def clustering(partSeqList, partData): print('clustering for the seperated dataset') #simiMatrix = simiMatrixCal(partData) '''Invoke the clustering method in library''' data_dist = pdist(partData,metric=distCalculate) Z = linkage(data_dist, 'complete') clusterLabels = fcluster(Z, para['max_d'], criterion='distance') print ('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels)))) clusNum = len(set(clusterLabels)) instIndexPerClus=[[] for i in range(clusNum)] #initialization for i in range(len(clusterLabels)): lab = clusterLabels[i]-1 instIndexPerClus[lab].append(partSeqList[i]) return clusterLabels,instIndexPerClus
def clustering(partSeqList, partData): '''Invoke the clustering method in library''' print('clustering for the seperated dataset') data_dist = pdist(partData,metric=distCalculate) Z = linkage(data_dist, 'complete') clusterLabels = fcluster(Z, para['max_d'], criterion='distance') print('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels)))) clusNum = len(set(clusterLabels)) instIndexPerClus=[[] for i in range(clusNum)] #initialization for i in range(len(clusterLabels)): lab = clusterLabels[i]-1 instIndexPerClus[lab].append(partSeqList[i]) return clusterLabels,instIndexPerClus
def _transactions_fuzzy_matching(transactions, match): """ Runs fuzzy matching on the transactions, by applying a complete linkage hierarchical clustering algorithm to the set of different itemsets in the transactions. For clustering, the similarity ratio as given by fuzzywuzzy.ratio is used as the distance measure Input: transactions: list of tuples representing items on each transaction match: minimum similarity ratio (0 to 100) for clustering Output: transactions: new version of the transactions, where each item has been replaced by the first item on its corresponding cluster word_clusters: dictionary that maps the cluster for each item in the transactions """ words = set([]) for transaction in transactions: words |= set(transaction) words = sorted(words) l = [((a, b), 100-Levenshtein.ratio(str(a), str(b))) for a, b in combinations(words, 2)] d = [value for pair, value in l] r = linkage(d, 'complete') clusters_index = fcluster(r, 100-match, "distance") clusters = {} for obs_i, cluster_i in enumerate(clusters_index): if cluster_i in clusters: clusters[cluster_i].append(words[obs_i]) else: clusters[cluster_i] = [words[obs_i]] word_clusters = {word: clusters[clusters_index[i]] for i, word in enumerate(words)} new_transactions = [] for transaction in transactions: new_transaction = tuple(set(([word_clusters[word][0] for word in transaction]))) new_transactions.append(new_transaction) return new_transactions, word_clusters
def chc(X, K, params=()): pnames = ['linkage_method', 'distance'] dflts = [ 'ward', 'euclidean'] if isinstance(params, np.ndarray): paramsloc = params.tolist() else: paramsloc = params (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc) Z = sphc.linkage(X, method=linkage_method, metric=distance) C = sphc.fcluster(Z, K, criterion='maxclust') return clustVec2partMat(C, K) # Other related functions
def cluster_helices(helices, cluster_distance=12.0): """ Clusters helices according to the minimum distance between the line segments representing their backbone. Notes ----- Each helix is represented as a line segement joining the CA of its first Residue to the CA if its final Residue. The minimal distance between pairwise line segments is calculated and stored in a condensed_distance_matrix. This is clustered using the 'single' linkage metric (all members of cluster i are at < cluster_distance away from at least one other member of cluster i). Helices belonging to the same cluster are grouped together as values of the returned cluster_dict. Parameters ---------- helices: Assembly cluster_distance: float Returns ------- cluster_dict: dict Keys: int cluster number Values: [Polymer] """ condensed_distance_matrix = [] for h1, h2 in itertools.combinations(helices, 2): md = minimal_distance_between_lines(h1[0]['CA']._vector, h1[-1]['CA']._vector, h2[0]['CA']._vector, h2[-1]['CA']._vector, segments=True) condensed_distance_matrix.append(md) z = linkage(condensed_distance_matrix, method='single') clusters = fcluster(z, t=cluster_distance, criterion='distance') cluster_dict = {} for h, k in zip(helices, clusters): if k not in cluster_dict: cluster_dict[k] = [h] else: cluster_dict[k].append(h) return cluster_dict
def checkMultiplicity(self, thr): FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] print('You are clustering with a threshold of %s'%(thr)) print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
def completenessEstimation(self): x = 0.00 dx = 0.05 while x > 1: FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
def minimalForCompleteness(self): print("Running estimator for minimal threshold for completeness") labels=self.createLabels() x = 0.00 dx = 0.05 countsList = {} x_list = [] while x < 1: Arrays= {} FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] toProcess=[Best] y=0 for cluster, filename in zip(FlatC,labels): if cluster in toProcess: hklFile = any_reflection_file(filename) b= hklFile.as_miller_arrays() for column in b: if column.is_xray_intensity_array(): Arrays[y]=column break y+=1 try: Arr = Arrays[0] except: countsList.append(0) for label in range(1, y): try: Arr = Arr.concatenate(Arrays[label]) except: pass countsList[x]=(Arr.completeness()) x+= dx # return minimal for max L = [] for key in countsList: if countsList[key]>0.98: L.append(key) L.sort() return L[0]
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200, method='ap'): """Define clusters given the similarity matrix and the threshold.""" n, labels = connected_components(similarity_matrix, directed=False) prev_max_clust = 0 print("connected components: %d" % n) clusters = labels.copy() if method == 'dbscan': ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1) if method == 'ap': ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter, preference='median') for i in range(n): idxs = np.where(labels == i)[0] if idxs.shape[0] > 1: sm = similarity_matrix[idxs][:, idxs] sm += sm.T + scipy.sparse.eye(sm.shape[0]) # Hierarchical clustering if method == 'hc': dists = squareform(1 - sm.toarray()) links = fastcluster.linkage(dists, method='ward') try: clusters_ = fcluster(links, threshold, 'distance') except ValueError as err: logging.critical(err) clusters_ = np.zeros(1, dtype=int) # DBSCAN elif method == 'dbscan': db = ap.fit(1. - sm.toarray()) # Number of clusters in labels, ignoring noise if present. clusters_ = db.labels_ # n_clusters_ = len(set(clusters_)) - int(0 in clusters_) # AffinityPropagation # ap = AffinityPropagation(affinity='precomputed') elif method == 'ap': db = ap.fit(sm) clusters_ = db.labels_ else: raise ValueError("clustering method %s unknown" % method) if np.min(clusters_) == 0: clusters_ += 1 clusters_ += prev_max_clust clusters[idxs] = clusters_ prev_max_clust = max(clusters_) else: # connected component contains just 1 element prev_max_clust += 1 clusters[idxs] = prev_max_clust return np.array(extra.flatten(clusters))
def community_detection(self,cutoff_threshold=0.55): ''' Finding communities of features using pairwise differences of solutions aquired in the main LP step. ''' svm_solution = self._svm_coef abs_svm_sol = np.abs(svm_solution) om = self._omegas mins = om[:,0,:] maxs = om[:,1,:] abs_mins = np.abs(mins) abs_maxs = np.abs(maxs) # Aggregate min and max solution values to obtain the absolute variation lower_variation = abs_svm_sol - abs_mins upper_variation = abs_maxs - abs_svm_sol variation = np.abs(lower_variation) + np.abs(upper_variation) # add up lower triangular matrix to upper one collapsed_variation = np.triu(variation)+np.tril(variation).T np.fill_diagonal(collapsed_variation, 0) #collapsed_variation = pd.DataFrame(collapsed_variation) # Create distance matrix dist_mat = np.triu(collapsed_variation).T + collapsed_variation # normalize dist_mat = 1- dist_mat/np.max(dist_mat) # get numpy array #dist_mat = dist_mat.values[:] # feature with itself has no distance np.fill_diagonal(dist_mat,0) # convert to squareform for scipy compat. dist_mat_square = squareform(dist_mat) # Execute clustering link = linkage(dist_mat_square, method="ward") # Set cutoff at which threshold the linkage gets flattened (clustering) RATIO = cutoff_threshold threshold = RATIO * np.max(link[:,2]) # max of branch lengths (distances) feature_clustering = fcluster(link,threshold,criterion="distance") return feature_clustering, link
def cluster_contigs(outbase, d, bin_chr, bin_position, threads=4, frac=0.66, iterations=30, maxnumchr=500, seed=0, dpi=100, minchr=3, nchr=0): """Estimate number of chromosomes and assign contigs to chromosomes""" logger(" Estimating number of chromosomes...") prng = np.random#.RandomState(seed) Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method="ward") # estimate chromosome number if not specified if not nchr: dZ = [] n = d.shape[0] subset = int(round(n * frac)) if subset < maxnumchr: maxnumchr = subset-1 ''' 24 vs 31s on 4 cores - it's not worth as need to pickle large matrix and pass it through args = (d[prng.permutation(n)[:subset], :][:, prng.permutation(n)[:subset]] for i in range(iterations)) p = Pool(threads) for i, _dZ in enumerate(p.imap_unordered(worker, args), 1): sys.stderr.write(' %s / %s \r'%(i, iterations)) dZ.append(_dZ) ''' for i in range(iterations): sys.stderr.write(' %s / %s \r'%(i+1, iterations)) selection = prng.permutation(n)[:subset] iZ = fastcluster.linkage(d[selection, :][:, selection][np.triu_indices(subset, 1)], method="ward") dZ.append(np.diff(iZ[:, 2])) #''' # get number of chromosomes mean_step_len = np.mean(np.array(dZ), 0) nchr = np.argmax(mean_step_len[-minchr+1::-1][:maxnumchr]) + minchr logger(" number of chromosomes: %s"%nchr) plt.figure(figsize = (15, 5)) plt.title("Number of clusters estimation") plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outbase+'.avg_step_len.svg', dpi=dpi) plt.figure() plt.title("Number of clusters estimation") plt.plot(np.arange(50, 1, -1), mean_step_len[-50+1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outbase+'.avg_step_len_50.svg', dpi=dpi) logger(" Assigning contigs to chromosomes...") clusters = [[] for n in range(nchr)] assignments = sch.fcluster(Z, t=nchr, criterion='maxclust') for i, c in zip(assignments-1, bin_chr): clusters[i].append(c) return clusters
def merge(self, anomFlag, thr): FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance') Log = open(self.CurrentDir+'/.cc_cluster.log', 'a') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] Process = True #change checkboxes to standard variables if Process: ToProcess = [Best] else: ToProcess = set(Clusters) for key in ToProcess: if counter[key]==1: ToProcess = [x for x in ToProcess if x != key] #Processing pipeline, #Does all the XSCALE run for x in ToProcess: if [thr,x, anomFlag] not in self.alreadyDone: os.mkdir(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s'%(float(thr),x, anomFlag)) Xscale=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),x, anomFlag), 'a') Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag ), 'a') print('OUTPUT_FILE=scaled.hkl',file=Xscale) print('MERGE= TRUE', file=Xscale) print('pointless hklout clustered.mtz << eof', file=Pointless) if anomFlag=='ano': print('FRIEDEL\'S_LAW= FALSE', file=Xscale) elif anomFlag=='no_ano': print('FRIEDEL\'S_LAW= TRUE', file=Xscale) Xscale.close() Pointless.close() for cluster, filename in zip(FlatC,self.labelList): if cluster in ToProcess: OUT = open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),cluster,anomFlag), 'a') Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),cluster,anomFlag), 'a') print ('INPUT_FILE= ../%s'%(filename), file=OUT) #print ('INCLUDE_RESOLUTION_RANGE=20, 1.8', file=OUT) print ('MINIMUM_I/SIGMA= 0', file=OUT) print ('XDSIN ../%s'%(filename), file= Pointless) OUT.close() Pointless.close() #optional run of XSCALE newProcesses=[] for x in ToProcess: if [thr,x, anomFlag] not in self.alreadyDone: plt.savefig(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/Dendrogram.png'%(float(thr),x,anomFlag)) P= subprocess.Popen('/opt/pxsoft/xds/vdefault/linux-x86_64/xscale_par',cwd=self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/'%(float(thr), x, anomFlag)) P.wait() print('Cluster, %s , %s , %s'%(float(thr),x, anomFlag), file=Log) Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag), 'a') print('COPY \n bg\n TOLERANCE 4 \n eof', file= Pointless) Pointless.close() os.chmod(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(self.threshold,x,self.anomFlag ), st.st_mode | 0o111) newProcesses.append([thr,x, anomFlag])