我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.cluster.hierarchy.linkage()。
def test_cuttreeHybrid(): from dynamicTreeCut import cutreeHybrid d = np.transpose(np.arange(1, 10001).reshape(100, 100)) distances = pdist(d, "euclidean") link = linkage(distances, "average") test = cutreeHybrid(link, distances) true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] assert (test['labels'] == true).all() assert False
def formClusters(dists, link, distance): """Form clusters based on hierarchical clustering of input distance matrix with linkage type and cutoff distance :param dists: numpy matrix of distances :param link: linkage type for hierarchical clustering :param distance: distance at which to cut into clusters :return: list of cluster assignments """ # Make distance matrix square dists = squareform(dists) # Compute linkage links = linkage(dists, link) # import matplotlib.pyplot as plt # from scipy.cluster import hierarchy # plt.figure(figsize=(15,5)) # p = hierarchy.dendrogram(links) # Break into clusters based on cutoff clusters = fcluster(links, distance, criterion='distance') return clusters
def generate_graphs(clusters_list, output, size, linkage, cutoff,distances): """ DESCRIPTION Create a linear cluster mapping graph where every frame is printed as a colored barplot Args: clusters_labels (list): list of cluster number per frame output (string) output name for graph Return: colors_list (list) to be used with 2D distance projection graph """ colors_list = plot_barplot(clusters_list, output, size) plot_dendro(linkage, output, cutoff, colors_list,clusters_list) plot_hist(clusters_list, output,colors_list) if (distances.shape[0] < 10000): implot(distances,output) else: printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated") return colors_list
def linkage(df, n_groups): # create the distance matrix based on the forbenius norm: |A-B|_F where A is # a 24 x N matrix with N the number of timeseries inside the dataframe df # TODO: We can save have time as we only need the upper triangle once as the # distance matrix is symmetric if True: Y = np.empty((n_groups, n_groups,)) Y[:] = np.NAN for i in range(len(Y)): for j in range(len(Y[i,:])): A = df.loc[i+1].values B = df.loc[j+1].values #print('Computing distance of:{},{}'.format(i,j)) Y[i,j] = norm(A-B, ord='fro') # condensed distance matrix as vector for linkage (upper triangle as a vector) y = Y[np.triu_indices(n_groups, 1)] # create linkage matrix with wards algorithm an euclidean norm Z = hac.linkage(y, method='ward', metric='euclidean') # R = hac.inconsistent(Z, d=10) return Z
def hierarchicalClustering(X,y,Maxclust, C, Method = 'single', Metric = 'euclidean'): # Perform hierarchical/agglomerative clustering on data matrix Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure() #clusterplot(X, cls.reshape(cls.shape[0],1), y=y) clusterPlot(X, cls.reshape(cls.shape[0],1), Maxclust, C, y=y) # Display dendrogram max_display_levels=7 figure() dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=0.5*np.max(Z[:,2])) title("Dendrgram of the Hierarchical Clustering") show()
def array2tree(d, names, outbase="", method="ward"): """Return tree representation for array""" # cluster Z = sch.linkage(d[np.triu_indices(d.shape[0], 1)], method=method) # get ete Tree t = distance_matrix2tree(Z, names) # save tree & newick if outbase: pdf, nw = outbase+".nw.pdf", outbase+".nw" with open(nw, "w") as out: out.write(t.write()) ts = ete3.TreeStyle() ts.show_leaf_name = False ts.layout_fn = mylayout t.render(pdf, tree_style=ts) return t
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree(): # Check that we obtain the correct solution for structured linkage trees. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for tree_builder in _TREE_BUILDERS.values(): children, n_components, n_leaves, parent = \ tree_builder(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes) # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape assert_raises(ValueError, tree_builder, X.T, np.ones((4, 4))) # Check that fitting with no samples raises an error assert_raises(ValueError, tree_builder, X.T[:0], connectivity)
def test_unstructured_linkage_tree(): # Check that we obtain the correct solution for unstructured linkage trees. rng = np.random.RandomState(0) X = rng.randn(50, 100) for this_X in (X, X[0]): # With specified a number of clusters just for the sake of # raising a warning and testing the warning code with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, ward_tree, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert_equal(len(children) + n_leaves, n_nodes) for tree_builder in _TREE_BUILDERS.values(): for this_X in (X, X[0]): with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, tree_builder, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert_equal(len(children) + n_leaves, n_nodes)
def test_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def tree(self): data = self.ccTable Matrix=np.zeros((self.Dimension,self.Dimension)) reducedArray=[] for line in data: #print line if line is not None and len(line) is not 0: Matrix[line[0],line[1]]= line[2] Matrix[line[1],line[0]]= line[2] for x in range(0,self.Dimension): for y in range(x+1,self.Dimension): reducedArray.append(Matrix[x,y]) Distances = np.array(reducedArray, dtype=(float)) self.Tree =hierarchy.linkage(Distances, 'complete') return self.Tree #new function, chose the average linkage
def avgTree(self): data = self.ccTable Matrix=np.zeros((self.Dimension,self.Dimension)) reducedArray=[] for line in data: #print line if line is not None and len(line) is not 0: Matrix[line[0],line[1]]= line[2] Matrix[line[1],line[0]]= line[2] for x in range(0,self.Dimension): for y in range(x+1,self.Dimension): reducedArray.append(Matrix[x,y]) Distances = np.array(reducedArray, dtype=(float)) self.Tree =hierarchy.linkage(Distances, 'average') return self.Tree #Funtion added to plot dendrogram in shell mode only. #still not funtioninhg #Uncomment when will be needed
def cluster_words(words, service_name, size): stopwords = ["GET", "POST", "total", "http-requests", service_name, "-", "_"] cleaned_words = [] for word in words: for stopword in stopwords: word = word.replace(stopword, "") cleaned_words.append(word) def distance(coord): i, j = coord return 1 - jaro_distance(cleaned_words[i], cleaned_words[j]) indices = np.triu_indices(len(words), 1) distances = np.apply_along_axis(distance, 0, indices) return cluster_of_size(linkage(distances), size)
def cluster_sequences(sequences, minsize=5): """ Cluster the given sequences into groups of similar sequences. Return a triple that contains a pandas.DataFrame with the edit distances, the linkage result, and a list that maps sequence ids to their cluster id. If an entry is zero in that list, it means that the sequence is not part of a cluster. """ matrix = distances(sequences) linkage = hierarchy.linkage(distance.squareform(matrix), method='average') # Linkage columns are: # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster inner = inner_nodes(hierarchy.to_tree(linkage)) prev = linkage[:, 2].max() # highest distance clusters = [0] * len(sequences) cl = 1 for n in inner: if n.dist > 0 and prev / n.dist < 0.8 \ and n.left.count >= minsize and n.right.count >= minsize: for id in collect_ids(n.left): # Do not overwrite previously assigned ids if clusters[id] == 0: clusters[id] = cl cl += 1 prev = n.dist # At the end of the above loop, we have not processed the rightmost # subtree. In our experiments, it never contains true novel sequences, # so we omit it. return pd.DataFrame(matrix), linkage, clusters
def get_cell_data(n=50, seed=0): np.random.seed(seed) cells_data = np.load('./data/cells_data.npy') sample_cells = np.random.choice(cells_data.shape[0], n, replace=False) D = pdist(cells_data[sample_cells, :], 'euclidean') Z = linkage(D, 'ward') return cells_data, Z, D
def get_random_data(n=50, seed=0): np.random.seed(seed) data = np.random.choice(10000, (n, 1), replace=False) D = pdist(data, 'euclidean') Z = linkage(D, 'ward') return data, Z, D
def plotDend(esd, filename=None): """Summary Function to display an electrostatic similarity dendrogram from a previously run ElecSimilarity class. Parameters ---------- esd : ElecSimilarity class ElecSimilarity class containing final esd matrix. filename : str, optional If the resulting plot should be written to disk, specify a filename. Otherwise, the image will only be saved. Returns ------- None Writes image to disk, if desired. """ # plt.style.use('seaborn-talk') fig, ax = plt.subplots(sharey=True) Z = cluster.linkage(esd.esd) cluster.dendrogram( Z, labels=esd.ids, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ax=ax) plt.xlabel('Variants') plt.ylabel('ESD') plt.tight_layout() if filename is not None: fig.savefig(filename)
def build_clusters(predicted_scores, method='centroid'): """agglomerative clustering using predicted scores as distances Args: predicted_scores: predicted scores for all mentions in documents method: methods for calculating distance between clusters look at scipy.cluster.hierarchy.linkage documentation Returns: clustering, min_score and max_score in predicted_scores """ print('building clusters') min_score = 1e10 max_score = 0 clustrering = [] for doc_id in tqdm(range(len(predicted_scores))): scores = predicted_scores[doc_id] if len(scores) > 0: distances = [] for i in range(len(scores)): for j in range(i + 1, len(scores)): distances.append((scores[i, j] + scores[j, i]) / 2) c = linkage(distances, method=method) clustrering.append(c) min_score = min(min(c[:, 2]), min_score) max_score = max(max(c[:, 2]), max_score) print('clusters are built: min_score: {} max_score: {}'.format(min_score, max_score)) return clustrering, min_score, max_score
def tree_from_linkage_matrix(linkage, leaf_labels): """ Form an ete3.Tree from hierarchical linkage matrix. Linkage should be the matrix returned by hierarchy.linkage. leaf_labels should be a vector of names for the nodes corresponding to the clustered items. Internal nodes will be named node0, node1, etc, in the order in which the clusters they represent were formed. returns: new Tree """
def cluster(target_sequence_ids, fasta_filename, method='average'): """ Form distance-based hierachical clustering of sequences. Looks up each entry in target_sequence_ids in the file specified by fasta_filename to obtain an associated DNA sequence. In principle, we could just work with the Hamming distance, but the sequences may be of different lengths (mostly small differences.) So we need a more sophisticated approach: we use pairwise global alignment, scoring 0 for a match, -1 for mismatch, and -1.5 for opening or extending a gap. We then take the distance to be -1.0*(score). UPGMA clustering is used when method='average', the default. Returns the distance matrix and the linkage matrix returned by the clustering routine. """ # globalms arguments: seq1, seq2, match, mismatch, open, extend distance = lambda seq1, seq2: -1.0*( pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True) ) sequences = fasta_to_dict(fasta_filename) N = len(target_sequence_ids) distances = np.zeros((N,N)) # fill in the upper triangle for i,seqid1 in enumerate(target_sequence_ids): seq1 = sequences[seqid1] for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]): j = j_offset + i + 1 seq2 = sequences[seqid2] distances[i][j] = distance(seq1, seq2) # convert to the form expected by the scipy clustering routines y = squareform(distances,checks=False) return distances, hierarchy.linkage(y,method)
def hierarchy(data, axis, method, metric): if axis == 'columns': data = data.transpose() clusters = range(len(data.index), 2*len(data.index) - 1) result = pd.DataFrame( linkage(data, method=method, metric=metric), columns=['child1', 'child2', 'distance', 'size'], index=clusters) for col in ['child1', 'child2', 'size']: result[col] = result[col].astype(int) return result
def cluster(df, metric="euclidean", method="single", row=True, column=True): row_linkmat, col_linkmat = None, None if row: distmat = dist.pdist(df, metric) row_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(row_linkmat), :] if column: df = df.T distmat = dist.pdist(df, metric) col_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(col_linkmat), :].T return df, row_linkmat, col_linkmat
def docv_centroid_order_idx(meta_clusters): dist = cdist(meta_clusters, meta_clusters, metric='cosine') # Compute the linkage and the order linkage = hierarchy.linkage(dist, method='average') d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"] return d_idx
def cluster_2d_array_rows(array_2d, linkage_method='average', distance_function='euclidean'): """ Cluster array_2d rows. Arguments: array_2d (array): (n_rows, n_columns) linkage_method (str): linkage method compatible for scipy.cluster.hierarchy.linkage distance_function (str | callable): distance function compatible for scipy.cluster.hierarchy.linkage Returns: array: (n_rows); clustered row indices """ clustered_indices = dendrogram( linkage(array_2d, method=linkage_method, metric=distance_function), no_plot=True)['leaves'] return array(clustered_indices)
def __init__(self): """ https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html: A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster. """ self.linkage = None self.series = None self._series_y = None self.ts_height_factor = None
def maxnode(self): return len(self.series) - 1 + len(self.linkage)
def get_linkage(self, node): if node < len(self.series): return None idx = int(node - len(self.series)) return self.linkage[idx]
def fit(self, series, *args, **kwargs): self.series = series self.linkage = [] new_nodes = {i: i for i in range(len(series))} if self._model.merge_hook: old_merge_hook = self._model.merge_hook else: old_merge_hook = None def merge_hook(from_idx, to_idx, distance): # print('merge_hook', from_idx, to_idx) new_idx = len(self.series) + len(self.linkage) # print('adding to linkage: ', new_nodes[from_idx], new_nodes[to_idx], distance, 0) if new_nodes[from_idx] is None: raise Exception('Trying to merge series that is already merged') self.linkage.append((new_nodes[from_idx], new_nodes[to_idx], distance, 0)) new_nodes[to_idx] = new_idx new_nodes[from_idx] = None if old_merge_hook: old_merge_hook(from_idx, to_idx, distance) self._model.merge_hook = merge_hook result = self._model.fit(series, *args, **kwargs) self._model.merge_hook = old_merge_hook return result
def __init__(self, dists_fun, dists_options): """Hierarchical clustering using the Scipy linkage function. This is the same but faster algorithm as available in Hierarchical (~10 times faster). But with less options to steer the clustering (e.g. no possibility to give weights). It still computes the entire distance matrix first and is thus not ideal for extremely large data sets. """ super().__init__() self.dists_fun = dists_fun self.dists_options = dists_options
def example_naivehierarchicalclustering(): """Naive hierarchical clustering algorithm using DTW and based on . For a more efficient approach, check: Mueen, A and Keogh, E, Extracting Optimal Performance from Dynamic Time Warping, Tutorial, KDD 2016 http://www.cs.unm.edu/~mueen/DTW.pdf :return: None """ series = [ np.array([0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0]), np.array([0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0]), np.array([2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]), np.array([2.0, 1.0, 1.0, 0.0, 0.0, 2.0, 3.0]), np.array([4.0, 2.0, 1.0, 0.0, 0.0, 1.0, 3.0]) ] dists = dtw.distance_matrix_fast(series) print("Distance matrix:\n{}".format(dists)) dists_cond = np.zeros(size_cond(len(series))) idx = 0 for r in range(len(series)-1): dists_cond[idx:idx+len(series)-r-1] = dists[r, r+1:] idx += len(series)-r-1 z = linkage(dists_cond, method='complete', metric='euclidean') print(z) fig, axes = plt.subplots(2, 1, figsize=(8, 3)) for idx, serie in enumerate(series): serie += idx * 0.1 axes[0].plot(serie, label=str(idx)) axes[0].text(0 + 0.15 * (-1)**idx * idx, serie[0] + 0.15 * idx, idx) axes[0].add_line(Line2D([0, 0 + 0.15 * (-1)**idx * idx], [serie[0], serie[0] + 0.15 * idx], linewidth=1, color='gray')) axes[0].legend(loc=1) dendrogram(z, ax=axes[1]) plt.show(block=True)
def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"): self.words = words self.vectors = vectors self.number_of_steps = number_of_steps self.metric = metric self.linkage = linkage
def __call__(self): if len(self.words) == 0 or len(self.vectors) == 0: return [] distance_matrix = scidist.pdist(np.array(self.vectors),self.metric) linkage_matrix = hier.linkage(distance_matrix,self.linkage) dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors) clusterings = self._create_clusterings(dendrogram) return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
def test_denrogram_children(): # temporary solution for # https://stackoverflow.com/questions/40239956/node-indexing-in-hierarachical-clustering-dendrograms import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage from freediscovery.cluster import _DendrogramChildren # generate two clusters: a with 10 points, b with 5: np.random.seed(1) a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[10, ]) b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[5, ]) X = np.concatenate((a, b),) Z = linkage(X, 'ward') # make distances between pairs of children uniform # (re-scales the horizontal (distance) axis when plotting) Z[:, 2] = np.arange(Z.shape[0])+1 ddata = dendrogram(Z, no_plot=True) dc = _DendrogramChildren(ddata) idx = 0 # check that we can compute children for all nodes for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']): node_children = dc.query(idx) idx += 1 # last level node should encompass all samples assert len(node_children) == X.shape[0] assert_allclose(sorted(node_children), np.arange(X.shape[0]))
def test_binary_linkage2clusters(): from freediscovery.cluster.utils import _binary_linkage2clusters from sklearn.metrics import v_measure_score n_samples = 10 linkage = np.array([[1, 2], [2, 3], [5, 7], [6, 9]]) cluster_id = _binary_linkage2clusters(linkage, n_samples) cluster_id_ref = np.array([0, 1, 1, 1, 2, 3, 4, 3, 5, 4]) assert cluster_id.shape == cluster_id_ref.shape # i.e. same clusters assert v_measure_score(cluster_id, cluster_id_ref) == 1.0
def get_k(clustering, depth = 10): """ (ndarray, int) -> int clustering: ndarray -- linkage matrix representing hierarchical clustering depth: int -- the maximum depth to traverse clustering Returns the number of clusters to extract from the hierarchical clustering using the elbow method. """ last = clustering[-depth: , 2] acceleration = np.diff(last, 2) acceleration_rev = acceleration[::-1] k = acceleration_rev.argmax() + 2 return k
def get_cluster_assignments(sim_matrix, parameters): """ (np.array, list of int) -> list of int sim_matrix: list of list of float -- similarity matrix between exemplars parameters: list of parameters in the format ["method:method_name", "algo:algo_name", "k:num_clusters", "damping:damping"] where order doesn't matter (k and damping only relevant for certain clustering methods) the possible values for each parameter are listed in the function below. Returns a list of integers. The integer at each index of the list corresponds to the cluster number of the exemplar at the same index in sim_matrix. """ algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap') # from { 'hierarchical', 'kmeans', 'ap', 'ward' } method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single') # from {'single', 'complete', 'average'} (only relevant for hierarchical clustering) kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8) # any integer <= the data length damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5) # only relevant for AP -- in [0.5,1] # if algorithm == 'hierarchical': clustering = hierarchy.linkage(sim_matrix, method) k = get_k(clustering, 20) cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1 elif algorithm == 'kmeans': cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix) elif algorithm == 'ap': cluster_assignments = AffinityPropagation().fit_predict(sim_matrix) elif algorithm == 'ward': clustering = hierarchy.ward(sim_matrix) k = get_k(clustering, 20) cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1 return cluster_assignments
def HierarchicalClustering(self,data): distances = nx.to_numpy_matrix(data) hierarchy = linkage(distances) print hierarchy,"HIERRATCJY" Z = dendrogram(hierarchy) print Z return hierarchy
def main(): D = 2 # so we can visualize it more easily s = 4 # separation so we can control how far apart the means are mu1 = np.array([0, 0]) mu2 = np.array([s, s]) mu3 = np.array([0, s]) N = 900 # number of samples X = np.zeros((N, D)) X[:300, :] = np.random.randn(300, D) + mu1 X[300:600, :] = np.random.randn(300, D) + mu2 X[600:, :] = np.random.randn(300, D) + mu3 Z = linkage(X, 'ward') print "Z.shape:", Z.shape # Z has the format [idx1, idx2, dist, sample_count] # therefore, its size will be (N-1, 4) plt.title("Ward") dendrogram(Z) plt.show() Z = linkage(X, 'single') plt.title("Single") dendrogram(Z) plt.show() Z = linkage(X, 'complete') plt.title("Complete") dendrogram(Z) plt.show()
def single_silhouette_dendrogram(dist_matrix, Z, threshold, mode='clusters', method='single', sample_names=None): """Compute the average silhouette at a given threshold. Parameters ---------- dist_matrix : array-like Precomputed distance matrix between points. Z : array-like Linkage matrix, results of scipy.cluster.hierarchy.linkage. threshold : float Specifies where to cut the dendrogram. mode : ('clusters', 'thresholds'), optional Choose what to visualise on the x-axis. Returns ------- x : float Based on mode, it can contains the number of clusters or threshold. silhouette_avg : float The average silhouette. """ cluster_labels = fcluster(Z, threshold, 'distance') nclusts = np.unique(cluster_labels).shape[0] save_results_clusters("res_{}_{:03d}_clust.csv".format(method, nclusts), sample_names, cluster_labels) try: silhouette_list = silhouette_samples(dist_matrix, cluster_labels, metric="precomputed") silhouette_avg = np.mean(silhouette_list) x = max(cluster_labels) if mode == 'clusters' else threshold except ValueError as e: if max(cluster_labels) == 1: x = 1 if mode == 'clusters' else threshold silhouette_avg = 0 else: raise(e) return x, silhouette_avg
def hierarchical_ordering_indices(columns, correlation_matrix): """Return array with hierarchical cluster ordering of columns Parameters ---------- columns: iterable of str Names of columns. correlation_matrix: np.ndarray Matrix of correlation coefficients between columns. Returns ------- indices: iterable of int Indices with order of columns """ if len(columns) > 2: pairwise_dists = distance.pdist( np.where(np.isnan(correlation_matrix), 0, correlation_matrix), metric='euclidean') linkage = hierarchy.linkage(pairwise_dists, method='average') dendogram = hierarchy.dendrogram( linkage, no_plot=True, color_threshold=-np.inf) idx = dendogram['leaves'] else: idx = list(range(len(columns))) return idx
def anglesCluster(angles): # this function uses hierarchical clustering to find the clusters of a set of angle values Dists = dist.pdist(angles, angleDiff) linkageMatrix = hier.linkage(Dists, metric = angleDiff) C = hier.fcluster(linkageMatrix, 5, 'maxclust') return C
def clustering(partSeqList, partData): print('clustering for the seperated dataset') #simiMatrix = simiMatrixCal(partData) '''Invoke the clustering method in library''' data_dist = pdist(partData,metric=distCalculate) Z = linkage(data_dist, 'complete') clusterLabels = fcluster(Z, para['max_d'], criterion='distance') print ('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels)))) clusNum = len(set(clusterLabels)) instIndexPerClus=[[] for i in range(clusNum)] #initialization for i in range(len(clusterLabels)): lab = clusterLabels[i]-1 instIndexPerClus[lab].append(partSeqList[i]) return clusterLabels,instIndexPerClus
def clustering(partSeqList, partData): '''Invoke the clustering method in library''' print('clustering for the seperated dataset') data_dist = pdist(partData,metric=distCalculate) Z = linkage(data_dist, 'complete') clusterLabels = fcluster(Z, para['max_d'], criterion='distance') print('there are altogether %d clusters in this initial clustering'%(len(np.unique(clusterLabels)))) clusNum = len(set(clusterLabels)) instIndexPerClus=[[] for i in range(clusNum)] #initialization for i in range(len(clusterLabels)): lab = clusterLabels[i]-1 instIndexPerClus[lab].append(partSeqList[i]) return clusterLabels,instIndexPerClus
def _transactions_fuzzy_matching(transactions, match): """ Runs fuzzy matching on the transactions, by applying a complete linkage hierarchical clustering algorithm to the set of different itemsets in the transactions. For clustering, the similarity ratio as given by fuzzywuzzy.ratio is used as the distance measure Input: transactions: list of tuples representing items on each transaction match: minimum similarity ratio (0 to 100) for clustering Output: transactions: new version of the transactions, where each item has been replaced by the first item on its corresponding cluster word_clusters: dictionary that maps the cluster for each item in the transactions """ words = set([]) for transaction in transactions: words |= set(transaction) words = sorted(words) l = [((a, b), 100-Levenshtein.ratio(str(a), str(b))) for a, b in combinations(words, 2)] d = [value for pair, value in l] r = linkage(d, 'complete') clusters_index = fcluster(r, 100-match, "distance") clusters = {} for obs_i, cluster_i in enumerate(clusters_index): if cluster_i in clusters: clusters[cluster_i].append(words[obs_i]) else: clusters[cluster_i] = [words[obs_i]] word_clusters = {word: clusters[clusters_index[i]] for i, word in enumerate(words)} new_transactions = [] for transaction in transactions: new_transaction = tuple(set(([word_clusters[word][0] for word in transaction]))) new_transactions.append(new_transaction) return new_transactions, word_clusters
def get_subtrees0(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4): """Recomputing linkage is slower than pruning the tree for large matrices""" maxtdist = 0 i = 0 subtrees = [] for i in range(1, nchrom): Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method) names = get_names(bin_chr, bin_position) t = distance_matrix2tree(Z, names) #tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names)) # get longest branch n, bestdist = get_longest(t) tname, tdist = t.get_farthest_leaf()#[1] if maxtdist < tdist: maxtdist = t.get_farthest_leaf()[1] # break if small subtree if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac: break pruned = n.get_leaf_names() subtrees.append(pruned) c = Counter(get_chr_name(n) for n in pruned) print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5) t = truncate(t, maxd=5); t.render('tree_%s.pdf'%i) # prune array indices = np.array([False if name in set(pruned) else True for _i, name in enumerate(names)]) d = d[indices, :] d = d[:, indices] bin_chr = bin_chr[indices] bin_position = bin_position[indices, :] if i: subtrees.append(t.get_leaf_names()) pruned = t.get_leaf_names() c = Counter(get_chr_name(n) for n in pruned) print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5) return subtrees
def array2scaffolds(outbase, infile, infile2, fasta, threads, minWindows, scaffolds=[]): """Return scaffolds computed for given matrix""" logger("Loading FastA...") faidx = FastaIndex(fasta) logger(" %s bp in %s contigs"%(faidx.genomeSize, len(faidx))) logger("Loading matrix from %s ..."%infile) d, bin_chr, bin_position, contig2size = load_matrix(infile, scaffolds=scaffolds, verbose=1) logger(" matrix of %s windows for %s contigs summing %s bp"%(d.shape[0], len(contig2size), sum(contig2size.values()))) # make sure all contigs from matrix are present in FastA diff = set(contig2size.keys()).difference(faidx) if diff: sys.stderr.write("[ERROR] %s / %s contigs are missing from provided FastA!\n"%(len(diff), len(contig2size))) sys.exit(1) logger("Calculating linkage matrix & tree...") names = ["%s %7sk"%(get_name(c), s/1000) for c, (s, e) in zip(bin_chr, bin_position)] d = transform(d) t = array2tree(d, names, outbase) del d logger("Assigning contigs to clusters/scaffolds...") clusters = get_clusters(outbase, t, contig2size, bin_chr) logger("Constructing scaffolds...") if threads > 1: scaffolds = clusters2scaffolds_multi(clusters, infile2, minWindows, scaffolds, threads) else: scaffolds = clusters2scaffolds(clusters, infile2, minWindows, scaffolds) logger("Reporting %s scaffolds..."%len(scaffolds)) fastafn = report_scaffolds(outbase, infile, scaffolds, faidx) return scaffolds, fastafn
def chc(X, K, params=()): pnames = ['linkage_method', 'distance'] dflts = [ 'ward', 'euclidean'] if isinstance(params, np.ndarray): paramsloc = params.tolist() else: paramsloc = params (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc) Z = sphc.linkage(X, method=linkage_method, metric=distance) C = sphc.fcluster(Z, K, criterion='maxclust') return clustVec2partMat(C, K) # Other related functions
def cluster_helices(helices, cluster_distance=12.0): """ Clusters helices according to the minimum distance between the line segments representing their backbone. Notes ----- Each helix is represented as a line segement joining the CA of its first Residue to the CA if its final Residue. The minimal distance between pairwise line segments is calculated and stored in a condensed_distance_matrix. This is clustered using the 'single' linkage metric (all members of cluster i are at < cluster_distance away from at least one other member of cluster i). Helices belonging to the same cluster are grouped together as values of the returned cluster_dict. Parameters ---------- helices: Assembly cluster_distance: float Returns ------- cluster_dict: dict Keys: int cluster number Values: [Polymer] """ condensed_distance_matrix = [] for h1, h2 in itertools.combinations(helices, 2): md = minimal_distance_between_lines(h1[0]['CA']._vector, h1[-1]['CA']._vector, h2[0]['CA']._vector, h2[-1]['CA']._vector, segments=True) condensed_distance_matrix.append(md) z = linkage(condensed_distance_matrix, method='single') clusters = fcluster(z, t=cluster_distance, criterion='distance') cluster_dict = {} for h, k in zip(helices, clusters): if k not in cluster_dict: cluster_dict[k] = [h] else: cluster_dict[k].append(h) return cluster_dict
def make_multi_hierarch_cluster_figs(model, field_input): def make_multi_cat_clust_fig(cats, freq_thr=500): # TODO make into config """ Returns fig showing hierarchical clustering of probes from multiple categories """ start = time.time() sns.set_style('white') # make cat_acts_mat acts_mats = [] cats_probe_list = [] for cat in cats: bool_index = [True if sum(model.term_doc_freq_dict[probe]) > freq_thr else False for probe in model.probe_store.cat_probe_list_dict[cat]] cat_probe_acts_df = model.get_single_cat_acts_df(cat) filtered_cat_probes_acts_mat = cat_probe_acts_df[bool_index].values acts_mats.append(filtered_cat_probes_acts_mat) cats_probe_list += [model.probe_store.probe_set[probe_id] for probe_id in cat_probe_acts_df[bool_index].index.tolist()] cat_acts_mat = np.vstack((mat for mat in acts_mats)) # fig rcParams['lines.linewidth'] = 2.0 fig, ax = plt.subplots(figsize=(FigsConfigs.MAX_FIG_WIDTH, 5 * len(cats)), dpi=FigsConfigs.DPI) # dendrogram dist_matrix = pdist(cat_acts_mat, 'euclidean') linkages = linkage(dist_matrix, method='complete') dendrogram(linkages, ax=ax, labels=cats_probe_list, orientation='right', leaf_font_size=10) ax.tick_params(axis='both', which='both', top='off', right='off', left='off') ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['top'].set_visible(False) print('{} completed in {:.1f} secs'.format(sys._getframe().f_code.co_name, time.time() - start)) return fig figs = [make_multi_cat_clust_fig(field_input)] return figs
def test_height_linkage_tree(): # Check that the height of the results of linkage tree is sorted. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for linkage_func in _TREE_BUILDERS.values(): children, n_nodes, n_leaves, parent = linkage_func(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)