Python scipy.cluster.hierarchy 模块，linkage() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用scipy.cluster.hierarchy.linkage()。

项目：dynamicTreeCut 作者：kylessmith | 项目源码 | 文件源码

def test_cuttreeHybrid():
    from dynamicTreeCut import cutreeHybrid
    d = np.transpose(np.arange(1, 10001).reshape(100, 100))
    distances = pdist(d, "euclidean")
    link = linkage(distances, "average")
    test = cutreeHybrid(link, distances)

    true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1]

    assert (test['labels'] == true).all()
    assert False

项目：icing 作者：slipguru | 项目源码 | 文件源码

def formClusters(dists, link, distance):
    """Form clusters based on hierarchical clustering of input distance matrix
    with linkage type and cutoff distance
    :param dists: numpy matrix of distances
    :param link: linkage type for hierarchical clustering
    :param distance: distance at which to cut into clusters
    :return: list of cluster assignments
    """
    # Make distance matrix square
    dists = squareform(dists)
    # Compute linkage
    links = linkage(dists, link)

    # import matplotlib.pyplot as plt
    # from scipy.cluster import hierarchy
    # plt.figure(figsize=(15,5))
    # p = hierarchy.dendrogram(links)

    # Break into clusters based on cutoff
    clusters = fcluster(links, distance, criterion='distance')
    return clusters

项目：TTClust 作者：tubiana | 项目源码 | 文件源码

def generate_graphs(clusters_list, output, size, linkage, cutoff,distances):
    """
    DESCRIPTION
    Create a linear cluster mapping graph where every frame is printed as a
    colored barplot
    Args:
        clusters_labels (list): list of cluster number per frame
        output (string) output name for graph
    Return:
        colors_list (list) to be used with 2D distance projection graph
    """
    colors_list = plot_barplot(clusters_list, output, size)
    plot_dendro(linkage, output, cutoff, colors_list,clusters_list)
    plot_hist(clusters_list, output,colors_list)
    if (distances.shape[0] < 10000):
        implot(distances,output)
    else:
        printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated")
    return colors_list

项目：eTraGo 作者：openego | 项目源码 | 文件源码

def linkage(df, n_groups):
    # create the distance matrix based on the forbenius norm: |A-B|_F where A is
    # a 24 x N matrix with N the number of timeseries inside the dataframe df
    # TODO: We can save have time as we only need the upper triangle once as the
    # distance matrix is symmetric
    if True:
        Y = np.empty((n_groups, n_groups,))
        Y[:] = np.NAN
        for i in range(len(Y)):
            for j in range(len(Y[i,:])):
                A = df.loc[i+1].values
                B = df.loc[j+1].values
                #print('Computing distance of:{},{}'.format(i,j))
                Y[i,j] = norm(A-B, ord='fro')

    # condensed distance matrix as vector for linkage (upper triangle as a vector)
    y = Y[np.triu_indices(n_groups, 1)]
    # create linkage matrix with wards algorithm an euclidean norm
    Z = hac.linkage(y, method='ward', metric='euclidean')
    # R = hac.inconsistent(Z, d=10)
    return Z

项目：South-African-Heart-Disease-data-analysis-using-python 作者：khushi4tiwari | 项目源码 | 文件源码

def hierarchicalClustering(X,y,Maxclust, C, Method = 'single', Metric = 'euclidean'):
    # Perform hierarchical/agglomerative clustering on data matrix

    Z = linkage(X, method=Method, metric=Metric)

    # Compute and display clusters by thresholding the dendrogram
    cls = fcluster(Z, criterion='maxclust', t=Maxclust)
    figure()
    #clusterplot(X, cls.reshape(cls.shape[0],1), y=y)
    clusterPlot(X, cls.reshape(cls.shape[0],1), Maxclust, C, y=y)

    # Display dendrogram
    max_display_levels=7
    figure()
    dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=0.5*np.max(Z[:,2]))
    title("Dendrgram of the Hierarchical Clustering")
    show()

项目：HiCembler 作者：lpryszcz | 项目源码 | 文件源码

def array2tree(d, names, outbase="", method="ward"):
    """Return tree representation for array"""
    # cluster
    Z = sch.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)

    # get ete Tree
    t = distance_matrix2tree(Z, names)

    # save tree & newick
    if outbase:
        pdf, nw = outbase+".nw.pdf", outbase+".nw"
        with open(nw, "w") as out:
            out.write(t.write())

        ts = ete3.TreeStyle()
        ts.show_leaf_name = False
        ts.layout_fn = mylayout
        t.render(pdf, tree_style=ts)

    return t

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_structured_linkage_tree():
    # Check that we obtain the correct solution for structured linkage trees.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    for tree_builder in _TREE_BUILDERS.values():
        children, n_components, n_leaves, parent = \
            tree_builder(X.T, connectivity)
        n_nodes = 2 * X.shape[1] - 1
        assert_true(len(children) + n_leaves == n_nodes)
        # Check that ward_tree raises a ValueError with a connectivity matrix
        # of the wrong shape
        assert_raises(ValueError,
                      tree_builder, X.T, np.ones((4, 4)))
        # Check that fitting with no samples raises an error
        assert_raises(ValueError,
                      tree_builder, X.T[:0], connectivity)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_unstructured_linkage_tree():
    # Check that we obtain the correct solution for unstructured linkage trees.
    rng = np.random.RandomState(0)
    X = rng.randn(50, 100)
    for this_X in (X, X[0]):
        # With specified a number of clusters just for the sake of
        # raising a warning and testing the warning code
        with ignore_warnings():
            children, n_nodes, n_leaves, parent = assert_warns(
                UserWarning, ward_tree, this_X.T, n_clusters=10)
        n_nodes = 2 * X.shape[1] - 1
        assert_equal(len(children) + n_leaves, n_nodes)

    for tree_builder in _TREE_BUILDERS.values():
        for this_X in (X, X[0]):
            with ignore_warnings():
                children, n_nodes, n_leaves, parent = assert_warns(
                    UserWarning, tree_builder, this_X.T, n_clusters=10)

            n_nodes = 2 * X.shape[1] - 1
            assert_equal(len(children) + n_leaves, n_nodes)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_scikit_vs_scipy():
    # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
    n, p, k = 10, 5, 3
    rng = np.random.RandomState(0)

    # Not using a lil_matrix here, just to check that non sparse
    # matrices are well handled
    connectivity = np.ones((n, n))
    for linkage in _TREE_BUILDERS.keys():
        for i in range(5):
            X = .1 * rng.normal(size=(n, p))
            X -= 4. * np.arange(n)[:, np.newaxis]
            X -= X.mean(axis=1)[:, np.newaxis]

            out = hierarchy.linkage(X, method=linkage)

            children_ = out[:, :2].astype(np.int)
            children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity)

            cut = _hc_cut(k, children, n_leaves)
            cut_ = _hc_cut(k, children_, n_leaves)
            assess_same_labelling(cut, cut_)

    # Test error management in _hc_cut
    assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)

项目：ccCluster 作者：gsantoni | 项目源码 | 文件源码

def tree(self):
        data = self.ccTable
        Matrix=np.zeros((self.Dimension,self.Dimension))

        reducedArray=[]
        for line in data:
                #print line
            if line is not None and len(line) is not 0:
                 Matrix[line[0],line[1]]= line[2]
                 Matrix[line[1],line[0]]= line[2]


        for x in range(0,self.Dimension):
            for y in range(x+1,self.Dimension):
                reducedArray.append(Matrix[x,y])

        Distances = np.array(reducedArray, dtype=(float))
        self.Tree =hierarchy.linkage(Distances, 'complete')

        return self.Tree

#new function, chose the average linkage

项目：ccCluster 作者：gsantoni | 项目源码 | 文件源码

def avgTree(self):
        data = self.ccTable
        Matrix=np.zeros((self.Dimension,self.Dimension))

        reducedArray=[]
        for line in data:
                #print line
            if line is not None and len(line) is not 0:
                 Matrix[line[0],line[1]]= line[2]
                 Matrix[line[1],line[0]]= line[2]


        for x in range(0,self.Dimension):
            for y in range(x+1,self.Dimension):
                reducedArray.append(Matrix[x,y])

        Distances = np.array(reducedArray, dtype=(float))
        self.Tree =hierarchy.linkage(Distances, 'average')

        return self.Tree

#Funtion added to plot dendrogram in shell mode only.
#still not funtioninhg
#Uncomment when will be needed

项目：rca-evaluation 作者：sieve-microservices | 项目源码 | 文件源码

def cluster_words(words, service_name, size):
    stopwords = ["GET", "POST", "total", "http-requests", service_name, "-", "_"]
    cleaned_words = []
    for word in words:
        for stopword in stopwords:
            word = word.replace(stopword, "")
        cleaned_words.append(word)
    def distance(coord):
        i, j = coord
        return 1 - jaro_distance(cleaned_words[i], cleaned_words[j])
    indices = np.triu_indices(len(words), 1)
    distances = np.apply_along_axis(distance, 0, indices)
    return cluster_of_size(linkage(distances), size)

项目：IgDiscover 作者：NBISweden | 项目源码 | 文件源码

def cluster_sequences(sequences, minsize=5):
    """
    Cluster the given sequences into groups of similar sequences.

    Return a triple that contains a pandas.DataFrame with the edit distances,
    the linkage result, and a list that maps sequence ids to their cluster id.
    If an entry is zero in that list, it means that the sequence is not part of
    a cluster.
    """
    matrix = distances(sequences)
    linkage = hierarchy.linkage(distance.squareform(matrix), method='average')
    # Linkage columns are:
    # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster
    inner = inner_nodes(hierarchy.to_tree(linkage))
    prev = linkage[:, 2].max()  # highest distance
    clusters = [0] * len(sequences)
    cl = 1
    for n in inner:
        if n.dist > 0 and prev / n.dist < 0.8 \
                and n.left.count >= minsize and n.right.count >= minsize:
            for id in collect_ids(n.left):
                # Do not overwrite previously assigned ids
                if clusters[id] == 0:
                    clusters[id] = cl
            cl += 1
        prev = n.dist
    # At the end of the above loop, we have not processed the rightmost
    # subtree. In our experiments, it never contains true novel sequences,
    # so we omit it.

    return pd.DataFrame(matrix), linkage, clusters

项目：polo 作者：adrianveres | 项目源码 | 文件源码

def get_cell_data(n=50, seed=0):
    np.random.seed(seed)
    cells_data = np.load('./data/cells_data.npy')

    sample_cells = np.random.choice(cells_data.shape[0], n, replace=False)

    D = pdist(cells_data[sample_cells, :], 'euclidean')
    Z = linkage(D, 'ward')

    return cells_data, Z, D

项目：polo 作者：adrianveres | 项目源码 | 文件源码

def get_random_data(n=50, seed=0):

    np.random.seed(seed)
    data = np.random.choice(10000, (n, 1), replace=False)
    D = pdist(data, 'euclidean')
    Z = linkage(D, 'ward')
    return data, Z, D

项目：aesop 作者：BioMoDeL | 项目源码 | 文件源码

def plotDend(esd, filename=None):
    """Summary
    Function to display an electrostatic similarity dendrogram from a
    previously run ElecSimilarity class.

    Parameters
    ----------
    esd : ElecSimilarity class
        ElecSimilarity class containing final esd matrix.
    filename : str, optional
        If the resulting plot should be written to disk, specify a filename.
        Otherwise, the image will only be saved.

    Returns
    -------
    None
        Writes image to disk, if desired.
    """
    # plt.style.use('seaborn-talk')
    fig, ax = plt.subplots(sharey=True)
    Z = cluster.linkage(esd.esd)
    cluster.dendrogram(
        Z,
        labels=esd.ids,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
        ax=ax)
    plt.xlabel('Variants')
    plt.ylabel('ESD')
    plt.tight_layout()
    if filename is not None:
        fig.savefig(filename)

项目：deeppavlov 作者：deepmipt | 项目源码 | 文件源码

def build_clusters(predicted_scores, method='centroid'):
    """agglomerative clustering using predicted scores as distances

    Args:
        predicted_scores: predicted scores for all mentions in documents
        method: methods for calculating distance between clusters
            look at scipy.cluster.hierarchy.linkage documentation

    Returns:
        clustering, min_score and max_score in predicted_scores

    """
    print('building clusters')
    min_score = 1e10
    max_score = 0
    clustrering = []
    for doc_id in tqdm(range(len(predicted_scores))):
        scores = predicted_scores[doc_id]
        if len(scores) > 0:
            distances = []
            for i in range(len(scores)):
                for j in range(i + 1, len(scores)):
                    distances.append((scores[i, j] + scores[j, i]) / 2)
            c = linkage(distances, method=method)
            clustrering.append(c)
            min_score = min(min(c[:, 2]), min_score)
            max_score = max(max(c[:, 2]), max_score)
    print('clusters are built: min_score: {} max_score: {}'.format(min_score, max_score))
    return clustrering, min_score, max_score

项目：mitre 作者：gerberlab | 项目源码 | 文件源码

def tree_from_linkage_matrix(linkage, leaf_labels):
    """ Form an ete3.Tree from hierarchical linkage matrix.

    Linkage should be the matrix returned by hierarchy.linkage. 
    leaf_labels should be a vector of names for the nodes 
    corresponding to the clustered items. Internal nodes will be 
    named node0, node1, etc, in the order in which the 
    clusters they represent were formed. 

    returns: new Tree

    """

项目：mitre 作者：gerberlab | 项目源码 | 文件源码

def cluster(target_sequence_ids, fasta_filename, method='average'):
    """ Form distance-based hierachical clustering of sequences.

    Looks up each entry in target_sequence_ids in the file 
    specified by fasta_filename to obtain an associated DNA 
    sequence. 

    In principle, we could just work with the Hamming distance, but 
    the sequences may be of different lengths (mostly small 
    differences.) So we need a more sophisticated approach: we use
    pairwise global alignment, scoring 0 for a match, -1 for mismatch,
    and -1.5 for opening or extending a gap. We then take the distance
    to be -1.0*(score). 

    UPGMA clustering is used when method='average', the default.

    Returns the distance matrix and the linkage matrix returned
    by the clustering routine.

    """
    # globalms arguments: seq1, seq2, match, mismatch, open, extend
    distance = lambda seq1, seq2: -1.0*(
        pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
    )
    sequences = fasta_to_dict(fasta_filename)
    N = len(target_sequence_ids)
    distances = np.zeros((N,N))
    # fill in the upper triangle
    for i,seqid1 in enumerate(target_sequence_ids):
        seq1 = sequences[seqid1]
        for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
            j = j_offset + i + 1
            seq2 = sequences[seqid2]
            distances[i][j] = distance(seq1, seq2)
    # convert to the form expected by the scipy clustering routines
    y = squareform(distances,checks=False)
    return distances, hierarchy.linkage(y,method)

项目：pysciencedock 作者：Kitware | 项目源码 | 文件源码

def hierarchy(data, axis, method, metric):
    if axis == 'columns':
        data = data.transpose()
    clusters = range(len(data.index), 2*len(data.index) - 1)
    result = pd.DataFrame(
        linkage(data, method=method, metric=metric),
        columns=['child1', 'child2', 'distance', 'size'],
        index=clusters)
    for col in ['child1', 'child2', 'size']:
        result[col] = result[col].astype(int)
    return result

项目：bkheatmap 作者：wwliao | 项目源码 | 文件源码

def cluster(df, metric="euclidean", method="single", row=True, column=True):
    row_linkmat, col_linkmat = None, None
    if row:
        distmat = dist.pdist(df, metric)
        row_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(row_linkmat), :]
    if column:
        df = df.T
        distmat = dist.pdist(df, metric)
        col_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(col_linkmat), :].T
    return df, row_linkmat, col_linkmat

项目：word2vec_pipeline 作者：NIHOPA | 项目源码 | 文件源码

def docv_centroid_order_idx(meta_clusters):
    dist = cdist(meta_clusters, meta_clusters, metric='cosine')

    # Compute the linkage and the order
    linkage = hierarchy.linkage(dist, method='average')
    d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"]

    return d_idx

项目：nd_array 作者：KwatME | 项目源码 | 文件源码

def cluster_2d_array_rows(array_2d,
                          linkage_method='average',
                          distance_function='euclidean'):
    """
    Cluster array_2d rows.
    Arguments:
        array_2d (array): (n_rows, n_columns)
        linkage_method (str): linkage method compatible for
            scipy.cluster.hierarchy.linkage
        distance_function (str | callable): distance function compatible for
            scipy.cluster.hierarchy.linkage
    Returns:
        array: (n_rows); clustered row indices
    """

    clustered_indices = dendrogram(
        linkage(array_2d, method=linkage_method, metric=distance_function),
        no_plot=True)['leaves']

    return array(clustered_indices)