Python scipy.cluster.hierarchy 模块,ward() 实例源码


项目:indefinite-pronouns    作者:dnrb    | 项目源码 | 文件源码
def get_cluster_assignments(sim_matrix, parameters):
    (np.array, list of int) -> list of int
    sim_matrix: list of list of float -- similarity matrix between exemplars
    parameters: list of parameters in the format ["method:method_name", 
            "algo:algo_name", "k:num_clusters", "damping:damping"]
            where order doesn't matter
            (k and damping only relevant for certain clustering methods)
            the possible values for each parameter are listed in the
            function below.

    Returns a list of integers. The integer at each index of the list corresponds
    to the cluster number of the exemplar at the same index in sim_matrix.

    algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap')
    # from { 'hierarchical', 'kmeans', 'ap', 'ward' }
    method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single')
    # from {'single', 'complete', 'average'} (only relevant for hierarchical clustering)
    kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8)
    # any integer <= the data length
    damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5)
    # only relevant for AP -- in [0.5,1]
    if algorithm == 'hierarchical':
        clustering = hierarchy.linkage(sim_matrix, method)
        k = get_k(clustering, 20)
        cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
    elif algorithm == 'kmeans':
        cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix)
    elif algorithm == 'ap':
        cluster_assignments = AffinityPropagation().fit_predict(sim_matrix)
    elif algorithm == 'ward':
        clustering = hierarchy.ward(sim_matrix)
        k = get_k(clustering, 20)
        cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
    return cluster_assignments
项目:indefinite-pronouns    作者:dnrb    | 项目源码 | 文件源码
def comparative_exp():
    Runs a series of clustering experiments for different parameter settings.
    data_path = sys.argv[1]    # path to data set
    stem_dict_path = sys.argv[2]    # path to stemming dictionary
    parameters = ['SPLIT', 'noUF']
    d = data(data_path, stem_dict_path, parameters)
    clustering_algos = [(a,m,k) for a in ['hierarchical', 'ward', 'ap', 'kmeans']
                for m in [None,'complete','average','single']
                for k in [None,2,3,4,5,6,7,8,9,10]
                if (m != None and k == None and a == 'hierarchical') or
                (m == None and k != None and a == 'kmeans') or
                (m == None and k == None and a in ['ward', 'ap'])]
    for onto_cat in ['thing', 'body']:
        parameters_i = parameters + ['onto:%s' % onto_cat]
        oix = sorted(set(np.where(d.ontological == onto_cat)[0]))
        similarity_matrix = get_similarity_matrix(d, parameters_i, oix, association = 'associated')
        for a,m,k in clustering_algos:
            parameters_j = parameters_i + ['algo:%s' % a]
            if m != None: parameters_j.append('method:%s' % m)
            if k != None: parameters_j.append('k:%r' % k)
            cluster_assignments = get_cluster_assignments(similarity_matrix, parameters_j)
            print(evaluate_clustering(cluster_assignments, d.annotation[oix]))
            print_confusion_matrix(cluster_assignments, d.annotation[oix])
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def main():
    country = dictdata(getCountrydict())
    result = ward(country.getData())
    dendrogram(result, labels=country.getName(), orientation='left', leaf_font_size=10)
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def ward_hierarchical_clustering(feature_matrix):

    cosine_distance = 1 - cosine_similarity(feature_matrix)
    linkage_matrix = ward(cosine_distance)
    return linkage_matrix
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
    # set size
    fig, ax = plt.subplots(figsize=figure_size) 
    movie_titles = movie_data['Title'].values.tolist()
    # plot dendrogram
    ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
    plt.tick_params(axis= 'x',   
    plt.savefig('ward_hierachical_clusters.png', dpi=200)

# build ward's linkage matrix