Python sklearn.cluster 模块,AgglomerativeClustering() 实例源码

我们从Python开源项目中,提取了以下32个代码示例,用于说明如何使用sklearn.cluster.AgglomerativeClustering()

项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def perform_clustering(X, connectivity, title, num_clusters=3, linkage='ward'):
    plt.figure()
    model = AgglomerativeClustering(linkage=linkage, 
                    connectivity=connectivity, n_clusters=num_clusters)
    model.fit(X)

    # extract labels
    labels = model.labels_

    # specify marker shapes for different clusters
    markers = '.vx'

    for i, marker in zip(range(num_clusters), markers):
        # plot the points belong to the current cluster
        plt.scatter(X[labels==i, 0], X[labels==i, 1], s=50, 
                    marker=marker, color='k', facecolors='none')

    plt.title(title)
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def agglomerative_clustering(self, out_path, pd_data, number_of_clusters):
        headers, repos, features = self.__fetch_data(pd_data)

        agglomerative_clustering = AgglomerativeClustering(n_clusters=number_of_clusters, linkage="complete")
        agglomerative_clustering.fit(features)

        # form clusters
        clusters = []
        for i in range(0, number_of_clusters):  # k cluster
            repo_list = []
            for j in range(0, len(agglomerative_clustering.labels_)):  # a label for each repo.
                if i == agglomerative_clustering.labels_[j]:  # if repo label is equal to Cluster number
                    repo_list.append(repos[j])  # add repo to cluster i's list.
            clusters.append(repo_list)

        out_file_path = os.path.join(out_path, "agglomerative_noOfClusters" + str(number_of_clusters))
        self.__export_agglomerative_results(agglomerative_clustering, clusters, out_file_path)
项目:ASTRiDE    作者:dwkim78    | 项目源码 | 文件源码
def __init__(self, edges, branching_factor=50, threshold=0.1):
        # Make features list.
        features = []
        for i in range(len(edges)):
            edge = edges[i]
            features.append([edge['perimeter'], edge['area'],
                             edge['shape_factor'], edge['radius_deviation']])
        features = np.array(features)

        # Normalize features
        normed_features = features.copy()
        for i in range(features.shape[1]):
            avg = np.median(features[::, i])
            std = np.std(features[::, i])

            normed_features[::, i] -= avg
            normed_features[::, i] /= avg

        self.features = features
        self.normed_features = normed_features
        self.branching_factor = branching_factor
        self.threshold = threshold
        #self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2)
        self.run(KMeans, n_clusters=2)
        #self.run(AgglomerativeClustering, n_clusters=2)
项目:image-segmentation    作者:alexlouden    | 项目源码 | 文件源码
def cluster_ward(self, image_cols):

        # Connectivity
        # TODO optional connectivity
        connectivity = grid_to_graph(*self.image.shape[:2])

        ward = AgglomerativeClustering(
            n_clusters=self.params.num_clusters,
            linkage='ward',
            connectivity=connectivity
        )
        ward.fit(image_cols)

        self.number_of_clusters = len(np.unique(ward.labels_))
        print 'number of clusters', self.number_of_clusters

        centers = np.zeros((self.number_of_clusters, 3))
        for i in range(0, self.number_of_clusters):
            cluster_points = image_cols[ward.labels_ == i]
            cluster_mean = np.mean(cluster_points, axis=0)
            centers[i, :] = cluster_mean

        return centers, ward.labels_
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def agglomerative_clustering(X, k=10):
    """ Run an agglomerative clustering on X.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        k: the number of clusters we want (default: 10).
    Returns:
        An AgglomerativeClustering model trained on X.
    """
    model = AgglomerativeClustering(n_clusters=k)
    model.fit(X)

    # Note all the other functions are the same except we use
    # 'AgglomerativeClustering' instead of 'KMeans'.
    return model


# Ex4.1
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def cluster_agglomerative(X_train, model_args=None, gridsearch=True, connectivity_graph=True, connectivity_graph_neighbors=10):
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import kneighbors_graph
    print('AgglomerativeClustering')

    if connectivity_graph:
        print('Creating k-neighbors graph for connectivity restraint')
        connectivity = kneighbors_graph(X_train, n_neighbors=connectivity_graph_neighbors)
        model_args['connectivity'] = connectivity

    if gridsearch is True:
        ## TODO:
        # add hyperparamter searching. No scoring method available for this model, 
        # so we can't easily use gridsearching.

        raise NotImplementedError('No hyperparameter optimization available yet for this model. Set gridsearch to False')
        # prune(param_grid, model_args)
    else:
        if 'n_clusters' not in model_args:
            raise KeyError('Need to define n_clusters for AgglomerativeClustering')
        param_grid = None

    return ModelWrapper(AgglomerativeClustering, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_AgglomerativeClustering_nclusters(*data):
    '''
    test the performance with different n_clusters
    :param data:  data, target
    :return: None
    '''
    X,labels_true=data
    nums=range(1,50)
    ARIs=[]
    for num in nums:
        clst=cluster.AgglomerativeClustering(n_clusters=num)
        predicted_labels=clst.fit_predict(X)
        ARIs.append(adjusted_rand_score(labels_true,predicted_labels))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(nums,ARIs,marker="+")
    ax.set_xlabel("n_clusters")
    ax.set_ylabel("ARI")
    fig.suptitle("AgglomerativeClustering")
    plt.show()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
项目:Oedipus    作者:tum-i22    | 项目源码 | 文件源码
def agglomerativeClustering(sourceFiles, fileExtension):
    """ Performs agglomerative hierarchical clustering using files with <fileExtension> in the <sourceFiles> directory and return accuracy measure"""
    try:
        accuracy = 0
        # Step 1 - Check the required algorithm to specify the data type to load
        dataFiles = glob.glob("%s/*.%s" % (arguments.sourcedir, arguments.datatype)) # Get the paths of files to load
        dataSamples, dataLabels, loadedClusters = [], [], []
        for dataPoint in dataFiles:
            dataSamples.append([float(x) for x in open(dataPoint).read()[1:-1].split(",")])
            # Also load its cluster
            clusterName, paramNames = loadLabelFromFile(dataPoint.replace(".%s" % arguments.datatype, ".metadata"))
            if not clusterName in loadedClusters:
                loadedClusters.append(clusterName)
            dataLabels.append(loadedClusters.index(clusterName))
        prettyPrint("Successfully retrieved %s instances for clustering" % len(dataSamples))
        # Step 2 - Perform clustering
        clusterer = AgglomerativeClustering(n_clusters=len(loadedClusters))
        predicted = clusterer.fit_predict(numpy.array(dataSamples), dataLabels)
        accuracy = round(metrics.accuracy_score(dataLabels, predicted), 2)

    except Exception as e:
        prettyPrint("Error encountered: %s" % e, "error")

    return accuracy
项目:NBAPlayerValue    作者:TWanish    | 项目源码 | 文件源码
def agglom(reduced_data, n_clusters):
    #----Do Agglomerative clustering and return relevant performance data
    clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters)
    clustering = clustering.fit(reduced_data)
    sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean')

    return {
        "labels":clustering.labels_,
        "silhouette_score": sil_score
        }
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def _cluster_documents(self):

        method = self.params['cluster_method']
        n_clusters = int(self.params['cluster_n_clusters'])

        n_samples = len(self.document_vectors)

        if n_clusters > n_samples:
            n_clusters = n_samples

        if method == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
        else:
            clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine')

        clustering = clusterer.fit(self.document_vectors)
        cluster_labels = clustering.labels_

        clustering_dict = clustering.__dict__
        cluster_centers = clustering_dict['cluster_centers_']

        clusters = {}

        for document_id,cluster_label in enumerate(cluster_labels):
            if cluster_label not in clusters:
                clusters[cluster_label] = []
            clusters[cluster_label].append(document_id)

        return clusters,cluster_centers
项目:postlearn    作者:TomAugspurger    | 项目源码 | 文件源码
def test_compute_centers(self, data_labels):
        data, _ = data_labels
        ac = cluster.AgglomerativeClustering()
        fit = ac.fit(data)
        result = compute_centers(fit, data)

        assert result.shape == (data.shape[1], len(set(fit.labels_)))
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def _get_htree(self, X=None, metric='cosine'):
        km = self.km
        method_name = type(km).__name__
        if method_name == 'AgglomerativeClustering':
            htree = {'n_leaves': km.n_leaves_,
                     'n_components': km.n_components_,
                     'children': km.children_.tolist()}
        elif method_name in ['Birch', '_BirchDummy']\
                and self._pars['n_clusters'] is None:
            hmod = _BirchHierarchy(km, metric=metric)
            hmod.fit(X)
            htree = hmod.htree
        else:
            htree = {}
        return htree
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def ward_hc(self, n_clusters, n_neighbors=10):
        """
        Perform Ward hierarchical clustering

        Parameters
        ----------
        n_clusters : int
            number of clusters
        lsi_components : int
            apply LSA before the clustering algorithm
        n_neighbors : int
            N nearest neighbors used for computing the connectivity matrix
        """
        from sklearn.cluster import AgglomerativeClustering
        from sklearn.neighbors import kneighbors_graph
        pars = {'n_neighbors': n_neighbors, 'is_hierarchical': True,
                "metric": self.metric}
        if 'lsi' not in self.pipeline:
            raise ValueError("you must use lsi with birch clustering "
                             "for scaling reasons.")

        # This is really not efficient as
        # it's done a second time in _cluster_func
        X = self.pipeline.data
        connectivity = kneighbors_graph(X, n_neighbors=n_neighbors,
                                        include_self=False)

        km = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
                                     connectivity=connectivity)

        return self._cluster_func(n_clusters, km, pars)
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def clustering(docs,n_clusters):  # ?? n_clusters ???
    kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs)  # kmeans??
    labels=kmeans_model.labels_
    # hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs)   # ????
    # labels=hmodel.labels_
    score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean')  #   euclidean  ??
    return labels,score
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_ward_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'WARD/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
        predict_result = ward.fit_predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def ex4_agglomerative_clustering(X, y):
    """ This does the same thing as ex2_kmeans but with an agglomerative
    clustering and K=2.
    """
    # AgglomerativeClustering needs a non-spare matrix
    X = X.toarray()

    k = 2
    model = AgglomerativeClustering(k).fit(X, y)

    print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)


# Ex 5
项目:iris-Clustering-python-PTVS    作者:mjbahmani    | 项目源码 | 文件源码
def Learning(X):
    from sklearn.cluster import AgglomerativeClustering

    learner = AgglomerativeClustering(n_clusters=3)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters(n=3)', y


#=================================================
项目:iris-Clustering-python-PTVS    作者:mjbahmani    | 项目源码 | 文件源码
def Learning(X):
    from sklearn.cluster import AgglomerativeClustering

    learner = AgglomerativeClustering(n_clusters=3)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters(n=3)', y


#=================================================
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_AgglomerativeClustering(*data):
    '''
    test AGG method
    :param data: data, target
    :return: None
    '''
    X,labels_true=data
    clst=cluster.AgglomerativeClustering()
    predicted_labels=clst.fit_predict(X)
    print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels)))
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_AgglomerativeClustering_linkage(*data):
    '''
    test the performance with different linkages
    :param data:  data, target
    :return: None
    '''
    X,labels_true=data
    nums=range(1,50)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)

    linkages=['ward','complete','average']
    markers="+o*"
    for i, linkage in enumerate(linkages):
        ARIs=[]
        for num in nums:
            clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage)
            predicted_labels=clst.fit_predict(X)
            ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
        ax.plot(nums,ARIs,marker=markers[i],label="linkage:{0}".format(linkage))

    ax.set_xlabel("n_clusters")
    ax.set_ylabel("ARI")
    ax.legend(loc="best")
    fig.suptitle("AgglomerativeClustering")
    plt.show()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144)])
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_connectivity_fixing_non_lil():
    # Check non regression of a bug if a non item assignable connectivity is
    # provided with more than one component.
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = AgglomerativeClustering(connectivity=c, linkage='ward')
    assert_warns(UserWarning, w.fit, x)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(
        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_connectivity_ignores_diagonal():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_agg_n_clusters():
    # Test that an error is raised when n_clusters <= 0

    rng = np.random.RandomState(0)
    X = rng.rand(20, 10)
    for n_clus in [-1, 0]:
        agc = AgglomerativeClustering(n_clusters=n_clus)
        msg = ("n_clusters should be an integer greater than 0."
               " %s was provided." % str(agc.n_clusters))
        assert_raise_message(ValueError, msg, agc.fit, X)
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeWard(X, k=2):
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    return cluster.AgglomerativeClustering(n_clusters=k,
                        linkage='ward', connectivity=connectivity)
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeAvgLinkage(X=None, k=2):
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    return cluster.AgglomerativeClustering(linkage="average",
                                affinity="cityblock", n_clusters=k,
                                connectivity=connectivity)
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeMaxLinkage(X=None, k=2):
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    return cluster.AgglomerativeClustering(linkage="complete",
                                affinity="cityblock", n_clusters=k,
                                connectivity=connectivity)
项目:sakmapper    作者:szairis    | 项目源码 | 文件源码
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'
项目:NBAPlayerValue    作者:TWanish    | 项目源码 | 文件源码
def plot_cluster(reduced_data, cluster_type, k_clusters, plot_title):
    if cluster_type.lower() == "kmeans":
        clus = KMeans(init='k-means++', n_clusters=k_clusters, n_init=10)

    elif cluster_type.lower() == "agglom":
        clus = AgglomerativeClustering(n_clusters = k_clusters)

    clus.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = clus.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(15,10))
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=10)

    if cluster_type.lower() == "kmeans":
        # Plot the centroids as a white X
        centroids = clus.cluster_centers_
        plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)

    plt.title(plot_title)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()