Python sklearn.cluster 模块,KMeans() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.KMeans()

项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def compress_image(img, num_clusters):
    # Convert input image into (num_samples, num_features) 
    # array to run kmeans clustering algorithm 
    X = img.reshape((-1, 1))  

    # Run kmeans on input data
    kmeans = cluster.KMeans(n_clusters=num_clusters, n_init=4, random_state=5)
    kmeans.fit(X)
    centroids = kmeans.cluster_centers_.squeeze()
    labels = kmeans.labels_

    # Assign each value to the nearest centroid and 
    # reshape it to the original image shape
    input_image_compressed = np.choose(labels, centroids).reshape(img.shape)

    return input_image_compressed
项目:vapor    作者:mills-lab    | 项目源码 | 文件源码
def k_means_cluster_Predict(data_list,info):
    array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
    ks = list(range(1,len(info)))
    KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
    BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
    ks_picked=ks[BIC.index(max(BIC))]
    if ks_picked==1:
        return [data_list]
    else:
        out=[]
        std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
        whitened = whiten(array_diagnal)
        centroids, distortion=kmeans(whitened,ks_picked)
        idx,_= vq(whitened,centroids)
        for x in range(ks_picked):
            group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
            out.append(group1)
        return out
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def run_kmeans(transformed_pca_matrix, n_clusters, random_state=None):
    if random_state is None:
        random_state=cr_constants.RANDOM_STATE

    kmeans = sk_cluster.KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = kmeans.fit_predict(transformed_pca_matrix) + 1

    cluster_score = compute_db_index(transformed_pca_matrix, kmeans)

    clusters = cr_clustering.relabel_by_size(clusters)

    clustering_key = cr_clustering.format_clustering_key(cr_clustering.CLUSTER_TYPE_KMEANS, n_clusters)

    return cr_clustering.create_clustering(clusters=clusters,
                                           num_clusters=n_clusters,
                                           cluster_score=cluster_score,
                                           clustering_type=cr_clustering.CLUSTER_TYPE_KMEANS,
                                           global_sort_key=n_clusters,
                                           description=cr_clustering.humanify_clustering_key(clustering_key))
项目:DomainDependencyMemeJsai2017    作者:GINK03    | 项目源码 | 文件源码
def step4():
  key_vec = pickle.loads(open("key_vec.pkl", "rb").read()) 
  vecs = []
  for ev, vec in enumerate(key_vec.values()):
    x = np.array(vec)
    if np.isnan(x).any():
      # print(vec)
      continue
    vecs.append(x)
  vecs   = np.array(vecs)
  kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300,
                       tol=0.0001,precompute_distances='auto', verbose=0,
                       random_state=None, copy_x=True, n_jobs=1)
  print("now fitting...")
  kmeans.fit(vecs)

  open("kmeans.model", "wb").write( pickle.dumps(kmeans) )
  for p in kmeans.predict(vecs):
    print(p)
项目:DNGR-Keras    作者:MdAsifKhan    | 项目源码 | 文件源码
def cluster(data,true_labels,n_clusters=3):

    km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    km.fit(data)

    km_means_labels = km.labels_
    km_means_cluster_centers = km.cluster_centers_
    km_means_labels_unique = np.unique(km_means_labels)

    colors_ = cycle(colors.cnames.keys())

    initial_dim = np.shape(data)[1]
    data_2 = tsne(data,2,initial_dim,30)

    plt.figure(figsize=(12, 6))
    plt.scatter(data_2[:,0],data_2[:,1], c=true_labels)
    plt.title('True Labels')

    return km_means_labels
项目:brainiak    作者:brainiak    | 项目源码 | 文件源码
def init_centers_widths(self, R):
        """Initialize prior of centers and widths

        Returns
        -------

        centers : 2D array, with shape [K, n_dim]
            Prior of factors' centers.

        widths : 1D array, with shape [K, 1]
            Prior of factors' widths.

        """

        kmeans = KMeans(
            init='k-means++',
            n_clusters=self.K,
            n_init=10,
            random_state=100)
        kmeans.fit(R)
        centers = kmeans.cluster_centers_
        widths = self._get_max_sigma(R) * np.ones((self.K, 1))
        return centers, widths
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def all_cluster():
    #????????between??
    # bet_dic = {}
    # fin = open('sort_between.txt', 'r')
    # while True:
    #     line = fin.readline()
    #     if line:
    #         line = line.strip()
    #         between, vec = line.split('^')
    #         vec = vec.strip('[')
    #         vec = vec.strip(']')
    #         vec = vec.split(',')
    #         bet_dic[between] = vec
    #
    #     else:
    #         break
    # bet_dic = pd.DataFrame(bet_dic)
    # bet_dic = bet_dic.T
    # bet_dic.to_csv('dataframe.csv')
    # fin.close()
    df = pd.read_csv('dataframe.csv')
    clf = KMeans(n_clusters=50)
    s = clf.fit(df[1:, 1:])
    print s
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def k_means_clustering(self, out_path, pd_data, number_of_clusters):
        headers, repos, features = self.__fetch_data(pd_data)

        kmeans = KMeans(n_clusters=number_of_clusters, random_state=0, n_init=200).fit(features)  # apply kmeans algorithm

        # form clusters
        clusters = []
        for i in range(0, number_of_clusters): # k cluster
            repo_list = []
            for j in range (0, len(kmeans.labels_)):  # a label for each repo.
                if i == kmeans.labels_[j]:  # if repo label is equal to Cluster number
                    repo_list.append(repos[j])  # add repo to cluster i's list.
            clusters.append(repo_list)

        out_file_path = os.path.join(out_path, "kmeans_noOfClusters" + str(number_of_clusters))
        self.__export_k_means_results(kmeans, headers, clusters, out_file_path)  # avoid ".csv"
项目:ref-extract    作者:brandonrobertz    | 项目源码 | 文件源码
def cluster(X, seed=0, n_clusters=20, alg='kmeans'):
    """
    Perform k-means on given X data. For alg, use one of:
    'kmeans' (sklearn KMeans) or 'spherical' (SphericalKMeans)
    returns (X pred clusters, cluster centers)
    NOTE: euclidean tends to perform very poorly
    """
    # log("Clustering k-means with {} clusters".format(n_clusters))
    if alg == 'kmeans':
        Model = KMeans
    elif alg == 'spherical':
        # inplace l2 normalization (spherical k-means assumes this)
        normalize(X, 'l2', copy=False)
        Model = SphericalKMeans

    kmeans = Model(
        n_clusters=int(n_clusters), random_state=seed
    )
    pred_clusters = kmeans.fit_predict(X)
    return pred_clusters, kmeans.cluster_centers_
项目:ASTRiDE    作者:dwkim78    | 项目源码 | 文件源码
def __init__(self, edges, branching_factor=50, threshold=0.1):
        # Make features list.
        features = []
        for i in range(len(edges)):
            edge = edges[i]
            features.append([edge['perimeter'], edge['area'],
                             edge['shape_factor'], edge['radius_deviation']])
        features = np.array(features)

        # Normalize features
        normed_features = features.copy()
        for i in range(features.shape[1]):
            avg = np.median(features[::, i])
            std = np.std(features[::, i])

            normed_features[::, i] -= avg
            normed_features[::, i] /= avg

        self.features = features
        self.normed_features = normed_features
        self.branching_factor = branching_factor
        self.threshold = threshold
        #self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2)
        self.run(KMeans, n_clusters=2)
        #self.run(AgglomerativeClustering, n_clusters=2)
项目:bot2017Fin    作者:AllanYiin    | 项目源码 | 文件源码
def color_differenciate(img:Image,k:int):
    imgarr = img2array(img)
    imgarr_r = imgarr.reshape((imgarr.shape[0] * imgarr.shape[1], 3))
    clt =KMeans(n_clusters = k)
    clt.fit(imgarr_r)
    numLabels = np.arange(0, len(np.unique(clt.labels_)) + 1)
    images=[]
    for i in range(len(numLabels)):
        images.append(np.ones(imgarr_r.shape,dtype=np.int32)*255)
    for idx in range(len(clt.labels_)):
            label=clt.labels_[idx]
            images[label][idx][0]=imgarr_r[idx][0]
            images[label][idx][1] = imgarr_r[idx][1]
            images[label][idx][2] = imgarr_r[idx][2]
    new_images=[]
    for i in range(len(numLabels)):
        new_img=array2img(images[i].reshape(imgarr.shape))
        new_img.save('test_'+str(i)+'.jpg')
        new_images.append(new_img)
    return new_images
项目:WebAppEx    作者:karlafej    | 项目源码 | 文件源码
def get_plot(x, y, k, iris=iris):
    k_means = KMeans(n_clusters= k)
    k_means.fit(iris.data) 
    colormap = rainbow(np.linspace(0, 1, k))
    fig = plt.figure()
    splt = fig.add_subplot(1, 1, 1)
    splt.scatter(iris.data[:,x], iris.data[:,y], c = colormap[k_means.labels_], s=40)
    splt.scatter(k_means.cluster_centers_[:,x], k_means.cluster_centers_[:,y], c = 'black', marker='x')
    splt.set_xlabel(iris.feature_names[x])
    splt.set_ylabel(iris.feature_names[y])

    figfile = BytesIO()
    plt.savefig(figfile, format='png')
    figfile.seek(0) 
    figdata_png = base64.b64encode(figfile.getvalue()).decode()
    return figdata_png
项目:WebAppEx    作者:karlafej    | 项目源码 | 文件源码
def update():
    # Get the current slider values
    N = clusters.value
    x_var = axis_map[x_axis.value]
    y_var = axis_map[y_axis.value]

    k_means = KMeans(n_clusters=N)
    k_means.fit(iris.data) 
    centroids = k_means.cluster_centers_

    palette = sns.palettes.color_palette('hls', N)
    colormap = np.array(palette.as_hex())[k_means.labels_] # as hex is necessary for bokeh to render the colors properly.

    plot.xaxis.axis_label = x_axis.value
    plot.yaxis.axis_label = y_axis.value

    source.data = dict(
        x=iris.data[:,x_var],
        y=iris.data[:,y_var],
        colors=colormap)
    centers.data = dict(
        cx=centroids[:,x_var],
        cy=centroids[:,y_var])
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def KmeansWrapper(true_k, data, load=False):
    from sklearn.externals import joblib

    modelName = 'doc_cluster.%s.plk' % true_k

    if load:
        km = joblib.load(modelName)
        labels = km.labels_
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    # max_iter=1000,
                    n_init=10,
                    n_jobs=-1,
                    random_state=0,
                    verbose=0)
        km.fit_predict(data)
        labels = km.labels_
        joblib.dump(km,  modelName)

    return labels, km.cluster_centers_
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def elbowMethod(X, k=21):
    distortions = []
    for i in range(1, k):
        km2 = KMeans(n_clusters=i,
                     init='k-means++',
                     n_init=10,
                     random_state=0,
                     n_jobs=-1,
                     verbose=0)
        km2.fit(X)
        distortions.append(km2.inertia_)
        print('k=%s, Distortion: %.2f' % (i, km2.inertia_))

    plt.plot(range(1, k), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def gridSearch(data, params, true_k):

    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=True,
                            sublinear_tf=True,
                            analyzer='word')

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf', KMeans(init='k-means++',
                                        n_jobs=-1,
                                        random_state=0,
                                        verbose=0))])
    gsTfIdf = GridSearchCV(
        lr_tfidf, params, n_jobs=1, verbose=1)

    gsTfIdf.fit(data)
    print()
    print("Best score: %0.3f" % gsTfIdf.best_score_)
    print("Best parameters set:")
    best_parameters = gsTfIdf.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:vapor    作者:mills-lab    | 项目源码 | 文件源码
def k_means_cluster(data_list):
    if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10:
        array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
        ks = list(range(1,min([5,len(data_list[0])+1])))
        KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
        KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks]
        BIC=[]
        BIC_rec=[]
        for x in ks:
            if KMeans_predict[x-1].max()<x-1: continue
            else:
                BIC_i=compute_bic(KMeans[x-1],array_diagnal)
                if abs(BIC_i)<10**8:
                    BIC.append(BIC_i)
                    BIC_rec.append(x)
        #BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
        #ks_picked=ks[BIC.index(max(BIC))]
        ks_picked=BIC_rec[BIC.index(max(BIC))]
        if ks_picked==1:
            return [data_list]
        else:
            out=[]
            std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
            whitened = whiten(array_diagnal)
            centroids, distortion=kmeans(whitened,ks_picked)
            idx,_= vq(whitened,centroids)
            for x in range(ks_picked):
                group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
                out.append(group1)
            return out
    else:
        return [data_list]
项目:elm    作者:ContinuumIO    | 项目源码 | 文件源码
def kmeans_aic(model, X, **kwargs):
    '''AIC (Akaike Information Criterion) for k-means for model selection

    Parameters:
        :model:  An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline
        :X:      The X data that were just given to "fit", or "partial_fit"
        :kwargs: placeholder - ignored

    Returns:
        :AIC: float

    '''

    k, m = model._estimator.cluster_centers_.shape
    if isinstance(X, xr.DataArray):
        n = X.flat.values.shape[0]
    else:
        n = X.shape[0]
    d = model._estimator.inertia_
    aic =  d + 2 * m * k
    delattr(model._estimator, 'labels_')
    return aic
项目:NetPower_TestBed    作者:Vignesh2208    | 项目源码 | 文件源码
def _init(self, X, lengths=None):
        super(GaussianHMM, self)._init(X, lengths=lengths)

        _, n_features = X.shape
        if hasattr(self, 'n_features') and self.n_features != n_features:
            raise ValueError('Unexpected number of dimensions, got %s but '
                             'expected %s' % (n_features, self.n_features))

        self.n_features = n_features
        if 'm' in self.init_params or not hasattr(self, "means_"):
            kmeans = cluster.KMeans(n_clusters=self.n_components,
                                    random_state=self.random_state)
            kmeans.fit(X)
            self.means_ = kmeans.cluster_centers_
        if 'c' in self.init_params or not hasattr(self, "covars_"):
            cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
            if not cv.shape:
                cv.shape = (1, 1)
            self._covars_ = distribute_covar_matrix_to_match_covariance_type(
                cv, self.covariance_type, self.n_components).copy()
项目:ECoG-ClusterFlow    作者:sugeerth    | 项目源码 | 文件源码
def ConsensusCluster(self, data, subsamples, subsample_fraction, norm_var, kvalues): 
        """
        Performs consensus clustering algorithms here!!!
        """
        return
        partition = dict()
        stuff = []
        nb_clusters = 0 # this is the number of cluster the dataset is supposed to be partitioned into
        distances = nx.to_numpy_matrix(data)

        for i in kvalues:
            clusterid, error, nfound = KMeans(distances, nclusters= i, npass=300)
            uniq_ids = list(set(clusterid))
            new_ids = [ uniq_ids.index(val) for val in clusterid]

            for i,value in enumerate(new_ids):
                partition[i] = value
            stuff.append(partition)
项目:anomaly-detection-libs    作者:IceKhan13    | 项目源码 | 文件源码
def fit(self, data):
        """ fit model on data """
        self.data = data

        kmeans = KMeans(n_clusters=self.n_clusters)
        kmeans.fit(data)
        self.clusterer = kmeans
        logging.info('Fit has been completed')

        self.data_clusters = self.clusterer.predict(data)
        self.cluster_centers = self.clusterer.cluster_centers_
        logging.info('Cluster calculation has been completed')

        self.__clusters_separation()
        logging.info('Cluster separation has been completed')

        self.__cluster_avg_distances()
        logging.info('Cluster avg distances has been calculated')
项目:oasis    作者:ngmarchant    | 项目源码 | 文件源码
def stratify_by_features(features, n_strata, **kwargs):
    """Stratify by clustering the items in feature space

    Parameters
    ----------
    features : array-like, shape=(n_items,n_features)
        feature matrix for the pool, where rows correspond to items and columns
        correspond to features.

    n_strata : int
        number of strata to create.

    **kwargs :
        passed to sklearn.cluster.KMeans

    Returns
    -------
    Strata instance
    """
    n_items = features.shape[0]
    km = KMeans(n_clusters=n_strata, **kwargs)
    allocations = km.fit_predict(X=features)
    return Strata(allocations)
项目:Particle-Picking-Cryo-EM    作者:hqythu    | 项目源码 | 文件源码
def cluster(centers):
    n_class = int(len(centers) * 0.18)
    est = KMeans(n_clusters=n_class, max_iter=1000)
    est.fit(centers)
    new_list = []
    for x, y in est.cluster_centers_:
        min_num = 10000
        min_x = -1
        min_y = -1
        for x_, y_ in centers:
            dist = distance(x, y, x_, y_)
            if (dist < min_num) or (min_x == -1):
                min_num = dist
                min_x = x_
                min_y = y_
        new_list.append([min_x, min_y])
    return new_list
项目:AutismVoicePrint    作者:opraveen    | 项目源码 | 文件源码
def noise_removal(aud_sample):
    if (min(abs(aud_sample)) == 0):
      return aud_sample

    data = abs(np.copy(aud_sample))
    clf = KMeans(n_clusters = 2,n_init = 5)
    data = data.reshape(-1,1)
    clf.fit(data)
    if clf.cluster_centers_[0] < clf.cluster_centers_[1]:
      noise = 0
    else:
      noise = 1

    aud = np.copy(aud_sample)

    window = 500
    windowStride = 50
    for i in range(0,len(clf.labels_),windowStride):
        if sum(clf.labels_[i:i+window] == noise) == window:
            aud[i:i+window] = 0

    return aud
项目:coursera-machine-learning-yandex    作者:dstarcev    | 项目源码 | 文件源码
def calculate():
    from sklearn.metrics import mean_squared_error
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')

    for k in xrange(2, 22):
        cluster = KMeans(k, init='k-means++', random_state=241)
        cluster.fit(X)
        reduced_image = recreate_image(cluster.cluster_centers_, cluster.labels_, h, w, d)
        mse = np.mean((image - reduced_image) ** 2)
        psnr = 10 * np.log10(1.0 / mse)
        plot(reduced_image, "plots/plot%d.png" % (k))
        print "k: %d, mse: %.2f psnr: %.2f" % (k, mse, psnr)
        if psnr > 20:
            return k
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def evaluate_kmeans(X, model):
    """ Evaluate a K-Means model that has been trained on X using the
     Silhouette score.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        model: the KMeans model trained on X.
    Returns:
        A double that corresponds to the Silhouette score of the model.
    """
    return silhouette_score(X, model.labels_)


# Ex2
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def agglomerative_clustering(X, k=10):
    """ Run an agglomerative clustering on X.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        k: the number of clusters we want (default: 10).
    Returns:
        An AgglomerativeClustering model trained on X.
    """
    model = AgglomerativeClustering(n_clusters=k)
    model.fit(X)

    # Note all the other functions are the same except we use
    # 'AgglomerativeClustering' instead of 'KMeans'.
    return model


# Ex4.1
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def cluster_kmeans(X_train, model_args=None, gridsearch=True):
    from sklearn.cluster import KMeans
    print('KMeans')

    if gridsearch is True:
        param_grid = {
            'n_clusters': np.arange(1, 20, 2),
            'max_iter': [50, 100, 300],
            'tol': [1e-5, 1e-4, 1e-3]
        }
        prune(param_grid, model_args)
    else:
        if 'n_clusters' not in model_args:
            raise KeyError('Need to define n_clusters for Birch')
        param_grid = None

    return ModelWrapper(KMeans, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
项目:autoxd    作者:nessessary    | 项目源码 | 文件源码
def getFlatVolume(series_volumes):
        """????????????? return: float"""
        results = np.array(series_volumes)
        results_n = np.zeros((len(results),2))
        results_n[:,0] = 1
        results_n[:,1] = np.array(results)  
        #???3?? ?????????????
        k = KMeans(3)
        k.fit(results_n)
        df = pd.DataFrame(k.labels_)
        df_c = pd.DataFrame(k.cluster_centers_)
        v = []
        for i in range(3):
            v.append( df[df[0]==i].count()[0])
        df_c[2] = v
        return df_c.iloc[df_c[2].argmax()][1]


    #
    #????
    #----------------------------------------------------------------------
项目:pyhiro    作者:wanweiwei07    | 项目源码 | 文件源码
def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5):
        """
        cluster the samples of each facet using k nearest neighbors
        the cluster center and their correspondent normals will be saved
        in self.objsamplepnts_refcls and self.objsamplenrmals_refcls

        :param: reduceRatio: the ratio of points to reduce
        :param: maxNPnts: the maximum number of points on a facet
        :return: None

        author: weiwei
        date: 20161129, tsukuba
        """

        self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        for i, facet in enumerate(self.facets):
            self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
            self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
            X = self.objsamplepnts_ref[i]
            nX = X.shape[0]
            if nX > reduceRatio:
                kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X)
                self.objsamplepnts_refcls[i] = kmeans.cluster_centers_
                self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])
项目:pyhiro    作者:wanweiwei07    | 项目源码 | 文件源码
def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5):
        """
        cluster the samples of each facet using k nearest neighbors
        the cluster center and their correspondent normals will be saved
        in self.objsamplepnts_refcls and self.objsamplenrmals_refcls

        :param: reduceRatio: the ratio of points to reduce
        :param: maxNPnts: the maximum number of points on a facet
        :return: None

        author: weiwei
        date: 20161129, tsukuba
        """

        self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        for i, facet in enumerate(self.facets):
            self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
            self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
            X = self.objsamplepnts_ref[i]
            nX = X.shape[0]
            if nX > reduceRatio:
                kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X)
                self.objsamplepnts_refcls[i] = kmeans.cluster_centers_
                self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])
项目:wordnet-clusters    作者:darenr    | 项目源码 | 文件源码
def word_cluster(data, labels, k):
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(data)
    for i, label in enumerate(labels):
        print label, k_means.labels_[i]

    d = defaultdict(list)
    for c, l in zip(k_means.labels_, labels):
        d['cluster' + str(c)].append(l.name())
    fname = 'results/clusters'
    if use_wordnet:
        fname += "_wn"
    if use_wordvectors:
        fname += "_wv"
    fname += '_k' + str(k) + '.json'
    with codecs.open(fname, 'wb', 'utf-8') as outfile:
        outfile.write(json.dumps(d, indent=True))
        print ' * Saved results to', fname
        # create histogram of cluster sizes
        histogram(d)
项目:context_predictive_words    作者:Cogitans    | 项目源码 | 文件源码
def KMeansAccuracy():
    clusterer = KMeans(n_clusters=2, n_init=30)
    tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb"))
    predictions = clusterer.fit_predict(tdm)
    true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
    numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels]
    numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels]
    one = f1_score(numerical_mapped_1, predictions)
    two = f1_score(numerical_mapped_2, predictions)
    print("The F1 score of KMeans on BOW is: " + str(max(one, two)))

    clusterer = KMeans(n_clusters=2, n_init=30)
    predictions = clusterer.fit_predict(tdm)
    true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
    accuracy = predict_accuracy(true_labels, predictions)
    print("The F1 score of KMeans on BOW (w/Tdidf) is: " + accuracy)
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def learn_color_clusters():
    samples = np.zeros((0, 3))
    cnt = 0
    with open('train_list') as f:
        for line in f:
            line = line[:-1]
            image = cv2.imread(line)
            image = cv2.resize(image, (100, 100))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)

            points = image.reshape((-1, 3))
            np.random.permutation(points.shape[0])
            samples = np.vstack([samples, points[:50]])

            print(samples.shape)
            cnt = cnt + 1
            if cnt % 10000 == 0:
                break

    km = cluster.KMeans(n_clusters=50, n_jobs=-1)
    km.fit(samples)
    np.save('lab_clusters.npy', km.cluster_centers_)
    return

#learn_color_clusters()
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_estimator_instance(self):
        """
        Test that isestimator works for instances
        """

        models = (
            LinearRegression(),
            LogisticRegression(),
            KMeans(),
            LSHForest(),
            PCA(),
            RidgeCV(),
            LassoCV(),
            RandomForestClassifier(),
        )

        for model in models:
            self.assertTrue(isestimator(model))
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_estimator_class(self):
        """
        Test that isestimator works for classes
        """
        models = (
            LinearRegression,
            LogisticRegression,
            KMeans,
            LSHForest,
            PCA,
            RidgeCV,
            LassoCV,
            RandomForestClassifier,
        )

        for model in models:
            self.assertTrue(inspect.isclass(model))
            self.assertTrue(isestimator(model))
项目:news-shot-classification    作者:gshruti95    | 项目源码 | 文件源码
def get_cluster_threshold(weights):

    estimator = KMeans(n_clusters = 2)
    data = np.asarray(weights)
    data = data.reshape(-1,1)
    # print data 
    clusters_idx = estimator.fit_predict(data)
    max_idx = data.argmax()
    max_cluster = clusters_idx[max_idx]
    #print max_cluster
    low_cluster = []
    if max_cluster == 1:
        indices = np.argwhere(clusters_idx == 0)
        for idx in indices:
            low_cluster.append(data[idx])
        threshold = max(low_cluster)
        threshold = threshold[0][0]
    else:
        indices = np.argwhere(clusters_idx == 1)
        for idx in indices:
            low_cluster.append(data[idx])
        threshold = max(low_cluster)
        threshold = threshold[0][0]
    # print threshold
    return threshold
项目:recommendation_hybrid    作者:nsmalimov    | 项目源码 | 文件源码
def make_clast_books(dict_books_all, array_books_real):
    dict_books_clasters = {}

    for i in array_books_real:
        try:
            dict_books_clasters[i] = dict_books_all[i]
        except:
            dict_books_clasters[i] = [1, 1, 1, 1]

    X_array = dict_books_clasters.values()

    num_clusters = len(X_array) / 50

    k_means = cluster.KMeans(n_clusters=num_clusters)
    k_means.fit(X_array)
    # ????? ?????????? ????? ????????
    clusterized_array = list(k_means.labels_)

    for index, i in enumerate(dict_books_clasters.keys()):
        dict_books_clasters[i] = clusterized_array[index]

    return dict_books_clasters, num_clusters
项目:betasqaud    作者:AJacobs15    | 项目源码 | 文件源码
def __init__(self, league_df):

        stat_matrix = []
        for i in range(len(league_df)):
            stat = make_stat_vector(i, league_df)
            stat_matrix.append(stat)

        kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
        kmeans.fit(stat_matrix)

        centroid_array = kmeans.cluster_centers_

        positions = kmeans.predict(stat_matrix)

        league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
        league_df['position'] = pd.Series(positions, index = league_df.index)



        self.df = league_df
        self.centroids = kmeans.cluster_centers_

        self.map = make_position_map(centroid_array)
项目:betasqaud    作者:AJacobs15    | 项目源码 | 文件源码
def __init__(self, league_df):

        stat_matrix = []
        for i in range(len(league_df)):
            stat = make_stat_vector(i, league_df)
            stat_matrix.append(stat)

        kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
        kmeans.fit(stat_matrix)

        centroid_array = kmeans.cluster_centers_

        positions = kmeans.predict(stat_matrix)

        league_df['vector'] = pd.Series(stat_matrix, index = league_df.index)
        league_df['position'] = pd.Series(positions, index = league_df.index)



        self.df = league_df
        self.centroids = kmeans.cluster_centers_

        self.map = make_position_map(centroid_array)
项目:LearnHash    作者:galad-loth    | 项目源码 | 文件源码
def PQTrain(data, lenSubVec,numSubCenter):
    (dataSize, dataDim)=data.shape
    if 0!=dataDim%lenSubVec:
        print "Cannot partition the feature space with the given segment number"
        return
    numSubVec=dataDim/lenSubVec
    centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32)
    distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32)
    objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001)
    for ii in range(numSubVec):
        print("PQ training. Processing "+str(ii)+"-th sub-vector")
        objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) 
        centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_
        distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean"))
    model={"centers":centers,"distOfCenters":distOfCenters}   
    return model
项目:LearnHash    作者:galad-loth    | 项目源码 | 文件源码
def PQEval(data,lenSubVec,numSubCenter,centersPQ):
    (dataSize, dataDim)=data.shape
    if 0!=dataDim%lenSubVec:
        print "Cannot partition the feature space with the given segment number"
        return
    numSubVec=dataDim/lenSubVec
    codePQ=-npy.ones((dataSize, numSubVec),dtype=npy.int32)
    objKmeans=KMeans(numSubCenter)
    if (centersPQ.shape[0]!=numSubVec*numSubCenter 
        or centersPQ.shape[1]!=lenSubVec):
        print "PQ model dimension is not compatible with input data"
        return
    for ii in range(numSubVec):
        objKmeans.cluster_centers_=centersPQ[ii*numSubCenter:(ii+1)*numSubCenter,:]
        codePQ[:,ii]=objKmeans.predict(data[:,ii*lenSubVec:(ii+1)*lenSubVec])
    return codePQ
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {"subquery": query,
                  "geom_col": "the_geom",
                  "id_col": "cartodb_id"}

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {"subquery": query,
                  "geom_col": "the_geom",
                  "id_col": "cartodb_id"}

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def compute_readpairs_per_umi_threshold(reads, subsample_rate):
    ''' Compute a threshold above which the UMIs are unlikely to be PCR off-products.
        reads (np.array(int)) - Read pairs for each UMI
        subsample_rate (float) - Subsample reads to this fraction.
        Returns threshold (int) - The RPPU threshold in the subsampled space '''

    if len(np.unique(reads)) < 2:
        print 'Skipping RPPU threshold calculation.'
        return 1

    print 'RPPU subsample rate: %0.4f' % subsample_rate

    reads = np.random.binomial(reads, subsample_rate)
    reads = reads[reads > 0]

    if len(np.unique(reads)) < 2:
        print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.'
        return 1

    new_n50 = tk_stats.NX(reads, 0.5)

    print 'New N50: %d:' % new_n50

    # Log-transform counts
    log_reads = np.log(reads)

    # Run K-Means. Reshape necessary because kmeans takes a matrix.
    kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1)))
    kmeans.predict(log_reads.reshape((-1,1)))

    # Take the cluster with the smallest mean
    min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0]

    print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_))))
    print 'RPPU component members: ' + str(np.bincount(kmeans.labels_))

    # Take the max element in the min-cluster
    threshold = np.max(reads[kmeans.labels_ == min_cluster])

    return threshold
项目:rosie    作者:datasciencebr    | 项目源码 | 文件源码
def fit(self, X):
        _X = X[self.__applicable_rows(X)]
        companies = _X.groupby('recipient_id').apply(self.__company_stats) \
            .reset_index()
        companies = companies[self.__applicable_company_rows(companies)]

        self.cluster_model = KMeans(n_clusters=3)
        self.cluster_model.fit(companies[self.CLUSTER_KEYS])
        companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS])
        self.clusters = companies.groupby('cluster') \
            .apply(self.__cluster_stats) \
            .reset_index()
        self.clusters['threshold'] = \
            self.clusters['mean'] + 4 * self.clusters['std']
        return self
项目:SnapStitch    作者:avikj    | 项目源码 | 文件源码
def get_clusters_from_frames(frame_dir=None):

  # TODO: allow multiple frame directories to be processed at once
  if frame_dir is None:
    filename_to_embedding = pickle.load(open('temp/temp_vid1_290717183249/filename_to_emb.pkl')) # TODO: call get_inception_embeddings on frame dir, but for now just use the pickle
    embs = []
    filenames = []
    for filename, embedding in filename_to_embedding.iteritems():
      embs.append(embedding)
      filenames.append(filename)
    filenames = [filename[filename.rindex('/')+1:] for filename in filenames]
    embs = np.array(embs)
    candidates = [(11, 6)]
    candidates = [(eps, min_pts) for eps in range(7, 15) for min_pts in range(2, 10)]
    labels = cluster(embs, filenames, algorithm='KMeans', n_clusters=6)
项目:cg    作者:michaelhabeck    | 项目源码 | 文件源码
def kmeans(X, K):
        km = KMeans(K).fit(X)
        return km.cluster_centers_
项目:onionstack    作者:ntddk    | 项目源码 | 文件源码
def main():
    features = []

    for i in list:
        im = cv2.imread(i)
        hist, bins = np.histogram(im.ravel(), 256, [0, 256])
        features.append(hist)

    lsa = TruncatedSVD(10)
    features = lsa.fit_transform(features)
    features = Normalizer(copy = False).fit_transform(features)

    km = KMeans(
        init='k-means++',
        n_clusters=n_clusters,
    )
    km.fit(features)

    for i in range(n_clusters):
        if not os.path.exists('./result/' + str(i)):
            os.makedirs('./result/' + str(i))

    cnt = 0

    for i in list:
        filename = i.split('/')[-1]
        print filename,
        print km.labels_[cnt]
        shutil.copyfile(i, './result/' +  str(km.labels_[cnt]) + '/' + filename)
        cnt += 1
项目:dsbox-cleaning    作者:usc-isi-i2    | 项目源码 | 文件源码
def _discretize_by_kmeans(col, num_bins, random_state):
    nan_idx = col[col.isnull()].index
    kmeans = KMeans(n_clusters=num_bins, random_state=random_state)
    kmeans = kmeans.fit(col.dropna().values.T.reshape(-1, 1))
    group = kmeans.labels_
    if col.isnull().sum() > 0:
        group = group.astype(float)
        for idx in nan_idx:
            group = np.insert(group,idx,np.nan)
    return pd.Series(group)