我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.KMeans()。
def compress_image(img, num_clusters): # Convert input image into (num_samples, num_features) # array to run kmeans clustering algorithm X = img.reshape((-1, 1)) # Run kmeans on input data kmeans = cluster.KMeans(n_clusters=num_clusters, n_init=4, random_state=5) kmeans.fit(X) centroids = kmeans.cluster_centers_.squeeze() labels = kmeans.labels_ # Assign each value to the nearest centroid and # reshape it to the original image shape input_image_compressed = np.choose(labels, centroids).reshape(img.shape) return input_image_compressed
def k_means_cluster_Predict(data_list,info): array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))]) ks = list(range(1,len(info))) KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks] BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans] ks_picked=ks[BIC.index(max(BIC))] if ks_picked==1: return [data_list] else: out=[] std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])] whitened = whiten(array_diagnal) centroids, distortion=kmeans(whitened,ks_picked) idx,_= vq(whitened,centroids) for x in range(ks_picked): group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]] out.append(group1) return out
def run_kmeans(transformed_pca_matrix, n_clusters, random_state=None): if random_state is None: random_state=cr_constants.RANDOM_STATE kmeans = sk_cluster.KMeans(n_clusters=n_clusters, random_state=random_state) clusters = kmeans.fit_predict(transformed_pca_matrix) + 1 cluster_score = compute_db_index(transformed_pca_matrix, kmeans) clusters = cr_clustering.relabel_by_size(clusters) clustering_key = cr_clustering.format_clustering_key(cr_clustering.CLUSTER_TYPE_KMEANS, n_clusters) return cr_clustering.create_clustering(clusters=clusters, num_clusters=n_clusters, cluster_score=cluster_score, clustering_type=cr_clustering.CLUSTER_TYPE_KMEANS, global_sort_key=n_clusters, description=cr_clustering.humanify_clustering_key(clustering_key))
def step4(): key_vec = pickle.loads(open("key_vec.pkl", "rb").read()) vecs = [] for ev, vec in enumerate(key_vec.values()): x = np.array(vec) if np.isnan(x).any(): # print(vec) continue vecs.append(x) vecs = np.array(vecs) kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300, tol=0.0001,precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) print("now fitting...") kmeans.fit(vecs) open("kmeans.model", "wb").write( pickle.dumps(kmeans) ) for p in kmeans.predict(vecs): print(p)
def cluster(data,true_labels,n_clusters=3): km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) km.fit(data) km_means_labels = km.labels_ km_means_cluster_centers = km.cluster_centers_ km_means_labels_unique = np.unique(km_means_labels) colors_ = cycle(colors.cnames.keys()) initial_dim = np.shape(data)[1] data_2 = tsne(data,2,initial_dim,30) plt.figure(figsize=(12, 6)) plt.scatter(data_2[:,0],data_2[:,1], c=true_labels) plt.title('True Labels') return km_means_labels
def init_centers_widths(self, R): """Initialize prior of centers and widths Returns ------- centers : 2D array, with shape [K, n_dim] Prior of factors' centers. widths : 1D array, with shape [K, 1] Prior of factors' widths. """ kmeans = KMeans( init='k-means++', n_clusters=self.K, n_init=10, random_state=100) kmeans.fit(R) centers = kmeans.cluster_centers_ widths = self._get_max_sigma(R) * np.ones((self.K, 1)) return centers, widths
def all_cluster(): #????????between?? # bet_dic = {} # fin = open('sort_between.txt', 'r') # while True: # line = fin.readline() # if line: # line = line.strip() # between, vec = line.split('^') # vec = vec.strip('[') # vec = vec.strip(']') # vec = vec.split(',') # bet_dic[between] = vec # # else: # break # bet_dic = pd.DataFrame(bet_dic) # bet_dic = bet_dic.T # bet_dic.to_csv('dataframe.csv') # fin.close() df = pd.read_csv('dataframe.csv') clf = KMeans(n_clusters=50) s = clf.fit(df[1:, 1:]) print s
def k_means_clustering(self, out_path, pd_data, number_of_clusters): headers, repos, features = self.__fetch_data(pd_data) kmeans = KMeans(n_clusters=number_of_clusters, random_state=0, n_init=200).fit(features) # apply kmeans algorithm # form clusters clusters = [] for i in range(0, number_of_clusters): # k cluster repo_list = [] for j in range (0, len(kmeans.labels_)): # a label for each repo. if i == kmeans.labels_[j]: # if repo label is equal to Cluster number repo_list.append(repos[j]) # add repo to cluster i's list. clusters.append(repo_list) out_file_path = os.path.join(out_path, "kmeans_noOfClusters" + str(number_of_clusters)) self.__export_k_means_results(kmeans, headers, clusters, out_file_path) # avoid ".csv"
def cluster(X, seed=0, n_clusters=20, alg='kmeans'): """ Perform k-means on given X data. For alg, use one of: 'kmeans' (sklearn KMeans) or 'spherical' (SphericalKMeans) returns (X pred clusters, cluster centers) NOTE: euclidean tends to perform very poorly """ # log("Clustering k-means with {} clusters".format(n_clusters)) if alg == 'kmeans': Model = KMeans elif alg == 'spherical': # inplace l2 normalization (spherical k-means assumes this) normalize(X, 'l2', copy=False) Model = SphericalKMeans kmeans = Model( n_clusters=int(n_clusters), random_state=seed ) pred_clusters = kmeans.fit_predict(X) return pred_clusters, kmeans.cluster_centers_
def __init__(self, edges, branching_factor=50, threshold=0.1): # Make features list. features = [] for i in range(len(edges)): edge = edges[i] features.append([edge['perimeter'], edge['area'], edge['shape_factor'], edge['radius_deviation']]) features = np.array(features) # Normalize features normed_features = features.copy() for i in range(features.shape[1]): avg = np.median(features[::, i]) std = np.std(features[::, i]) normed_features[::, i] -= avg normed_features[::, i] /= avg self.features = features self.normed_features = normed_features self.branching_factor = branching_factor self.threshold = threshold #self.run(Birch, branching_factor=50, threshold=0.1, n_clusters=2) self.run(KMeans, n_clusters=2) #self.run(AgglomerativeClustering, n_clusters=2)
def color_differenciate(img:Image,k:int): imgarr = img2array(img) imgarr_r = imgarr.reshape((imgarr.shape[0] * imgarr.shape[1], 3)) clt =KMeans(n_clusters = k) clt.fit(imgarr_r) numLabels = np.arange(0, len(np.unique(clt.labels_)) + 1) images=[] for i in range(len(numLabels)): images.append(np.ones(imgarr_r.shape,dtype=np.int32)*255) for idx in range(len(clt.labels_)): label=clt.labels_[idx] images[label][idx][0]=imgarr_r[idx][0] images[label][idx][1] = imgarr_r[idx][1] images[label][idx][2] = imgarr_r[idx][2] new_images=[] for i in range(len(numLabels)): new_img=array2img(images[i].reshape(imgarr.shape)) new_img.save('test_'+str(i)+'.jpg') new_images.append(new_img) return new_images
def get_plot(x, y, k, iris=iris): k_means = KMeans(n_clusters= k) k_means.fit(iris.data) colormap = rainbow(np.linspace(0, 1, k)) fig = plt.figure() splt = fig.add_subplot(1, 1, 1) splt.scatter(iris.data[:,x], iris.data[:,y], c = colormap[k_means.labels_], s=40) splt.scatter(k_means.cluster_centers_[:,x], k_means.cluster_centers_[:,y], c = 'black', marker='x') splt.set_xlabel(iris.feature_names[x]) splt.set_ylabel(iris.feature_names[y]) figfile = BytesIO() plt.savefig(figfile, format='png') figfile.seek(0) figdata_png = base64.b64encode(figfile.getvalue()).decode() return figdata_png
def update(): # Get the current slider values N = clusters.value x_var = axis_map[x_axis.value] y_var = axis_map[y_axis.value] k_means = KMeans(n_clusters=N) k_means.fit(iris.data) centroids = k_means.cluster_centers_ palette = sns.palettes.color_palette('hls', N) colormap = np.array(palette.as_hex())[k_means.labels_] # as hex is necessary for bokeh to render the colors properly. plot.xaxis.axis_label = x_axis.value plot.yaxis.axis_label = y_axis.value source.data = dict( x=iris.data[:,x_var], y=iris.data[:,y_var], colors=colormap) centers.data = dict( cx=centroids[:,x_var], cy=centroids[:,y_var])
def KmeansWrapper(true_k, data, load=False): from sklearn.externals import joblib modelName = 'doc_cluster.%s.plk' % true_k if load: km = joblib.load(modelName) labels = km.labels_ else: km = KMeans(n_clusters=true_k, init='k-means++', # max_iter=1000, n_init=10, n_jobs=-1, random_state=0, verbose=0) km.fit_predict(data) labels = km.labels_ joblib.dump(km, modelName) return labels, km.cluster_centers_
def elbowMethod(X, k=21): distortions = [] for i in range(1, k): km2 = KMeans(n_clusters=i, init='k-means++', n_init=10, random_state=0, n_jobs=-1, verbose=0) km2.fit(X) distortions.append(km2.inertia_) print('k=%s, Distortion: %.2f' % (i, km2.inertia_)) plt.plot(range(1, k), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show()
def gridSearch(data, params, true_k): tfidf = TfidfVectorizer(strip_accents=None, lowercase=True, sublinear_tf=True, analyzer='word') lr_tfidf = Pipeline([('vect', tfidf), ('clf', KMeans(init='k-means++', n_jobs=-1, random_state=0, verbose=0))]) gsTfIdf = GridSearchCV( lr_tfidf, params, n_jobs=1, verbose=1) gsTfIdf.fit(data) print() print("Best score: %0.3f" % gsTfIdf.best_score_) print("Best parameters set:") best_parameters = gsTfIdf.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def k_means_cluster(data_list): if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10: array_diagnal=np.array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))]) ks = list(range(1,min([5,len(data_list[0])+1]))) KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks] KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks] BIC=[] BIC_rec=[] for x in ks: if KMeans_predict[x-1].max()<x-1: continue else: BIC_i=compute_bic(KMeans[x-1],array_diagnal) if abs(BIC_i)<10**8: BIC.append(BIC_i) BIC_rec.append(x) #BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans] #ks_picked=ks[BIC.index(max(BIC))] ks_picked=BIC_rec[BIC.index(max(BIC))] if ks_picked==1: return [data_list] else: out=[] std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])] whitened = whiten(array_diagnal) centroids, distortion=kmeans(whitened,ks_picked) idx,_= vq(whitened,centroids) for x in range(ks_picked): group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]] out.append(group1) return out else: return [data_list]
def kmeans_aic(model, X, **kwargs): '''AIC (Akaike Information Criterion) for k-means for model selection Parameters: :model: An elm.pipeline.Pipeline with KMeans or MiniBatchKMeans as final step in Pipeline :X: The X data that were just given to "fit", or "partial_fit" :kwargs: placeholder - ignored Returns: :AIC: float ''' k, m = model._estimator.cluster_centers_.shape if isinstance(X, xr.DataArray): n = X.flat.values.shape[0] else: n = X.shape[0] d = model._estimator.inertia_ aic = d + 2 * m * k delattr(model._estimator, 'labels_') return aic
def _init(self, X, lengths=None): super(GaussianHMM, self)._init(X, lengths=lengths) _, n_features = X.shape if hasattr(self, 'n_features') and self.n_features != n_features: raise ValueError('Unexpected number of dimensions, got %s but ' 'expected %s' % (n_features, self.n_features)) self.n_features = n_features if 'm' in self.init_params or not hasattr(self, "means_"): kmeans = cluster.KMeans(n_clusters=self.n_components, random_state=self.random_state) kmeans.fit(X) self.means_ = kmeans.cluster_centers_ if 'c' in self.init_params or not hasattr(self, "covars_"): cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1]) if not cv.shape: cv.shape = (1, 1) self._covars_ = distribute_covar_matrix_to_match_covariance_type( cv, self.covariance_type, self.n_components).copy()
def ConsensusCluster(self, data, subsamples, subsample_fraction, norm_var, kvalues): """ Performs consensus clustering algorithms here!!! """ return partition = dict() stuff = [] nb_clusters = 0 # this is the number of cluster the dataset is supposed to be partitioned into distances = nx.to_numpy_matrix(data) for i in kvalues: clusterid, error, nfound = KMeans(distances, nclusters= i, npass=300) uniq_ids = list(set(clusterid)) new_ids = [ uniq_ids.index(val) for val in clusterid] for i,value in enumerate(new_ids): partition[i] = value stuff.append(partition)
def fit(self, data): """ fit model on data """ self.data = data kmeans = KMeans(n_clusters=self.n_clusters) kmeans.fit(data) self.clusterer = kmeans logging.info('Fit has been completed') self.data_clusters = self.clusterer.predict(data) self.cluster_centers = self.clusterer.cluster_centers_ logging.info('Cluster calculation has been completed') self.__clusters_separation() logging.info('Cluster separation has been completed') self.__cluster_avg_distances() logging.info('Cluster avg distances has been calculated')
def stratify_by_features(features, n_strata, **kwargs): """Stratify by clustering the items in feature space Parameters ---------- features : array-like, shape=(n_items,n_features) feature matrix for the pool, where rows correspond to items and columns correspond to features. n_strata : int number of strata to create. **kwargs : passed to sklearn.cluster.KMeans Returns ------- Strata instance """ n_items = features.shape[0] km = KMeans(n_clusters=n_strata, **kwargs) allocations = km.fit_predict(X=features) return Strata(allocations)
def cluster(centers): n_class = int(len(centers) * 0.18) est = KMeans(n_clusters=n_class, max_iter=1000) est.fit(centers) new_list = [] for x, y in est.cluster_centers_: min_num = 10000 min_x = -1 min_y = -1 for x_, y_ in centers: dist = distance(x, y, x_, y_) if (dist < min_num) or (min_x == -1): min_num = dist min_x = x_ min_y = y_ new_list.append([min_x, min_y]) return new_list
def noise_removal(aud_sample): if (min(abs(aud_sample)) == 0): return aud_sample data = abs(np.copy(aud_sample)) clf = KMeans(n_clusters = 2,n_init = 5) data = data.reshape(-1,1) clf.fit(data) if clf.cluster_centers_[0] < clf.cluster_centers_[1]: noise = 0 else: noise = 1 aud = np.copy(aud_sample) window = 500 windowStride = 50 for i in range(0,len(clf.labels_),windowStride): if sum(clf.labels_[i:i+window] == noise) == window: aud[i:i+window] = 0 return aud
def calculate(): from sklearn.metrics import mean_squared_error import os if not os.path.exists('plots'): os.makedirs('plots') for k in xrange(2, 22): cluster = KMeans(k, init='k-means++', random_state=241) cluster.fit(X) reduced_image = recreate_image(cluster.cluster_centers_, cluster.labels_, h, w, d) mse = np.mean((image - reduced_image) ** 2) psnr = 10 * np.log10(1.0 / mse) plot(reduced_image, "plots/plot%d.png" % (k)) print "k: %d, mse: %.2f psnr: %.2f" % (k, mse, psnr) if psnr > 20: return k
def evaluate_kmeans(X, model): """ Evaluate a K-Means model that has been trained on X using the Silhouette score. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. model: the KMeans model trained on X. Returns: A double that corresponds to the Silhouette score of the model. """ return silhouette_score(X, model.labels_) # Ex2
def agglomerative_clustering(X, k=10): """ Run an agglomerative clustering on X. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. k: the number of clusters we want (default: 10). Returns: An AgglomerativeClustering model trained on X. """ model = AgglomerativeClustering(n_clusters=k) model.fit(X) # Note all the other functions are the same except we use # 'AgglomerativeClustering' instead of 'KMeans'. return model # Ex4.1
def cluster_kmeans(X_train, model_args=None, gridsearch=True): from sklearn.cluster import KMeans print('KMeans') if gridsearch is True: param_grid = { 'n_clusters': np.arange(1, 20, 2), 'max_iter': [50, 100, 300], 'tol': [1e-5, 1e-4, 1e-3] } prune(param_grid, model_args) else: if 'n_clusters' not in model_args: raise KeyError('Need to define n_clusters for Birch') param_grid = None return ModelWrapper(KMeans, X=X_train, model_args=model_args, param_grid=param_grid, unsupervised=True)
def getFlatVolume(series_volumes): """????????????? return: float""" results = np.array(series_volumes) results_n = np.zeros((len(results),2)) results_n[:,0] = 1 results_n[:,1] = np.array(results) #???3?? ????????????? k = KMeans(3) k.fit(results_n) df = pd.DataFrame(k.labels_) df_c = pd.DataFrame(k.cluster_centers_) v = [] for i in range(3): v.append( df[df[0]==i].count()[0]) df_c[2] = v return df_c.iloc[df_c[2].argmax()][1] # #???? #----------------------------------------------------------------------
def clusterFacetSamplesKNN(self, reduceRatio=3, maxNPnts=5): """ cluster the samples of each facet using k nearest neighbors the cluster center and their correspondent normals will be saved in self.objsamplepnts_refcls and self.objsamplenrmals_refcls :param: reduceRatio: the ratio of points to reduce :param: maxNPnts: the maximum number of points on a facet :return: None author: weiwei date: 20161129, tsukuba """ self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object) self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object) for i, facet in enumerate(self.facets): self.objsamplepnts_refcls[i] = np.empty(shape=(0,0)) self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0)) X = self.objsamplepnts_ref[i] nX = X.shape[0] if nX > reduceRatio: kmeans = KMeans(n_clusters=maxNPnts if nX/reduceRatio>maxNPnts else nX/reduceRatio, random_state=0).fit(X) self.objsamplepnts_refcls[i] = kmeans.cluster_centers_ self.objsamplenrmls_refcls[i] = np.tile(self.facetnormals[i], [self.objsamplepnts_refcls.shape[0],1])
def word_cluster(data, labels, k): k_means = cluster.KMeans(n_clusters=k) k_means.fit(data) for i, label in enumerate(labels): print label, k_means.labels_[i] d = defaultdict(list) for c, l in zip(k_means.labels_, labels): d['cluster' + str(c)].append(l.name()) fname = 'results/clusters' if use_wordnet: fname += "_wn" if use_wordvectors: fname += "_wv" fname += '_k' + str(k) + '.json' with codecs.open(fname, 'wb', 'utf-8') as outfile: outfile.write(json.dumps(d, indent=True)) print ' * Saved results to', fname # create histogram of cluster sizes histogram(d)
def KMeansAccuracy(): clusterer = KMeans(n_clusters=2, n_init=30) tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb")) predictions = clusterer.fit_predict(tdm) true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0] numerical_mapped_1 = [0 if i == "Israeli" else 1 for i in true_labels] numerical_mapped_2 = [1 if i == "Israeli" else 0 for i in true_labels] one = f1_score(numerical_mapped_1, predictions) two = f1_score(numerical_mapped_2, predictions) print("The F1 score of KMeans on BOW is: " + str(max(one, two))) clusterer = KMeans(n_clusters=2, n_init=30) predictions = clusterer.fit_predict(tdm) true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0] accuracy = predict_accuracy(true_labels, predictions) print("The F1 score of KMeans on BOW (w/Tdidf) is: " + accuracy)
def learn_color_clusters(): samples = np.zeros((0, 3)) cnt = 0 with open('train_list') as f: for line in f: line = line[:-1] image = cv2.imread(line) image = cv2.resize(image, (100, 100)) image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab) points = image.reshape((-1, 3)) np.random.permutation(points.shape[0]) samples = np.vstack([samples, points[:50]]) print(samples.shape) cnt = cnt + 1 if cnt % 10000 == 0: break km = cluster.KMeans(n_clusters=50, n_jobs=-1) km.fit(samples) np.save('lab_clusters.npy', km.cluster_centers_) return #learn_color_clusters()
def test_estimator_instance(self): """ Test that isestimator works for instances """ models = ( LinearRegression(), LogisticRegression(), KMeans(), LSHForest(), PCA(), RidgeCV(), LassoCV(), RandomForestClassifier(), ) for model in models: self.assertTrue(isestimator(model))
def test_estimator_class(self): """ Test that isestimator works for classes """ models = ( LinearRegression, LogisticRegression, KMeans, LSHForest, PCA, RidgeCV, LassoCV, RandomForestClassifier, ) for model in models: self.assertTrue(inspect.isclass(model)) self.assertTrue(isestimator(model))
def get_cluster_threshold(weights): estimator = KMeans(n_clusters = 2) data = np.asarray(weights) data = data.reshape(-1,1) # print data clusters_idx = estimator.fit_predict(data) max_idx = data.argmax() max_cluster = clusters_idx[max_idx] #print max_cluster low_cluster = [] if max_cluster == 1: indices = np.argwhere(clusters_idx == 0) for idx in indices: low_cluster.append(data[idx]) threshold = max(low_cluster) threshold = threshold[0][0] else: indices = np.argwhere(clusters_idx == 1) for idx in indices: low_cluster.append(data[idx]) threshold = max(low_cluster) threshold = threshold[0][0] # print threshold return threshold
def make_clast_books(dict_books_all, array_books_real): dict_books_clasters = {} for i in array_books_real: try: dict_books_clasters[i] = dict_books_all[i] except: dict_books_clasters[i] = [1, 1, 1, 1] X_array = dict_books_clasters.values() num_clusters = len(X_array) / 50 k_means = cluster.KMeans(n_clusters=num_clusters) k_means.fit(X_array) # ????? ?????????? ????? ???????? clusterized_array = list(k_means.labels_) for index, i in enumerate(dict_books_clasters.keys()): dict_books_clasters[i] = clusterized_array[index] return dict_books_clasters, num_clusters
def __init__(self, league_df): stat_matrix = [] for i in range(len(league_df)): stat = make_stat_vector(i, league_df) stat_matrix.append(stat) kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10) kmeans.fit(stat_matrix) centroid_array = kmeans.cluster_centers_ positions = kmeans.predict(stat_matrix) league_df['vector'] = pd.Series(stat_matrix, index = league_df.index) league_df['position'] = pd.Series(positions, index = league_df.index) self.df = league_df self.centroids = kmeans.cluster_centers_ self.map = make_position_map(centroid_array)
def PQTrain(data, lenSubVec,numSubCenter): (dataSize, dataDim)=data.shape if 0!=dataDim%lenSubVec: print "Cannot partition the feature space with the given segment number" return numSubVec=dataDim/lenSubVec centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32) distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32) objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001) for ii in range(numSubVec): print("PQ training. Processing "+str(ii)+"-th sub-vector") objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_ distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean")) model={"centers":centers,"distOfCenters":distOfCenters} return model
def PQEval(data,lenSubVec,numSubCenter,centersPQ): (dataSize, dataDim)=data.shape if 0!=dataDim%lenSubVec: print "Cannot partition the feature space with the given segment number" return numSubVec=dataDim/lenSubVec codePQ=-npy.ones((dataSize, numSubVec),dtype=npy.int32) objKmeans=KMeans(numSubCenter) if (centersPQ.shape[0]!=numSubVec*numSubCenter or centersPQ.shape[1]!=lenSubVec): print "PQ model dimension is not compatible with input data" return for ii in range(numSubVec): objKmeans.cluster_centers_=centersPQ[ii*numSubCenter:(ii+1)*numSubCenter,:] codePQ[:,ii]=objKmeans.predict(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) return codePQ
def spatial(self, query, no_clusters, no_init=20): """ find centers based on clusters of latitude/longitude pairs query: SQL query that has a WGS84 geometry (the_geom) """ params = {"subquery": query, "geom_col": "the_geom", "id_col": "cartodb_id"} data = self.data_provider.get_spatial_kmeans(params) # Unpack query response xs = data[0]['xs'] ys = data[0]['ys'] ids = data[0]['ids'] km = KMeans(n_clusters=no_clusters, n_init=no_init) labels = km.fit_predict(zip(xs, ys)) return zip(ids, labels)
def compute_readpairs_per_umi_threshold(reads, subsample_rate): ''' Compute a threshold above which the UMIs are unlikely to be PCR off-products. reads (np.array(int)) - Read pairs for each UMI subsample_rate (float) - Subsample reads to this fraction. Returns threshold (int) - The RPPU threshold in the subsampled space ''' if len(np.unique(reads)) < 2: print 'Skipping RPPU threshold calculation.' return 1 print 'RPPU subsample rate: %0.4f' % subsample_rate reads = np.random.binomial(reads, subsample_rate) reads = reads[reads > 0] if len(np.unique(reads)) < 2: print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.' return 1 new_n50 = tk_stats.NX(reads, 0.5) print 'New N50: %d:' % new_n50 # Log-transform counts log_reads = np.log(reads) # Run K-Means. Reshape necessary because kmeans takes a matrix. kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1))) kmeans.predict(log_reads.reshape((-1,1))) # Take the cluster with the smallest mean min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0] print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_)))) print 'RPPU component members: ' + str(np.bincount(kmeans.labels_)) # Take the max element in the min-cluster threshold = np.max(reads[kmeans.labels_ == min_cluster]) return threshold
def fit(self, X): _X = X[self.__applicable_rows(X)] companies = _X.groupby('recipient_id').apply(self.__company_stats) \ .reset_index() companies = companies[self.__applicable_company_rows(companies)] self.cluster_model = KMeans(n_clusters=3) self.cluster_model.fit(companies[self.CLUSTER_KEYS]) companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS]) self.clusters = companies.groupby('cluster') \ .apply(self.__cluster_stats) \ .reset_index() self.clusters['threshold'] = \ self.clusters['mean'] + 4 * self.clusters['std'] return self
def get_clusters_from_frames(frame_dir=None): # TODO: allow multiple frame directories to be processed at once if frame_dir is None: filename_to_embedding = pickle.load(open('temp/temp_vid1_290717183249/filename_to_emb.pkl')) # TODO: call get_inception_embeddings on frame dir, but for now just use the pickle embs = [] filenames = [] for filename, embedding in filename_to_embedding.iteritems(): embs.append(embedding) filenames.append(filename) filenames = [filename[filename.rindex('/')+1:] for filename in filenames] embs = np.array(embs) candidates = [(11, 6)] candidates = [(eps, min_pts) for eps in range(7, 15) for min_pts in range(2, 10)] labels = cluster(embs, filenames, algorithm='KMeans', n_clusters=6)
def kmeans(X, K): km = KMeans(K).fit(X) return km.cluster_centers_
def main(): features = [] for i in list: im = cv2.imread(i) hist, bins = np.histogram(im.ravel(), 256, [0, 256]) features.append(hist) lsa = TruncatedSVD(10) features = lsa.fit_transform(features) features = Normalizer(copy = False).fit_transform(features) km = KMeans( init='k-means++', n_clusters=n_clusters, ) km.fit(features) for i in range(n_clusters): if not os.path.exists('./result/' + str(i)): os.makedirs('./result/' + str(i)) cnt = 0 for i in list: filename = i.split('/')[-1] print filename, print km.labels_[cnt] shutil.copyfile(i, './result/' + str(km.labels_[cnt]) + '/' + filename) cnt += 1
def _discretize_by_kmeans(col, num_bins, random_state): nan_idx = col[col.isnull()].index kmeans = KMeans(n_clusters=num_bins, random_state=random_state) kmeans = kmeans.fit(col.dropna().values.T.reshape(-1, 1)) group = kmeans.labels_ if col.isnull().sum() > 0: group = group.astype(float) for idx in nan_idx: group = np.insert(group,idx,np.nan) return pd.Series(group)