我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用sklearn.cluster()。
def _step5(arr): kmeans = pickle.loads(open("kmeans.model", "rb").read()) key, lines, tipe = arr print(key) open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines)) res = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read() w = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w") for line in res.split("\n"): try: vec = list(map(float, line.split()[-100:])) except: print(line) print(res) continue x = np.array(vec) if np.isnan(x).any(): continue cluster = kmeans.predict([vec]) txt = line.split()[:-100] obj = {"txt": txt, "cluster": cluster.tolist()} data = json.dumps(obj, ensure_ascii=False) w.write( data + "\n" )
def step6(): for tipe in ["news", "nocturne"]: names = [name for name in reversed(sorted(glob.glob("./tmp/tmp.{tipe}.*.json".format(tipe=tipe))))] size = len(names) for en, name in enumerate(names): term_clus = {} oss = [] with open(name) as f: for line in f: line = line.strip() oss.append(json.loads(line)) for i in range(3, len(oss) - 3): terms = set( oss[i]["txt"] ) for term in terms: if term_clus.get(term) is None: term_clus[term] = [0.0]*128 cd = [oss[i+d]["cluster"][0] for d in [-3, -2, -1, 1, 2, 3]] for c in cd: term_clus[term][c] += 1.0 print("{}/{} finished {}".format(en, size, name)) open("{tipe}.term_clus.pkl".format(tipe=tipe), "wb").write( pickle.dumps(term_clus) )
def do_kmeans(data, k): km = sklearn.cluster.KMeans(n_clusters=k) km.fit(data) means = km.cluster_centers_.reshape((-1,)) #initialize standard deviations with distances between random cluster centers sds = [] for i in range(means.shape[0]): # choose any 2 means and take half the distance between them x, y = np.random.choice(means, 2, replace=False) sds.append((x-y)/2) sds = np.abs(np.array(sds)) return (means, sds) # expectation maximization for gmm # use_kmeans: whether to initialize using kmeans or randomly # use_priors: whether to model the prior distribution; # this attaches a weight to each distribution that tells us # the percentage of points generated from that distribution
def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.eps <= 0.0: raise ValueError('eps must be > 0') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
def split_into_intervals(data, n): """ Split time series into n minute intervals """ # Throw away time, bid/ask numbers prices = [x[1] for x in data] # create a len n-1 array of price differences (10 second increments) price_diffs = np.diff(prices) # m = interval length in terms of data points (6*~10sec = 1 minute) m = n * 6 # each datapoint we're trying to cluster will be of the form: # (xi,yi) = (time series of prices, price change after series) intervals = np.zeros((len(prices)-1,m+1)) for i in range(0, len(prices)-m-1): intervals[i,0:m] = prices[i:i+m] intervals[i,m] = price_diffs[i+m] return intervals
def plot_data(*data): ''' graph the dataset :param data: data, target :return: None ''' X,labels_true=data labels=np.unique(labels_true) fig=plt.figure() ax=fig.add_subplot(1,1,1) colors='rgbyckm' for i,label in enumerate(labels): position=labels_true==label ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label), color=colors[i%len(colors)]) ax.legend(loc="best",framealpha=0.5) ax.set_xlabel("X[0]") ax.set_ylabel("Y[1]") ax.set_title("data") plt.show()
def spectral_clustering(messages, dist_func=combined, num_clusters=3): ''' takes a list of converstation messages and return `num_cluster` threads. ''' m = len(messages) affinity = np.zeros((m, m)) # extract message features. for (mi, message) in enumerate(messages): if type(message) != dict: message = { 'text': message } if 'feat' not in message: # extract on the fly. message['feat'] = extract_all(parse_body(message['text'])) messages[mi] = message # write back. # build affinity matrix. for mi in range(m): for mj in range(m): affinity[mi, mj] = np.exp(-1.0 * keywords_l0( messages[mi]['feat'], messages[mj]['feat'] )) # run clustering. print affinity labels = sklearn.cluster.spectral_clustering(affinity, n_clusters=num_clusters, eigen_solver='arpack') return labels
def adhoc_clustering(messages, dist_func=combined): ''' an adhoc method for clustering messages ''' m = len(messages) # extract message features. for (mi, message) in enumerate(messages): if type(message) != dict: message = { 'text': message } message.update(extract_all(parse_body(message['text']))) # run clustering (ad hoc). max_label = 0 bias = 600 labels = [] for (mi, message) in enumerate(messages): min_mj = -1 min_dist = float('inf') for mj in range(mi-1, -1, -1): dist = dist_func(messages[mi], messages[mj]) if dist < min_dist: min_dist = dist min_mj = mj if (bias- 100 * worth(messages[mi])) < min_dist: # create new cluster. labels.append(max_label) max_label += 1 else: # assign to an old cluster. labels.append(labels[min_mj]) return labels
def updateClusters(LLE_node_vals,switch_penalty = 1): """ Takes in LLE_node_vals matrix and computes the path that minimizes the total cost over the path Note the LLE's are negative of the true LLE's actually!!!!! Note: switch penalty > 0 """ (T,num_clusters) = LLE_node_vals.shape future_cost_vals = np.zeros(LLE_node_vals.shape) ##compute future costs for i in xrange(T-2,-1,-1): j = i+1 indicator = np.zeros(num_clusters) future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] for cluster in xrange(num_clusters): total_vals = future_costs + lle_vals + switch_penalty total_vals[cluster] -= switch_penalty future_cost_vals[i,cluster] = np.min(total_vals) ##compute the best path path = np.zeros(T) ##the first location curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:]) path[0] = curr_location DP_start2 = time.time() ##compute the path for i in xrange(T-1): j = i+1 future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] total_vals = future_costs + lle_vals + switch_penalty total_vals[int(path[i])] -= switch_penalty path[i+1] = np.argmin(total_vals) ##return the computed path return path
def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False): """ computes the F1 scores and returns a list of values """ F1_score = np.zeros(num_cluster) for cluster in xrange(num_cluster): matched_cluster = matching_algo[cluster] true_matrix = actual_clusters[cluster] estimated_matrix = threshold_algo[matched_cluster] TP = 0 TN = 0 FP = 0 FN = 0 for i in xrange(num_stacked*n): for j in xrange(num_stacked*n): if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0: TP += 1.0 elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0: TN += 1.0 elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0: FP += 1.0 else: FN += 1.0 precision = (TP)/(TP + FP) print "cluster #", cluster print "TP,TN,FP,FN---------->", (TP,TN,FP,FN) recall = TP/(TP + FN) f1 = (2*precision*recall)/(precision + recall) F1_score[cluster] = f1 return F1_score
def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo): """ computes a confusion matrix and returns it """ seg_len = 50 true_confusion_matrix = np.zeros([num_clusters,num_clusters]) for point in xrange(len(clustered_points_algo)): cluster = clustered_points_algo[point] #CASE E : ABCABC num = (int(sorted_indices_algo[point]/seg_len) %num_clusters) true_confusion_matrix[num,cluster] += 1 return true_confusion_matrix
def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score ############ ##The basic folder to be created
def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters): """ Takes in the matching for the clusters takes the computed clusters computes the average F1 score over the network """ threshold = 1e-2 f1 = 0 for cluster in xrange(num_clusters): true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",") matched_cluster = matching[cluster] matched_cluster_cov = train_cluster_inverse[matched_cluster] (nrow,ncol) = true_cluster_cov.shape out_true = np.zeros([nrow,ncol]) for i in xrange(nrow): for j in xrange(ncol): if np.abs(true_cluster_cov[i,j]) > threshold: out_true[i,j] = 1 out_matched = np.zeros([nrow,ncol]) for i in xrange(nrow): for j in xrange(ncol): if np.abs(matched_cluster_cov[i,j]) > threshold: out_matched[i,j] = 1 np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",") np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",") ##compute the confusion matrix confusion_matrix = np.zeros([2,2]) for i in xrange(nrow): for j in xrange(ncol): confusion_matrix[out_true[i,j],out_matched[i,j]] += 1 f1 += computeF1_macro(confusion_matrix, [0,1],2) return f1/num_clusters ############
def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False): """ computes the F1 scores and returns a list of values """ F1_score = np.zeros(num_cluster) for cluster in xrange(num_cluster): matched_cluster = matching_algo[cluster] true_matrix = actual_clusters[cluster] estimated_matrix = threshold_algo[matched_cluster] if save_matrix: np.savetxt("estimated_matrix_cluster=" + str(cluster)+".csv",estimated_matrix,delimiter = ",", fmt = "%1.4f") TP = 0 TN = 0 FP = 0 FN = 0 for i in xrange(num_stacked*n): for j in xrange(num_stacked*n): if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0: TP += 1.0 elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0: TN += 1.0 elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0: FP += 1.0 else: FN += 1.0 precision = (TP)/(TP + FP) recall = TP/(TP + FN) f1 = (2*precision*recall)/(precision + recall) F1_score[cluster] = f1 return F1_score
def write_cluster_ids(words, cluster_ids, out=None): """Write given list of words and their corresponding cluster ids to out.""" assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch' if out is None: out = sys.stdout for word, cid in izip(words, cluster_ids): print >> out, '%s\t%d' % (word, cid)
def main(argv=None): if argv is None: argv = sys.argv try: words, vectors, options = process_options(argv[1:]) except Exception, e: if str(e): print >> sys.stderr, 'Error: %s' % str(e) return 1 else: raise dbscan = sklearn.cluster.DBSCAN(eps=options.eps, metric=options.metric) dbscan.fit(numpy.array(vectors)) noisy = sum(1 for l in dbscan.labels_ if l == -1) unique = len(set(dbscan.labels_)) logging.info('%d clusters, %d noisy, %d vectors' % (unique, noisy, len(vectors))) if noisy >= len(vectors) / 4: logging.warning('%d/%d noisy (-1) labels (try higher eps?)' % \ (noisy, len(vectors))) elif unique < (len(vectors)/2)**0.5: logging.warning('only %d clusters (try lower eps?)' % unique) write_cluster_ids(words, dbscan.labels_) return 0
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.k is not None and options.k < 2: raise ValueError('cluster number must be >= 2') if options.method == MINIBATCH_KMEANS and not with_sklearn: logging.warning('minibatch kmeans not available, using kmeans (slow)') options.method = KMEANS if options.jobs != 1 and (options.method != KMEANS or not with_sklearn): logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS) options.jobs = 1 wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.k is None: options.k = int(math.ceil((len(wv.words())/2)**0.5)) logging.info('set k=%d (%d words)' % (options.k, len(wv.words()))) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
def minibatch_kmeans(vectors, k): if not with_sklearn: raise NotImplementedError # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) # uses batch size 1000. sklearn KMeans defaults to n_init 10 kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10) kmeans.fit(vectors) return kmeans.labels_
def create_data(centers,num=100,std=0.7): ''' generate data :param centers: dimension of centre :param num: number of sample :param std: std of each cluster :return: data, target ''' X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) return X,labels_true
def runClustering(ssearch, eps, min_samples): """ Run DBSCAN with the determined eps and MinPts values. """ print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples)) # Initialize DBSCAN with parameters. # I forgot to use cosine at first! db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute') # Time this step. t0 = time.time() # Cluster the LSI vectors. db.fit(ssearch.index.index) # Calculate the elapsed time (in seconds) elapsed = (time.time() - t0) print(" done in %.3fsec" % elapsed) # Get the set of unique IDs. cluster_ids = set(db.labels_) # Show the number of clusters (don't include noise label) print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1)) # For each of the clusters... for cluster_id in cluster_ids: # Get the list of all doc IDs belonging to this cluster. cluster_doc_ids = [] for doc_id in range(0, len(db.labels_)): if db.labels_[doc_id] == cluster_id: cluster_doc_ids.append(doc_id) # Get the top words in this cluster top_words = ssearch.getTopWordsInCluster(cluster_doc_ids) print(' Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
def updateClusters(LLE_node_vals,switch_penalty = 1): """ Uses the Viterbi path dynamic programming algorithm to compute the optimal cluster assigments Takes in LLE_node_vals matrix and computes the path that minimizes the total cost over the path Note the LLE's are negative of the true LLE's actually!!!!! Note: switch penalty > 0 """ (T,num_clusters) = LLE_node_vals.shape future_cost_vals = np.zeros(LLE_node_vals.shape) ##compute future costs for i in xrange(T-2,-1,-1): j = i+1 indicator = np.zeros(num_clusters) future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] for cluster in xrange(num_clusters): total_vals = future_costs + lle_vals + switch_penalty total_vals[cluster] -= switch_penalty future_cost_vals[i,cluster] = np.min(total_vals) ##compute the best path path = np.zeros(T) ##the first location curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:]) path[0] = curr_location ##compute the path for i in xrange(T-1): j = i+1 future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] total_vals = future_costs + lle_vals + switch_penalty total_vals[int(path[i])] -= switch_penalty path[i+1] = np.argmin(total_vals) ##return the computed path return path
def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo): """ computes a confusion matrix and returns it """ seg_len = 200 true_confusion_matrix = np.zeros([num_clusters,num_clusters]) for point in xrange(len(clustered_points_algo)): cluster = int(clustered_points_algo[point]) ##CASE G: ABBACCCA # num = (int(sorted_indices_algo[point]/seg_len) ) # if num in [0,3,7]: # true_confusion_matrix[0,cluster] += 1 # elif num in[1,2]: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[2,cluster] += 1 ##CASE F: ABCBA # num = (int(sorted_indices_algo[point]/seg_len)) # num = min(num, 4-num) # true_confusion_matrix[num,cluster] += 1 #CASE E : ABCABC num = (int(sorted_indices_algo[point]/seg_len) %num_clusters) true_confusion_matrix[num,cluster] += 1 ##CASE D : ABABABAB # num = (int(sorted_indices_algo[point]/seg_len) %2) # true_confusion_matrix[num,cluster] += 1 ##CASE C: # num = (sorted_indices_algo[point]/seg_len) # if num < 15: # true_confusion_matrix[0,cluster] += 1 # elif num < 20: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[0,cluster] += 1 ##CASE B : # if num > 4: # num = 9 - num # true_confusion_matrix[num,cluster] += 1 ##CASE A : ABA # if sorted_indices_algo[point] < seg_len: # true_confusion_matrix[0,cluster] += 1 # elif sorted_indices_algo[point] <3*seg_len: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[0,cluster] += 1 return true_confusion_matrix
def cluster(data): """ Use k-means clustering on training data to find profitable patterns we can exploit """ num_clusters = 100 num_selected_clusters = 20 # Split into 30, 60, and 120 min time intervals, cluster each split = lambda n: split_into_intervals(data, n) kmeans30 = sklearn.cluster.k_means(split(30), num_clusters) kmeans60 = sklearn.cluster.k_means(split(60), num_clusters) kmeans120 = sklearn.cluster.k_means(split(120), num_clusters) # Sort the clusters by performance hp30, hp60, hp120 = [], [], [] for i in range(0, num_clusters): hp30.append((i,kmeans30[0][i,-1])) hp60.append((i,kmeans60[0][i,-1])) hp120.append((i,kmeans120[0][i,-1])) hp30 = sorted(hp30, reverse=True, key=lambda x: x[1])[0:num_selected_clusters] hp60 = sorted(hp60, reverse=True, key=lambda x: x[1])[0:num_selected_clusters] hp60 = sorted(hp120, reverse=True, key=lambda x: x[1])[0:num_selected_clusters] # Select the highest performing clusters top30 = np.zeros((num_selected_clusters,181)) top60 = np.zeros((num_selected_clusters,361)) top120 = np.zeros((num_selected_clusters,721)) for i in range(0, num_selected_clusters): top30[i,0:181] = kmeans30[0][hp30[i][0],0:181] top60[i,0:361] = kmeans60[0][hp60[i][0],0:361] top120[i,0:721] = kmeans120[0][hp120[i][0],0:721] # Then normalize the clusters so we can use the faster similarity function # from S&Z to compare instead of L2 norm scaler = sklearn.preprocessing.StandardScaler() for i in range(0, num_selected_clusters): top30[i,0:180] = scaler.fit_transform(top30[i,0:180]) top60[i,0:360] = scaler.fit_transform(top60[i,0:360]) top120[i,0:720] = scaler.fit_transform(top120[i,0:720]) return [top30, top60, top120]