我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用sklearn.metrics.silhouette_samples()。
def generateEvaluation(self, output_dir, assigned_clusters, quick = False): if quick: self.silhouette_avg = 0 return if self.distances is not None: self.sample_silhouette_values = silhouette_samples( self.distances, assigned_clusters, metric = 'precomputed') else: self.sample_silhouette_values = silhouette_samples(self.instances.getFeatures(), assigned_clusters) self.silhouette_avg = np.mean(self.sample_silhouette_values) self.printSilhouette(output_dir, assigned_clusters) # Code from a scikit-learn example: # Selecting the number of clusters with silhouette analysis on KMeans clustering
def fit(self, X, y=None, **kwargs): """ Fits the model and generates the the silhouette visualization. TODO: decide to use this method or the score method to draw. NOTE: Probably this would be better in score, but the standard score is a little different and I'm not sure how it's used. """ # Fit the wrapped estimator self.estimator.fit(X, y, **kwargs) # Get the properties of the dataset self.n_samples = X.shape[0] self.n_clusters = self.estimator.n_clusters # Compute the scores of the cluster labels = self.estimator.predict(X) self.silhouette_score_ = silhouette_score(X, labels) self.silhouette_samples_ = silhouette_samples(X, labels) # Draw the silhouette figure self.draw(labels) # Return the estimator return self
def ex3_kmeans(X, y): """ Tries to find the best value for K when applying the KMeans algorithm on X, y. There are multiple ways to score a model but here we count what is the ratio of clusters with a negative Silhouette score and try to minimize it, for K from 2 to 20. Returns: best_k: the value of K that gives the best score. best_score: the score associated with best_k. """ best_k = 1 best_score = -1 for k in range(2, 20+1): model = KMeans(k).fit(X, y) scores = metrics.silhouette_samples(X, model.labels_) negative_scores_count = len([x for x in scores if x < 0]) model_score = negative_scores_count / float(len(scores)) print "K=%d, score=%f" % (k, model_score) if model_score > best_score: best_score = model_score best_k = k # Unsurprisingly the best K is usually 2 because we have two classes of # messages: spams and hams. return best_k, best_score # Ex 4
def plot_silhouettes(X, y): cluster_labels = np.unique(y) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(X, y, metric='euclidean') y_ax_lower = 0 y_ax_upper = 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(i / n_clusters) plt.barh( range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color, ) yticks.append((y_ax_lower + y_ax_upper) / 2) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color='red', linestyle='--') plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.show()