我们从Python开源项目中,提取了以下39个代码示例,用于说明如何使用sklearn.metrics.silhouette_score()。
def silhouette_score(series, clusters): distances = np.zeros((series.shape[0], series.shape[0])) for idx_a, metric_a in enumerate(series): for idx_b, metric_b in enumerate(series): distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0] labels = np.zeros(series.shape[0]) for i, (cluster, indicies) in enumerate(clusters): for index in indicies: labels[index] = i # silhouette is only defined, if we have 2 clusters with assignments at # minimum if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >= distances.shape[0]): #if len(np.unique(labels)) == 1: return labels, -1 else: return labels, _silhouette_score(distances, labels, metric='precomputed')
def spectral(data): spectral = SpectralClustering( eigen_solver='arpack', affinity='rbf', assign_labels='discretize' ).fit(data) print 'Spectral' print collections.Counter(spectral.labels_) print metrics.silhouette_score(data, spectral.labels_) reduced_data = reduce_with_pca(data, 2) plot_2d_data(reduced_data, spectral.labels_)
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'): from sklearn.cross_validation import train_test_split from sklearn.metrics import silhouette_score shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers)) train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio) train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values)) full_mat = np.array(list(shape_df.values)) centroids = None labels = None best_score = 0 for k in k_range: res = cluster_shapes(train_mat, full_mat, k) score = silhouette_score(full_mat, res[1]) if score > best_score: centroids = res[0] labels = res[1] best_score = score mols[cluster_key] = labels return mols, centroids
def _find_optimal_clustering(self,clusterings): max_score = float('-inf') max_clustering = None for clustering in clusterings: labeled_vectors = [(node.vector,cluster_idx) for cluster_idx in range(len(clustering)) for node in _get_cluster_nodes(clustering[cluster_idx][1]) ] vectors,labels = [np.array(x) for x in zip(*labeled_vectors)] if np.in1d([1],labels)[0]: score = silhouette_score(vectors,labels,metric='cosine') else: continue # silhouette doesn't work with just one cluster if score > max_score: max_score = score max_clustering = clustering return zip(*max_clustering)[1] if max_clustering else zip(*clusterings[0])[1]
def evaluate_kmeans(X, model): """ Evaluate a K-Means model that has been trained on X using the Silhouette score. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. model: the KMeans model trained on X. Returns: A double that corresponds to the Silhouette score of the model. """ return silhouette_score(X, model.labels_) # Ex2
def fit(self, X, y=None, **kwargs): """ Fits the model and generates the the silhouette visualization. TODO: decide to use this method or the score method to draw. NOTE: Probably this would be better in score, but the standard score is a little different and I'm not sure how it's used. """ # Fit the wrapped estimator self.estimator.fit(X, y, **kwargs) # Get the properties of the dataset self.n_samples = X.shape[0] self.n_clusters = self.estimator.n_clusters # Compute the scores of the cluster labels = self.estimator.predict(X) self.silhouette_score_ = silhouette_score(X, labels) self.silhouette_samples_ = silhouette_samples(X, labels) # Draw the silhouette figure self.draw(labels) # Return the estimator return self
def clustering( points, k=2,name='kmeans'): ''' points: N_samples * N_features k: number of clusters ''' if name == 'kmeans': kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points) ## print within_variance #cluster_distance = kmeans.transform( points ) #within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] ) #print("AvgWithinSS:"+str(within_variance)) if len( np.unique(kmeans.labels_) ) > 1: si = silhouette_score( points,kmeans.labels_ ) #print("Silhouette:"+str(si)) else: si = 0 print("Silhouette:"+str(si)) return kmeans.labels_,si if name == 'spec': spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points ) si = silhouette_score( points,spec.labels_ ) print("Silhouette:"+str(si)) return spec.labels_,si
def k_means(data, nc, req_info=None): means = np.mean(data, axis=0) stds = np.std(data, axis=0) sdata = (data - means)/stds km = KMeans(init='k-means++', n_clusters=nc, n_init=10) km.fit(sdata) if req_info == 'all': req_info = ['silhouette', 'inertia', 'centers'] elif req_info is None: req_info = [] info = {} if 'silhouette' in req_info: info['silhouette'] = metrics.silhouette_score(data, km.labels_) if 'inertia' in req_info: info['inertia'] = km.inertia_ if 'centers' in req_info: info['centers'] = km.cluster_centers_*stds + means return km.labels_, info
def internal_silhouette(self, idea_id, base_labels=None): labels = self.labels_for_idea(idea_id, True, False, base_labels) self.remove_singletons(labels, idea_id) idea_post_ids = self.get_posts_of_idea(idea_id) if base_labels: idea_post_ids = set(idea_post_ids) idea_post_ids.update(list(base_labels.keys())) idea_post_ids = np.array(list(idea_post_ids)) idea_post_ids.sort() idea_post_ids = np.array(idea_post_ids) idea_post_nums = self.post_ids.searchsorted(idea_post_ids) # slicing one axis of a time # because simultaneous slice interpreted as diagonal distances = self.distance_matrix sub_distance = distances[idea_post_nums][:, idea_post_nums] sub_labels = labels[idea_post_nums] if len(set(sub_labels)) < 2: return 0 return metrics.silhouette_score(sub_distance, sub_labels, 'precomputed')
def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size)))
def db_scan(data, eps, min_samples, metric): dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data) print 'DBSCAN' print metrics.silhouette_score(data, dbscan.labels_) print collections.Counter(dbscan.labels_) reduced_data = reduce_with_pca(data) plot_2d_data(reduced_data, dbscan.labels_)
def mean_shift(data): mean_shift = MeanShift(cluster_all=False, n_jobs=1).fit(data) print 'Mean Shift' print metrics.silhouette_score(data, mean_shift.labels_) print collections.Counter(mean_shift.labels_)
def affinity_prop(data): af = AffinityPropagation(damping=0.5, convergence_iter=15, affinity='euclidean').fit(data) print 'Affinity Propagation' print metrics.silhouette_score(data, af.labels_) print collections.Counter(af.labels_) # mean_shift(np.array(data)) # affinity_prop(np.array(data))
def cluster2d(data, n_clusters): reduced_data = reduce_with_pca(data) kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(reduced_data) print 'K-Means' print collections.Counter(kmeans.labels_) print metrics.silhouette_score(data, kmeans.labels_) plot_2d_data(reduced_data, kmeans.labels_)
def em(data): gmm = GaussianMixture( n_components=6, covariance_type="tied" ).fit(data) predicted_data = gmm.predict(data) print collections.Counter(predicted_data) print metrics.silhouette_score(data, predicted_data) reduced_data = reduce_with_pca(data, 2) plot_2d_data(reduced_data, predicted_data)
def kmeans(reduced_data, n_clusters): #----Do KMeans clustering and return relevant graphing/performance data kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=42) kmeans = kmeans.fit(reduced_data) sil_score = metrics.silhouette_score(reduced_data, kmeans.labels_, metric='euclidean') data_dictionary = { "labels": kmeans.labels_, "centroids": kmeans.cluster_centers_, "silhouette_score": sil_score } return data_dictionary
def agglom(reduced_data, n_clusters): #----Do Agglomerative clustering and return relevant performance data clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters) clustering = clustering.fit(reduced_data) sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean') return { "labels":clustering.labels_, "silhouette_score": sil_score }
def find_best_cluster(cluster_type,data,a,b): #----Prints silhouette scores for all # of clusters in range scores = [] for i in range(a,b): if cluster_type.lower() == "kmeans": i_clusters = kmeans(data, i) elif cluster_type.lower() == "agglom": i_clusters = agglom(data, i) sil_score_i = i_clusters['silhouette_score'] scores.append(sil_score_i) print(scores)
def clustering(docs,n_clusters): # ?? n_clusters ??? kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs) # kmeans?? labels=kmeans_model.labels_ # hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs) # ???? # labels=hmodel.labels_ score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean') # euclidean ?? return labels,score
def analyze_k_means(estimator, name, data): t0 = time() estimator.fit(data) print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
def evaluate(k): km = kmeans[k] score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED) print('Silhouette score for k=%d is %f.' % (k, score)) return (k, score)
def try_kmeans(X): """ Run the K-Means algorithm on X with different values of K, and return the one that gives the best score. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. """ best_k = 1 best_score = -1 for k in range(2, 20+1): model = KMeans(n_clusters=k) model.fit(X) labels = model.predict(X) score = silhouette_score(model.transform(X), labels) print(k, "->", score) if score > best_score: best_k = k best_score = score print("The best K is", best_k) return best_k # Ex3
def ex2_kmeans(X, y): """ Applies the KMeans algorithm on X, y using K=10 and print the silhouette score of this model. X and y are returned by transform_text above. """ model = KMeans(10).fit(X, y) print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_) # Ex 3
def ex4_agglomerative_clustering(X, y): """ This does the same thing as ex2_kmeans but with an agglomerative clustering and K=2. """ # AgglomerativeClustering needs a non-spare matrix X = X.toarray() k = 2 model = AgglomerativeClustering(k).fit(X, y) print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_) # Ex 5
def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
def nmf_test(df): X = df.drop(['Year', 'zipcode'], axis=1).values scaler = MinMaxScaler() X_sca = scaler.fit_tranform(X) scores = [] for k in xrange(2, 11): model = NMF(n_components=k) W = model.fit_transform(X_sca) labels = W.argmax(axis=1) score = silhouette_score(X_sca, labels) scores.append(score) plt.plot(xrange(2, 11), scores, 'b*-') plt.show()
def silhouette(self): """ Calculate the silhouette score for a certain clustering. Input: None Output: silhouette score (None) """ return silhouette_score(self.features, self.cluster_labels)
def h_cluster(wordlist, sims, distmat, thresh=0.01): B_, Bs, Ms, Ts, As = hgfc(sims, thresh=thresh) sil_coefs = [] for i,a in enumerate(As): l = labels(a) if len(set(l)) > 2 and len(set(l)) < len(wordlist)-1: sil_coefs.append(silhouette_score(distmat, labels(a), metric='precomputed')) else: sil_coefs.append(0.0) ld = [labeldict(a,wordlist) for a in As] return ld, sil_coefs
def scores(dmat,cluster_labels): try: silhouette_avg = silhouette_score(dmat, cluster_labels, metric='precomputed', sample_size=100) return(silhouette_avg) except: return(None)
def silhcoeff(data,labels): arrdata = array(data) print("Silhouette coefficient: ", metrics.silhouette_score(arrdata,labels,metric='euclidean')) ################################### PHOTOS ########################################### # LOAD PHOTOS FROM FOLDER & SAVE IN A LIST [FILENAME,PHOTO,GRAYSCALE_PHOTO]
def get_all_results(self): discussion = self.discussion idea_ids = discussion.db.query(Idea.id).filter_by( discussion_id=discussion.id).all() results = {id: self.get_cluster_info(id) for (id,) in idea_ids} results[None] = self.get_cluster_info() posres = {id: r for (id, r) in results.items() if r is not None} # for id, (silhouette_score, compare_with_ideas, clusters, post_info) in posres.iteritems(): # log.debug(" ".join((id, silhouette_score, repr([len(x['cluster']) for x in clusters])))) return posres
def silhouette_score(self): if self._silhouette_score is None: self._silhouette_score = metrics.silhouette_score( self.model_matrix, self.optics.as_labels(self.optics_clusters), metric=self.metric) return self._silhouette_score
def elbow_test(X, max_cluster): """ This function performs the elbow test to determine the number of clusters for k-means clustering. Parameters ---------- X : numpy array 2d list of floats. max_cluster : int The maximum number of clusters to desirable. Returns ------- number of clusters : int The number of clusters for kmeans clustering """ from sklearn.cluster import KMeans from sklearn import metrics inertia_list = [] s_list = [] for cluster_cnt in range(max_cluster-1): k_means = KMeans(n_clusters=cluster_cnt+2) k_means.fit(X) k_means_labels = k_means.labels_ s_factor = metrics.silhouette_score(X, k_means_labels, metric='euclidean') s_list.append(s_factor) kmeans_inertia = k_means.inertia_ inertia_list.append(kmeans_inertia) inertia_cnt = 0 i_diff_list = [] for inertia in inertia_list: #look for the difference between each difference in cluster number if inertia_cnt != len(inertia_list) - 1: i_diff = inertia - inertia_list[inertia_cnt + 1] i_diff_list.append(i_diff) inertia_cnt = inertia_cnt + 1 #find the biggest difference and use that number for the best number of cluster max_diff = max(i_diff_list) max_diff_index = i_diff_list.index(max_diff) #+3 because of the counting best_no_cluster = max_diff_index + 3 return best_no_cluster
def distortion_score(X, labels, metric='euclidean'): """ Compute the mean distortion of all samples. The distortion is computed as the the sum of the squared distances between each observation and its closest centroid. Logically, this is the metric that K-Means attempts to minimize as it is fitting the model. .. seealso:: http://kldavenport.com/the-cost-function-of-k-means/ Parameters ---------- X : array, shape = [n_samples, n_features] or [n_samples_a, n_samples_a] Array of pairwise distances between samples if metric == "precomputed" or a feature array for computing distances against the labels. labels : array, shape = [n_samples] Predicted labels for each sample metric : string The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by `sklearn.metrics.pairwise.pairwise_distances <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances>`_ .. todo:: add sample_size and random_state kwds similar to silhouette_score """ # Encode labels to get unique centers and groups le = LabelEncoder() labels = le.fit_transform(labels) unique_labels = le.classes_ # Sum of the distortions distortion = 0 # Loop through each label (center) to compute the centroid for current_label in unique_labels: # Mask the instances that belong to the current label mask = labels == current_label instances = X[mask] # Compute the center of these instances center = instances.mean(axis=0) # Compute the square distances from the instances to the center distances = pairwise_distances(instances, [center], metric=metric) distances = distances ** 2 # Add the mean square distance to the distortion distortion += distances.mean() return distortion ########################################################################## ## Elbow Method ##########################################################################
def runClustering(cluster_df): from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score as silhouette_score Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()] # Convert character columns to dummy variables X = cluster_df[Xcols] cols = X.columns num_cols = X._get_numeric_data().columns char_cols = list(set(cols) - set(num_cols)) for col in char_cols: if len(X[col].unique()) <= 20: dummy = pd.get_dummies(X[col], prefix='dm' + col) column_name = X.columns.values.tolist() column_name.remove(col) X = X[column_name].join(dummy) else: if col in X.columns: # If more than 20 distinct values then delete del X[col] # Standardize (Z-score normalize) all continuous variables from scipy.stats import zscore for col in X: if len(X[col].unique()) > 2: # Standardize non-dummy variables col_zscore = 'z_' + col X[col_zscore] = zscore(X[col]) del X[col] # Fill missing values with 0 = the mean in the z-normalize data # Obviously missing values can be handled in many different ways X.fillna(0, inplace=True) # convert to matrix/numpy array to use in KMeans clustering class data_for_clustering_matrix = X.as_matrix() number_of_Clusters = [] silhouette_value = [] # Loop through 2 and 20 clusters and identify which has the highest silhouette score k = range(2, 21) for i in k: clustering_method = KMeans(n_clusters=i) clustering_method.fit(data_for_clustering_matrix) labels = clustering_method.predict(data_for_clustering_matrix) silhouette_average = silhouette_score(data_for_clustering_matrix, labels) silhouette_value.append(silhouette_average) number_of_Clusters.append(int(i)) # maxind = np.argmax(silhouette_value) max_value = max(silhouette_value) indexMaxValue = silhouette_value.index(max_value) # FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue]) clustering_method.fit(data_for_clustering_matrix) labels = clustering_method.predict(data_for_clustering_matrix) # SCORE THE DATAFRAME score_df cluster_df['cluster'] = labels return cluster_df
def compute_affinity_propagation(preference_, X): # DATA FILLING #text = io.Input.local_read_text_file(inputFilePath) #input_array = text.split('\n') centers = [[1, 1], [-1, -1], [1, -1]] n_samples = 300 #Make Blobs used for generating of labels_true array if (X == None): X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0) print("Data is none!!!") print("Generating " + str(n_samples) + " samples") else : data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0) #slist = list() #for line in X: # slist.append(line) #io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist) #float_array = [] #for line in input_array: # float_line = [float(i) for i in line.split(' ')] # float_array.append(float_line) #X = array(float_array) af = AffinityPropagation(preference=preference_).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels)) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def expectation_maximization(data, nc, cv_type='full', req_info=None): gmm = GMM(n_components=nc, covariance_type=cv_type, thresh=1.0E-4, n_init=10) gmm.fit(data) labels = gmm.predict(data) if req_info == 'all': req_info = ['aic', 'bic', 'converged', 'weights', 'means', 'covars', 'silhouette', 'proba'] elif req_info is None: req_info = [] info = {} if 'aic' in req_info: info['aic'] = gmm.aic(data) if 'bic' in req_info: info['bic'] = gmm.bic(data) if 'converged' in req_info: info['converged'] = gmm.converged_ if 'weights' in req_info: info['weights'] = gmm.weights_ if 'means' in req_info: info['means'] = gmm.means_ if 'covars' in req_info: if cv_type == 'full': info['covars'] = gmm.covars_ elif cv_type == 'tied': cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1])) for i in range(nc): cov[i] = gmm.covars_.copy() info['covars'] = cov else: cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1])) for i in range(nc): cov[i] = np.diag(gmm.covars_[i]) info['covars'] = cov if 'silhouette' in req_info: info['silhouette'] = metrics.silhouette_score(data, labels) if 'proba' in req_info: info['proba'] = gmm.predict_proba(data).T return labels, info
def calculate_cluster_scores(x, cluster_labels, output): with open("%s_scores.log" % output, "w+") as fh: # Filter out singleton "cluster" (labeled as -1) filtered_x, filtered_cluster_labels, singletons = ([] for _ in range(3)) cluster_groups = defaultdict(list) for vec, lab in zip(x, cluster_labels): if not lab == -1: filtered_x.append(vec) filtered_cluster_labels.append(lab) cluster_groups[lab].append(vec) else: singletons.append(vec) ln = "Number of clustered events: %d/%d (%f%%)\n" % (len(filtered_x), len(filtered_x)+len(singletons), (len(filtered_x)/(len(filtered_x)+len(singletons)))*100) print(ln.strip("\n")) fh.write(ln) for group in cluster_groups: n_events = len(cluster_groups[group]) ln = "Cluster %d contains %d events\n" % (group, n_events) print(ln.strip("\n")) fh.write(ln) rmsstd_scores = [] for group in cluster_groups: rmsstd = calculate_rmsstd(np.array(cluster_groups[group])) ln = "The RMSSTD score for cluster %d is %f\n" % (group, rmsstd) print(ln.strip("\n")) fh.write(ln) rmsstd_scores.append(rmsstd) try: silhouette_avg = silhouette_score(np.array(filtered_x), np.array(filtered_cluster_labels)) ln = "The average silhouette score is : %f\n" % silhouette_avg print(ln.strip("\n")) fh.write(ln) except: silhouette_avg = float("nan") ln = "Impossible to calculate silhouette score. Only 1 cluster group identified.\n" print(ln.strip("\n")) fh.write(ln) return silhouette_avg, rmsstd_scores