我们从Python开源项目中,提取了以下26个代码示例,用于说明如何使用sklearn.metrics.pairwise_distances()。
def prune(self, question, paragraphs: List[ExtractedParagraph]): if not self.filter_dist_one and len(paragraphs) == 1: return paragraphs tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words) text = [] for para in paragraphs: text.append(" ".join(" ".join(s) for s in para.text)) try: para_features = tfidf.fit_transform(text) q_features = tfidf.transform([" ".join(question)]) except ValueError: return [] dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph if self.filter_dist_one: return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0] else: return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]
def dists(self, question, paragraphs: List[ExtractedParagraph]): tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words) text = [] for para in paragraphs: text.append(" ".join(" ".join(s) for s in para.text)) try: para_features = tfidf.fit_transform(text) q_features = tfidf.transform([" ".join(question)]) except ValueError: return [] dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph if self.filter_dist_one: return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0] else: return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]
def find_distance_matrix(self, vector, metric='cosine'): ''' compute distance matrix between topis using cosine or euclidean distance (default=cosine distance) ''' if metric == 'cosine': distance_matrix = pairwise_distances(vector, metric='cosine') # diagonals should be exactly zero, so remove rounding errors numpy.fill_diagonal(distance_matrix, 0) if metric == 'euclidean': distance_matrix = pairwise_distances(vector, metric='euclidean') return distance_matrix
def find_similar_words(wordvecs): """ Use loaded word embeddings to find out the most similar words in the embedded vector space. """ from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:], metric='cosine', # metric='euclidean', ) id2word = {} for key, value in wordvecs.word_idx_map.iteritems(): assert(value not in id2word) id2word[value] = key while True: word = raw_input("Enter a word ('STOP' to quit): ") if word == 'STOP': break try: w_id = wordvecs.word_idx_map[word] except KeyError: print '%s not in the vocabulary.' % word sim_w_id = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1] for i in sim_w_id: print id2word[i+1], print ''
def sort_by_tfidf(question, paragraphs): tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=spacy.en.STOP_WORDS, decode_error='replace') try: para_features = tfidf.fit_transform(paragraphs) q_features = tfidf.transform([question]) except ValueError: return [(i, 0.0) for i in range(len(paragraphs))] dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort((paragraphs, dists)) # in case of ties, use the earlier paragraph return [(i, 1.0 - dists[i]) for i in sorted_ix]
def test_pairwise_distances(X_blobs): centers = X_blobs[::100].compute() result = dm.pairwise_distances(X_blobs, centers) expected = sm.pairwise_distances(X_blobs.compute(), centers) assert_eq(result, expected, atol=1e-4)
def pairwise_distances(X, Y, metric='euclidean', n_jobs=None, **kwargs): if isinstance(Y, da.Array): raise TypeError("`Y` must be a numpy array") chunks = (X.chunks[0], (len(Y),)) return X.map_blocks(metrics.pairwise_distances, Y, dtype=float, chunks=chunks, metric=metric, **kwargs)
def transform(self, X): """Compute the LLC representation of the provided data. Parameters ---------- X : array_like or list The local features to aggregate. They must be either nd arrays or a list of nd arrays. In case of a list each item is aggregated separately. """ # Get the local features and the number of local features per document X, lengths = self._reshape_local_features(X) # Preprocess the lengths list into indexes in the local feature array starts = np.cumsum([0] + lengths).astype(int) ends = np.cumsum(lengths).astype(int) # Calculate the nearest neighbors centroids = self._clusterer.cluster_centers_ distances = pairwise_distances(X, centroids) K = self.neighbors neighbors = np.argpartition(distances, K)[:, :K] # Compute the llc representation llc = np.zeros((len(lengths), self.n_codewords)) L2 = self.beta * np.eye(X.shape[1]) for i, (s, e) in enumerate(zip(starts, ends)): for j in range(s, e): # a = argmin_{1^T a = 1} ||x - Ca||_2^2 + \beta ||a||_2^2 C = centroids[neighbors[j]] a = C.dot(np.linalg.inv(C.T.dot(C) + L2)).dot(X[j]) llc[i, neighbors[j]] = np.maximum( llc[i, neighbors[j]], a / a.sum() ) return llc
def computePerformance(self, instances): X = instances.features labels = instances.true_labels # For unsupervised projection methods, the performance is always computed with the labels (not the families). if hasattr(self.projection.conf, 'families_supervision'): if self.projection.conf.families_supervision: labels = instances.true_families unique_labels, label_inds = np.unique(labels, return_inverse = True) ratio = 0 for li in xrange(len(unique_labels)): Xc = X[label_inds == li] Xnc = X[label_inds != li] ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean() self.class_separation = ratio / len(unique_labels)
def _compute_score(q, X, metric): """ Internal method to compute the scores """ from .metrics import _scale_cosine_similarity dist = pairwise_distances(q, X, 'cosine') dist = dist[0] scores = 1 - dist scores = _scale_cosine_similarity(scores, metric=metric) return scores
def draw_features_and_similarity(mm, words_of_interest): rows, cols, xlabels = mm.filter_submatrix(words_of_interest, 25) ax = plt.subplot(1, 2, 1) plot_heat(ax, cols, xlabels, words_of_interest) # plot_heat(ax,abs(m),numbered) ax = plt.subplot(1, 2, 2) t = 1 - pairwise_distances(rows, metric="cosine") np.fill_diagonal(t, 0) plot_heat(ax, t, words_of_interest, words_of_interest) # plt.savefig("m1.pdf")
def score_paragraphs(self, question, paragraphs: List[ExtractedParagraphWithAnswers]): tfidf = self._tfidf text = [] for para in paragraphs: text.append(" ".join(" ".join(s) for s in para.text)) try: para_features = tfidf.fit_transform(text) q_features = tfidf.transform([" ".join(question)]) except ValueError: return [] q_words = {x for x in question if x.lower() not in self._stop} q_words_lower = {x.lower() for x in q_words} word_matches_features = np.zeros((len(paragraphs), 2)) for para_ix, para in enumerate(paragraphs): found = set() found_lower = set() for sent in para.text: for word in sent: if word in q_words: found.add(word) elif word.lower() in q_words_lower: found_lower.add(word.lower()) word_matches_features[para_ix, 0] = len(found) word_matches_features[para_ix, 1] = len(found_lower) tfidf = pairwise_distances(q_features, para_features, "cosine").ravel() starts = np.array([p.start for p in paragraphs]) log_word_start = np.log(starts/400.0 + 1) first = starts == 0 scores = tfidf * self.TFIDF_W + self.LOG_WORD_START_W * log_word_start + self.FIRST_W * first +\ self.LOWER_WORD_W * word_matches_features[:, 1] + self.WORD_W * word_matches_features[:, 0] return scores
def rank(self, questions: List[List[str]], paragraphs: List[List[List[str]]]): tfidf = self._tfidf para_features = tfidf.fit_transform([" ".join(" ".join(s) for s in x) for x in paragraphs]) q_features = tfidf.transform([" ".join(q) for q in questions]) scores = pairwise_distances(q_features, para_features, "cosine") return scores
def centroid_pairwise_dist(X,centroids): return pairwise_distances(X,centroids,metric='euclidean')
def compute_heterogeneity(data, k, centroids, cluster_assignment): heterogeneity = 0.0 for i in range(k): # Select all data points that belong to cluster i. Fill in the blank (RHS only) member_data_points = data[cluster_assignment==i, :] if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty # Compute distances from centroid to data points (RHS only) distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean') squared_distances = distances**2 heterogeneity += np.sum(squared_distances) return heterogeneity
def _compute_radii(self): """Generate RBF radii""" # use supplied radii if present radii = self._get_user_components('radii') # compute radii if (radii is None): centers = self.components_['centers'] n_centers = centers.shape[0] max_dist = np.max(pairwise_distances(centers)) radii = np.ones(n_centers) * max_dist/sqrt(2.0 * n_centers) self.components_['radii'] = radii
def parameter_distance(params, dist_metric='canberra', scale='minmax', return_scaled=False): """ Computes distances between subjects' respective parameter estimates Parameters ---------- params : ndarray(shape=(nsubjects, nsubjects)) Array of parameter estimates dist_metric : str (default='canberra') Distance metric to be used. Can take any value acceptable by ``sklearn.metrics.pairwise_distances``. scale : {'minmax', 'standard', 'none'} How to scale the parameters for distance computation return_scaled : bool Whether to return scaled parameters """ if scale != 'none': if scale == 'minmax': scaler = MinMaxScaler() if scale == 'standard': scaler = StandardScaler() nparams = np.shape(params)[1] for j in range(nparams): scaledparam = scaler.fit_transform(params[:, j].reshape(-1, 1)) params[:, j] = scaledparam.flatten() if return_scaled is True: D = (pairwise_distances(params, metric=dist_metric), params) else: D = pairwise_distances(params, metric=dist_metric) return D
def construct_k_nearest_matrix(self, dt_matrix, k): tmp = np.array(1 - pairwise_distances(dt_matrix[dt_matrix.columns[1:]], metric = "cosine")) similarity_matrix = pd.DataFrame(tmp, index = dt_matrix.index.tolist(), columns = dt_matrix.index.tolist()) for i in similarity_matrix.index: tmp = [int(i),[]] j = 0 while j < k: max_col = similarity_matrix.loc[i].idxmax(axis = 1) similarity_matrix.loc[i][max_col] = -1 if max_col != i: tmp[1].append(int(max_col)) #max column name j += 1 self.k_nearest.append(tmp)
def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) X = rng.rand(20, 2) D = pairwise_distances(X, metric='euclidean') y = rng.randint(3, size=20) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): metric_score = cross_val_score(Est(), X, y) precomp_score = cross_val_score(Est(metric='precomputed'), D, y) assert_array_equal(metric_score, precomp_score)
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X = dataset.data y = dataset.target D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0) # Test without calculating D silhouette_metric = silhouette_score(X, y, metric='euclidean') assert_almost_equal(silhouette, silhouette_metric) # Test with sampling silhouette = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) silhouette_metric = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert(silhouette > 0) assert(silhouette_metric > 0) assert_almost_equal(silhouette_metric, silhouette) # Test with sparse X X_sparse = csr_matrix(X) D = pairwise_distances(X_sparse, metric='euclidean') silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0)
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, eigen_solver="amg")
def test_spectral_unknown_assign_labels(): # Test that SpectralClustering fails with an unknown assign_labels set. centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, assign_labels="<unknown>")
def train_wordfilter_coefficient(self, seed_words, wordfilters): mined_words = defaultdict(lambda: defaultdict(lambda: 0)) filter_set = {wordfilter for (rng, wordfilter) in wordfilters} ranges = {rng for (rng, wordfilter) in wordfilters} for num_doc, doc in enumerate(Word2vecCorpus(self.corpus_file)): len_doc = len(doc) for rng in ranges: (fb, fe) = rng if len_doc < (fe - fb + 1): continue words = doc[-fb:-fe] contexts = [] for i, word in enumerate(doc): if (i + fb < 0) or (i + fe >= len_doc): continue contexts.append(tuple([doc[i+r] for r in range(fb, fe+1) if r != 0])) for i, context in enumerate(contexts): if context in filter_set: mined_words[(rng, context)][words[i]] += 1 result = [] seeds_idx = sorted([self.word2index[seed] for seed in seed_words]) seeds_vec = [self.word2vec_model.syn0[idx] for idx in seeds_idx] for ((rng, context), word2freq) in sorted(mined_words.items(), key=lambda x:sum(x[1].values()), reverse=True): word_freq = [(self.word2index[word], freq) for (word, freq) in word2freq.items()] word_freq = [v for v in word_freq if v[0] != -1] word_freq = sorted(word_freq) idx = [pair[0] for pair in word_freq] word_vec = self.word2vec_model.syn0[idx] sum_freq = sum([v[1] for v in word_freq]) score = 0 for seed_vec in seeds_vec: sim = 1 + -1 * pairwise_distances(word_vec, seed_vec, metric='cosine') score += sum([wf[1] * s for wf, s in zip(word_freq, sim)]) / sum_freq score /= len(seed_words) result.append((context, rng, score, sum_freq)) return result
def likelihood_distance(loglik_func, data, params, diff_metric='sq', dist_metric='cosine', verbose=False): """ Estimates the likelihood of the data from the i'th subject using the parameter estimates of the j'th subject, for all i and j, then computes the distance between subjects' likelihood difference vectors Parameters ---------- loglik_func : function The log-likelihood function to be used data : dict Data formatted for input into the log-likelihood function params : ndarray(shape=(nsubjects, nparams)) Array of parameter estimates diff_metric : {'sq', 'diff', 'abs'} Which type of difference measure to compute, 'diff' is simple subtractive difference, whereas 'sq' and 'abs' are the squared and absolute differences, respectively dist_metric : str (default='cosine') The pairwise distance metric to use. Any option that can be passed into ``sklearn.metrics.pairwise_distances`` can work. verbose : bool Whether to print out progress Returns ------- ndarray(shape=(nsubjects, nsubjects)) """ nsubjects = np.shape(params)[0] D = np.zeros([nsubjects, nsubjects]) for i in range(nsubjects): S = data[i]['S'] A = data[i]['A'] R = data[i]['R'] if verbose is True: print('Likelihood Differences: Subject ' + str(i)) # Compute loglikelihood for subject i with own data LL0 = loglik_func(params=params[i, :], states=S, actions=A, rewards=R) for j in range(nsubjects): if i !=j: LL1 = loglik_func(params=params[j, :], states=S, actions=A, rewards=R) if diff_metric == 'diff': D[i, j] = LL1 - LL0 elif diff_metric == 'sq': D[i, j] = (LL1 - LL0)**2 elif diff_metric == 'abs': D[i, j] = np.abs(LL1 - LL0) return pairwise_distances(D, metric=dist_metric)
def test_precomputed(random_state=42): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(random_state) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X, metric='euclidean') DYX = metrics.pairwise_distances(Y, X, metric='euclidean') for method in ['kneighbors']: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) nbrs_X.fit(X) dist_X, ind_X = getattr(nbrs_X, method)(Y) # As a dense distance matrix (n_samples by n_samples) nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check auto works too nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check X=None in prediction dist_X, ind_X = getattr(nbrs_X, method)(None) dist_D, ind_D = getattr(nbrs_D, method)(None) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape assert_raises(ValueError, getattr(nbrs_D, method), X) target = np.arange(X.shape[0]) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): print(Est) est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) est.metric = 'precomputed' pred_D = est.fit(DXX, target).predict(DYX) assert_array_almost_equal(pred_X, pred_D)