我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.metrics.pairwise.pairwise_distances()。
def decision_function(self, X): """Compute the distances to the nearest centroid for an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] """ from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import check_array, check_is_fitted check_is_fitted(self, 'centroids_') X = check_array(X, accept_sparse='csr') return pairwise_distances(X, self.centroids_, metric=self.metric).min(axis=1)
def test_similarity_calculations(): """ Tests the implementation of fast similarity calculations with the PyTorch :return: """ np.random.seed(1) # Create random data vectors for sigma in [0.01, 0.1, 0.5, 1]: A = np.random.randn(10, 23) sef_sim = fast_heat_similarity_matrix(A, sigma) assert sef_sim.shape[0] == 10 assert sef_sim.shape[1] == 10 sim = np.exp(-pairwise_distances(A, A)**2/sigma**2) assert np.sum((sef_sim-sim)*2) < 1e-3
def test_cosine2jaccard(): from sklearn.metrics.pairwise import pairwise_distances from freediscovery.metrics import (cosine2jaccard_similarity, jaccard2cosine_similarity) x = np.array([[0, 0, 1., 1.]]) y = np.array([[0, 1., 1., 0]]) S_cos = 1 - pairwise_distances(x, y, metric='cosine') S_jac = cosine2jaccard_similarity(S_cos) S_jac_ref = 1 - pairwise_distances(x.astype('bool'), y.astype('bool'), metric='jaccard') assert_allclose(S_jac, S_jac_ref) S_cos2 = jaccard2cosine_similarity(S_jac) assert_allclose(S_cos2, S_cos)
def centroid_similarity(X, internal_ids, nn_metric='cosine'): """ Given a list of documents in a cluster, compute the cluster centroid, intertia and individual distances Parameters ---------- internal_ids : list a list of internal ids nn_metric : str a rescaling of the metric if needed """ from ..metrics import _scale_cosine_similarity from sklearn.metrics.pairwise import pairwise_distances X_sl = X[internal_ids, :] centroid = X_sl.mean(axis=0) if centroid.ndim == 1: centroid = centroid[None, :] S_cos = 1 - pairwise_distances(X_sl, centroid, metric='cosine') S_sim = _scale_cosine_similarity(S_cos, metric=nn_metric) S_sim_mean = np.mean(S_sim) return float(S_sim_mean), S_sim[:, 0]
def query(vec, model, k, max_search_radius): data = model['data'] table = model['table'] random_vectors = model['random_vectors'] num_vector = random_vectors.shape[1] # Compute bin index for the query vector, in bit representation. bin_index_bits = (vec.dot(random_vectors) >= 0).flatten() # Search nearby bins and collect candidates candidate_set = set() for search_radius in range(max_search_radius+1): candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set) # Sort candidates by their true distances from the query nearest_neighbors = pd.DataFrame({'id':list(candidate_set)}) candidates = data[np.array(list(candidate_set)),:] nearest_neighbors['distance'] = pairwise_distances(candidates, vec, metric='cosine').flatten() return nearest_neighbors.sort_values(by='distance').head(k), len(candidate_set)
def pre_train(train_df, test_df, train_add, test_add): train = train_df.values[:,1:-1] t = train_add.values[:,1:-1] train = np.hstack((train, t)) dtest = test_df.values[:,1:] tA = test_add.values[:,1:] dtest = np.hstack((dtest, tA)) cor_distance = pairwise.pairwise_distances(dtest, train) resultset = set() for tmp in cor_distance: index = np.argsort(tmp) for i in range(10): resultset.add(index[i]) index = [] for i in resultset: index.append(i) return index
def predict(self, X): """ Classify the input data assigning the label of the nearest prototype Keyword arguments: X -- The feature vectors """ classification=np.zeros(len(X)) if self.distance_metric=="euclidean": distances=pairwise_distances(X, self.M_,self.distance_metric) #compute distances to the prototypes (template matching) if self.distance_metric=="minkowski": distances=pairwise_distances(X, self.M_,self.distance_metric) elif self.distance_metric=="manhattan": distances=pairwise_distances(X, self.M_,self.distance_metric) elif self.distance_metric=="mahalanobis": distances=pairwise_distances(X, self.M_,self.distance_metric) else: distances=pairwise_distances(X, self.M_,"euclidean") for i in xrange(len(X)): classification[i]=self.outcomes[distances[i].tolist().index(min(distances[i]))] #choose the class belonging to nearest prototype distance return classification
def test_distance_calculations(): """ Tests the implementation of fast distance calculations with the PyTorch :return: """ np.random.seed(1) # Create random data vectors A = np.random.randn(10, 23) B = np.random.randn(5, 23) sef_dists = fast_distance_matrix(A, B) assert sef_dists.shape[0] == 10 assert sef_dists.shape[1] == 5 dists = pairwise_distances(A, B) assert np.sum((sef_dists-dists)*2) < 1e-3
def mean_data_distance(data): """ Calculates the mean distance between a set of data points :param data: :return: """ mean_distance = np.mean(pairwise_distances(data)) return mean_distance
def compare_pic(self,feature1,feature2): predicts=pw.pairwise_distances(feature2, feature1,'cosine') #predicts=pw.cosine_similarity(feature1, feature2) return predicts
def initialize_layer(self, data, n_samples=10000): """ Initializes the layer using k-means (sigma is set to the mean pairwise distance) :param data: data :param n_samples: n_samples to keep for initializing the model :return: """ if self.features_fn is None: assert False idx = np.arange(data.shape[0]) np.random.shuffle(idx) features = [] for i in range(idx.shape[0]): feats = self.features_fn([data[idx[i]]]) feats = feats.transpose((0, 2, 3, 1)) feats = feats.reshape((-1, feats.shape[-1])) features.extend(feats) if len(features) > n_samples: break features = np.asarray(features) kmeans = KMeans(n_clusters=self.n_codewords, n_jobs=4, n_init=5) kmeans.fit(features) V = kmeans.cluster_centers_.copy() # Initialize gamma mean_distance = np.sum(pairwise_distances(V)) / (self.n_codewords * (self.n_codewords - 1)) self.gamma.set_value(self.gamma.get_value() * np.float32(mean_distance)) # Initialize codebook V = V.reshape((V.shape[0], V.shape[1], 1, 1)) self.V.set_value(np.float32(V))
def delta(X, Y, n_jobs=-1, a=1, c=0): """Pairwise delta function: cosine and sigmoid :X: TODO :returns: TODO """ D = pairwise_distances(X, Y, metric="cosine", n_jobs=n_jobs) if c != 0: D -= c if a != 1: D *= a D = expit(D) return D
def test_euclidean2cosine(): from sklearn.metrics.pairwise import pairwise_distances x = normalize([[0, 2, 3, 5]]) y = normalize([[1, 3, 6, 7]]) D_cos = pairwise_distances(x, y, metric='cosine')[0, 0] S_cos = 1 - D_cos D_seuc = pairwise_distances(x, y, metric='euclidean', squared=True)[0, 0] assert_allclose(S_cos, seuclidean_dist2cosine_sim(D_seuc))
def get_distances(self): distances = pairwise_distances(self.query_feats,self.db_feats,self.dist_type, n_jobs=-1) return distances
def fit(self, data): """ :param data: :return: """ [n_samples, n_obs] = data.shape self.protos = data[self.rng.choice(n_samples, self.n_protos),] # w self.context = np.zeros(self.protos.shape) # c ct = np.zeros((1, n_obs)) wr = ct cr = wr for iteration in range(self.iterations): sample = data[self.rng.choice(n_samples, 1),] ct = (1 - self.a) * wr + self.b * cr t = iteration / float(self.iterations) lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t epsilon = self.epsilon_i * (self.lrate_f / float(self.lrate_i)) ** t d = (1 - self.a) * pairwise_distances(sample, self.protos) + self.a * pairwise_distances(ct, self.context) I = np.argsort(np.argsort(d)) min_id = np.where(I == 0)[0] H = np.exp(-I / epsilon).ravel() diff_w = sample - self.protos diff_c = ct - self.context for i in range(self.n_protos): self.protos[i, :] += lrate * H[i] * diff_w[i, :] self.context[i, :] += lrate * H[i] * diff_c[i, :] wr = self.protos[min_id] cr = self.context[min_id] return self
def encode(self, data, metric = 'euclidean'): """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. metric : string One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`. Valid options include: - euclidean - cityblock - l1 - cosine Returns ------- encoded_data : real array-like, shape(n_samples, n_features) ``data``, as represented by the prototypes in codebook. ts_symbols : list, shape(n_samples, 1) A discrete symbolic time series """ nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos) _, self.__symbols = nbrs.kneighbors(data) self.__encoding = self.protos[self.__symbols] return (self.__encoding, self.__symbols)
def fit(self, data): """ Learn data, and construct a vector codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. Returns ------- self : object The instance itself """ [n_samples, _] = data.shape self.protos = data[self.rng.choice(n_samples, self.n_protos), ] # avg_p = np.mean(data, 0) #dist_from_avg_p = np.sum(pairwise_distances(avg_p, data)) #ndistortion = [] for iteration in range(self.iterations): sample = data[self.rng.choice(n_samples, 1), ] t = iteration / float(self.iterations) lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs) I = np.argsort(np.argsort(D)) H = np.exp(-I / epsilon).ravel() diff = sample - self.protos for proto_id in range(self.n_protos): self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :] #nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos) #distances, _ = nbrs.kneighbors(data) #ndistortion.append( np.sum(distances) / dist_from_avg_p ) return self
def encode(self, data, metric='euclidean'): """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. metric : string One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html. Valid options include: - euclidean - cityblock - l1 - cosine Returns ------- encoded_data : real array-like, shape(n_samples, n_features) ``data``, as represented by the prototypes in codebook. ts_symbols : list, shape(n_samples, 1) A discrete symbolic time series """ # Perform a proposed data mining procedure as described in [Laskaris2004]. mds = MDS(1, random_state=self.rng) protos_1d = mds.fit_transform(self.protos).ravel() sorted_protos_1d = np.argsort(protos_1d) sprotos = self.protos[sorted_protos_1d] nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos) _, self.__symbols = nbrs.kneighbors(data) self.__encoding = sprotos[self.__symbols] return (self.__encoding, self.__symbols)
def grab_articles(self, ids): task_id = self.request.id ids = ids[0] print("Entering Grab Articles Task: ", len(ids)) print("Task id from self: ", task_id) s = select([articles_db.c.id, articles_db.c.tfidf]).where(articles_db.c.id.in_(ids)) all_articles = pd.read_sql(s, con=connection, chunksize=350) all_articles = pd.concat(all_articles, ignore_index=True) stored_data = json.loads(r.get(task_id)) stored_data['status'] = "creating article matrix" r.set(task_id, json.dumps(stored_data)) tfidf_dict = stored_data['tfidf_dict'] all_articles = all_articles.append({'id': 1, 'tfidf': tfidf_dict}, ignore_index=True) corpus = helpers.generate_sparse_matrix(all_articles) query_article_vector = corpus.getrow(-1) all_articles['distance'] = pairwise_distances(corpus, query_article_vector, metric='cosine').flatten() stored_data['status'] = "computing best matches" r.set(task_id, json.dumps(stored_data)) max_distance_from_query = 0.75 # on a scale of 0 (exact match) to 1.0 (not even close) all_articles = all_articles[all_articles['distance'] < max_distance_from_query] print("Done computing matrix and distances") s = select([articles_db.c.id, articles_db.c.headline, articles_db.c.url, articles_db.c.date]).where( articles_db.c.id.in_(all_articles['id'].tolist())) all_articles = pd.read_sql(s, connection).set_index('id').join(all_articles.set_index('id')).sort_values(by='date') query_article = {'headline': stored_data['headline'], 'date': datetime.strptime(stored_data['date'], "%d-%b-%Y"), 'distance': 0, 'url': stored_data['url']} articles = helpers.make_article_array(all_articles, query_article) return articles, query_article['headline']
def pairwise_distances(self, X, Y=None, metric='cosine', n_jobs=1, **kwds): if self.prenorm: if metric == 'cosine': return self._cosine_distances_prenorm(X, Y) else: raise Exception( 'Vectors are normalized and will work only with cosine.') return smp.pairwise_distances(X, Y, metric=metric, n_jobs=n_jobs, **kwds)
def all_distances(self, l1, metric='cosine'): """Return distance matrix with distances to all words.""" l1_vecs = self.word_vectors_matrix(l1) l1_labels = [self.label(e) for e in l1] sims = self.pairwise_distances(l1_vecs, self.vectors, metric=metric) return pd.DataFrame(sims, l1_labels, self.words)
def pair_distance(self, w1, w2, metric='cosine'): """Calculate distance between two words.""" distance = self.pairwise_distances( self.get_vector(w1), self.get_vector(w2), metric=metric) return distance[0, 0]
def matrix_distances(self, l1, l2=None, metric='cosine'): """Return distance matrix with distances between pairs of words.""" l1_vecs = self.word_vectors_matrix(l1) l1_labels = [self.label(e) for e in l1] if l2 is None: sims = self.pairwise_distances(l1_vecs, metric=metric) l2 = l1 else: l2_vecs = self.word_vectors_matrix(l2) l2_labels = [self.label(e) for e in l2] sims = self.pairwise_distances(l1_vecs, l2_vecs, metric=metric) return pd.DataFrame(sims, l1_labels, l2_labels)
def computeProbabilities(X, perplexity=30.0, tolerance=1e-5): #Perform an initial dimensionality reduction pca = PCA(n_components=50) X = pca.fit_transform(X) numSamples = X.shape[0] P = np.zeros((numSamples, numSamples)) D = pairwise_distances(X, squared=True) for i in range(numSamples): indices = np.concatenate((np.arange(i), np.arange(i + 1, numSamples))) distancesFromI = D[i, indices] sigma = binarySearch(computePerplexity, distancesFromI, tolerance, perplexity) precision = 1.0 / sigma #Compute a "row" of matrix P: the probabilities wrt point I PwrtI = np.exp(- distancesFromI * precision) PwrtI /= sum(PwrtI) #Insert an element corresponding to I wrt I PwrtI = np.concatenate((PwrtI[0:i], [0.0], PwrtI[i:numSamples])) #Insert the row P[i, :] = PwrtI return P
def main(args): PF, PL, GF, GL = _get_test_data(args.result_dir) D = pairwise_distances(GF, PF, metric=args.method, n_jobs=-2) gallery_labels_set = np.unique(GL) for label in PL: if label not in gallery_labels_set: print 'Probe-id is out of Gallery-id sets.' Times = 100 k = 20 res = np.zeros(k) gallery_labels_map = [[] for i in xrange(gallery_labels_set.size)] for i, g in enumerate(GL): gallery_labels_map[g].append(i) for __ in xrange(Times): # Randomly select one gallery sample per label selected newD = np.zeros((gallery_labels_set.size, PL.size)) for i, g in enumerate(gallery_labels_set): j = np.random.choice(gallery_labels_map[g]) newD[i, :] = D[j, :] # Compute CMC res += _cmc_core(newD, gallery_labels_set, PL, k) res /= Times for topk in [1, 5, 10, 20]: print "{:8}{:8.1%}".format('top-' + str(topk), res[topk - 1])
def getDist(feat1, feat2, metric): pair_num = len(feat1) import sklearn.metrics.pairwise as pw mt = pw.pairwise_distances(feat1, feat2, metric=metric) distance = np.empty((pair_num,)) for i in xrange(pair_num): distance[i] = mt[i,i] return distance # Extract feature via network.
def constructCovarianceMatrix(self): #this function constructs the covariance matrix for the dataset and then does a label propagation over it self.covarianceMatrix=np.cov(self.trainVectorsPCA.T) #as numpy treats them as column vetcors self.inverseCovarianceMatrix=np.linalg.inv(self.covarianceMatrix) #compute the cholesky decomposition and then transform the data into the new space self.L_cov=np.linalg.cholesky(self.covarianceMatrix) self.allDataCov=np.dot(self.allDataPCA,self.L_cov.T) self.pwdis=pairwise_distances(self.allDataCov) self.D=np.zeros(self.pwdis.shape) projectedDigits=TSNE(random_state=randomState).fit_transform(self.allDataCov) plt.figure() plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) plt.title('Data projected by Covariance Matrix in Mahalanobis metric') plt.savefig(pp,format='pdf') plt.close() ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] accs=[] for k in ks: for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[k:]]=0 self.D[i,i]=sum(self.pwdis[i]+0.01) print 'accuracy by using Covariance Matrix for Mahalanobis Distance for k= ',k,'\n' accs.append(self.labelPropogation()) plt.figure() plt.plot(ks,accs) plt.title('Plot of accuracy vs k using Covariance Matrix in Mahalanobis metric') plt.savefig(pp,format='pdf')
def constructEucleadianGaussianKernel(self): self.pwdis=pairwise_distances(self.allDataPCA) maccs=[] ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] for k in ks: sigmas=[1,1.5,2,2.5,3,3.5] accs=[] for sigma in sigmas: self.pwdis=-1*self.pwdis/(2*sigma*sigma) self.pwdis=np.exp(self.pwdis) self.D=np.zeros(self.pwdis.shape) for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[k:]]=0 self.D[i,i]=sum(self.pwdis[i]) #here we make no trnasformation on the dataset, as this is simply the print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n' accs.append(self.labelPropogation()) maccs.append(np.mean(accs)) plt.figure() plt.plot(ks,maccs) plt.title('Accuarcy vs k for Eucledian Gaussian Kernel') plt.savefig(pp,format='pdf') plt.close()
def constructEucleadianGaussianKernelNoPca(self): self.pwdis=pairwise_distances(self.allVectors) maccs=[] ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] for k in ks: sigmas=[1,1.5,2,2.5,3,3.5] accs=[] for sigma in sigmas: self.pwdis=-1*self.pwdis/(2*sigma*sigma) self.pwdis=np.exp(self.pwdis) self.D=np.zeros(self.pwdis.shape) for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[k:]]=0 self.D[i,i]=sum(self.pwdis[i]) #here we make no trnasformation on the dataset, as this is simply the print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n' accs.append(self.labelPropogation()) maccs.append(np.mean(accs)) plt.figure() plt.plot(ks,maccs) plt.title('Accuarcy vs k for Eucledian Gaussian Kernel') plt.savefig(pp,format='pdf') plt.close()
def constructSimilartyMatrixCosine(self): #This is a simpole k nearest neighbour approach based on the cosine distance #for this takefrom modshogun import RealFeatures, MulticlassLabels #then find the k nearest neighbours for each node #now we have all the pairwise cosine distances between all the sentences #now we need to do a knnNeighbour search #now we can construct the diagonal weight marix , which has the sum of all the weights ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] accs=[] for k in ks: self.pwdis=pairwise_distances(self.allVectors,metric='cosine') self.D=np.zeros(self.pwdis.shape) for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[k:]]=0 self.D[i,i]=sum(self.pwdis[i]) print 'accuracy on non pca data using cosine and k= ',k,' is ','\n' accs.append(self.labelPropogation()) plt.figure() plt.plot(ks,accs) plt.title('Plot of accuracy vs k using cosine non PCA data') plt.savefig(pp,format='pdf') plt.close()
def constructSimilartyMatrixCosinePCA(self): #This is a simpole k nearest neighbour approach based on the cosine distance #for this takefrom modshogun import RealFeatures, MulticlassLabels #then find the k nearest neighbours for each node ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] accs=[] for k in ks: self.pwdis=pairwise_distances(self.allDataPCA,metric='cosine') #now we have all the pairwise cosine distances between all the sentences #now we need to do a knnNeighbour search #now we can construct the diagonal weight marix , which has the sum of all the weights self.D=np.zeros(self.pwdis.shape) for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[k:]]=0 self.D[i,i]=sum(self.pwdis[i]) print 'Now computing accuracy for cosine metric on PCA data' accs.append(self.labelPropogation()) plt.figure() plt.plot(ks,accs) plt.title('Plot of accuracy vs k using cosine PCA data') plt.savefig(pp,format='pdf') plt.close() #now we have the weight matrix graph based on the cosine distance #print 'self.D is ',self.D
def calc_cosine_dist(text_a ,text_b): return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
def calc_cosine_dist(text_a ,text_b, metric = 'euclidean'): return pairwise_distances([text_a], [text_b], metric = metric)[0][0]
def predict_proba(self, X): """ Returns a matrix for each of the samples to belong to each of the classes. The matrix has shape = [n_samples, n_classes] where n_samples is the size of the first dimension of the input matrix X and n_classes is the number of classes as determined from the parameter 'y' obtained during training. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Prediction vector, where n_samples in the number of samples and n_features is the number of features. """ probabilities = np.zeros((X.shape[0], self.y.shape[1]), dtype=np.float64) distances = (pairwise_distances(X, self.centroids_, metric=self.metric)) # in order to get probability like values, we ensure that the closer # the distance is to zero, the closer the probability is to 1 if(self.metric == 'cosine'): distances = 1 - distances else: # in the case of euclidean distance metric we need to normalize by the largest distance # to get a value between 0 and 1 distances = 1 - (distances / distances.max()) # map back onto a matrix containing all labels probabilities[:,self._mem_original_mapping] = distances return probabilities
def assign_to_closest(X, centers, metric='euclidean'): return np.argmin(pairwise_distances(X, centers, metric=metric), axis=1)
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean') # Sets of inputs
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean') # Sets of input defining sizes
def sort(self, word): ''' Use an input word to sort words using cosine distance in ascending order ''' assert word in self.dictionary i = self.dictionary[word] vec = self.final_embeddings[i].reshape(1, -1) # Calculate pairwise cosine distance and flatten to 1-d pdist = pairwise_distances(self.final_embeddings, vec, metric='cosine').ravel() return [self.reverse_dictionary[i] for i in pdist.argsort()]