我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.spatial.distance.cosine()。
def correlations(A,B,pc_n=100): p = (1 - distance.correlation(A.flatten(),B.flatten())) spear = spearmanr(A.flatten(),B.flatten()) dist_genes = np.zeros(A.shape[0]) for i in range(A.shape[0]): dist_genes[i] = 1 - distance.correlation(A[i],B[i]) pg = (np.average(dist_genes[np.isfinite(dist_genes)])) dist_sample = np.zeros(A.shape[1]) for i in range(A.shape[1]): dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i]) ps = (np.average(dist_sample[np.isfinite(dist_sample)])) pc_dist = [] if pc_n > 0: u0,s0,vt0 = np.linalg.svd(A) u,s,vt = np.linalg.svd(B) for i in range(pc_n): pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i]))) pc_dist = np.array(pc_dist) return p,spear[0],pg,ps,pc_dist
def evaluate1Word(wv, reference): """Evaluate wv against reference, return (rho, count) where rwo is Spearman's rho and count is the number of reference word pairs that could be evaluated against. """ count=0 gold, predicted = [], [] for words, sim in sorted(reference, key=lambda ws: ws[1]): if " " not in words[0] and " " not in words[1]: #print words[0],words[1] try: v1, v2 = wv[words[0]], wv[words[1]] except KeyError: count+=1 continue #print words gold.append((words, sim)) predicted.append((words, cosine(v1, v2))) simlist = lambda ws: [s for w,s in ws] rho, p = spearmanr(simlist(gold), simlist(predicted)) print "Word not found in WordVector",count return (rho, len(gold))
def get_best_label(label_list,num): topic_ls = get_topic_lg(topic_list[num]) val_dict = {} for item in label_list: trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Extracting letter trigram for label label_cnt = Counter(trigrams) total = sum(label_cnt.values(), 0.0) for key in label_cnt: label_cnt[key] /= total tot_keys = list(set(topic_ls.keys() + label_cnt.keys())) listtopic = [] listlabel = [] for elem in tot_keys: if elem in topic_ls: listtopic.append(topic_ls[elem]) else: listtopic.append(0.0) if elem in label_cnt: listlabel.append(label_cnt[elem]) else: listlabel.append(0.0) val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity val_dict[item] = val list_sorted=sorted(val_dict.items(), key=lambda x:x[1], reverse = True) # Sorting the labels by rank return [i[0] for i in list_sorted[:int(args.num_unsup_labels)]]
def texts_tfidf(ids, important_texts, citations_texts) : ''' Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. ''' tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1,2), min_df=2) freqs1 = tfidf.fit_transform(important_texts) terms1 = tfidf.get_feature_names() freqs2 = tfidf.fit_transform(citations_texts) terms2 = tfidf.get_feature_names() return terms1, terms2, freqs1, freqs2
def texts_similarity(terms1, terms2, freqs1, freqs2) : # Merge all terms terms = list(set(terms1 + terms2)) npapers = freqs1.shape[0] sims = np.empty(npapers, np.float) for i in xrange(npapers) : # If one of the vectors is nil, skip it if (freqs1[i].sum()==0.0) or (freqs2[i].sum()==0.0) : continue # Changes representation to a {term: freq} map fmap1 = to_dict(terms1, freqs1.getrow(i).toarray()[0]) fmap2 = to_dict(terms2, freqs2.getrow(i).toarray()[0]) vec1, vec2 = to_same_dimension(terms, fmap1, fmap2) sims[i] = 1.0-cosine(vec1, vec2) return sims
def random_similarity(terms1, terms2, freqs1, freqs2) : # Merge all terms terms = list(set(terms1 + terms2)) npapers = freqs1.shape[0] sims = np.empty(npapers, np.float) for i in xrange(npapers) : a = random.randint(0,npapers-1) #@UndefinedVariable b = random.randint(0,npapers-1) #@UndefinedVariable # If one of the vectors is nil, skip it if (freqs1[a].sum()==0.0) or (freqs2[b].sum()==0.0) : continue # Changes representation to a {term: freq} map fmap1 = to_dict(terms1, freqs1[a].toarray()[0]) fmap2 = to_dict(terms2, freqs2[b].toarray()[0]) vec1, vec2 = to_same_dimension(terms, fmap1, fmap2) sims[i] = 1.0-cosine(vec1, vec2) return sims
def sanity_check(test_emb, train_emb, num_test): ''' Sanity check on the cosine similarity calculations Finds the closest vector in the space by brute force ''' correct_list = [] for i in xrange(num_test): smallest_norm = np.infty index = 0 for j in xrange(len(train_emb)): norm = np.linalg.norm(emb - test_emb[i]) if norm < smallest_norm: smallest_norm = norm index = j correct_list.append(index) # Pad the list to make it the same length as test_emb for i in xrange(len(test_emb) - num_test): correct_list.append(-1) return correct_list
def token_similarity(self, words ,rwords): words = set(words) rwords = set(rwords) word_vec = np.zeros(self.word_dim) rword_vec = np.zeros(self.word_dim) word_count = 0 rword_count = 0 for word in words: if self.word_vec.has_key(word) and word not in self.stopwords: word_vec += self.word_vec[word] word_count += 1 for word in rwords: if self.word_vec.has_key(word): rword_vec += self.word_vec[word] rword_count += 1 if word_count > 0: word_vec = word_vec / word_count if rword_count > 0: rword_vec = rword_vec / rword_count if word_count>0 and rword_count>0: return cosine(word_vec, rword_vec) else: return 1
def nearest_words(self, word, top=20, display=False): """ Find the nearest words to the word according to the cosine similarity. """ W = self.W / np.linalg.norm(self.W, axis=0) if (type(word)==str): vec = self.word_vector(word, W) else: vec = word / np.linalg.norm(word) cosines = (vec.T).dot(W) args = np.argsort(cosines)[::-1] nws = [] for i in xrange(1, top+1): nws.append(self.inv_vocab[args[i]]) if (display): print self.inv_vocab[args[i]], round(cosines[args[i]],3) return nws
def argmax_fun(W, indices, argmax_type='levi'): """ cosine: b* = argmax cosine(b*, b - a + a*) levi: b* = argmax cos(b*,a*)cos(b*,b)/(cos(b*,a)+eps) """ if (argmax_type == 'levi'): W = W / np.linalg.norm(W, axis=0) words3 = W[:, indices] cosines = ((words3.T).dot(W) + 1) / 2 obj = (cosines[1] * cosines[2]) / (cosines[0] + 1e-3) pred_idx = np.argmax(obj) elif (argmax_type == 'cosine'): words3_vec = W[:, indices].sum(axis=1) - 2*W[:, indices[0]] W = W / np.linalg.norm(W, axis=0) words3_vec = words3_vec / np.linalg.norm(words3_vec) cosines = (words3_vec.T).dot(W) pred_idx = np.argmax(cosines) return pred_idx
def synonyms_by_synset(self, synset_name, topn=3): ssid = self.id_table[synset_name] doc = self.doc_matrix[ssid] found_indices = set([ssid]) synonyms = [] for _ in range(topn): min_index = 0 min_val = 10 for i in range(self.doc_matrix.shape[0]): cos_dist = cosine(self.doc_matrix[i], doc) if i not in found_indices and cos_dist < min_val: min_index = i min_val = cos_dist found_indices.add(min_index) synonyms.append((self.definitions[min_index], min_val)) return synonyms
def get_sils_matrix(method, scores, wordlist): ''' See get_sims_matrix for definitions, which are the same here. The difference is that the resulting matrix contains distances instead of similarities. :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist) ''' if method =='direct': sims = get_sims_matrix(method, scores, wordlist) sims = preprocessing.normalize(np.matrix(sims), norm='l2') sils = 1-sims elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist]) elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist]) elif method == 'vec_cosine': d = scores.values()[0].shape[0] sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist]) else: sys.stderr.write('Unknown sil method: %s' % method) return None sils = np.nan_to_num(sils) return sils
def get_sentiment_sim(context_seqs, gen_seqs): '''return the cosine similarity between the sentiment scores of each context and corresponding generated sequence; the sentiment scores are given in spacy''' gen_seqs = check_seqs_format(gen_seqs) emotion_types = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD'] gen_sentiment_sim_scores = [] for context_seq, gen_seqs_ in zip(context_seqs, gen_seqs): context_sentiment = lexicon_methods.emotional_valence(encoder(context_seq)) context_sentiment = numpy.array([context_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0 sentiment_sim_scores = [] for gen_seq in gen_seqs_: gen_sentiment = lexicon_methods.emotional_valence(encoder(gen_seq)) gen_sentiment = numpy.array([gen_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0 sentiment_sim = 1 - cosine(context_sentiment, gen_sentiment) sentiment_sim_scores.append(sentiment_sim) gen_sentiment_sim_scores.append(sentiment_sim_scores) gen_sentiment_sim_scores = numpy.array(gen_sentiment_sim_scores) return {'sentiment_sim_scores': gen_sentiment_sim_scores, 'mean_sentiment_sim_scores': numpy.mean(gen_sentiment_sim_scores)}
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
def find_similar_words(wordvecs): """ Use loaded word embeddings to find out the most similar words in the embedded vector space. """ from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:], metric='cosine', # metric='euclidean', ) id2word = {} for key, value in wordvecs.word_idx_map.iteritems(): assert(value not in id2word) id2word[value] = key while True: word = raw_input("Enter a word ('STOP' to quit): ") if word == 'STOP': break try: w_id = wordvecs.word_idx_map[word] except KeyError: print '%s not in the vocabulary.' % word sim_w_id = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1] for i in sim_w_id: print id2word[i+1], print ''
def generate_answer(self, msg_text, chat_id): minimum_index=[1-(10**(-5)),-1] # min value / minimum index if chat_id in self.vectorizer: t=self.vectorizer[chat_id].transform([msg_text]).toarray()[0] else: reply="" return for i,t2 in enumerate(self.mat[chat_id].toarray()): w=cosine(t,t2) if abs(w)<=minimum_index[0]: if minimum_index[0] == abs(w): # equal weight, lets take the longer message if len(self.speech[chat_id][0][i]) > len(self.speech[chat_id][0][minimum_index[1]]): minimum_index[1] = i else: #not equal, take the lower weight minimum_index[0] = w minimum_index[1] = i if minimum_index[1]==-1 or minimum_index[0]>0.85: # no message found or score too bad return "" from_sent_id = self.speech[chat_id][1][minimum_index[1]] for i in range(1,5): try: if from_sent_id != self.speech[chat_id][1][minimum_index[1]+i]: return self.speech[chat_id][0][minimum_index[1]+i] except IndexError: return "" return ""
def most_similar(self, word, num_similar=5): idx = self._w2idx[word] y = list(range(self._matrix.shape[0])) y.pop(idx) most_similar = [(1,0)] * num_similar for i in y: dist = 0 dist = cosine(self._matrix[idx], self._matrix[i]) if dist < most_similar[-1][0]: most_similar.pop() most_similar.append((dist,i)) most_similar = sorted(most_similar) most_similar = [(distance, self._idx2w[i]) for (distance, i) in most_similar] return most_similar
def all_col_dist(m): D = m.shape[1] d = np.zeros((D,D)) for i in xrange(D): div = m[:,i] for j in xrange(D): djv = m[:,j] d[j][i] = cosine(div,djv) return d
def choose_best_action(self, list_of_words): min_distance = 3 best_matching_action = None tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name) current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words) for action,centroid in tf_idf_shelve[CENTROID].iteritems(): distance = cosine(centroid,current_sentence_centroid) print action,distance if distance <= min_distance: min_distance = distance best_matching_action = action tf_idf_shelve.close() return current_sentence_centroid, best_matching_action, min_distance
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2): assert(len(senseVec1)==len(senseVec2)) avgCos = [] for t in xrange(len(senseVec1)): thisCos = [] p1 = (senseScore1[t]) p2 = (senseScore2[t]) for i in xrange(len(senseVec1[t])): for j in xrange(len(senseVec2[t])): thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j]) avgCos.append(np.sum(thisCos)) return spearmanr(test_score, avgCos)[0]
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2): assert(len(senseVec1)==len(senseVec2)) avgCos = [] for t in xrange(len(senseVec1)): i = np.argmax(senseScore1[t]) j = np.argmax(senseScore2[t]) thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j])) avgCos.append(thisCos) return spearmanr(test_score, avgCos)[0]
def cosine_similarity(a, b): # returns cosine smilarity between a and b return 1.0-cosine(a, b)
def cosine_similarities(a, b, transform): """ returns list of cosine similarities between lists of vectors a and b. The z_score transformation is applied if transform == True """ a = numpy.stack(a) b = numpy.stack(b) #transform if requested if transform: print "transforming" # z_score is written to apply same scale to a and b a, b = z_score(a, b) print "calculating cosine dists" cos = [cosine_similarity(a[i], b[i]) for i in range(len(a))] return cos
def delta(u, v): """ cosine ° sigmoid >>> delta([0.2], [0.3]) 0.5 >>> delta([0.3], [0.2]) 0.5 >>> delta([0.1,0.9], [-0.9,0.1]) == delta([-0.9,0.1], [0.1,0.9]) True """ # TODO scale with a and c return expit(cosine(u, v))
def reduncy(sen_vec, doc_vec): return 1 - cosine(sen_vec, (doc_vec - sen_vec))
def relavence(sen_vec, doc_vec): return 1 - cosine(sen_vec, doc_vec)
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'): """ Compute the specified distance type between chanels of mean vector and query image. In caffe library, FC8 layer consists of 10 channels. Here, we compute distance of distance of each channel (from query image) with respective channel of Mean Activation Vector. In the paper, we considered a hybrid distance eucos which combines euclidean and cosine distance for bouding open space. Alternatively, other distances such as euclidean or cosine can also be used. Input: -------- query_channel: Particular FC8 channel of query image channel: channel number under consideration mean_vec: mean activation vector Output: -------- query_distance : Distance between respective channels """ if distance_type == 'eucos': query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel) elif distance_type == 'euclidean': query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. elif distance_type == 'cosine': query_distance = spd.cosine(mean_vec[channel, :], query_channel) else: print "distance type not known: enter either of eucos, euclidean or cosine" return query_distance
def cmp_vectors(v1, v2): # c = cosine(normed(v1), normed(v2)) # c = cosine(v1, v2) c = v1 @ v2 return c
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.threshold is not None and options.threshold < 0.0: raise ValueError('threshold must be >= 0') if options.tolerance is not None and options.tolerance < 0.0: raise ValueError('tolerance must be >= 0') if options.approximate and not options.threshold: raise ValueError('approximate only makes sense with a threshold') if options.approximate and options.metric != 'cosine': raise NotImplementedError('approximate only supported for cosine') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: # whitening should be implemented in wvlib to support together with # approximate similarity if options.approximate: raise NotImplemenedError logging.info('normalize features to unit variance') vectors = whiten(vectors) return words, vectors, wv, options
def make_dist(vectors, options): if options.metric != 'cosine': return vectors, metrics[options.metric] else: # normalize once only vectors = [v/numpy.linalg.norm(v) for v in vectors] return vectors, lambda u, v: 1 - numpy.dot(u, v)
def cosine(v1, v2): return numpy.dot(v1/numpy.linalg.norm(v1), v2/numpy.linalg.norm(v2))
def get_lt_ranks(lab_list,num): topic_ls = get_topic_lt(topic_list[num]) val_dict = {} val_list =[] final_list=[] for item in lab_list: trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Letter trigram for candidate label. label_cnt = Counter(trigrams) total = sum(label_cnt.values(), 0.0) for key in label_cnt: label_cnt[key] /= total tot_keys = list(set(topic_ls.keys() + label_cnt.keys())) listtopic = [] listlabel = [] for elem in tot_keys: if elem in topic_ls: listtopic.append(topic_ls[elem]) else: listtopic.append(0.0) if elem in label_cnt: listlabel.append(label_cnt[elem]) else: listlabel.append(0.0) val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity val_list.append((item,val)) rank_val = [i[1] for i in val_list] arr = np.array(rank_val) order = arr.argsort() ranks = order.argsort() for i,elem in enumerate(val_list): final_list.append((elem[0],ranks[i],int(num))) return final_list # Generates letter trigram feature
def get_lt_ranks(lab_list,num): topic_ls = get_topic_lt(topic_list[num]) # Will get letter trigram for topic terms. val_dict = {} val_list =[] final_list=[] for item in lab_list: trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] # get the trigrams for label candidate. label_cnt = Counter(trigrams) total = sum(label_cnt.values(), 0.0) for key in label_cnt: label_cnt[key] /= total tot_keys = list(set(topic_ls.keys() + label_cnt.keys())) listtopic = [] listlabel = [] for elem in tot_keys: if elem in topic_ls: listtopic.append(topic_ls[elem]) else: listtopic.append(0.0) if elem in label_cnt: listlabel.append(label_cnt[elem]) else: listlabel.append(0.0) val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine similarity. val_list.append((item,val)) rank_val = [i[1] for i in val_list] arr = np.array(rank_val) order = arr.argsort() ranks = order.argsort() for i,elem in enumerate(val_list): final_list.append((elem[0],ranks[i],int(num))) return final_list # This calls the above method to get letter trigram feature.
def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'): if method == 'cosine': minim = [] # min, index for i, v in enumerate(df): # skip the base word, its usually the closest if id_to_word[i] in skip_words: continue dist = cosine(vec, v) minim.append((dist, i, v)) minim = sorted(minim, key=lambda v: v[0]) # return list of (word, cosine distance, vector) tuples return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)] else: raise Exception('{} is not an excepted method parameter'.format(method))
def turn(gs, word_to_id, id_to_word, df, soft_score): gs['turn_number'] += 1 names = list(gs['players'].keys()) current_player = names[(gs['turn_number'] % len(names) - 1)] while True: expr = input('{}, please enter a word expression:\n> '.format(current_player)) try: vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df) except Exception as err: print(err) continue break answers = {} for name in gs['players']: while True: word = input('{}, please enter your answer: '.format(name)) if word in word_to_id: answers[name] = df[word_to_id[word]] break else: print('{} is not in the dataset, please another word.'.format(word)) answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0] # transform answers from vectors to distances for k, v in answers.items(): answers[k] = cosine(v, answer_vec) winner = min(answers, key=answers.get) if not soft_score: gs['players'][winner] += 1 else: for name in answers: gs['players'][name] += round(answers[name], 2) print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan'))) print('{} wins this round.'.format(colored(winner, 'green'))) print_standings(gs)
def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'): if method == 'cosine': minim = [] # min, index for i, v in enumerate(df): # skip the base word, its usually the closest if id_to_word[i] in words: continue dist = cosine(vec, v) minim.append((dist, i)) minim = sorted(minim, key=lambda v: v[0]) # return list of (word, cosine distance) tuples return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)] else: raise Exception('{} is not an excepted method parameter'.format(method))
def find_similar_words_by_vector(self, vector: np.ndarray, n: int = 10): vocabulary = self._vocabulary similar_ids = sorted(range(0, vocabulary.size), key=lambda id: cosine(self._vectors[id], vector))[:n] return [vocabulary.to_word(id) for id in similar_ids]
def computeDistance(X, Y, method): if method == 'cosine': dist = spdistance.cosine(X,Y) if dist < 0: print ('WARNING: distance between X {} and Y {} = {} < 0, method: ' '{}'.format(X, Y, dist, method)) return dist
def runNN(descriptors, labels, parallel, nprocs): """ compute nearest neighbor from specific descriptors, given labels """ distance_method = { "cosine": 'cosine' } ret_matrix = None for name, method in distance_method.iteritems(): dist_matrix = computeDistances(descriptors, method, parallel, nprocs) computeStats(name, dist_matrix, labels, parallel) ret_matrix = dist_matrix return ret_matrix
def compare_tweet_with_storage(tweet, storage=None, bow=False): if storage is None: if not os.path.isfile(os.path.join(config.data_folder, config.model_file)): raise('Model was not found!') else: storage = pickle.load(open(os.path.join(config.data_folder, config.model_file), 'rb')) print(tweet) transformed_tweet = transform_tweet(tweet, bow) print([x[0] for x in transformed_tweet], [np.sum(y) for y in (x[2] for x in transformed_tweet)]) scores = {} for i, (entity, entity_type, vector_array) in enumerate(transformed_tweet): temp_score = 0.0 for j, (tweetid, item) in enumerate(storage[storage['Entity'] == entity].iterrows()): if bow: clusterids = np.unique([vector_array.keys() + item['Vector array'].keys()]) vector1 = np.zeros([len(clusterids)]) vector2 = np.zeros([len(clusterids)]) for k, cid in enumerate(clusterids): vector1[k] = vector_array.get(cid, 0) vector2[k] = item['Vector array'].get(cid, 0) temp_score = np.max([1.0 * np.sum(np.logical_and(vector1, vector2)) / np.min([np.sum(vector1), np.sum(vector2)]), temp_score]) else: if SPLIT: result = [1 - cosine(vector_array[x], item['Vector array'][x]) for x in range(3)] isnan = np.isnan(result) res = 0.0 for v in range(3): if not isnan[v]: res+=result[v] res = 1.0 * res/(np.sum(isnan==False)+10**(-10)) temp_score = np.max([res, temp_score]) # print(entity, entity_type) else: temp_score = np.max([1 - cosine(vector_array, item['Vector array']), temp_score]) print(1 - cosine(vector_array, item['Vector array']), entity, tweet, str(tweetid)) scores.update({entity: temp_score}) return combine_scores(scores)
def calc_glove_sim(row,embedder,idf_dict): ''' Calc glove similarities and diff of centers of query\title ''' a2 = [x for x in remove_punctuation(row['question1']).lower().split() if x in embedder] b2 = [x for x in remove_punctuation(row['question2']).lower().split() if x in embedder] # if len(a2)>0 and len(b2)>0: # glove_sim = embedder.n_similarity(a2, b2) # else: # return((-1, -1, np.zeros(300))) vectorA = np.zeros(300) for w in a2: if w in idf_dict: coef = idf_dict[w] else: coef = idf_dict['default_idf'] vectorA += coef*embedder[w] vectorA /= len(a2) vectorB = np.zeros(300) for w in b2: if w in idf_dict: coef = idf_dict[w] else: coef = idf_dict['default_idf'] vectorB += coef*embedder[w] vectorB /= len(b2) vector_diff = (vectorA - vectorB) glove_sim = cosine(vectorA,vectorB) glove_vdiff_dist = np.sqrt(np.sum(vector_diff**2)) return (glove_sim,glove_vdiff_dist, vector_diff)
def cosine_similarity(a, b): return dis.cosine(a, b)
def acous_text_eval(m, sess, data, lengths, text_data, text_lengths, matches, config): embeddings = [] now = 0 while now < len(data): embedding = sess.run(m.final_state, {m.input_x1: data[now: now + config.eval_batch_size], m.input_x1_lengths: lengths[now: now + config.eval_batch_size]}) embeddings.append(embedding) now += config.eval_batch_size X = np.vstack(embeddings) text_embeddings = [] now = 0 while now < len(data): text_embedding = sess.run(m.word_state, {m.input_c1: text_data[now: now + config.eval_batch_size], m.input_c1_lengths: text_lengths[now: now + config.eval_batch_size]}) text_embeddings.append(text_embedding) now += config.eval_batch_size Y = np.vstack(text_embeddings) distances = [] for i in range(len(data)): for j in range(i+1, len(data)): distances.append(cosine(X[i], Y[j])) distances = np.asarray(distances) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) print "Average precision:", ap print "Precision-recall breakeven:", prb return ap
def sem_clust(self, w2p, simsdict): ''' Baseline SEMCLUST method (dynamic thresholding), based on: Marianna Apidianaki, Emilia Verzeni, and Diana McCarthy. Semantic Clustering of Pivot Paraphrases. In LREC 2014. Builds a graph where nodes are words, and edges connect words that have a connection in <w2p>. Weights edges by the values given in <simsdict>. :param w2p: word -> {paraphrase: score} dictionary, used to decide which nodes to connect with edges :param simsdict: word -> {paraphrase: score} OR word -> vector, used for edge weights :return: ''' self.reset_sense_clustering() wordlist = self.pp_dict.keys() oov = [w for w in wordlist if w not in w2p or w not in simsdict] if len(oov) > 0: sys.stderr.write('WARNING: Paraphrases %s are OOV. ' 'Removing from ppset.\n' % str(oov)) wordlist = list(set(wordlist) - set(oov)) if len(wordlist) == 1: self.add_sense_cluster([wordlist[0]]) return # Using cosine similarity of word-paraphrase vectors: if type(simsdict.values()[0]) != dict: similarities = np.array([[1-cosine(simsdict[i], simsdict[j]) for j in wordlist] for i in wordlist]) else: similarities = np.array([[(1-dict_cosine_dist(simsdict[i], simsdict[j])) for j in wordlist] for i in wordlist]) gr = sem_clust.toGraph(similarities, wordlist, self.target_word, w2p) for c in nx.connected_components(gr): self.add_sense_cluster(c)
def dict_cosine_dist(u,v): features = list(set(u.keys()) | set(v.keys())) features.sort() uvec = np.array([u[f] if f in u else 0.0 for f in features]) vvec = np.array([v[f] if f in v else 0.0 for f in features]) return cosine(uvec,vvec)
def get_similarity(self, w1, w2): if w1 not in self.wv or w2 not in self.wv: return -0.5 sim = 1.0 - cos_dist(self.wv[w1], self.wv[w2]) return sim
def predict(self, seq1, seq2, pred_method='multiply', unigram_probs=None): '''right now this function only handles getting prob for one sequence pair''' if self.flat_input: if self.embedded_input: seq1 = seq1[None] else: seq1 = get_vector_batch([seq1], vector_length=self.lexicon_size+1) else: seq1 = get_seq_batch([seq1], max_length=self.n_timesteps) probs = self.model.predict_on_batch(seq1)[0] if self.flat_output: if unigram_probs is not None: probs = probs / unigram_probs ** 0.66 probs[numpy.isinf(probs)] = 0.0 #replace inf #import pdb;pdb.set_trace() seq2 = get_vector_batch([seq2], vector_length=self.lexicon_size+1) #prob = 1 - cosine(seq2, probs) probs = probs[seq2[0].astype('bool')] else: seq2 = get_seq_batch([seq2], padding='post', max_length=self.n_timesteps) probs = probs[numpy.arange(self.n_timesteps), seq2] probs = probs[seq2 > 0] if pred_method == 'multiply': prob = numpy.sum(numpy.log(probs)) #prob = numpy.multiply(probs) if pred_method == 'mean': #prob = numpy.sum(numpy.log(probs)) prob = numpy.mean(numpy.log(probs)) elif pred_method == 'last': prob = numpy.log(probs[-1]) elif pred_method == 'max': prob = numpy.log(numpy.max(probs)) return prob
def predict(self, seq1, seq2): seq1 = seq1 + 1e-8 seq2 = seq2 + 1e-8 #smooth to avoid NaN score = 1 - cosine(seq1, seq2) return score
def get_word2vec_sim(context_seq, gen_seq): '''return the word2vec cosine similarity between the context and each generated sequence (where the word2vec representation for a sequence is just the average of its word vectors)''' word_pairs = get_word_pairs(context_seq, gen_seq) if word_pairs: pair_scores = [similarity.word2vec(encoder(word1),encoder(word2)) for word1,word2 in word_pairs] else: #no word pairs between context and generated sequences (e.g. generated sequence might be punctuation only) pair_scores = [0] # assert(len(word_pairs) == len(pair_scores)) word2vec_sim = numpy.mean(pair_scores) return word2vec_sim