我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.metrics.pairwise.cosine_similarity()。
def _train(self, training_frame): hashing_vectorizer = HashingVectorizer(analyzer="word", n_features=(2 ** 30), ngram_range=(1, 3), stop_words="english") training_hashing_matrix = hashing_vectorizer.fit_transform(training_frame["description"]) self.log.info("starting kernel") start = time() cosine_similarities = cosine_similarity(training_hashing_matrix, training_hashing_matrix) self.log.info("finished kernel. this took {} s".format(time() - start)) self.log.info("starting adding to redis database") start = time() i = 0 l = len(training_frame.index) print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50) for idx, row in training_frame.iterrows(): similar_indices = cosine_similarities[idx].argsort()[:-100:-1] similar_items = [(cosine_similarities[idx][i], training_frame['id'][i]) for i in similar_indices] flattened = sum(similar_items[1:], ()) self._r.zadd(self.SIMKEY % row['id'], *flattened) i += 1 print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50) self.log.info("finished adding {} rows to redis database. this took {} s".format(i, time() - start))
def compar_pic(path1,path2): global net #?????? X=read_image(path1) test_num=np.shape(X)[0] #X ?? ????? out = net.forward_all(data = X) #fc7??????,?????? feature1 = np.float64(out['fc7']) feature1=np.reshape(feature1,(test_num,4096)) #?????? X=read_image(path2) #X ?? ????? out = net.forward_all(data=X) #fc7??????,?????? feature2 = np.float64(out['fc7']) feature2=np.reshape(feature2,(test_num,4096)) #????????cos?,?????????? predicts=pw.cosine_similarity(feature1, feature2) return predicts
def compar_pic(path1,path2): global net #?????? X=read_image(path1) test_num=np.shape(X)[0] #X ?? ????? out = net.forward_all(data = X) #fc7??????,?????? feature1 = np.float64(out['fc7']) feature1=np.reshape(feature1,(test_num,4096)) #np.savetxt('feature1.txt', feature1, delimiter=',') #?????? X=read_image(path2) #X ?? ????? out = net.forward_all(data=X) #fc7??????,?????? feature2 = np.float64(out['fc7']) feature2=np.reshape(feature2,(test_num,4096)) #np.savetxt('feature2.txt', feature2, delimiter=',') #????????cos?,?????????? predicts=pw.cosine_similarity(feature1, feature2) return predicts
def cos_sim(ind1,ind2=1999): view1 = np.load("test_v1.npy")[0:ind1] view2 = np.load("test_v2.npy")[0:ind2] #val = [] MAP=0 for i,j in enumerate(view1): val=[] AP=0 for x in view2: val.append(cosine_similarity(j,x)[0].tolist()) #val=val[0].tolist() #print val[0].tolist() val=[(q,p)for p,q in enumerate(val)] #print val val.sort() val.reverse() t = [w[1]for w in val[0:7]] for x,y in enumerate(t): if y in range(i,i+5): AP+=1/(x+1) print(t) print(AP) MAP+=AP print('MAP is : ',MAP/ind1)
def search_query(self, query): """ search for query and find most related document for query http://webhome.cs.uvic.ca/~thomo/svd.pdf """ def topN(similarities, N=5): return np.argsort(similarities)[::-1][:N] words = query.split(" ") tokens_ids = [] for word in words: try: token_id = self.tokens_mapping[word] except KeyError: print("Token not found in tokens mapping dict") else: tokens_ids.append(token_id) query_representation = np.mean(self.tokens_representation[tokens_ids,:], axis=0) similarities = cosine_similarity(query_representation, self.documents_representation) topN_documents =[self.documents_mapping[index] for index in topN(similarities[0])] return topN_documents
def emit(id): if not id in hctx: return (id, {}) hvector, candidates = v.transform(hctx[id]), Counter() for hypernym in hctx[id]: hsenses = Counter({hid: sim(v.transform(Counter(synsets[hid])), hvector).item(0) for hid in index[hypernym]}) for hid, cosine in hsenses.most_common(1): if cosine > 0: candidates[(hypernym, hid)] = cosine matches = [(hypernym, hid, cosine) for (hypernym, hid), cosine in candidates.most_common(len(candidates) if args.k == 0 else args.k) if hypernym not in synsets[id]] return (id, matches)
def generateCosineNeighborGraph(hin,kNeighbors=10,tf_param={'word':True, 'entity':False, 'we_weight':1}): X, newIds, entIds = GraphGenerator.getTFVectorX(hin,param=tf_param) cosX = cosine_similarity(X) #return sparse.csc_matrix(X.dot(X.transpose())),newIds n = cosX.shape[0] graph = np.zeros((n,n)) tic = time.time() for i in range(n): for j in np.argpartition(-cosX[i],kNeighbors)[:kNeighbors]: if j == i: continue #graph[i, j] += cosX[i, j] #graph[j, i] += cosX[i, j] graph[i, j] += 1 graph[j, i] += 1 toc = time.time() - tic return sparse.csc_matrix(graph), newIds
def generateCosineNeighborGraphfromX(X, kNeighbors=10): cosX = cosine_similarity(X) # return sparse.csc_matrix(X.dot(X.transpose())),newIds #print cosX.shape n = cosX.shape[0] graph = np.zeros((n, n)) tic = time.time() for i in range(n): for j in np.argpartition(-cosX[i], kNeighbors)[:kNeighbors]: if j == i: continue # graph[i, j] += cosX[i, j] # graph[j, i] += cosX[i, j] graph[i, j] += 1 graph[j, i] += 1 toc = time.time() - tic #print 'graph generation done in %f seconds.' % toc return sparse.csc_matrix(graph)
def generate_laplacian_score_scalar(X_ent, X_word, kNeighbors): # Generate cosine similarity graph n = X_ent.shape[0] cosX = cosine_similarity(X_word) graph = np.zeros((n, n)) for i in range(n): for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]: if j == i: continue graph[i, j] = cosX[i, j] graph[j, i] = cosX[i, j] D = sparse.diags([graph.sum(axis=0)], [0]) L = D - graph f_tilde = X_ent - (float(X_ent.transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones((n, 1)) score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10) laplacian_score = score return laplacian_score
def compar_pic(path1,path2): global net #?????? X=read_image(path1) test_num=np.shape(X)[0] #X ?? ????? out = net.forward_all(blobs=['pool5'],data = X) # print out.keys() feature1 = np.float64(out["pool5"]) feature1=np.reshape(feature1,(test_num,25088)) #?????? X=read_image(path2) #X ?? ????? out = net.forward_all(blobs=['pool5'],data=X) feature2 = np.float64(out['pool5']) feature2=np.reshape(feature2,(test_num,25088)) #????????cos?,?????????? predicts=pw.cosine_similarity(feature1, feature2) return predicts
def compar_pic(path1,path2): global net #?????? X=read_image(path1) test_num=np.shape(X)[0] #X ?? ????? out = net.forward_all(blobs=['pool5'],data = X) # print out.keys() feature1 = np.float64(out["pool5"]) feature1=np.reshape(feature1,(test_num,25088)) #?????? X=read_image(path2) #X ?? ????? out = net.forward_all(blobs=['pool5'],data=X) #fc7??????,?????? feature2 = np.float64(out['pool5']) feature2=np.reshape(feature2,(test_num,25088)) #????????cos?,?????????? predicts=pw.cosine_similarity(feature1, feature2) return predicts
def readFace(feature): r=redis.Redis("localhost") keys= r.keys("*") for key in keys : db_feature =pickle.loads( r.lindex(key,0) ) comple=pw.cosine_similarity(db_feature,feature) if(comple>0.46) : #????? return key for key in keys : if(r.llen(key))>1 : db_feature =pickle.loads( r.lindex(key,1) ) comple=pw.cosine_similarity(db_feature,feature) if(comple>0.46) : #????? return key return 'unknow' #????????
def get_ranked_response(model, test_post_seg, candidate_list, similar_post_dic): test_post_seg_vec = get_sentence_vec(model, test_post_seg, candidate_list, similar_post_dic) for c in candidate_list: c_p_vec = get_sentence_vec(model, c[1], candidate_list, similar_post_dic) c_r_vec = get_sentence_vec(model, c[4], candidate_list, similar_post_dic) c[2] = c_p_vec c[5] = c_r_vec s2 = float(cosine_similarity(c_p_vec, c_r_vec)) s3 = float(cosine_similarity(test_post_seg_vec, c_r_vec)) c[7] = s2 c[8] = s3 # rank_score = 1000*c[6]*c[7]*c[8] rank_score = c[6]*0.5+c[7]*1.5+c[8]*2 c[9] = rank_score rank_candidate = sorted(candidate_list, key = lambda l: l[-1]) return rank_candidate
def __init__(self, dataset, save_path_queries=None, **kwargs): super(UncertaintyDenseSampling, self).__init__(dataset, **kwargs) self.model = kwargs.pop('model', None) if self.model is None: raise TypeError( "__init__() missing required keyword-only argument: 'model'" ) self.save_path_queries = save_path_queries self.save_path_queries_hdf5 = os.path.join(self.save_path_queries, os.path.normpath(self.save_path_queries) + ".hdf5") if os.path.isfile(self.save_path_queries_hdf5): print "This file already exists %s" % self.save_path_queries_hdf5 quit(0) self.model.train(self.dataset, first_time=True) unlabeled_train = self.dataset.get_unlabeled_train_data()["features"] print "Computing cosine similarities of", unlabeled_train.shape, "by", unlabeled_train.shape self.similarity_matrix = cosine_similarity(unlabeled_train, unlabeled_train)
def disambiguate_word(self, sentence, index): super().disambiguate_word(sentence, index) lemmas = self.lemmatize(sentence) if index not in lemmas: return svector = self.sensegram(lemmas.values()) # sentence vector if svector is None: return # map synset identifiers to the cosine similarity value candidates = Counter({id: sim(svector, self.dense[id]).item(0) for id in self.inventory.index[lemmas[index]] if self.dense[id] is not None}) if not candidates: return for id, _ in candidates.most_common(1): return id
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
def selecttop(CF, k): """ Finds cosine similarity between SC and Wi and returns index of top features """ NCF = np.zeros((CF.shape[1],CF.shape[1])) for i in range(CF.shape[1]): for j in range(CF.shape[1]): if (CF[i,j]+CF[j,j]-CF[i,j]) !=0: NCF[i,j]=CF[i,j]/(CF[i,j]+CF[j,j]-CF[i,j]) else: NCF[i,j]=0 SC = np.zeros(CF.shape[1]) for i in range(CF.shape[1]): SC[i] = np.sum(NCF[i,:]) print(np.isnan(SC).any()) print(np.isnan(CF).any()) cosim = cosine_similarity(SC,CF) return (-cosim).argsort()[0][:int(k*CF.shape[1])] #Loading CF matrix for each cluster
def cosine_sim(x, y): try: d = cosine_similarity(x.reshape(1,-1), y.reshape(1,-1)) d = d[0][0] except: d = 0.0 return d
def setKernel(self, kernel_name, kernel_param): self.kernel_name = kernel_name if kernel_name == 'rbf': def rbf(x1,x2): return rbf_kernel(x1,x2, gamma=kernel_param) # from sklearn self.internal_kernel_func = rbf else: def dot_product(x1,x2): return cosine_similarity(x1,x2) # from sklearn - a normalized version of dot product #np.dot(x1,x2.T) self.internal_kernel_func = dot_product
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs): docs = [] sim = pw.cosine_similarity(corpus_vectors, query_vectors) order = np.argsort(sim, axis=0)[::-1] for i in range(len(query_vectors)): docs.append(order[:, i][0:n_docs]) return np.array(docs)
def compare_pic(self,feature1,feature2): predicts=pw.pairwise_distances(feature2, feature1,'cosine') #predicts=pw.cosine_similarity(feature1, feature2) return predicts
def cosine(x1,x2): #find common ratings #new_x1, new_x2 = common(x1,x2) #compute the cosine similarity between two vectors sum = x1.dot(x2) denom = sqrt(x1.dot(x1)*x2.dot(x2)) try: return float(sum)/denom except ZeroDivisionError: return 0 #return cosine_similarity(x1,x2)[0][0]
def vec_cos_sim(token_input, operation_input): operation_string = None ref_vector_string = None cond_value_string = None for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']: if opr_sign in operation_input: ref_vector_string = operation_input.split(opr_sign)[0] operation_string = opr_sign cond_value_string = operation_input.split(opr_sign)[1] break if ref_vector_string and cond_value_string and operation_string: try: cond_value = float(cond_value_string) ref_vector = change_string_to_vector(ref_vector_string) token_vector = change_string_to_vector(token_input) if len(ref_vector) != len(token_vector): print ('len of vectors does not match') return False if operation_string == "=" or operation_string == "==": return cosine_similarity(token_vector, ref_vector) == cond_value elif operation_string == "<": return cosine_similarity(token_vector, ref_vector) < cond_value elif operation_string == ">": return cosine_similarity(token_vector, ref_vector) > cond_value elif operation_string == ">=": return cosine_similarity(token_vector, ref_vector) >= cond_value elif operation_string == "<=": return cosine_similarity(token_vector, ref_vector) <= cond_value elif operation_string == "!=" or operation_string == "<>": return cosine_similarity(token_vector, ref_vector) != cond_value else: return False except ValueError: # TODO raise tokenregex error return False else: # TODO raise tokenregex error print ('Problem with the operation input')
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10): if dist == 'euclidean': sim = euclidean_distances(X, vec.reshape(1, -1)) elif dist == 'cosine': sim = cosine_similarity(X, vec.reshape(1, -1)) else: raise NotImplementedError('dist must be euclidean or cosine') # get the top five indices indices = sim.argsort(axis=0)[-top:][::-1] words = [] for i in indices: words.append(labels[i[0]]) return " ".join(words)
def find_nearest_word(self,represent, topk:int=10,stopwords:list=[]): """ ????(???????????)??????? :param stopwords: ????????????? :param represent: :param topk: :return: """ array1=np.empty(200) if isinstance(represent,str) and represent in self: array1=self[represent] stopwords.append(represent) elif isinstance(represent,np.ndarray) : array1=represent else: raise NotImplementedError result_cos=cosine_similarity(np.reshape(array1,(1,array1.shape[-1])),self._matrix) result_cos=np.reshape(result_cos,result_cos.shape[-1]) result_sort=result_cos.argsort()[-1*topk:][::-1] # [[self.idx2word[idx],result_cos[idx]] for idx in result_sort] # found={} # for item in result_sort: # found[self.idx2word[item]]=result[item] # sortlist=sorted(found.items(), key=lambda d: d[1],reverse=True) #print(found) return [[self.idx2word[idx],result_cos[idx]] for idx in result_sort if self.idx2word[idx] not in stopwords and sum([ 1 if stop.startswith(self.idx2word[idx]) else 0 for stop in stopwords])==0 ] #[item for item in sortlist if sum([len(item[0].replace(stop,''))>=2 for stop in stopwords]) ==0]
def simCalcMatrix(docs): tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None) tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs) #finds the tfidf score with normalization cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) return cosineSimilarities
def generateSimMatrix(phraseList): #print 'Num elements', len(phraseList), phraseList all_elements=[] #for elementlist in phraseList: for element in phraseList: if len(element.strip())==0: all_elements.append(' ') else: all_elements.append(element.strip()) tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None) tfidf_matrix_train = tfidf_vectorizer.fit_transform(all_elements) #finds the tfidf score with normalization cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) return cosineSimilarities
def calu_cosin_num(word_q,word_a,model): #print 'word_q : ' + str(word_q) + '\n' #print 'word_a : ' + str(word_a) + '\n' try: q_vector = model[word_q.decode('utf-8')] a_vector = model[word_a.decode('utf-8')] except KeyError: return 0 cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1)) return float(cosine_similarity_num)
def cosine_affinity(X): epsilon = 1e-8 S = cosine_similarity(X) S[S > 1] = 1.0 # Rounding problems S += 1 + epsilon # Sanity checks assert(not (S < 0).any()) assert(not np.isnan(S).any()) assert(not np.isinf(S).any()) return S
def decision_function(self, graphs): vecs = self.vectorizer.transform(graphs) return cosine_similarity(self.reference_vec, vecs)
def search(): qry = request.args.get('query', '') test = np.zeros((tfidf[0].shape)) keywords = [] for word in qry.split(' '): # validate word if len(word) <2 or word in stop_words: continue try: idx = features.index(word) test[0][idx] = 1 except ValueError, e: pass cosine_similarities = cosine_similarity(test, tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-100:-1] # TOP 100 results MAX = 100 data = [] related_docs_indices = related_docs_indices[:MAX] tag_map = {} # All tags and their counts for img in indices[related_docs_indices]: file_path = "/Users/smallya/workspace/Rekognition-personal-searchengine/" + img labels = d_index[img] word = qry.split(' ')[0] data.append(file_path) print related_docs_indices return json.dumps(data)
def score_topics(source_id, topics_desc_dict): token_dict = {} indices = {} res_dict = {} index = 0 for tid, text in topics_desc_dict.iteritems(): lowers = text.lower() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) no_punctuation = lowers.translate(remove_punctuation_map) token_dict[tid] = no_punctuation for tok in token_dict.keys(): indices.update({tok: index}) index += 1 main_index = indices[source_id] # this can take some time tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english') tfidf_matrix = tf_idf.fit_transform(token_dict.values()) res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix) for tok, ind in indices.iteritems(): if tok == main_index: continue; res_dict.update({tok: res[0][ind]}) return res_dict
def score_outlinks(main_text, title_list): main_title = "current_selected_topic" token_dict = {} len_titles = {} indices = {} res_dict = {} index = 0 for title in title_list: lowers = title.lower().replace("_", " ").replace("-", " ") len_titles.update({title: len(lowers.split(" "))}) token_dict[title] = lowers len_titles[main_title] = 1 token_dict[main_title] = main_text for tok in token_dict.keys(): indices.update({tok: index}) index += 1 main_index = indices[main_title] tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english') tfidf_matrix = tf_idf.fit_transform(token_dict.values()) res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix) for tok, ind in indices.iteritems(): if tok == main_title: continue; res_dict.update({tok: (res[0][ind] * 100 / len_titles[tok]) }) return res_dict
def predict(self, X): """Predict the class labels for the provided data Parameters ---------- X : np.ndarray, shape = [n_samples] Returns ------- y : np.array of shape [n_samples] Class labels for each data sample. """ if not self.fitted: raise NotFittedError("Estimator not fitted, call `fit` before exploiting the model.") n_samples = len(X) frequencies = np.zeros((n_samples, self.n_all_words_)) for i in range(n_samples): words_unique, words_counts = np.unique(X[i], return_counts=True) for j, word in enumerate(self.all_words_): if word in words_unique: frequencies[i, j] = words_counts[np.where(words_unique == word)[0]] self.frequencies_ = frequencies y_pred = cosine_similarity(frequencies, self.tf_idf_array_).argmax(axis=1) return y_pred
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num = 5): split_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) repeats = 50 tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param) n = X_word.shape[0] knowsim = sparse.lil_matrix((n, n)) for t in type_list: tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param, t) # make similarity graph cosX = cosine_similarity(X_typed) graph = sparse.lil_matrix((n, n)) for i in range(n): for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]: if j == i: continue graph[i, j] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # graph[j, i] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # # calculate laplacian scores row_sum = graph.sum(axis=1) laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors) # add meta-path-based similarity to the knowsim knowsim = knowsim + np.exp(-tau * laplacian_score) * graph knowsim = knowsim.tocsr() print 'running lp' lp_param = {'alpha':0.98, 'normalization_factor':5} ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def generate_laplacian_score(X_ent, X_word, kNeighbors): # Generate cosine similarity graph n = X_ent.shape[0] m = X_ent.shape[1] cosX = cosine_similarity(X_word) graph = np.zeros((n, n)) t = cosX.sum().sum() / n/n for i in range(n): for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]: if j == i: continue # diff = (X_word[i, :] - X_word[j, :]).toarray().flatten() # dist = np.exp(np.dot(diff, diff) / t) graph[i, j] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) # graph[j, i] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) # D = sparse.diags([graph.sum(axis=0)], [0]) L = D - graph laplacian_score = np.zeros(m) for i in range(m): f_tilde = X_ent[:, i] - (float(X_ent[:, i].transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones( (n, 1)) score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10) laplacian_score[i] = score return (laplacian_score)