Python scipy.spatial.distance 模块,cosine() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.spatial.distance.cosine()

项目:CS-SMAF    作者:brian-cleary    | 项目源码 | 文件源码
def correlations(A,B,pc_n=100):
    p = (1 - distance.correlation(A.flatten(),B.flatten()))
    spear = spearmanr(A.flatten(),B.flatten())
    dist_genes = np.zeros(A.shape[0])
    for i in range(A.shape[0]):
        dist_genes[i] = 1 - distance.correlation(A[i],B[i])
    pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
    dist_sample = np.zeros(A.shape[1])
    for i in range(A.shape[1]):
        dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
    ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
    pc_dist = []
    if pc_n > 0:
        u0,s0,vt0 = np.linalg.svd(A)
        u,s,vt = np.linalg.svd(B)
        for i in range(pc_n):
            pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
        pc_dist = np.array(pc_dist)
    return p,spear[0],pg,ps,pc_dist
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def evaluate1Word(wv, reference):
    """Evaluate wv against reference, return (rho, count) where rwo is
    Spearman's rho and count is the number of reference word pairs
    that could be evaluated against.
    """
    count=0
    gold, predicted = [], []
    for words, sim in sorted(reference, key=lambda ws: ws[1]):
        if " " not in words[0] and " " not in words[1]:
            #print words[0],words[1]
            try:
                v1, v2 = wv[words[0]], wv[words[1]]
            except KeyError:
                count+=1
                continue
            #print words
            gold.append((words, sim))
            predicted.append((words, cosine(v1, v2)))

    simlist = lambda ws: [s for w,s in ws]
    rho, p = spearmanr(simlist(gold), simlist(predicted))
    print "Word not found in WordVector",count
    return (rho, len(gold))
项目:NETL-Automatic-Topic-Labelling-    作者:sb1992    | 项目源码 | 文件源码
def get_best_label(label_list,num):
    topic_ls = get_topic_lg(topic_list[num])
    val_dict = {}
    for item in label_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Extracting letter trigram for label
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel))   # Cosine Similarity
        val_dict[item] = val
    list_sorted=sorted(val_dict.items(), key=lambda x:x[1], reverse = True) # Sorting the labels by rank
    return [i[0] for i in list_sorted[:int(args.num_unsup_labels)]]
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def texts_tfidf(ids, important_texts, citations_texts) :
    '''
    Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. 
    '''

    tfidf = TfidfVectorizer(strip_accents='ascii',
                                                    stop_words='english', 
                                                    ngram_range=(1,2),
                                                    min_df=2)

    freqs1 = tfidf.fit_transform(important_texts)
    terms1 = tfidf.get_feature_names()

    freqs2 = tfidf.fit_transform(citations_texts)
    terms2 = tfidf.get_feature_names()

    return terms1, terms2, freqs1, freqs2
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def texts_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :

        # If one of the vectors is nil, skip it
        if (freqs1[i].sum()==0.0) or (freqs2[i].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1.getrow(i).toarray()[0])
        fmap2 = to_dict(terms2, freqs2.getrow(i).toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def random_similarity(terms1, terms2, freqs1, freqs2) :

    # Merge all terms
    terms = list(set(terms1 + terms2))

    npapers = freqs1.shape[0]
    sims = np.empty(npapers, np.float)

    for i in xrange(npapers) :
        a = random.randint(0,npapers-1)  #@UndefinedVariable
        b = random.randint(0,npapers-1)  #@UndefinedVariable

        # If one of the vectors is nil, skip it
        if (freqs1[a].sum()==0.0) or (freqs2[b].sum()==0.0) :
            continue

        # Changes representation to a {term: freq} map
        fmap1 = to_dict(terms1, freqs1[a].toarray()[0])
        fmap2 = to_dict(terms2, freqs2[b].toarray()[0])

        vec1, vec2 = to_same_dimension(terms, fmap1, fmap2)

        sims[i] = 1.0-cosine(vec1, vec2)

    return sims
项目:ADEM    作者:mike-n-7    | 项目源码 | 文件源码
def sanity_check(test_emb, train_emb, num_test):
    '''
    Sanity check on the cosine similarity calculations
    Finds the closest vector in the space by brute force
    '''
    correct_list = []
    for i in xrange(num_test):
        smallest_norm = np.infty
        index = 0
        for j in xrange(len(train_emb)):
            norm = np.linalg.norm(emb - test_emb[i])
            if norm < smallest_norm:
                smallest_norm = norm
                index = j
        correct_list.append(index)
    # Pad the list to make it the same length as test_emb
    for i in xrange(len(test_emb) - num_test):
        correct_list.append(-1)
    return correct_list
项目:scanner    作者:cheng6076    | 项目源码 | 文件源码
def token_similarity(self, words ,rwords):
        words = set(words)
        rwords = set(rwords)
        word_vec = np.zeros(self.word_dim)
        rword_vec = np.zeros(self.word_dim)
        word_count = 0
        rword_count = 0
        for word in words:
            if self.word_vec.has_key(word) and word not in self.stopwords:
                word_vec += self.word_vec[word]
                word_count += 1
        for word in rwords:
            if self.word_vec.has_key(word):
                rword_vec += self.word_vec[word]
                rword_count += 1
        if word_count > 0:
            word_vec = word_vec / word_count
        if rword_count > 0:
            rword_vec = rword_vec / rword_count
        if word_count>0 and rword_count>0:
            return cosine(word_vec, rword_vec)
        else:
            return 1
项目:ro_sgns    作者:AlexGrinch    | 项目源码 | 文件源码
def nearest_words(self, word, top=20, display=False):
        """
        Find the nearest words to the word 
        according to the cosine similarity.
        """

        W = self.W / np.linalg.norm(self.W, axis=0)   
        if (type(word)==str):
            vec = self.word_vector(word, W)
        else:
            vec = word / np.linalg.norm(word)

        cosines = (vec.T).dot(W)
        args = np.argsort(cosines)[::-1]       

        nws = []
        for i in xrange(1, top+1):
            nws.append(self.inv_vocab[args[i]])
            if (display):
                print self.inv_vocab[args[i]], round(cosines[args[i]],3)

        return nws
项目:ro_sgns    作者:AlexGrinch    | 项目源码 | 文件源码
def argmax_fun(W, indices, argmax_type='levi'):
    """
    cosine: b* = argmax cosine(b*, b - a + a*) 
    levi: b* = argmax cos(b*,a*)cos(b*,b)/(cos(b*,a)+eps)
    """

    if (argmax_type == 'levi'):
        W = W / np.linalg.norm(W, axis=0)
        words3 = W[:, indices]
        cosines = ((words3.T).dot(W) + 1) / 2
        obj = (cosines[1] * cosines[2]) / (cosines[0] + 1e-3)
        pred_idx = np.argmax(obj)

    elif (argmax_type == 'cosine'):
        words3_vec = W[:, indices].sum(axis=1) - 2*W[:, indices[0]]
        W = W / np.linalg.norm(W, axis=0)
        words3_vec = words3_vec / np.linalg.norm(words3_vec)
        cosines = (words3_vec.T).dot(W)
        pred_idx = np.argmax(cosines)

    return pred_idx
项目:abc    作者:daemon    | 项目源码 | 文件源码
def synonyms_by_synset(self, synset_name, topn=3):
    ssid = self.id_table[synset_name]
    doc = self.doc_matrix[ssid]
    found_indices = set([ssid])
    synonyms = []
    for _ in range(topn):
      min_index = 0
      min_val = 10
      for i in range(self.doc_matrix.shape[0]):
        cos_dist = cosine(self.doc_matrix[i], doc)
        if i not in found_indices and cos_dist < min_val:
          min_index = i
          min_val = cos_dist
      found_indices.add(min_index)
      synonyms.append((self.definitions[min_index], min_val))
    return synonyms
项目:cluster_paraphrases    作者:acocos    | 项目源码 | 文件源码
def get_sils_matrix(method, scores, wordlist):
    ''' See get_sims_matrix for definitions, which are the same here. The
    difference is that the resulting matrix contains distances instead of
    similarities.

    :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
    '''
    if method =='direct':
        sims = get_sims_matrix(method, scores, wordlist)
        sims = preprocessing.normalize(np.matrix(sims), norm='l2')
        sils = 1-sims
    elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
        sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
    elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
        sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
    elif method == 'vec_cosine':
        d = scores.values()[0].shape[0]
        sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
    else:
        sys.stderr.write('Unknown sil method: %s' % method)
        return None
    sils = np.nan_to_num(sils)
    return sils
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def get_sentiment_sim(context_seqs, gen_seqs):
    '''return the cosine similarity between the sentiment scores of each context and corresponding generated sequence;
    the sentiment scores are given in spacy'''
    gen_seqs = check_seqs_format(gen_seqs)
    emotion_types = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD']
    gen_sentiment_sim_scores = []
    for context_seq, gen_seqs_ in zip(context_seqs, gen_seqs):
        context_sentiment = lexicon_methods.emotional_valence(encoder(context_seq))
        context_sentiment = numpy.array([context_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
        sentiment_sim_scores = []
        for gen_seq in gen_seqs_:
            gen_sentiment = lexicon_methods.emotional_valence(encoder(gen_seq))
            gen_sentiment = numpy.array([gen_sentiment[emotion_type] for emotion_type in emotion_types]) + 1e-8 #add tiny number to avoid NaN when all scores are 0
            sentiment_sim = 1 - cosine(context_sentiment, gen_sentiment)
            sentiment_sim_scores.append(sentiment_sim)
        gen_sentiment_sim_scores.append(sentiment_sim_scores)

    gen_sentiment_sim_scores = numpy.array(gen_sentiment_sim_scores)
    return {'sentiment_sim_scores': gen_sentiment_sim_scores, 'mean_sentiment_sim_scores': numpy.mean(gen_sentiment_sim_scores)}
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)
项目:answer-triggering    作者:jiez-osu    | 项目源码 | 文件源码
def find_similar_words(wordvecs):
    """ Use loaded word embeddings to find out the most similar words in the
    embedded vector space.
    """
    from sklearn.metrics import pairwise_distances
    from scipy.spatial.distance import cosine
    pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
                                              metric='cosine',
                                              # metric='euclidean',
                                              )

    id2word = {}
    for key, value in wordvecs.word_idx_map.iteritems():
        assert(value not in id2word)
        id2word[value] = key
    while True:
        word = raw_input("Enter a word ('STOP' to quit): ")
        if word == 'STOP': break
        try:
            w_id = wordvecs.word_idx_map[word]
        except KeyError:
            print '%s not in the vocabulary.' % word
        sim_w_id  = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
        for i in sim_w_id:
            print id2word[i+1],
        print ''
项目:Abb1t    作者:k-freeman    | 项目源码 | 文件源码
def generate_answer(self, msg_text, chat_id):
        minimum_index=[1-(10**(-5)),-1] # min value / minimum index
        if chat_id in self.vectorizer:
            t=self.vectorizer[chat_id].transform([msg_text]).toarray()[0]
        else:
            reply=""
            return
        for i,t2 in enumerate(self.mat[chat_id].toarray()):
            w=cosine(t,t2)
            if abs(w)<=minimum_index[0]:
                if minimum_index[0] == abs(w): # equal weight, lets take the longer message
                    if len(self.speech[chat_id][0][i]) > len(self.speech[chat_id][0][minimum_index[1]]):
                        minimum_index[1] = i
                else: #not equal, take the lower weight
                    minimum_index[0] = w
                    minimum_index[1] = i

        if minimum_index[1]==-1 or minimum_index[0]>0.85: # no message found or score too bad
            return ""

        from_sent_id = self.speech[chat_id][1][minimum_index[1]]
        for i in range(1,5):
            try:
                if from_sent_id != self.speech[chat_id][1][minimum_index[1]+i]:
                    return self.speech[chat_id][0][minimum_index[1]+i]
            except IndexError:
                return ""
        return ""
项目:sota_sentiment    作者:jbarnesspain    | 项目源码 | 文件源码
def most_similar(self, word, num_similar=5):
        idx = self._w2idx[word]
        y = list(range(self._matrix.shape[0]))
        y.pop(idx)
        most_similar = [(1,0)] * num_similar
        for i in y:
            dist = 0
            dist = cosine(self._matrix[idx], self._matrix[i])
            if dist < most_similar[-1][0]:
                most_similar.pop()
                most_similar.append((dist,i))
                most_similar = sorted(most_similar)
        most_similar = [(distance, self._idx2w[i]) for (distance, i) in most_similar]
        return most_similar
项目:uci-statnlp    作者:sameersingh    | 项目源码 | 文件源码
def all_col_dist(m):
    D = m.shape[1]
    d = np.zeros((D,D))
    for i in xrange(D):
        div = m[:,i]
        for j in xrange(D):
            djv = m[:,j]
            d[j][i] = cosine(div,djv)
    return d
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def choose_best_action(self, list_of_words):
        min_distance = 3
        best_matching_action = None
        tf_idf_shelve = shelve.open(self.tf_idf_shelve_file_name)
        current_sentence_centroid = self.compute_list_of_words_centroid(list_of_words)
        for action,centroid in tf_idf_shelve[CENTROID].iteritems():
            distance = cosine(centroid,current_sentence_centroid)
            print action,distance
            if distance <= min_distance:
                min_distance = distance
                best_matching_action = action
        tf_idf_shelve.close()
        return current_sentence_centroid, best_matching_action, min_distance
项目:MUSE    作者:MiuLab    | 项目源码 | 文件源码
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    thisCos = []
    p1 = (senseScore1[t])
    p2 = (senseScore2[t])
    for i in xrange(len(senseVec1[t])):
      for j in xrange(len(senseVec2[t])):
        thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j])
    avgCos.append(np.sum(thisCos))
  return spearmanr(test_score, avgCos)[0]
项目:MUSE    作者:MiuLab    | 项目源码 | 文件源码
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
  assert(len(senseVec1)==len(senseVec2))
  avgCos = []
  for t in xrange(len(senseVec1)):
    i = np.argmax(senseScore1[t])
    j = np.argmax(senseScore2[t])
    thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j])) 
    avgCos.append(thisCos)
  return spearmanr(test_score, avgCos)[0]
项目:visually-grounded-speech    作者:gchrupala    | 项目源码 | 文件源码
def cosine_similarity(a, b):
    # returns cosine smilarity between a and b
    return 1.0-cosine(a, b)
项目:visually-grounded-speech    作者:gchrupala    | 项目源码 | 文件源码
def cosine_similarities(a, b, transform):
    """
    returns list of cosine similarities between lists of vectors
    a and b. The z_score transformation is applied if transform == True
    """
    a = numpy.stack(a)
    b = numpy.stack(b)
    #transform if requested
    if transform:
        print "transforming"
        # z_score is written to apply same scale to a and b
        a, b = z_score(a, b)
    print "calculating cosine dists"
    cos = [cosine_similarity(a[i], b[i]) for i in range(len(a))]
    return cos
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def delta(u, v):
    """ cosine ° sigmoid
    >>> delta([0.2], [0.3])
    0.5
    >>> delta([0.3], [0.2])
    0.5
    >>> delta([0.1,0.9], [-0.9,0.1]) == delta([-0.9,0.1], [0.1,0.9])
    True
    """
    # TODO scale with a and c
    return expit(cosine(u, v))
项目:NeuralSum    作者:cheng6076    | 项目源码 | 文件源码
def reduncy(sen_vec, doc_vec):
        return 1 - cosine(sen_vec, (doc_vec - sen_vec))
项目:NeuralSum    作者:cheng6076    | 项目源码 | 文件源码
def relavence(sen_vec, doc_vec): 
        return 1 - cosine(sen_vec, doc_vec)
项目:OSDN    作者:abhijitbendale    | 项目源码 | 文件源码
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'):
    """ Compute the specified distance type between chanels of mean vector and query image.
    In caffe library, FC8 layer consists of 10 channels. Here, we compute distance
    of distance of each channel (from query image) with respective channel of
    Mean Activation Vector. In the paper, we considered a hybrid distance eucos which
    combines euclidean and cosine distance for bouding open space. Alternatively,
    other distances such as euclidean or cosine can also be used. 

    Input:
    --------
    query_channel: Particular FC8 channel of query image
    channel: channel number under consideration
    mean_vec: mean activation vector

    Output:
    --------
    query_distance : Distance between respective channels

    """

    if distance_type == 'eucos':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel)
    elif distance_type == 'euclidean':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200.
    elif distance_type == 'cosine':
        query_distance = spd.cosine(mean_vec[channel, :], query_channel)
    else:
        print "distance type not known: enter either of eucos, euclidean or cosine"
    return query_distance
项目:vsmlib    作者:undertherain    | 项目源码 | 文件源码
def cmp_vectors(v1, v2):
    # c = cosine(normed(v1), normed(v2))
    # c = cosine(v1, v2)
    c = v1 @ v2
    return c
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def make_dist(vectors, options):
    if options.metric != 'cosine':
        return vectors, metrics[options.metric]
    else:
        # normalize once only
        vectors = [v/numpy.linalg.norm(v) for v in vectors]
        return vectors, lambda u, v: 1 - numpy.dot(u, v)
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def cosine(v1, v2):
    return numpy.dot(v1/numpy.linalg.norm(v1), v2/numpy.linalg.norm(v2))
项目:NETL-Automatic-Topic-Labelling-    作者:sb1992    | 项目源码 | 文件源码
def get_lt_ranks(lab_list,num):
    topic_ls = get_topic_lt(topic_list[num])
    val_dict = {}
    val_list =[]
    final_list=[]
    for item in lab_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] #Letter trigram for candidate label.
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine Similarity
        val_list.append((item,val))
    rank_val = [i[1] for i in val_list]
    arr = np.array(rank_val)
    order = arr.argsort()
    ranks = order.argsort()
    for i,elem in enumerate(val_list):
        final_list.append((elem[0],ranks[i],int(num)))

    return final_list

# Generates letter trigram feature
项目:NETL-Automatic-Topic-Labelling-    作者:sb1992    | 项目源码 | 文件源码
def get_lt_ranks(lab_list,num):
    topic_ls = get_topic_lt(topic_list[num]) # Will get letter trigram for topic terms.
    val_dict = {}
    val_list =[]
    final_list=[]
    for item in lab_list:
        trigrams = [item[i:i+3] for i in range(0, len(item) - 2)] # get the trigrams for label candidate.
        label_cnt = Counter(trigrams)
        total = sum(label_cnt.values(), 0.0)
        for key in label_cnt:
            label_cnt[key] /= total
        tot_keys = list(set(topic_ls.keys() + label_cnt.keys()))
        listtopic = []
        listlabel = []
        for elem in tot_keys:
            if elem in topic_ls:
                listtopic.append(topic_ls[elem])
            else:
                listtopic.append(0.0)
            if elem in label_cnt:
                listlabel.append(label_cnt[elem])
            else:
                listlabel.append(0.0)
        val = 1 - cosine(np.array(listtopic),np.array(listlabel)) # Cosine similarity.
        val_list.append((item,val))
    rank_val = [i[1] for i in val_list]
    arr = np.array(rank_val)
    order = arr.argsort()
    ranks = order.argsort()
    for i,elem in enumerate(val_list):
        final_list.append((elem[0],ranks[i],int(num)))

    return final_list

# This calls the above method to get letter trigram feature.
项目:GloVe-experiments    作者:brannondorsey    | 项目源码 | 文件源码
def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'):

    if method == 'cosine':
        minim = [] # min, index
        for i, v in enumerate(df):
            # skip the base word, its usually the closest
            if id_to_word[i] in skip_words:
                continue
            dist = cosine(vec, v)
            minim.append((dist, i, v))
        minim = sorted(minim, key=lambda v: v[0])
        # return list of (word, cosine distance, vector) tuples
        return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)]
    else:
        raise Exception('{} is not an excepted method parameter'.format(method))
项目:GloVe-experiments    作者:brannondorsey    | 项目源码 | 文件源码
def turn(gs, word_to_id, id_to_word, df, soft_score):

    gs['turn_number'] += 1
    names = list(gs['players'].keys())
    current_player = names[(gs['turn_number'] % len(names) - 1)]
    while True:
        expr = input('{}, please enter a word expression:\n> '.format(current_player))
        try:
            vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df)
        except Exception as err:
            print(err)
            continue
        break

    answers = {}
    for name in gs['players']:
        while True:
            word = input('{}, please enter your answer: '.format(name))
            if word in word_to_id:
                answers[name] = df[word_to_id[word]]
                break
            else:
                print('{} is not in the dataset, please another word.'.format(word))

    answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0]
    # transform answers from vectors to distances
    for k, v in answers.items():
        answers[k] = cosine(v, answer_vec)

    winner = min(answers, key=answers.get)

    if not soft_score:
        gs['players'][winner] += 1
    else:
        for name in answers:
            gs['players'][name] += round(answers[name], 2)

    print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan')))
    print('{} wins this round.'.format(colored(winner, 'green')))
    print_standings(gs)
项目:GloVe-experiments    作者:brannondorsey    | 项目源码 | 文件源码
def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'):

    if method == 'cosine':
        minim = [] # min, index
        for i, v in enumerate(df):
            # skip the base word, its usually the closest
            if id_to_word[i] in words:
                continue
            dist = cosine(vec, v)
            minim.append((dist, i))
        minim = sorted(minim, key=lambda v: v[0])
        # return list of (word, cosine distance) tuples
        return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)]
    else:
        raise Exception('{} is not an excepted method parameter'.format(method))
项目:workspace    作者:nojima    | 项目源码 | 文件源码
def find_similar_words_by_vector(self, vector: np.ndarray, n: int = 10):
        vocabulary = self._vocabulary
        similar_ids = sorted(range(0, vocabulary.size),
                             key=lambda id: cosine(self._vectors[id], vector))[:n]
        return [vocabulary.to_word(id) for id in similar_ids]
项目:wi_wacv14    作者:VChristlein    | 项目源码 | 文件源码
def computeDistance(X, Y, method):
    if method == 'cosine':
        dist = spdistance.cosine(X,Y)

    if dist < 0:
        print ('WARNING: distance between X {} and Y {} = {} < 0, method: '
                         '{}'.format(X, Y, dist, method))

    return dist
项目:wi_wacv14    作者:VChristlein    | 项目源码 | 文件源码
def runNN(descriptors, labels, parallel, nprocs):
    """
    compute nearest neighbor from specific descriptors, given labels
    """

    distance_method = { "cosine": 'cosine' }
    ret_matrix = None
    for name, method in distance_method.iteritems():
        dist_matrix = computeDistances(descriptors, method, 
                                           parallel, nprocs)

        computeStats(name, dist_matrix, labels, parallel)
        ret_matrix = dist_matrix

    return ret_matrix
项目:aihackathon    作者:nicoheidtke    | 项目源码 | 文件源码
def compare_tweet_with_storage(tweet, storage=None, bow=False):
    if storage is None:
        if not os.path.isfile(os.path.join(config.data_folder, config.model_file)):
            raise('Model was not found!')
        else:
            storage = pickle.load(open(os.path.join(config.data_folder, config.model_file), 'rb'))
    print(tweet)
    transformed_tweet = transform_tweet(tweet, bow)
    print([x[0] for x in transformed_tweet], [np.sum(y) for y in (x[2] for x in transformed_tweet)])
    scores = {}
    for i, (entity, entity_type, vector_array) in enumerate(transformed_tweet):
        temp_score = 0.0
        for j, (tweetid, item) in enumerate(storage[storage['Entity'] == entity].iterrows()):
            if bow:
                clusterids = np.unique([vector_array.keys() + item['Vector array'].keys()])
                vector1 = np.zeros([len(clusterids)])
                vector2 = np.zeros([len(clusterids)])
                for k, cid in enumerate(clusterids):
                    vector1[k] = vector_array.get(cid, 0)
                    vector2[k] = item['Vector array'].get(cid, 0)
                temp_score = np.max([1.0 * np.sum(np.logical_and(vector1, vector2)) / np.min([np.sum(vector1), np.sum(vector2)]), temp_score])
            else:
                if SPLIT:
                    result = [1 - cosine(vector_array[x], item['Vector array'][x]) for x in range(3)]
                    isnan = np.isnan(result)
                    res = 0.0
                    for v in range(3):
                        if not isnan[v]:
                            res+=result[v]
                    res = 1.0 * res/(np.sum(isnan==False)+10**(-10))
                    temp_score = np.max([res, temp_score])
                    # print(entity, entity_type)
                else:
                    temp_score = np.max([1 - cosine(vector_array, item['Vector array']), temp_score])
                    print(1 - cosine(vector_array, item['Vector array']), entity, tweet, str(tweetid))
        scores.update({entity: temp_score})
    return combine_scores(scores)
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def calc_glove_sim(row,embedder,idf_dict):
    '''
    Calc glove similarities and diff of centers of query\title
    '''
    a2 = [x for x in remove_punctuation(row['question1']).lower().split() if x in embedder]
    b2 = [x for x in remove_punctuation(row['question2']).lower().split() if x in embedder]

    # if len(a2)>0 and len(b2)>0:
    #     glove_sim = embedder.n_similarity(a2, b2)
    # else:
    #     return((-1, -1, np.zeros(300)))

    vectorA = np.zeros(300)
    for w in a2:
        if w in idf_dict:
            coef = idf_dict[w]
        else:
            coef = idf_dict['default_idf']
        vectorA += coef*embedder[w]
    vectorA /= len(a2)

    vectorB = np.zeros(300)
    for w in b2:
        if w in idf_dict:
            coef = idf_dict[w]
        else:
            coef = idf_dict['default_idf']
        vectorB += coef*embedder[w]
    vectorB /= len(b2)

    vector_diff = (vectorA - vectorB)
    glove_sim = cosine(vectorA,vectorB)
    glove_vdiff_dist = np.sqrt(np.sum(vector_diff**2))
    return (glove_sim,glove_vdiff_dist, vector_diff)
项目:sentence_similarity    作者:MorinoseiMorizo    | 项目源码 | 文件源码
def cosine_similarity(a, b):
    return dis.cosine(a, b)
项目:Multi-view-neural-acoustic-words-embeddings    作者:opheadacheh    | 项目源码 | 文件源码
def acous_text_eval(m, sess, data, lengths, text_data, text_lengths, matches, config):
    embeddings = []
    now = 0
    while now < len(data):
        embedding = sess.run(m.final_state, {m.input_x1: data[now: now + config.eval_batch_size],
                                             m.input_x1_lengths: lengths[now: now + config.eval_batch_size]})
        embeddings.append(embedding)
        now += config.eval_batch_size
    X = np.vstack(embeddings)
    text_embeddings = []
    now = 0
    while now < len(data):
        text_embedding = sess.run(m.word_state, {m.input_c1: text_data[now: now + config.eval_batch_size],
                                                 m.input_c1_lengths: text_lengths[now: now + config.eval_batch_size]})
        text_embeddings.append(text_embedding)
        now += config.eval_batch_size
    Y = np.vstack(text_embeddings)
    distances = []
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distances.append(cosine(X[i], Y[j]))
    distances = np.asarray(distances)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    print "Average precision:", ap
    print "Precision-recall breakeven:", prb
    return ap
项目:cluster_paraphrases    作者:acocos    | 项目源码 | 文件源码
def sem_clust(self, w2p, simsdict):
        ''' Baseline SEMCLUST method (dynamic thresholding), based on:

        Marianna Apidianaki, Emilia Verzeni, and Diana McCarthy. Semantic
        Clustering of Pivot Paraphrases. In LREC 2014.

        Builds a graph where nodes are words, and edges connect words that
        have a connection in <w2p>. Weights edges by the values given in
        <simsdict>.
        :param w2p: word -> {paraphrase: score} dictionary, used to decide which nodes to connect with edges
        :param simsdict: word -> {paraphrase: score} OR word -> vector, used for edge weights
        :return:
        '''
        self.reset_sense_clustering()
        wordlist = self.pp_dict.keys()

        oov = [w for w in wordlist if w not in w2p or w not in simsdict]
        if len(oov) > 0:
            sys.stderr.write('WARNING: Paraphrases %s are OOV. '
                             'Removing from ppset.\n' % str(oov))
            wordlist = list(set(wordlist) - set(oov))

        if len(wordlist) == 1:
            self.add_sense_cluster([wordlist[0]])
            return

        # Using cosine similarity of word-paraphrase vectors:
        if type(simsdict.values()[0]) != dict:
            similarities = np.array([[1-cosine(simsdict[i], simsdict[j])
                                      for j in wordlist] for i in wordlist])
        else:
            similarities = np.array([[(1-dict_cosine_dist(simsdict[i], simsdict[j]))
                                      for j in wordlist] for i in wordlist])

        gr = sem_clust.toGraph(similarities, wordlist, self.target_word, w2p)

        for c in nx.connected_components(gr):
            self.add_sense_cluster(c)
项目:cluster_paraphrases    作者:acocos    | 项目源码 | 文件源码
def dict_cosine_dist(u,v):
    features = list(set(u.keys()) | set(v.keys()))
    features.sort()
    uvec = np.array([u[f] if f in u else 0.0 for f in features])
    vvec = np.array([v[f] if f in v else 0.0 for f in features])
    return cosine(uvec,vvec)
项目:MorphForest    作者:j-luo93    | 项目源码 | 文件源码
def get_similarity(self, w1, w2):
        if w1 not in self.wv or w2 not in self.wv: return -0.5
        sim = 1.0 - cos_dist(self.wv[w1], self.wv[w2])
        return sim
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def predict(self, seq1, seq2, pred_method='multiply', unigram_probs=None):

        '''right now this function only handles getting prob for one sequence pair'''
        if self.flat_input:
            if self.embedded_input:
                seq1 = seq1[None]
            else:
                seq1 = get_vector_batch([seq1], vector_length=self.lexicon_size+1)
        else:
            seq1 = get_seq_batch([seq1], max_length=self.n_timesteps)

        probs = self.model.predict_on_batch(seq1)[0]

        if self.flat_output:
            if unigram_probs is not None:
                probs = probs / unigram_probs ** 0.66
                probs[numpy.isinf(probs)] = 0.0 #replace inf
            #import pdb;pdb.set_trace()
            seq2 = get_vector_batch([seq2], vector_length=self.lexicon_size+1)
            #prob = 1 - cosine(seq2, probs)
            probs = probs[seq2[0].astype('bool')]

        else:
            seq2 = get_seq_batch([seq2], padding='post', max_length=self.n_timesteps)

            probs = probs[numpy.arange(self.n_timesteps), seq2]
            probs = probs[seq2 > 0]

        if pred_method == 'multiply':
            prob = numpy.sum(numpy.log(probs))
            #prob = numpy.multiply(probs)
        if pred_method == 'mean':
            #prob = numpy.sum(numpy.log(probs))
            prob = numpy.mean(numpy.log(probs))
        elif pred_method == 'last':
            prob = numpy.log(probs[-1])
        elif pred_method == 'max':
            prob = numpy.log(numpy.max(probs))
        return prob
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def predict(self, seq1, seq2):
        seq1 = seq1 + 1e-8
        seq2 = seq2 + 1e-8 #smooth to avoid NaN
        score = 1 - cosine(seq1, seq2)
        return score
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def get_word2vec_sim(context_seq, gen_seq):
    '''return the word2vec cosine similarity between the context and each generated sequence 
    (where the word2vec representation for a sequence is just the average of its word vectors)'''
    word_pairs = get_word_pairs(context_seq, gen_seq)
    if word_pairs:
        pair_scores = [similarity.word2vec(encoder(word1),encoder(word2)) for word1,word2 in word_pairs]
    else: #no word pairs between context and generated sequences (e.g. generated sequence might be punctuation only)
        pair_scores = [0]
    # assert(len(word_pairs) == len(pair_scores))
    word2vec_sim = numpy.mean(pair_scores)
    return word2vec_sim