Python sklearn.metrics.pairwise 模块,cosine_similarity() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.metrics.pairwise.cosine_similarity()

项目:contentpy    作者:Joklost    | 项目源码 | 文件源码
def _train(self, training_frame):
        hashing_vectorizer = HashingVectorizer(analyzer="word", n_features=(2 ** 30),
                                               ngram_range=(1, 3), stop_words="english")
        training_hashing_matrix = hashing_vectorizer.fit_transform(training_frame["description"])

        self.log.info("starting kernel")
        start = time()
        cosine_similarities = cosine_similarity(training_hashing_matrix, training_hashing_matrix)
        self.log.info("finished kernel. this took {} s".format(time() - start))

        self.log.info("starting adding to redis database")
        start = time()
        i = 0
        l = len(training_frame.index)
        print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50)
        for idx, row in training_frame.iterrows():
            similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
            similar_items = [(cosine_similarities[idx][i], training_frame['id'][i]) for i in similar_indices]

            flattened = sum(similar_items[1:], ())
            self._r.zadd(self.SIMKEY % row['id'], *flattened)
            i += 1
            print_progress(i, l, prefix="Progress:", suffix="Complete", bar_length=50)
        self.log.info("finished adding {} rows to redis database. this took {} s".format(i, time() - start))
项目:FaceRecognitionProjects    作者:ForrestPi    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(data = X)
    #fc7??????,??????
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(data=X)
    #fc7??????,??????
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:FaceRecognitionProjects    作者:ForrestPi    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(data = X)
    #fc7??????,??????
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(data=X)
    #fc7??????,??????
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #np.savetxt('feature2.txt', feature2, delimiter=',')
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:DeepLearn    作者:GauravBh1010tt    | 项目源码 | 文件源码
def cos_sim(ind1,ind2=1999):
    view1 = np.load("test_v1.npy")[0:ind1]
    view2 = np.load("test_v2.npy")[0:ind2]
    #val = []
    MAP=0
    for i,j in enumerate(view1):
        val=[]
        AP=0
        for x in view2:            
            val.append(cosine_similarity(j,x)[0].tolist())
        #val=val[0].tolist()
        #print val[0].tolist()
        val=[(q,p)for p,q in enumerate(val)]
        #print val
        val.sort()
        val.reverse()
        t = [w[1]for w in val[0:7]]
        for x,y in enumerate(t):
            if y in range(i,i+5):
                AP+=1/(x+1)
        print(t)
        print(AP)
        MAP+=AP
    print('MAP is : ',MAP/ind1)
项目:nlp-lt    作者:minven    | 项目源码 | 文件源码
def search_query(self, query):
        """
        search for query and find most related document for query
        http://webhome.cs.uvic.ca/~thomo/svd.pdf
        """

        def topN(similarities, N=5):
            return np.argsort(similarities)[::-1][:N]

        words = query.split(" ")
        tokens_ids = []
        for word in words:
            try:
                token_id = self.tokens_mapping[word]
            except KeyError:
                print("Token not found in tokens mapping dict")
            else:
                tokens_ids.append(token_id)

        query_representation = np.mean(self.tokens_representation[tokens_ids,:], axis=0)
        similarities = cosine_similarity(query_representation, self.documents_representation)
        topN_documents =[self.documents_mapping[index] for index in topN(similarities[0])] 
        return topN_documents
项目:watlink    作者:dustalov    | 项目源码 | 文件源码
def emit(id):
    if not id in hctx:
        return (id, {})

    hvector, candidates = v.transform(hctx[id]), Counter()

    for hypernym in hctx[id]:
        hsenses = Counter({hid: sim(v.transform(Counter(synsets[hid])), hvector).item(0) for hid in index[hypernym]})

        for hid, cosine in hsenses.most_common(1):
            if cosine > 0:
                candidates[(hypernym, hid)] = cosine

    matches = [(hypernym, hid, cosine) for (hypernym, hid), cosine in candidates.most_common(len(candidates) if args.k == 0 else args.k) if hypernym not in synsets[id]]

    return (id, matches)
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def generateCosineNeighborGraph(hin,kNeighbors=10,tf_param={'word':True, 'entity':False, 'we_weight':1}):
        X, newIds, entIds = GraphGenerator.getTFVectorX(hin,param=tf_param)
        cosX = cosine_similarity(X)
        #return sparse.csc_matrix(X.dot(X.transpose())),newIds
        n = cosX.shape[0]
        graph = np.zeros((n,n))
        tic = time.time()
        for i in range(n):
            for j in np.argpartition(-cosX[i],kNeighbors)[:kNeighbors]:
                if j == i:
                    continue
                #graph[i, j] += cosX[i, j]
                #graph[j, i] += cosX[i, j]
                graph[i, j] += 1
                graph[j, i] += 1
        toc = time.time() - tic

        return sparse.csc_matrix(graph), newIds
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def generateCosineNeighborGraphfromX(X, kNeighbors=10):
        cosX = cosine_similarity(X)
        # return sparse.csc_matrix(X.dot(X.transpose())),newIds
        #print cosX.shape
        n = cosX.shape[0]
        graph = np.zeros((n, n))
        tic = time.time()
        for i in range(n):
            for j in np.argpartition(-cosX[i], kNeighbors)[:kNeighbors]:
                if j == i:
                    continue
                # graph[i, j] += cosX[i, j]
                # graph[j, i] += cosX[i, j]
                graph[i, j] += 1
                graph[j, i] += 1
        toc = time.time() - tic
        #print 'graph generation done in %f seconds.' % toc
        return sparse.csc_matrix(graph)
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def generate_laplacian_score_scalar(X_ent, X_word, kNeighbors):
    # Generate cosine similarity graph
    n = X_ent.shape[0]
    cosX = cosine_similarity(X_word)
    graph = np.zeros((n, n))
    for i in range(n):
        for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
            if j == i:
                continue
            graph[i, j] = cosX[i, j]
            graph[j, i] = cosX[i, j]

    D = sparse.diags([graph.sum(axis=0)], [0])
    L = D - graph
    f_tilde = X_ent - (float(X_ent.transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones((n, 1))
    score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10)
    laplacian_score = score
    return laplacian_score
项目:Face-recognition-test    作者:jiangwei1995910    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(blobs=['pool5'],data = X)
    # print out.keys()

    feature1 = np.float64(out["pool5"])

    feature1=np.reshape(feature1,(test_num,25088))
    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(blobs=['pool5'],data=X)
    feature2 = np.float64(out['pool5'])
    feature2=np.reshape(feature2,(test_num,25088))
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:Face-recognition-test    作者:jiangwei1995910    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(blobs=['pool5'],data = X)
    # print out.keys()

    feature1 = np.float64(out["pool5"])

    feature1=np.reshape(feature1,(test_num,25088))
    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(blobs=['pool5'],data=X)
    #fc7??????,??????
    feature2 = np.float64(out['pool5'])
    feature2=np.reshape(feature2,(test_num,25088))
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:Face-recognition-test    作者:jiangwei1995910    | 项目源码 | 文件源码
def readFace(feature):

    r=redis.Redis("localhost")
    keys= r.keys("*")
    for key in keys :
        db_feature =pickle.loads( r.lindex(key,0) )
        comple=pw.cosine_similarity(db_feature,feature)
        if(comple>0.46) :  #?????
            return key
    for key in keys :
        if(r.llen(key))>1 :
            db_feature =pickle.loads( r.lindex(key,1) )
            comple=pw.cosine_similarity(db_feature,feature)
            if(comple>0.46) :  #?????
                return key
    return 'unknow'





#????????
项目:stc_ntcir12_code    作者:luochuwei    | 项目源码 | 文件源码
def get_ranked_response(model, test_post_seg, candidate_list, similar_post_dic):
    test_post_seg_vec = get_sentence_vec(model, test_post_seg, candidate_list, similar_post_dic)
    for c in candidate_list:
        c_p_vec = get_sentence_vec(model, c[1], candidate_list, similar_post_dic)
        c_r_vec = get_sentence_vec(model, c[4], candidate_list, similar_post_dic)
        c[2] = c_p_vec
        c[5] = c_r_vec
        s2 = float(cosine_similarity(c_p_vec, c_r_vec))
        s3 = float(cosine_similarity(test_post_seg_vec, c_r_vec))
        c[7] = s2
        c[8] = s3
        # rank_score = 1000*c[6]*c[7]*c[8]
        rank_score = c[6]*0.5+c[7]*1.5+c[8]*2
        c[9] = rank_score
    rank_candidate = sorted(candidate_list, key = lambda l: l[-1])
    return rank_candidate
项目:ActiveBoundary    作者:MiriamHu    | 项目源码 | 文件源码
def __init__(self, dataset, save_path_queries=None, **kwargs):
        super(UncertaintyDenseSampling, self).__init__(dataset, **kwargs)

        self.model = kwargs.pop('model', None)
        if self.model is None:
            raise TypeError(
                "__init__() missing required keyword-only argument: 'model'"
            )
        self.save_path_queries = save_path_queries
        self.save_path_queries_hdf5 = os.path.join(self.save_path_queries,
                                                   os.path.normpath(self.save_path_queries) + ".hdf5")
        if os.path.isfile(self.save_path_queries_hdf5):
            print "This file already exists %s" % self.save_path_queries_hdf5
            quit(0)
        self.model.train(self.dataset, first_time=True)
        unlabeled_train = self.dataset.get_unlabeled_train_data()["features"]
        print "Computing cosine similarities of", unlabeled_train.shape, "by", unlabeled_train.shape
        self.similarity_matrix = cosine_similarity(unlabeled_train, unlabeled_train)
项目:mnogoznal    作者:nlpub    | 项目源码 | 文件源码
def disambiguate_word(self, sentence, index):
        super().disambiguate_word(sentence, index)

        lemmas = self.lemmatize(sentence)

        if index not in lemmas:
            return

        svector = self.sensegram(lemmas.values()) # sentence vector

        if svector is None:
            return

        # map synset identifiers to the cosine similarity value
        candidates = Counter({id: sim(svector, self.dense[id]).item(0)
                              for id in self.inventory.index[lemmas[index]]
                              if self.dense[id] is not None})

        if not candidates:
            return

        for id, _ in candidates.most_common(1):
            return id
项目:VGG_Face_Caffe_Model    作者:PatienceKai    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(data = X)
    #fc7??????,??????
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(data=X)
    #fc7??????,??????
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:VGG_Face_Caffe_Model    作者:PatienceKai    | 项目源码 | 文件源码
def compar_pic(path1,path2):
    global net
    #??????
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  ?? ?????
    out = net.forward_all(data = X)
    #fc7??????,??????
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    #??????
    X=read_image(path2)
    #X  ?? ?????
    out = net.forward_all(data=X)
    #fc7??????,??????
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #np.savetxt('feature2.txt', feature2, delimiter=',')
    #????????cos?,??????????
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)
项目:FCM-Feature-Selection    作者:achyudhk    | 项目源码 | 文件源码
def selecttop(CF, k):
    """
        Finds cosine similarity between SC and Wi and returns index of top features
    """
    NCF = np.zeros((CF.shape[1],CF.shape[1]))
    for i in range(CF.shape[1]):
        for j in range(CF.shape[1]):
            if (CF[i,j]+CF[j,j]-CF[i,j]) !=0:
                NCF[i,j]=CF[i,j]/(CF[i,j]+CF[j,j]-CF[i,j])
            else:
                NCF[i,j]=0

    SC = np.zeros(CF.shape[1])
    for i in range(CF.shape[1]):
        SC[i] = np.sum(NCF[i,:])

    print(np.isnan(SC).any())
    print(np.isnan(CF).any())
    cosim = cosine_similarity(SC,CF)
    return (-cosim).argsort()[0][:int(k*CF.shape[1])]

#Loading CF matrix for each cluster
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def cosine_sim(x, y):
    try:
        d = cosine_similarity(x.reshape(1,-1), y.reshape(1,-1))
        d = d[0][0]
    except:
        d = 0.0
    return d
项目:PersonalizedMultitaskLearning    作者:mitmedialab    | 项目源码 | 文件源码
def setKernel(self, kernel_name, kernel_param):
        self.kernel_name = kernel_name
        if kernel_name == 'rbf':
            def rbf(x1,x2):
                return rbf_kernel(x1,x2, gamma=kernel_param) # from sklearn

            self.internal_kernel_func = rbf
        else:
            def dot_product(x1,x2):
                return cosine_similarity(x1,x2) # from sklearn - a normalized version of dot product #np.dot(x1,x2.T)
            self.internal_kernel_func = dot_product
项目:docnade    作者:AYLIEN    | 项目源码 | 文件源码
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs):
    docs = []
    sim = pw.cosine_similarity(corpus_vectors, query_vectors)
    order = np.argsort(sim, axis=0)[::-1]
    for i in range(len(query_vectors)):
        docs.append(order[:, i][0:n_docs])
    return np.array(docs)
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:RecQ    作者:Coder-Yu    | 项目源码 | 文件源码
def cosine(x1,x2):
    #find common ratings
    #new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = x1.dot(x2)
    denom = sqrt(x1.dot(x1)*x2.dot(x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0]
项目:tokenquery    作者:ramtinms    | 项目源码 | 文件源码
def vec_cos_sim(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_similarity(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_similarity(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_similarity(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_similarity(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_similarity(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_similarity(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input')
项目:adversarial-document-model    作者:AYLIEN    | 项目源码 | 文件源码
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs):
    docs = []
    sim = pw.cosine_similarity(corpus_vectors, query_vectors)
    order = np.argsort(sim, axis=0)[::-1]
    for i in range(len(query_vectors)):
        docs.append(order[:, i][0:n_docs])
    return np.array(docs)
项目:ref-extract    作者:brandonrobertz    | 项目源码 | 文件源码
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
    if dist == 'euclidean':
        sim = euclidean_distances(X, vec.reshape(1, -1))
    elif dist == 'cosine':
        sim = cosine_similarity(X, vec.reshape(1, -1))
    else:
        raise NotImplementedError('dist must be euclidean or cosine')
    # get the top five indices
    indices = sim.argsort(axis=0)[-top:][::-1]
    words = []
    for i in indices:
        words.append(labels[i[0]])
    return " ".join(words)
项目:bot2017Fin    作者:AllanYiin    | 项目源码 | 文件源码
def find_nearest_word(self,represent,  topk:int=10,stopwords:list=[]):
        """
        ????(???????????)???????
        :param stopwords: ?????????????
        :param represent:
        :param topk:
        :return:
        """
        array1=np.empty(200)
        if isinstance(represent,str) and represent in self:
            array1=self[represent]
            stopwords.append(represent)
        elif isinstance(represent,np.ndarray) :
            array1=represent
        else:
            raise NotImplementedError
        result_cos=cosine_similarity(np.reshape(array1,(1,array1.shape[-1])),self._matrix)
        result_cos=np.reshape(result_cos,result_cos.shape[-1])
        result_sort=result_cos.argsort()[-1*topk:][::-1]
        # [[self.idx2word[idx],result_cos[idx]] for idx in result_sort]
        # found={}
        # for item in  result_sort:
        #   found[self.idx2word[item]]=result[item]
        # sortlist=sorted(found.items(), key=lambda d: d[1],reverse=True)
        #print(found)
        return [[self.idx2word[idx],result_cos[idx]] for idx in result_sort if self.idx2word[idx] not in stopwords and sum([ 1 if stop.startswith(self.idx2word[idx]) else 0 for stop in  stopwords])==0 ] #[item for item in sortlist if sum([len(item[0].replace(stop,''))>=2 for stop in stopwords]) ==0]
项目:AbTextSumm    作者:StevenLOL    | 项目源码 | 文件源码
def simCalcMatrix(docs):
    tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs)  #finds the tfidf score with normalization
    cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) 
    return cosineSimilarities
项目:AbTextSumm    作者:StevenLOL    | 项目源码 | 文件源码
def generateSimMatrix(phraseList):
    #print 'Num elements', len(phraseList), phraseList
    all_elements=[]
    #for elementlist in phraseList:
    for element in phraseList:
        if len(element.strip())==0:
            all_elements.append(' ')
        else:
            all_elements.append(element.strip())
    tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None)
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(all_elements)  #finds the tfidf score with normalization
    cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) 
    return cosineSimilarities
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def calu_cosin_num(word_q,word_a,model):
    #print 'word_q : ' + str(word_q) + '\n'
    #print 'word_a : ' + str(word_a) + '\n'
    try:
        q_vector = model[word_q.decode('utf-8')]
        a_vector = model[word_a.decode('utf-8')]
    except KeyError:
        return 0
    cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
    return float(cosine_similarity_num)
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def calu_cosin_num(word_q,word_a,model):
    #print 'word_q : ' + str(word_q) + '\n'
    #print 'word_a : ' + str(word_a) + '\n'
    try:
        q_vector = model[word_q.decode('utf-8')]
        a_vector = model[word_a.decode('utf-8')]
    except KeyError:
        return 0
    cosine_similarity_num = cosine_similarity(np.array(q_vector).reshape(1,-1),np.array(a_vector).reshape(1,-1))
    return float(cosine_similarity_num)
项目:word2vec_pipeline    作者:NIHOPA    | 项目源码 | 文件源码
def cosine_affinity(X):
    epsilon = 1e-8
    S = cosine_similarity(X)
    S[S > 1] = 1.0  # Rounding problems
    S += 1 + epsilon

    # Sanity checks
    assert(not (S < 0).any())
    assert(not np.isnan(S).any())
    assert(not np.isinf(S).any())

    return S
项目:GraphLearn    作者:smautner    | 项目源码 | 文件源码
def decision_function(self, graphs):
        vecs = self.vectorizer.transform(graphs)
        return cosine_similarity(self.reference_vec, vecs)
项目:rekognition-image-search-engine    作者:awslabs    | 项目源码 | 文件源码
def search():
    qry = request.args.get('query', '')
    test = np.zeros((tfidf[0].shape))

    keywords = []

    for word in qry.split(' '):
        # validate word
        if len(word) <2 or word in stop_words:
            continue 
        try:
            idx = features.index(word)
            test[0][idx] = 1
        except ValueError, e:
            pass

    cosine_similarities = cosine_similarity(test, tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-100:-1] # TOP 100 results

    MAX = 100 
    data = []
    related_docs_indices = related_docs_indices[:MAX]
    tag_map = {} # All tags and their counts

    for img in indices[related_docs_indices]:
        file_path = "/Users/smallya/workspace/Rekognition-personal-searchengine/" + img
        labels = d_index[img]
        word = qry.split(' ')[0]
        data.append(file_path)

    print related_docs_indices
    return json.dumps(data)
项目:Diggly-Back-End    作者:WikiDiggly    | 项目源码 | 文件源码
def score_topics(source_id, topics_desc_dict):
    token_dict = {}
    indices = {}
    res_dict = {}
    index = 0

    for tid, text in topics_desc_dict.iteritems():
        lowers = text.lower()
        remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
        no_punctuation = lowers.translate(remove_punctuation_map)
        token_dict[tid] = no_punctuation

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[source_id]

    # this can take some time
    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_index:
            continue;
        res_dict.update({tok: res[0][ind]})

    return res_dict
项目:Diggly-Back-End    作者:WikiDiggly    | 项目源码 | 文件源码
def score_outlinks(main_text, title_list):
    main_title = "current_selected_topic"
    token_dict = {}
    len_titles = {}
    indices = {}
    res_dict = {}
    index = 0

    for title in title_list:
        lowers = title.lower().replace("_", " ").replace("-", " ")
        len_titles.update({title: len(lowers.split(" "))})
        token_dict[title] = lowers

    len_titles[main_title] = 1
    token_dict[main_title] = main_text

    for tok in token_dict.keys():
        indices.update({tok: index})
        index += 1

    main_index = indices[main_title]

    tf_idf = TfidfVectorizer(tokenizer=text_proc.tokenize, stop_words='english')
    tfidf_matrix = tf_idf.fit_transform(token_dict.values())
    res = cosine_similarity(tfidf_matrix[main_index], tfidf_matrix)

    for tok, ind in indices.iteritems():
        if tok == main_title:
            continue;
        res_dict.update({tok: (res[0][ind] * 100 / len_titles[tok]) })

    return res_dict
项目:pyts    作者:johannfaouzi    | 项目源码 | 文件源码
def predict(self, X):
        """Predict the class labels for the provided data

        Parameters
        ----------
        X : np.ndarray, shape = [n_samples]

        Returns
        -------
        y : np.array of shape [n_samples]
            Class labels for each data sample.
        """

        if not self.fitted:
            raise NotFittedError("Estimator not fitted, call `fit` before exploiting the model.")

        n_samples = len(X)
        frequencies = np.zeros((n_samples, self.n_all_words_))
        for i in range(n_samples):
            words_unique, words_counts = np.unique(X[i], return_counts=True)
            for j, word in enumerate(self.all_words_):
                if word in words_unique:
                    frequencies[i, j] = words_counts[np.where(words_unique == word)[0]]

        self.frequencies_ = frequencies

        y_pred = cosine_similarity(frequencies, self.tf_idf_array_).argmax(axis=1)

        return y_pred
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num = 5):
    split_path = 'data/local/split/' + scope_name + '/'
    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    repeats = 50
    tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
    X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
    n = X_word.shape[0]

    knowsim = sparse.lil_matrix((n, n))
    for t in type_list:
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param, t)

        # make similarity graph
        cosX = cosine_similarity(X_typed)
        graph = sparse.lil_matrix((n, n))
        for i in range(n):
            for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
                if j == i:
                    continue
                graph[i, j] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #
                graph[j, i] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #

        # calculate laplacian scores
        row_sum = graph.sum(axis=1)
        laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)

        # add meta-path-based similarity to the knowsim
        knowsim = knowsim + np.exp(-tau * laplacian_score) * graph

    knowsim = knowsim.tocsr()
    print 'running lp'
    lp_param = {'alpha':0.98, 'normalization_factor':5}

    ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds)
    return ssl.get_mean()
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def generate_laplacian_score(X_ent, X_word, kNeighbors):
    # Generate cosine similarity graph
    n = X_ent.shape[0]
    m = X_ent.shape[1]
    cosX = cosine_similarity(X_word)
    graph = np.zeros((n, n))
    t = cosX.sum().sum() / n/n
    for i in range(n):
        for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
            if j == i:
                continue
            # diff = (X_word[i, :] - X_word[j, :]).toarray().flatten()

            # dist = np.exp(np.dot(diff, diff) / t)
            graph[i, j] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #
            graph[j, i] = cosX[i, j] #np.exp(- (1 - cosX[i, j]) / 0.03) #

    D = sparse.diags([graph.sum(axis=0)], [0])
    L = D - graph

    laplacian_score = np.zeros(m)
    for i in range(m):
        f_tilde = X_ent[:, i] - (float(X_ent[:, i].transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones(
            (n, 1))
        score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10)
        laplacian_score[i] = score


    return (laplacian_score)