Python sklearn.metrics.pairwise 模块,linear_kernel() 实例源码

我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用sklearn.metrics.pairwise.linear_kernel()

项目:AbTextSumm    作者:StevenLOL    | 项目源码 | 文件源码
def removeSimilarSentences(generatedSentences, originalSentences,  stopwords,threshold=0.80,):
    docs=[]
    for sent, sim in generatedSentences:
        docs.append(sent)
    docs.extend(originalSentences)

    bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    #simMatrix = (normalized[0:] * normalized[0:].T).A
    simindices=[]
    #print 'Num original, ', len(originalSentences)
    for i in xrange(len(generatedSentences)):
        simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
        if(max(simGeneratedScores) >= threshold):
            simindices.append(i)

    #print simindices
    finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
    #print len(generatedSentences), len(finalGen)
    return finalGen
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
    """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.

        :param str verb_token: Surface form of a verb, e.g., *born*
        :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
         used to transform verbs into vectors
        :return: cosine similarity score
        :rtype: ndarray
    """
    verb_token_vector = vectorizer.transform([verb_token])
    # Here the linear kernel is the same as the cosine similarity, but faster
    # cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    scores = linear_kernel(verb_token_vector, tf_idf_matrix)
    logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
    logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
    return scores
项目:morpy    作者:iurykrieger96    | 项目源码 | 文件源码
def __init__(self):
        start = time.time()
        self.item_service = ItemService()
        self.data = pd.DataFrame(list(self.item_service.get_rec_data()))
        self.tfidf = TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 3),
            min_df=0,
            smooth_idf=False,
            stop_words='english')
        self.tfidf_matrix = self.tfidf.fit_transform(
            self.data['concated_attrs'])
        self.cosine_similarities = linear_kernel(
            self.tfidf_matrix, self.tfidf_matrix)
        info("Training data ingested in %s seconds." % (time.time() - start))
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def query(self, query, k=None, indices=None):
        if indices is not None:
            dvs = self.inferred_docvecs[indices]
        else:
            dvs = self.inferred_docvecs

        analyzed_query = self.analyzer(query)
        qv = self.model.infer_vector(analyzed_query).reshape(1, -1)
        qv = normalize(qv, copy=False)

        dists = linear_kernel(qv, dvs)[0]

        ind = argtopk(dists)

        return ind
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def query(self, query, k=None, indices=None):
        if self._fit_X is None:
            raise NotFittedError
        q = super().transform([query])
        if indices is not None:
            fit_X = self._fit_X[indices]
        else:
            fit_X = self._fit_X
        # both fit_X and q are l2-normalized
        D = linear_kernel(q, fit_X)
        ind = argtopk(D[0], k)
        return ind
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def query(self, query, k=None, indices=None):
        centroids = self.centroids
        if centroids is None:
            raise NotFittedError
        if indices is not None:
            centroids = centroids[indices]
        q = self.vect.transform([query])
        q = normalize(q, copy=False)
        D = linear_kernel(q, centroids)  # l2 normalized, so linear kernel
        # ind = np.argsort(D[0, :])[::-1]  # similarity metric, so reverse
        # if k is not None:  # we could use our argtopk in the first place
        #     ind = ind[:k]
        # print(ind)
        ret = argtopk(D[0], k=k)
        return ret
项目:EasyMKL    作者:jmikko    | 项目源码 | 文件源码
def __kernel_definition__(self):
        if self.Kf == 'rbf':
            return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
        if self.Kf == 'poly':
            return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
        if self.Kf == None or self.Kf == 'linear':
            return lambda X,Y : linear_kernel(X,Y)
项目:textcatvis    作者:cod3licious    | 项目源码 | 文件源码
def cluster_texts(textdict, eps=0.45, min_samples=3):
    """
    cluster the given texts

    Input:
        textdict: dictionary with {docid: text}
    Returns:
        doccats: dictionary with {docid: cluster_id}
    """
    doc_ids = list(textdict.keys())
    # transform texts into length normalized kpca features
    ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False)
    docfeats = ft.texts2features(textdict)
    X, featurenames = features2mat(docfeats, doc_ids)
    e_lkpca = KernelPCA(n_components=250, kernel='linear')
    X = e_lkpca.fit_transform(X)
    xnorm = np.linalg.norm(X, axis=1)
    X = X/xnorm.reshape(X.shape[0], 1)
    # compute cosine similarity
    D = 1. - linear_kernel(X)
    # and cluster with dbscan
    clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples)
    y_pred = clst.fit_predict(D)
    return {did: y_pred[i] for i, did in enumerate(doc_ids)}
项目:prml    作者:Yevgnen    | 项目源码 | 文件源码
def inner(self, x, y):
        return linear_kernel(to2d(x), to2d(y))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_kernel_symmetry():
    # Valid kernels should be symmetric
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        assert_array_almost_equal(K, K.T, 15)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_kernel_sparse():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    X_sparse = csr_matrix(X)
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        K2 = kernel(X_sparse, X_sparse)
        assert_array_almost_equal(K, K2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_linear_kernel():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    K = linear_kernel(X, X)
    # the diagonal elements of a linear kernel are their squared norm
    assert_array_almost_equal(K.flat[::6], [linalg.norm(x) ** 2 for x in X])
项目:ml-email-clustering    作者:anthdm    | 项目源码 | 文件源码
def query(self, keyword, limit):
    vec_keyword = self.vec.transform([keyword])
    cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten()
    related_email_indices = cosine_sim.argsort()[:-limit:-1]
    print(related_email_indices)
    return related_email_indices
项目:entity2vec    作者:D2KLab    | 项目源码 | 文件源码
def similarity_function(vec1,vec2, similarity):

    #compute cosine similarity or other similarities

    v1 = np.array(vec1)

    v2 = np.array(vec2)

    if len(v1)*len(v2) == 0: #any of the two is 0
        global count
        count +=1

        return 0

    else:

        if similarity == 'cosine':

            return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]]

        elif similarity == 'softmax':

            return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons

        elif similarity == 'linear_kernel':
            return linear_kernel(v1,v2)[0][0]

        elif similarity == 'euclidean':
            return euclidean_distances(v1,v2)[0][0]
        else:
            raise NameError('Choose a valid similarity function')