我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用sklearn.metrics.pairwise.linear_kernel()。
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,): docs=[] for sent, sim in generatedSentences: docs.append(sent) docs.extend(originalSentences) bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) #simMatrix = (normalized[0:] * normalized[0:].T).A simindices=[] #print 'Num original, ', len(originalSentences) for i in xrange(len(generatedSentences)): simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten() if(max(simGeneratedScores) >= threshold): simindices.append(i) #print simindices finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices] #print len(generatedSentences), len(finalGen) return finalGen
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix): """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix. :param str verb_token: Surface form of a verb, e.g., *born* :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer used to transform verbs into vectors :return: cosine similarity score :rtype: ndarray """ verb_token_vector = vectorizer.transform([verb_token]) # Here the linear kernel is the same as the cosine similarity, but faster # cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity scores = linear_kernel(verb_token_vector, tf_idf_matrix) logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores)) logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores))) return scores
def __init__(self): start = time.time() self.item_service = ItemService() self.data = pd.DataFrame(list(self.item_service.get_rec_data())) self.tfidf = TfidfVectorizer( analyzer='word', ngram_range=(1, 3), min_df=0, smooth_idf=False, stop_words='english') self.tfidf_matrix = self.tfidf.fit_transform( self.data['concated_attrs']) self.cosine_similarities = linear_kernel( self.tfidf_matrix, self.tfidf_matrix) info("Training data ingested in %s seconds." % (time.time() - start))
def query(self, query, k=None, indices=None): if indices is not None: dvs = self.inferred_docvecs[indices] else: dvs = self.inferred_docvecs analyzed_query = self.analyzer(query) qv = self.model.infer_vector(analyzed_query).reshape(1, -1) qv = normalize(qv, copy=False) dists = linear_kernel(qv, dvs)[0] ind = argtopk(dists) return ind
def query(self, query, k=None, indices=None): if self._fit_X is None: raise NotFittedError q = super().transform([query]) if indices is not None: fit_X = self._fit_X[indices] else: fit_X = self._fit_X # both fit_X and q are l2-normalized D = linear_kernel(q, fit_X) ind = argtopk(D[0], k) return ind
def query(self, query, k=None, indices=None): centroids = self.centroids if centroids is None: raise NotFittedError if indices is not None: centroids = centroids[indices] q = self.vect.transform([query]) q = normalize(q, copy=False) D = linear_kernel(q, centroids) # l2 normalized, so linear kernel # ind = np.argsort(D[0, :])[::-1] # similarity metric, so reverse # if k is not None: # we could use our argtopk in the first place # ind = ind[:k] # print(ind) ret = argtopk(D[0], k=k) return ret
def __kernel_definition__(self): if self.Kf == 'rbf': return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma) if self.Kf == 'poly': return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff) if self.Kf == None or self.Kf == 'linear': return lambda X,Y : linear_kernel(X,Y)
def cluster_texts(textdict, eps=0.45, min_samples=3): """ cluster the given texts Input: textdict: dictionary with {docid: text} Returns: doccats: dictionary with {docid: cluster_id} """ doc_ids = list(textdict.keys()) # transform texts into length normalized kpca features ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False) docfeats = ft.texts2features(textdict) X, featurenames = features2mat(docfeats, doc_ids) e_lkpca = KernelPCA(n_components=250, kernel='linear') X = e_lkpca.fit_transform(X) xnorm = np.linalg.norm(X, axis=1) X = X/xnorm.reshape(X.shape[0], 1) # compute cosine similarity D = 1. - linear_kernel(X) # and cluster with dbscan clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples) y_pred = clst.fit_predict(D) return {did: y_pred[i] for i, did in enumerate(doc_ids)}
def inner(self, x, y): return linear_kernel(to2d(x), to2d(y))
def test_kernel_symmetry(): # Valid kernels should be symmetric rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, laplacian_kernel, sigmoid_kernel, cosine_similarity): K = kernel(X, X) assert_array_almost_equal(K, K.T, 15)
def test_kernel_sparse(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) X_sparse = csr_matrix(X) for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, laplacian_kernel, sigmoid_kernel, cosine_similarity): K = kernel(X, X) K2 = kernel(X_sparse, X_sparse) assert_array_almost_equal(K, K2)
def test_linear_kernel(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) K = linear_kernel(X, X) # the diagonal elements of a linear kernel are their squared norm assert_array_almost_equal(K.flat[::6], [linalg.norm(x) ** 2 for x in X])
def query(self, keyword, limit): vec_keyword = self.vec.transform([keyword]) cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten() related_email_indices = cosine_sim.argsort()[:-limit:-1] print(related_email_indices) return related_email_indices
def similarity_function(vec1,vec2, similarity): #compute cosine similarity or other similarities v1 = np.array(vec1) v2 = np.array(vec2) if len(v1)*len(v2) == 0: #any of the two is 0 global count count +=1 return 0 else: if similarity == 'cosine': return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]] elif similarity == 'softmax': return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons elif similarity == 'linear_kernel': return linear_kernel(v1,v2)[0][0] elif similarity == 'euclidean': return euclidean_distances(v1,v2)[0][0] else: raise NameError('Choose a valid similarity function')