Python sklearn.metrics 模块，pairwise_distances() 实例源码

我们从Python开源项目中，提取了以下26个代码示例，用于说明如何使用sklearn.metrics.pairwise_distances()。

项目：document-qa 作者：allenai | 项目源码 | 文件源码

def prune(self, question, paragraphs: List[ExtractedParagraph]):
        if not self.filter_dist_one and len(paragraphs) == 1:
            return paragraphs

        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
        text = []
        for para in paragraphs:
            text.append(" ".join(" ".join(s) for s in para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        dists = pairwise_distances(q_features, para_features, "cosine").ravel()
        sorted_ix = np.lexsort(([x.start for x in paragraphs], dists))  # in case of ties, use the earlier paragraph

        if self.filter_dist_one:
            return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
        else:
            return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]

项目：document-qa 作者：allenai | 项目源码 | 文件源码

def dists(self, question, paragraphs: List[ExtractedParagraph]):
        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
        text = []
        for para in paragraphs:
            text.append(" ".join(" ".join(s) for s in para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        dists = pairwise_distances(q_features, para_features, "cosine").ravel()
        sorted_ix = np.lexsort(([x.start for x in paragraphs], dists))  # in case of ties, use the earlier paragraph

        if self.filter_dist_one:
            return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
        else:
            return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]

项目：corporadb 作者：nlesc-sherlock | 项目源码 | 文件源码

def find_distance_matrix(self, vector, metric='cosine'):
        '''
        compute distance matrix between topis using cosine or euclidean
        distance (default=cosine distance)
        '''
        if metric == 'cosine':
            distance_matrix = pairwise_distances(vector,
                                                metric='cosine')
            # diagonals should be exactly zero, so remove rounding errors
            numpy.fill_diagonal(distance_matrix, 0)
        if metric == 'euclidean':
            distance_matrix = pairwise_distances(vector,
                                                metric='euclidean')
        return distance_matrix

项目：answer-triggering 作者：jiez-osu | 项目源码 | 文件源码

def find_similar_words(wordvecs):
    """ Use loaded word embeddings to find out the most similar words in the
    embedded vector space.
    """
    from sklearn.metrics import pairwise_distances
    from scipy.spatial.distance import cosine
    pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
                                              metric='cosine',
                                              # metric='euclidean',
                                              )

    id2word = {}
    for key, value in wordvecs.word_idx_map.iteritems():
        assert(value not in id2word)
        id2word[value] = key
    while True:
        word = raw_input("Enter a word ('STOP' to quit): ")
        if word == 'STOP': break
        try:
            w_id = wordvecs.word_idx_map[word]
        except KeyError:
            print '%s not in the vocabulary.' % word
        sim_w_id  = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
        for i in sim_w_id:
            print id2word[i+1],
        print ''

项目：jack 作者：uclmr | 项目源码 | 文件源码

def sort_by_tfidf(question, paragraphs):
    tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=spacy.en.STOP_WORDS, decode_error='replace')
    try:
        para_features = tfidf.fit_transform(paragraphs)
        q_features = tfidf.transform([question])
    except ValueError:
        return [(i, 0.0) for i in range(len(paragraphs))]

    dists = pairwise_distances(q_features, para_features, "cosine").ravel()
    sorted_ix = np.lexsort((paragraphs, dists))  # in case of ties, use the earlier paragraph

    return [(i, 1.0 - dists[i]) for i in sorted_ix]

项目：dask-ml 作者：dask | 项目源码 | 文件源码

def test_pairwise_distances(X_blobs):
    centers = X_blobs[::100].compute()
    result = dm.pairwise_distances(X_blobs, centers)
    expected = sm.pairwise_distances(X_blobs.compute(), centers)
    assert_eq(result, expected, atol=1e-4)

项目：dask-ml 作者：dask | 项目源码 | 文件源码

def pairwise_distances(X, Y, metric='euclidean', n_jobs=None, **kwargs):
    if isinstance(Y, da.Array):
        raise TypeError("`Y` must be a numpy array")
    chunks = (X.chunks[0], (len(Y),))
    return X.map_blocks(metrics.pairwise_distances, Y,
                        dtype=float, chunks=chunks,
                        metric=metric, **kwargs)

项目：feature-aggregation 作者：paschalidoud | 项目源码 | 文件源码

def transform(self, X):
        """Compute the LLC representation of the provided data.

        Parameters
        ----------
        X : array_like or list
            The local features to aggregate. They must be either nd arrays or
            a list of nd arrays. In case of a list each item is aggregated
            separately.
        """
        # Get the local features and the number of local features per document
        X, lengths = self._reshape_local_features(X)

        # Preprocess the lengths list into indexes in the local feature array
        starts = np.cumsum([0] + lengths).astype(int)
        ends = np.cumsum(lengths).astype(int)

        # Calculate the nearest neighbors
        centroids = self._clusterer.cluster_centers_
        distances = pairwise_distances(X, centroids)
        K = self.neighbors
        neighbors = np.argpartition(distances, K)[:, :K]

        # Compute the llc representation
        llc = np.zeros((len(lengths), self.n_codewords))
        L2 = self.beta * np.eye(X.shape[1])
        for i, (s, e) in enumerate(zip(starts, ends)):
            for j in range(s, e):
                # a = argmin_{1^T a = 1} ||x - Ca||_2^2 + \beta ||a||_2^2
                C = centroids[neighbors[j]]
                a = C.dot(np.linalg.inv(C.T.dot(C) + L2)).dot(X[j])
                llc[i, neighbors[j]] = np.maximum(
                    llc[i, neighbors[j]],
                    a / a.sum()
                )

        return llc

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def computePerformance(self, instances):
        X = instances.features
        labels = instances.true_labels
        # For unsupervised projection methods, the performance is always computed with the labels (not the families).
        if hasattr(self.projection.conf, 'families_supervision'):
            if self.projection.conf.families_supervision:
                labels = instances.true_families
        unique_labels, label_inds = np.unique(labels, return_inverse = True)
        ratio = 0
        for li in xrange(len(unique_labels)):
            Xc  = X[label_inds == li]
            Xnc = X[label_inds != li]
            ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean()
        self.class_separation = ratio / len(unique_labels)

项目：FreeDiscovery 作者：FreeDiscovery | 项目源码 | 文件源码

def _compute_score(q, X, metric):
        """ Internal method to compute the scores """

        from .metrics import _scale_cosine_similarity

        dist = pairwise_distances(q, X, 'cosine')
        dist = dist[0]

        scores = 1 - dist

        scores = _scale_cosine_similarity(scores, metric=metric)

        return scores

项目：vsmlib 作者：undertherain | 项目源码 | 文件源码

def draw_features_and_similarity(mm, words_of_interest):
    rows, cols, xlabels = mm.filter_submatrix(words_of_interest, 25)
    ax = plt.subplot(1, 2, 1)
    plot_heat(ax, cols, xlabels, words_of_interest)
    # plot_heat(ax,abs(m),numbered)
    ax = plt.subplot(1, 2, 2)
    t = 1 - pairwise_distances(rows, metric="cosine")
    np.fill_diagonal(t, 0)
    plot_heat(ax, t, words_of_interest, words_of_interest)
    # plt.savefig("m1.pdf")

项目：document-qa 作者：allenai | 项目源码 | 文件源码

def score_paragraphs(self, question, paragraphs: List[ExtractedParagraphWithAnswers]):
        tfidf = self._tfidf
        text = []
        for para in paragraphs:
            text.append(" ".join(" ".join(s) for s in para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        q_words = {x for x in question if x.lower() not in self._stop}
        q_words_lower = {x.lower() for x in q_words}
        word_matches_features = np.zeros((len(paragraphs), 2))
        for para_ix, para in enumerate(paragraphs):
            found = set()
            found_lower = set()
            for sent in para.text:
                for word in sent:
                    if word in q_words:
                        found.add(word)
                    elif word.lower() in q_words_lower:
                        found_lower.add(word.lower())
            word_matches_features[para_ix, 0] = len(found)
            word_matches_features[para_ix, 1] = len(found_lower)

        tfidf = pairwise_distances(q_features, para_features, "cosine").ravel()
        starts = np.array([p.start for p in paragraphs])
        log_word_start = np.log(starts/400.0 + 1)
        first = starts == 0
        scores = tfidf * self.TFIDF_W + self.LOG_WORD_START_W * log_word_start + self.FIRST_W * first +\
                 self.LOWER_WORD_W * word_matches_features[:, 1] + self.WORD_W * word_matches_features[:, 0]
        return scores

项目：document-qa 作者：allenai | 项目源码 | 文件源码

def rank(self, questions: List[List[str]], paragraphs: List[List[List[str]]]):
        tfidf = self._tfidf
        para_features = tfidf.fit_transform([" ".join(" ".join(s) for s in x) for x in paragraphs])
        q_features = tfidf.transform([" ".join(q) for q in questions])
        scores = pairwise_distances(q_features, para_features, "cosine")
        return scores

项目：Python 作者：TheAlgorithms | 项目源码 | 文件源码

def centroid_pairwise_dist(X,centroids):
    return pairwise_distances(X,centroids,metric='euclidean')

项目：Python 作者：TheAlgorithms | 项目源码 | 文件源码

def compute_heterogeneity(data, k, centroids, cluster_assignment):

    heterogeneity = 0.0
    for i in range(k):

        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment==i, :]

        if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)

    return heterogeneity

项目：SVM-CNN 作者：dlmacedo | 项目源码 | 文件源码

def _compute_radii(self):
        """Generate RBF radii"""

        # use supplied radii if present
        radii = self._get_user_components('radii')

        # compute radii
        if (radii is None):
            centers = self.components_['centers']

            n_centers = centers.shape[0]
            max_dist = np.max(pairwise_distances(centers))
            radii = np.ones(n_centers) * max_dist/sqrt(2.0 * n_centers)

        self.components_['radii'] = radii

项目：fitr 作者：abrahamnunes | 项目源码 | 文件源码

def parameter_distance(params, dist_metric='canberra', scale='minmax', return_scaled=False):
    """
    Computes distances between subjects' respective parameter estimates

    Parameters
    ----------
    params : ndarray(shape=(nsubjects, nsubjects))
        Array of parameter estimates
    dist_metric : str (default='canberra')
        Distance metric to be used. Can take any value acceptable by ``sklearn.metrics.pairwise_distances``.
    scale : {'minmax', 'standard', 'none'}
        How to scale the parameters for distance computation
    return_scaled : bool
        Whether to return scaled parameters
    """

    if scale != 'none':
        if scale == 'minmax':
            scaler = MinMaxScaler()
        if scale == 'standard':
            scaler = StandardScaler()

        nparams = np.shape(params)[1]
        for j in range(nparams):
            scaledparam = scaler.fit_transform(params[:, j].reshape(-1, 1))
            params[:, j] = scaledparam.flatten()

    if return_scaled is True:
        D = (pairwise_distances(params, metric=dist_metric), params)
    else:
        D = pairwise_distances(params, metric=dist_metric)

    return D

项目：news-search-engine 作者：01joy | 项目源码 | 文件源码

def construct_k_nearest_matrix(self, dt_matrix, k):
        tmp = np.array(1 - pairwise_distances(dt_matrix[dt_matrix.columns[1:]], metric = "cosine"))
        similarity_matrix = pd.DataFrame(tmp, index = dt_matrix.index.tolist(), columns = dt_matrix.index.tolist())
        for i in similarity_matrix.index:
            tmp = [int(i),[]]
            j = 0
            while j < k:
                max_col = similarity_matrix.loc[i].idxmax(axis = 1)
                similarity_matrix.loc[i][max_col] =  -1
                if max_col != i:
                    tmp[1].append(int(max_col)) #max column name
                    j += 1
            self.k_nearest.append(tmp)