Python sklearn.metrics.pairwise 模块,pairwise_distances() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.metrics.pairwise.pairwise_distances()

项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def decision_function(self, X):
        """Compute the distances to the nearest centroid for
        an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array, shape = [n_samples]
        """
        from sklearn.metrics.pairwise import pairwise_distances
        from sklearn.utils.validation import check_array, check_is_fitted

        check_is_fitted(self, 'centroids_')

        X = check_array(X, accept_sparse='csr')

        return pairwise_distances(X, self.centroids_,
                                  metric=self.metric).min(axis=1)
项目:sef    作者:passalis    | 项目源码 | 文件源码
def test_similarity_calculations():
    """
    Tests the implementation of fast similarity calculations with the PyTorch
    :return:
    """
    np.random.seed(1)

    # Create random data vectors
    for sigma in [0.01, 0.1, 0.5, 1]:
        A = np.random.randn(10, 23)
        sef_sim = fast_heat_similarity_matrix(A, sigma)

        assert sef_sim.shape[0] == 10
        assert sef_sim.shape[1] == 10

        sim = np.exp(-pairwise_distances(A, A)**2/sigma**2)
        assert np.sum((sef_sim-sim)*2) < 1e-3
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_cosine2jaccard():
    from sklearn.metrics.pairwise import pairwise_distances
    from freediscovery.metrics import (cosine2jaccard_similarity,
                                       jaccard2cosine_similarity)

    x = np.array([[0, 0, 1., 1.]])
    y = np.array([[0, 1., 1., 0]])

    S_cos = 1 - pairwise_distances(x, y, metric='cosine')
    S_jac = cosine2jaccard_similarity(S_cos)
    S_jac_ref = 1 - pairwise_distances(x.astype('bool'), y.astype('bool'), metric='jaccard')

    assert_allclose(S_jac, S_jac_ref)

    S_cos2 = jaccard2cosine_similarity(S_jac)
    assert_allclose(S_cos2, S_cos)
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def centroid_similarity(X, internal_ids, nn_metric='cosine'):
    """ Given a list of documents in a cluster, compute the cluster centroid,
    intertia and individual distances

    Parameters
    ----------
    internal_ids : list
      a list of internal ids
    nn_metric : str
      a rescaling of the metric if needed
    """
    from ..metrics import _scale_cosine_similarity
    from sklearn.metrics.pairwise import pairwise_distances

    X_sl = X[internal_ids, :]
    centroid = X_sl.mean(axis=0)

    if centroid.ndim == 1:
        centroid = centroid[None, :]

    S_cos = 1 - pairwise_distances(X_sl, centroid, metric='cosine')
    S_sim = _scale_cosine_similarity(S_cos, metric=nn_metric)
    S_sim_mean = np.mean(S_sim)
    return float(S_sim_mean), S_sim[:, 0]
项目:newsgraph    作者:exchez    | 项目源码 | 文件源码
def query(vec, model, k, max_search_radius):

    data = model['data']
    table = model['table']
    random_vectors = model['random_vectors']
    num_vector = random_vectors.shape[1]

    # Compute bin index for the query vector, in bit representation.
    bin_index_bits = (vec.dot(random_vectors) >= 0).flatten()

    # Search nearby bins and collect candidates
    candidate_set = set()
    for search_radius in range(max_search_radius+1):
        candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set)

    # Sort candidates by their true distances from the query
    nearest_neighbors = pd.DataFrame({'id':list(candidate_set)})
    candidates = data[np.array(list(candidate_set)),:]
    nearest_neighbors['distance'] = pairwise_distances(candidates, vec, metric='cosine').flatten()

    return nearest_neighbors.sort_values(by='distance').head(k), len(candidate_set)
项目:CIKM_AnalytiCup_2017    作者:zxth93    | 项目源码 | 文件源码
def pre_train(train_df, test_df, train_add, test_add):

    train = train_df.values[:,1:-1]
    t = train_add.values[:,1:-1]
    train = np.hstack((train, t))

    dtest = test_df.values[:,1:]
    tA = test_add.values[:,1:]
    dtest = np.hstack((dtest, tA))

    cor_distance = pairwise.pairwise_distances(dtest, train)

    resultset = set()
    for tmp in cor_distance:
        index = np.argsort(tmp)
        for i in range(10):
            resultset.add(index[i])

    index = []
    for i in resultset:
        index.append(i)

    return index
项目:Default-Credit-Card-Prediction    作者:AlexPnt    | 项目源码 | 文件源码
def predict(self, X):
        """
        Classify the input data assigning the label of the nearest prototype

        Keyword arguments:
        X -- The feature vectors
        """
        classification=np.zeros(len(X))

        if self.distance_metric=="euclidean":
            distances=pairwise_distances(X, self.M_,self.distance_metric)                                   #compute distances to the prototypes (template matching)
        if self.distance_metric=="minkowski":
            distances=pairwise_distances(X, self.M_,self.distance_metric)   
        elif self.distance_metric=="manhattan":
            distances=pairwise_distances(X, self.M_,self.distance_metric)
        elif self.distance_metric=="mahalanobis":
            distances=pairwise_distances(X, self.M_,self.distance_metric)
        else:
            distances=pairwise_distances(X, self.M_,"euclidean")

        for i in xrange(len(X)):
            classification[i]=self.outcomes[distances[i].tolist().index(min(distances[i]))]                 #choose the class belonging to nearest prototype distance

        return classification
项目:sef    作者:passalis    | 项目源码 | 文件源码
def test_distance_calculations():
    """
    Tests the implementation of fast distance calculations with the PyTorch
    :return:
    """
    np.random.seed(1)

    # Create random data vectors
    A = np.random.randn(10, 23)
    B = np.random.randn(5, 23)

    sef_dists = fast_distance_matrix(A, B)

    assert sef_dists.shape[0] == 10
    assert sef_dists.shape[1] == 5

    dists = pairwise_distances(A, B)

    assert np.sum((sef_dists-dists)*2) < 1e-3
项目:sef    作者:passalis    | 项目源码 | 文件源码
def mean_data_distance(data):
    """
    Calculates the mean distance between a set of data points
    :param data:
    :return:
    """
    mean_distance = np.mean(pairwise_distances(data))
    return mean_distance
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def compare_pic(self,feature1,feature2):
    predicts=pw.pairwise_distances(feature2, feature1,'cosine')
    #predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts
项目:cbof    作者:passalis    | 项目源码 | 文件源码
def initialize_layer(self, data, n_samples=10000):
        """
        Initializes the layer using k-means (sigma is set to the mean pairwise distance)
        :param data: data
        :param n_samples: n_samples to keep for initializing the model
        :return:
        """
        if self.features_fn is None:
            assert False

        idx = np.arange(data.shape[0])
        np.random.shuffle(idx)

        features = []
        for i in range(idx.shape[0]):
            feats = self.features_fn([data[idx[i]]])
            feats = feats.transpose((0, 2, 3, 1))
            feats = feats.reshape((-1, feats.shape[-1]))
            features.extend(feats)
            if len(features) > n_samples:
                break
        features = np.asarray(features)

        kmeans = KMeans(n_clusters=self.n_codewords, n_jobs=4, n_init=5)
        kmeans.fit(features)
        V = kmeans.cluster_centers_.copy()

        # Initialize gamma
        mean_distance = np.sum(pairwise_distances(V)) / (self.n_codewords * (self.n_codewords - 1))
        self.gamma.set_value(self.gamma.get_value() * np.float32(mean_distance))

        # Initialize codebook
        V = V.reshape((V.shape[0], V.shape[1], 1, 1))
        self.V.set_value(np.float32(V))
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def delta(X, Y, n_jobs=-1, a=1, c=0):
    """Pairwise delta function: cosine and sigmoid

    :X: TODO
    :returns: TODO

    """
    D = pairwise_distances(X, Y, metric="cosine", n_jobs=n_jobs)
    if c != 0:
        D -= c
    if a != 1:
        D *= a
    D = expit(D)
    return D
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_euclidean2cosine():
    from sklearn.metrics.pairwise import pairwise_distances
    x = normalize([[0, 2, 3, 5]])
    y = normalize([[1, 3, 6, 7]])

    D_cos = pairwise_distances(x, y, metric='cosine')[0, 0]
    S_cos = 1 - D_cos
    D_seuc = pairwise_distances(x, y, metric='euclidean', squared=True)[0, 0]

    assert_allclose(S_cos, seuclidean_dist2cosine_sim(D_seuc))
项目:retrieval-2016-deepvision    作者:imatge-upc    | 项目源码 | 文件源码
def get_distances(self):

        distances = pairwise_distances(self.query_feats,self.db_feats,self.dist_type, n_jobs=-1)

        return distances
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def fit(self, data):
        """

        :param data:
        :return:
        """
        [n_samples, n_obs] = data.shape
        self.protos = data[self.rng.choice(n_samples, self.n_protos),] # w
        self.context = np.zeros(self.protos.shape)                     # c

        ct = np.zeros((1, n_obs))
        wr = ct
        cr = wr
        for iteration in range(self.iterations):
            sample = data[self.rng.choice(n_samples, 1),]

            ct = (1 - self.a) * wr + self.b * cr

            t = iteration / float(self.iterations)
            lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
            epsilon = self.epsilon_i * (self.lrate_f / float(self.lrate_i)) ** t

            d = (1 - self.a) * pairwise_distances(sample, self.protos) + self.a * pairwise_distances(ct, self.context)
            I = np.argsort(np.argsort(d))

            min_id = np.where(I == 0)[0]

            H = np.exp(-I / epsilon).ravel()

            diff_w = sample - self.protos
            diff_c = ct - self.context
            for i in range(self.n_protos):
                self.protos[i, :] += lrate * H[i] * diff_w[i, :]
                self.context[i, :] += lrate * H[i] * diff_c[i, :]

            wr = self.protos[min_id]
            cr = self.context[min_id]

        return self
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def encode(self, data, metric = 'euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = self.protos[self.__symbols]

        return (self.__encoding, self.__symbols)
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def fit(self, data):
        """ Learn data, and construct a vector codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        Returns
        -------
        self : object
            The instance itself
        """
        [n_samples, _] = data.shape
        self.protos = data[self.rng.choice(n_samples, self.n_protos), ]

        # avg_p = np.mean(data, 0)
        #dist_from_avg_p = np.sum(pairwise_distances(avg_p, data))
        #ndistortion = []

        for iteration in range(self.iterations):
            sample = data[self.rng.choice(n_samples, 1), ]

            t = iteration / float(self.iterations)
            lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
            epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t

            D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs)
            I = np.argsort(np.argsort(D))

            H = np.exp(-I / epsilon).ravel()

            diff = sample - self.protos
            for proto_id in range(self.n_protos):
                self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :]
                #nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos)
                #distances, _ = nbrs.kneighbors(data)
        #ndistortion.append( np.sum(distances) / dist_from_avg_p )

        return self
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def encode(self, data, metric='euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        # Perform a proposed data mining procedure as described in [Laskaris2004].
        mds = MDS(1, random_state=self.rng)
        protos_1d = mds.fit_transform(self.protos).ravel()
        sorted_protos_1d = np.argsort(protos_1d)

        sprotos = self.protos[sorted_protos_1d]

        nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = sprotos[self.__symbols]

        return (self.__encoding, self.__symbols)
项目:newsgraph    作者:exchez    | 项目源码 | 文件源码
def grab_articles(self, ids):
    task_id = self.request.id
    ids = ids[0]
    print("Entering Grab Articles Task: ", len(ids))
    print("Task id from self: ", task_id)

    s = select([articles_db.c.id, articles_db.c.tfidf]).where(articles_db.c.id.in_(ids))
    all_articles = pd.read_sql(s, con=connection, chunksize=350)
    all_articles = pd.concat(all_articles, ignore_index=True)

    stored_data = json.loads(r.get(task_id))
    stored_data['status'] = "creating article matrix"
    r.set(task_id, json.dumps(stored_data))

    tfidf_dict = stored_data['tfidf_dict']
    all_articles = all_articles.append({'id': 1, 'tfidf': tfidf_dict}, ignore_index=True)
    corpus = helpers.generate_sparse_matrix(all_articles)
    query_article_vector = corpus.getrow(-1)
    all_articles['distance'] = pairwise_distances(corpus, query_article_vector, metric='cosine').flatten()

    stored_data['status'] = "computing best matches"
    r.set(task_id, json.dumps(stored_data))

    max_distance_from_query = 0.75  # on a scale of 0 (exact match) to 1.0 (not even close)
    all_articles = all_articles[all_articles['distance'] < max_distance_from_query]
    print("Done computing matrix and distances")
    s = select([articles_db.c.id, articles_db.c.headline, articles_db.c.url, articles_db.c.date]).where(
        articles_db.c.id.in_(all_articles['id'].tolist()))
    all_articles = pd.read_sql(s, connection).set_index('id').join(all_articles.set_index('id')).sort_values(by='date')

    query_article = {'headline': stored_data['headline'], 'date': datetime.strptime(stored_data['date'], "%d-%b-%Y"),
                     'distance': 0, 'url': stored_data['url']}
    articles = helpers.make_article_array(all_articles, query_article)
    return articles, query_article['headline']
项目:semspaces    作者:pmandera    | 项目源码 | 文件源码
def pairwise_distances(self, X, Y=None, metric='cosine',
                           n_jobs=1, **kwds):

        if self.prenorm:
            if metric == 'cosine':
                return self._cosine_distances_prenorm(X, Y)
            else:
                raise Exception(
                    'Vectors are normalized and will work only with cosine.')

        return smp.pairwise_distances(X, Y, metric=metric,
                                      n_jobs=n_jobs, **kwds)
项目:semspaces    作者:pmandera    | 项目源码 | 文件源码
def all_distances(self, l1, metric='cosine'):
        """Return distance matrix with distances to all words."""

        l1_vecs = self.word_vectors_matrix(l1)
        l1_labels = [self.label(e) for e in l1]

        sims = self.pairwise_distances(l1_vecs, self.vectors, metric=metric)

        return pd.DataFrame(sims, l1_labels, self.words)
项目:semspaces    作者:pmandera    | 项目源码 | 文件源码
def pair_distance(self, w1, w2, metric='cosine'):
        """Calculate distance between two words."""

        distance = self.pairwise_distances(
            self.get_vector(w1),
            self.get_vector(w2), metric=metric)

        return distance[0, 0]
项目:semspaces    作者:pmandera    | 项目源码 | 文件源码
def matrix_distances(self, l1, l2=None, metric='cosine'):
        """Return distance matrix with distances between pairs of words."""

        l1_vecs = self.word_vectors_matrix(l1)
        l1_labels = [self.label(e) for e in l1]

        if l2 is None:
            sims = self.pairwise_distances(l1_vecs, metric=metric)
            l2 = l1
        else:
            l2_vecs = self.word_vectors_matrix(l2)
            l2_labels = [self.label(e) for e in l2]
            sims = self.pairwise_distances(l1_vecs, l2_vecs, metric=metric)

        return pd.DataFrame(sims, l1_labels, l2_labels)
项目:CNN_Visualization    作者:albioTQ    | 项目源码 | 文件源码
def computeProbabilities(X, perplexity=30.0, tolerance=1e-5):
    #Perform an initial dimensionality reduction
    pca = PCA(n_components=50)

    X = pca.fit_transform(X)

    numSamples = X.shape[0]

    P = np.zeros((numSamples, numSamples))

    D = pairwise_distances(X, squared=True)

    for i in range(numSamples):
        indices = np.concatenate((np.arange(i), np.arange(i + 1, numSamples)))

        distancesFromI = D[i, indices]

        sigma = binarySearch(computePerplexity, distancesFromI, tolerance, perplexity)

        precision = 1.0 / sigma
        #Compute a "row" of matrix P: the probabilities wrt point I
        PwrtI = np.exp(- distancesFromI * precision)
        PwrtI /= sum(PwrtI)
        #Insert an element corresponding to I wrt I
        PwrtI = np.concatenate((PwrtI[0:i], [0.0], PwrtI[i:numSamples]))
        #Insert the row
        P[i, :] = PwrtI

    return P
项目:SpindleNet    作者:yokattame    | 项目源码 | 文件源码
def main(args):
  PF, PL, GF, GL = _get_test_data(args.result_dir)
  D = pairwise_distances(GF, PF, metric=args.method, n_jobs=-2)

  gallery_labels_set = np.unique(GL)

  for label in PL:
    if label not in gallery_labels_set:
      print 'Probe-id is out of Gallery-id sets.'

  Times = 100
  k = 20

  res = np.zeros(k)

  gallery_labels_map = [[] for i in xrange(gallery_labels_set.size)]
  for i, g in enumerate(GL):
    gallery_labels_map[g].append(i)

  for __ in xrange(Times):
    # Randomly select one gallery sample per label selected
    newD = np.zeros((gallery_labels_set.size, PL.size))
    for i, g in enumerate(gallery_labels_set):
      j = np.random.choice(gallery_labels_map[g])
      newD[i, :] = D[j, :]
    # Compute CMC
    res += _cmc_core(newD, gallery_labels_set, PL, k)
  res /= Times

  for topk in [1, 5, 10, 20]:
    print "{:8}{:8.1%}".format('top-' + str(topk), res[topk - 1])
项目:DeepID2    作者:chenzeyuczy    | 项目源码 | 文件源码
def getDist(feat1, feat2, metric):
    pair_num = len(feat1)
    import sklearn.metrics.pairwise as pw
    mt = pw.pairwise_distances(feat1, feat2, metric=metric)
    distance = np.empty((pair_num,))
    for i in xrange(pair_num):
        distance[i] = mt[i,i]
    return distance

# Extract feature via network.
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def constructCovarianceMatrix(self):

        #this function constructs the covariance matrix for the dataset and then does a label propagation over it

        self.covarianceMatrix=np.cov(self.trainVectorsPCA.T) #as numpy treats them as column vetcors
        self.inverseCovarianceMatrix=np.linalg.inv(self.covarianceMatrix)

        #compute the cholesky decomposition and then transform the data into the new space

        self.L_cov=np.linalg.cholesky(self.covarianceMatrix)
        self.allDataCov=np.dot(self.allDataPCA,self.L_cov.T)
        self.pwdis=pairwise_distances(self.allDataCov)
        self.D=np.zeros(self.pwdis.shape)
        projectedDigits=TSNE(random_state=randomState).fit_transform(self.allDataCov)
        plt.figure()
        plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
        plt.title('Data projected by Covariance Matrix in Mahalanobis metric')
        plt.savefig(pp,format='pdf')
        plt.close()

        ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
        accs=[]
        for k in ks:
            for i in range(0,self.pwdis.shape[0]):
                l1=self.pwdis[i].tolist()
                #print 'l1 is ',l1,'\n\n'
                allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
                #now set the all the weights except for k+1 to 0
                self.pwdis[i,allnearestNeighbours[k:]]=0
                self.D[i,i]=sum(self.pwdis[i]+0.01)

            print 'accuracy by using Covariance Matrix for Mahalanobis Distance for k= ',k,'\n'
            accs.append(self.labelPropogation())

        plt.figure()
        plt.plot(ks,accs)
        plt.title('Plot of accuracy vs k using Covariance Matrix in  Mahalanobis metric')
        plt.savefig(pp,format='pdf')
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def constructEucleadianGaussianKernel(self):

        self.pwdis=pairwise_distances(self.allDataPCA)

        maccs=[]
        ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
        for k in ks:
            sigmas=[1,1.5,2,2.5,3,3.5]
            accs=[]
            for sigma in sigmas:
                self.pwdis=-1*self.pwdis/(2*sigma*sigma)
                self.pwdis=np.exp(self.pwdis)
                self.D=np.zeros(self.pwdis.shape)
                for i in range(0,self.pwdis.shape[0]):
                    l1=self.pwdis[i].tolist()
                    #print 'l1 is ',l1,'\n\n'
                    allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
                    #now set the all the weights except for k+1 to 0
                    self.pwdis[i,allnearestNeighbours[k:]]=0
                    self.D[i,i]=sum(self.pwdis[i])

                    #here we make no trnasformation on the dataset, as this is simply the 
                print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
                accs.append(self.labelPropogation())
            maccs.append(np.mean(accs))

        plt.figure()
        plt.plot(ks,maccs)
        plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
        plt.savefig(pp,format='pdf')
        plt.close()
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def constructEucleadianGaussianKernelNoPca(self):

        self.pwdis=pairwise_distances(self.allVectors)

        maccs=[]
        ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
        for k in ks:
            sigmas=[1,1.5,2,2.5,3,3.5]
            accs=[]
            for sigma in sigmas:
                self.pwdis=-1*self.pwdis/(2*sigma*sigma)
                self.pwdis=np.exp(self.pwdis)
                self.D=np.zeros(self.pwdis.shape)
                for i in range(0,self.pwdis.shape[0]):
                    l1=self.pwdis[i].tolist()
                    #print 'l1 is ',l1,'\n\n'
                    allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
                    #now set the all the weights except for k+1 to 0
                    self.pwdis[i,allnearestNeighbours[k:]]=0
                    self.D[i,i]=sum(self.pwdis[i])

                    #here we make no trnasformation on the dataset, as this is simply the 
                print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
                accs.append(self.labelPropogation())
            maccs.append(np.mean(accs))

        plt.figure()   
        plt.plot(ks,maccs)
        plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
        plt.savefig(pp,format='pdf')           
        plt.close()
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def constructSimilartyMatrixCosine(self):
        #This is a simpole k nearest neighbour approach based on the cosine distance
        #for this takefrom modshogun import RealFeatures, MulticlassLabels
        #then find the k nearest neighbours for each node 

        #now we have all the pairwise cosine distances between all the sentences
        #now we need to do a knnNeighbour search
        #now we can construct the diagonal weight marix , which has the sum of all the weights
        ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
        accs=[]
        for k in ks:

            self.pwdis=pairwise_distances(self.allVectors,metric='cosine')
            self.D=np.zeros(self.pwdis.shape)
            for i in range(0,self.pwdis.shape[0]):
                l1=self.pwdis[i].tolist()
                #print 'l1 is ',l1,'\n\n'
                allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
                #now set the all the weights except for k+1 to 0
                self.pwdis[i,allnearestNeighbours[k:]]=0
                self.D[i,i]=sum(self.pwdis[i])

            print 'accuracy on non pca data using cosine and k= ',k,' is ','\n'
            accs.append(self.labelPropogation())

        plt.figure()
        plt.plot(ks,accs)
        plt.title('Plot of accuracy vs k using cosine non PCA data')
        plt.savefig(pp,format='pdf')
        plt.close()
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def constructSimilartyMatrixCosinePCA(self):
        #This is a simpole k nearest neighbour approach based on the cosine distance
        #for this takefrom modshogun import RealFeatures, MulticlassLabels
        #then find the k nearest neighbours for each node 
        ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
        accs=[]
        for k in ks:
            self.pwdis=pairwise_distances(self.allDataPCA,metric='cosine')
            #now we have all the pairwise cosine distances between all the sentences
            #now we need to do a knnNeighbour search
            #now we can construct the diagonal weight marix , which has the sum of all the weights
            self.D=np.zeros(self.pwdis.shape)
            for i in range(0,self.pwdis.shape[0]):
                l1=self.pwdis[i].tolist()
                #print 'l1 is ',l1,'\n\n'
                allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
                #now set the all the weights except for k+1 to 0
                self.pwdis[i,allnearestNeighbours[k:]]=0
                self.D[i,i]=sum(self.pwdis[i])

            print 'Now computing accuracy for cosine metric on PCA data'
            accs.append(self.labelPropogation())

        plt.figure()
        plt.plot(ks,accs)
        plt.title('Plot of accuracy vs k using cosine  PCA data')
        plt.savefig(pp,format='pdf')    
        plt.close()

        #now we have the weight matrix graph based on the cosine distance
        #print 'self.D is ',self.D
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def calc_cosine_dist(text_a ,text_b):
    return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def calc_cosine_dist(text_a ,text_b):
    return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def calc_cosine_dist(text_a ,text_b):
    return pairwise_distances(text_a, text_b, metric='cosine')[0][0]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def calc_cosine_dist(text_a ,text_b, metric = 'euclidean'):
    return pairwise_distances([text_a], [text_b], metric = metric)[0][0]
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def predict_proba(self, X):
        """
        Returns a matrix for each of the samples to belong to each of the classes.
        The matrix has shape = [n_samples, n_classes] where n_samples is the
        size of the first dimension of the input matrix X and n_classes is the number of
        classes as determined from the parameter 'y' obtained during training.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Prediction vector, where n_samples in the number of samples and
            n_features is the number of features.
        """
        probabilities = np.zeros((X.shape[0], self.y.shape[1]), dtype=np.float64)
        distances = (pairwise_distances(X, self.centroids_, metric=self.metric))

        # in order to get probability like values, we ensure that the closer
        # the distance is to zero, the closer the probability is to 1
        if(self.metric == 'cosine'):
            distances = 1 - distances
        else:
            # in the case of euclidean distance metric we need to normalize by the largest distance
            # to get a value between 0 and 1
            distances = 1 - (distances / distances.max())

        # map back onto a matrix containing all labels
        probabilities[:,self._mem_original_mapping] = distances

        return probabilities
项目:ecml17    作者:gmum    | 项目源码 | 文件源码
def assign_to_closest(X, centers, metric='euclidean'):
    return np.argmin(pairwise_distances(X, centers, metric=metric), axis=1)
项目:eucl_dist    作者:droyed    | 项目源码 | 文件源码
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean')

# Sets of inputs
项目:eucl_dist    作者:droyed    | 项目源码 | 文件源码
def sq_cdist(A,B): return pairwise_distances(A,B, 'sqeuclidean')

# Sets of input defining sizes
项目:tensorflow-playground    作者:wangz10    | 项目源码 | 文件源码
def sort(self, word):
        '''
        Use an input word to sort words using cosine distance in ascending order
        '''
        assert word in self.dictionary
        i = self.dictionary[word]
        vec = self.final_embeddings[i].reshape(1, -1)
        # Calculate pairwise cosine distance and flatten to 1-d
        pdist = pairwise_distances(self.final_embeddings, vec, metric='cosine').ravel()
        return [self.reverse_dictionary[i] for i in pdist.argsort()]