Python sklearn.metrics.pairwise 模块,euclidean_distances() 实例源码


项目:bolero    作者:rock-learning    | 项目源码 | 文件源码
def predict(self, X):
        """Predict ranking values for new data.

        X : array, shape (n_test, n_features)
            Test data

        y : array, shape (n_test,)
            Ranking values
        n_features = X.shape[1]

        if self.n_features != n_features:
            raise ValueError("Expected %d dimensions, got %d"
                             % (self.n_features, n_features))

        K = euclidean_distances(self.X, X, squared=True)
        K /= self.denom
        np.exp(K, K)

        return np.sum(self.alpha[:, np.newaxis] * (K[:-1] - K[1:]), axis=0)
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def transform(self, X):
        Transform X into subcluster centroids dimension.

        Each dimension represents the distance from the sample point to each
        cluster centroid.

        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data.

        X_trans : {array-like, sparse matrix}, shape (n_samples, n_clusters)
            Transformed data.
        check_is_fitted(self, 'subcluster_centers_')
        return euclidean_distances(X, self.subcluster_centers_)
项目:pylmnn    作者:johny-c    | 项目源码 | 文件源码
def _select_target_neighbors(self):
        """Find the target neighbors of each sample, that stay fixed during training.

            An array of neighbors indices for each sample with shape (n_samples, n_neighbors).

        """'Finding target neighbors...')
        target_neighbors = np.empty((self.X_.shape[0], self.n_neighbors_), dtype=int)
        for class_ in self.classes_:
            class_ind, = np.where(np.equal(self.y_, class_))
            dist = euclidean_distances(self.X_[class_ind], squared=True)
            np.fill_diagonal(dist, np.inf)
            neigh_ind = np.argpartition(dist, self.n_neighbors_ - 1, axis=1)
            neigh_ind = neigh_ind[:, :self.n_neighbors_]
            # argpartition doesn't guarantee sorted order, so we sort again but only the k neighbors
            row_ind = np.arange(len(class_ind))[:, None]
            neigh_ind = neigh_ind[row_ind, np.argsort(dist[row_ind, neigh_ind])]
            target_neighbors[class_ind] = class_ind[neigh_ind]

        return target_neighbors
项目:PersonalizedMultitaskLearning    作者:mitmedialab    | 项目源码 | 文件源码
def eta_L2(self):
        # Note that V should be positive
        return self.V*np.sum(euclidean_distances(self.eta,squared=True))
项目:ref-extract    作者:brandonrobertz    | 项目源码 | 文件源码
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10):
    if dist == 'euclidean':
        sim = euclidean_distances(X, vec.reshape(1, -1))
    elif dist == 'cosine':
        sim = cosine_similarity(X, vec.reshape(1, -1))
        raise NotImplementedError('dist must be euclidean or cosine')
    # get the top five indices
    indices = sim.argsort(axis=0)[-top:][::-1]
    words = []
    for i in indices:
    return " ".join(words)
项目:Yugioh-bot    作者:will7200    | 项目源码 | 文件源码
def compare_distances(self, train_img, cluster):
        # sometimes the sift algorithm matches random points on screen so therefore
        # it is necessary to determine the euclidean distances between these points
        distances = euclidean_distances([self.kmeans.cluster_centers_[0]], cluster)
        height, width = train_img.shape
        new_cluster = []
        # If all the points are greater than np.sqrt((width / 2) ** 2 + (height / 2) ** 2)
        # Which then we can assume that they are not correct
        # this will only work on images that fit the same dimensions against the query image
        for index, distance in enumerate(distances[0]):
            if distance <= np.sqrt((width / 2) ** 2 + (height / 2) ** 2):
        return new_cluster
项目:bolero    作者:rock-learning    | 项目源码 | 文件源码
def fit(self, X):
        """Fit ranking SVM.

        X : array, shape (n_samples, n_features)
            Training data, sorted, highest rank first
        self.n_samples, self.n_features = X.shape
        self.n_alpha = self.n_samples - 1
        self.X = X

        if self.n_samples < 2:
            raise ValueError("Expected at least 2 training samples, got %d"
                             % self.n_samples)

        random_state = check_random_state(self.random_state)
        n_iter = self.n_iter
        if n_iter < 0:
            n_iter = int(50000 * np.sqrt(self.n_features))

        K = euclidean_distances(self.X, squared=True)

        # Average distance between training data
        sigma = np.sqrt(K).sum() / ((self.n_samples - 1) * self.n_samples)
        sigma *= self.c_sigma
        self.denom = -np.maximum(2.0 * sigma ** 2, MACHINE_EPSILON)

        K /= self.denom
        np.exp(K, K)

        # Constraint violation cost
        Ci = np.linspace(self.n_alpha, 1, self.n_alpha) ** self.c_pow
        Ci *= 10 ** self.c_base

        # Optimize alpha parameters
        self.alpha = optimize(Ci, K, 1.0, n_iter, random_state)

        return self
项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def generateClustering(self, assignment_proba, centroids, drop_annotated_instances = False,
                           cluster_labels = None):
        self.clusters = [Cluster() for x in range(self.num_clusters)]
        if cluster_labels is not None:
            for x in range(self.num_clusters):
                self.clusters[x].label = cluster_labels[x]
        ids = self.instances.getIds()
        for i in range(len(ids)):
            instance_id = ids[i]
            annotated   = self.instances.isAnnotated(instance_id)
            c           = self.assigned_clusters[i]
            proba       = None
            if assignment_proba is not None:
                proba = assignment_proba[i, :]
            label  = self.instances.getLabel(instance_id)
            family = self.instances.getFamily(instance_id)
            if centroids is not None:
                # Reshape to avoid warning from euclidean_distances
                # Does not take 1D array as input
                centroid = centroids[c].reshape(1, -1)
                features = self.instances.getInstance(instance_id).reshape(1,-1)
                distance = euclidean_distances(centroid, features)[0][0]
                distance = None
            self.clusters[c].addInstance(instance_id, distance, label, family, annotated)
        unknown_cluster_id = 0
        for c in range(self.num_clusters):
            unknown_cluster_id = self.clusters[c].finalComputation(unknown_cluster_id)
项目:picasso    作者:jungmannlab    | 项目源码 | 文件源码
def substract_picks(self, path):
        oldpicks = self._picks.copy()
        with open(path, 'r') as f:
            regions = yaml.load(f)
            self._picks = regions['Centers']
            diameter = regions['Diameter']

            x_cord = np.array([_[0] for _ in self._picks])
            y_cord = np.array([_[1] for _ in self._picks])
            x_cord_old = np.array([_[0] for _ in oldpicks])
            y_cord_old = np.array([_[1] for _ in oldpicks])

            distances = np.sum((euclidean_distances(oldpicks, self._picks)<diameter/2)*1,axis=1)>=1
            filtered_list = [i for (i, v) in zip(oldpicks, distances) if not v]

            x_cord_new = np.array([_[0] for _ in filtered_list])
            y_cord_new = np.array([_[1] for _ in filtered_list])
            output = False

            if output:
                fig1 = plt.figure()
                plt.title('Old picks and new picks')
                plt.scatter(x_cord,-y_cord, c='r', label='Newpicks')
                plt.scatter(x_cord_old,-y_cord_old, c='b', label='Oldpicks')
                plt.scatter(x_cord_new,-y_cord_new, c='g', label='Picks to keep')
            self._picks = filtered_list

项目:soft-dtw    作者:mblondel    | 项目源码 | 文件源码
def compute(self):
        Compute distance matrix.

        D: array, shape = [m, n]
            Distance matrix.
        return euclidean_distances(self.X, self.Y, squared=True)
项目:tslearn    作者:rtavenar    | 项目源码 | 文件源码
def compute(self):
        Compute distance matrix.
        D: array, shape = [m, n]
            Distance matrix.
        return euclidean_distances(self.X, self.Y, squared=True)
项目:operalib    作者:operalib    | 项目源码 | 文件源码
def first_periodic_kernel(X, Y=None, gamma=None, period=None):
    # TODO: Add mathematical form of the kernel in the docstring
    """Compute the first periodic kernel between *X* and *Y*.

    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float, default None
        If None, default to 1.0 / n_samples_X

    period : float, default None
        If None, default to 2 * pi.

        This parameter should not be default as
        wrong estimation lead to poor learning score.

    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 0.8

    if period is None:
        period = 2. * pi

    a = -log(gamma) / period
    b = 2 * pi / period
    c = sqrt(pi / a) * (exp(- b ** 2 / (4 * a)) + 1)
    K = euclidean_distances(X, Y, squared=True)

    # TODO: Optimize to avoid temporary?
    return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
项目:semeval2017    作者:edilsonacjr    | 项目源码 | 文件源码
def fit(self, X, y):

        eucl = euclidean_distances(X)

        k = self.k
        while True:
            simi_m = 1 / (1 + eucl)
            to_remove = simi_m.shape[0] - (k + 1)

            for vec in simi_m:
                vec[vec.argsort()[:to_remove]] = 0

            g = Graph.Weighted_Adjacency(simi_m.tolist(), mode=ADJ_UNDIRECTED, loops=False)

            if g.is_connected():
            k += 1

        self.k = k
        comm = g.community_multilevel()
        self.y_comm = np.array(comm.membership)
        self.y = y
        self.X = X
        self.mapping = {}
        for c in list(set(comm.membership)):
            com_clas = self.y[self.y_comm==c]
            self.mapping[c] = Counter(com_clas).most_common(1)[0][0]
项目:semeval2017    作者:edilsonacjr    | 项目源码 | 文件源码
def predict(self, X):
        y_pred = []
        for x in X:
            dists = euclidean_distances([x], self.X)[0]
            simi_m = 1 / (1 + dists)
            nearest_com = self.y_comm[simi_m.argsort()[-self.k:]]

        return np.array(y_pred)
项目:clust    作者:BaselAbujamous    | 项目源码 | 文件源码
def dist_matrices(X1, X2, criterion='euclidean'):
    X1loc = np.array(X1)
    X2loc = np.array(X2)

    if len(X1loc.shape) == 1:
        if len(X2loc.shape) == 1:
            if X1loc.shape[0] == X2loc.shape[0]:
                # As row vectors
                X1loc = X1loc.reshape(1, -1)
                X2loc = X2loc.reshape(1, -1)
                # As column vectors
                X1loc = X1loc.reshape(-1, 1)
                X2loc = X2loc.reshape(-1, 1)
            if X1loc.shape[0] == X2loc.shape[1]:
                # Row vector VS. Many rows
                X1loc = X1loc.reshape(1, -1)
            elif X2loc.shape[1] == 1:
                # Column vector VS. Column vector
                X1loc = X1loc.reshape(-1, 1)
            elif X1loc.shape[0] == X2loc.shape[0]:
                # Row vector VS. transposed columns
                X1loc = X1loc.reshape(1, -1)
                X2loc = X2loc.transpose()
                raise ValueError('Invalid dimensions of X1 and X2')
    elif len(X2loc.shape) == 1:
        if X2loc.shape[0] == X1loc.shape[1]:
            # Many rows VS. row vector
            X2loc = X2loc.reshape(1, -1)
            raise ValueError('Invalid dimensions of X1 and X2')

    if criterion == 'euclidean':
        return skdists.euclidean_distances(X1loc, X2loc)
    elif criterion == 'hamming':
        raise NotImplementedError('Hamming distance between rows of matrices has not been implemented yet.')
        raise ValueError('Invalid distance criterion')
项目:ProjectOfDataMining    作者:IljaNovo    | 项目源码 | 文件源码
def process(self, rows_slice):
        tmp = self.array[rows_slice, ...]
        result = - euclidean_distances(tmp, self.array, squared = True)

        with Worker.hdf5_lock:            
            with tables.open_file(self.hdf5_file, 'r+') as fileh:
                hdf5_array = fileh.get_node(self.path)
                hdf5_array[rows_slice, ...] = result

        del tmp
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((20, 4))
    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)

    # check that we still get the right answers with {X,Y}_norm_squared
    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
    assert_array_almost_equal(D2, D1)
    assert_array_almost_equal(D3, D1)
    assert_array_almost_equal(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    X_norm_sq *= 0.5
    Y_norm_sq *= 0.5
    wrong_D = euclidean_distances(X, Y,
    assert_greater(np.max(np.abs(wrong_D - D1)), .01)

# Paired distances
项目:DEPICT    作者:herandy    | 项目源码 | 文件源码
def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40,
    # weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None }

    if weight_initilization == 'kmeans-pca':

        start_time = timeit.default_timer()
        pca = PCA(n_components=nClusters).fit(encoder_val_clean)
        kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed)
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    elif weight_initilization == 'k-means++':

        start_time = timeit.default_timer()
        kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15,
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        D = 1.0 / euclidean_distances(encoder_val_clean, kmeans_model.cluster_centers_, squared=True)
        D **= 2.0 / (2 - 1)
        D /= np.sum(D, axis=1)[:, np.newaxis]

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred),
          '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
          'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time)

    if y_pred_prev is not None:
        print('Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev),
              '\tdatapoints-bestMap*datapoints: ',
              encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0])

    return centroids, kmeans_model.inertia_, y_pred
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def _split_node(node, threshold, branching_factor):
    """The node has to be split if there is no place for a new subcluster
    in the node.
    1. Two empty nodes and two empty subclusters are initialized.
    2. The pair of distant subclusters are found.
    3. The properties of the empty subclusters and nodes are updated
       according to the nearest distance between the subclusters to the
       pair of distant subclusters.
    4. The two nodes are set as children to the two subclusters.
    new_subcluster1 = _CFSubcluster()
    new_subcluster2 = _CFSubcluster()
    new_node1 = _CFNode(
        threshold, branching_factor, is_leaf=node.is_leaf,
    new_node2 = _CFNode(
        threshold, branching_factor, is_leaf=node.is_leaf,
    new_subcluster1.child_ = new_node1
    new_subcluster2.child_ = new_node2

    if node.is_leaf:
        if node.prev_leaf_ is not None:
            node.prev_leaf_.next_leaf_ = new_node1
        new_node1.prev_leaf_ = node.prev_leaf_
        new_node1.next_leaf_ = new_node2
        new_node2.prev_leaf_ = new_node1
        new_node2.next_leaf_ = node.next_leaf_
        if node.next_leaf_ is not None:
            node.next_leaf_.prev_leaf_ = new_node2

    dist = euclidean_distances(
        node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)
    n_clusters = dist.shape[0]

    farthest_idx = np.unravel_index(
        dist.argmax(), (n_clusters, n_clusters))
    node1_dist, node2_dist = dist[[farthest_idx]]

    node1_closer = node1_dist < node2_dist
    for idx, subcluster in enumerate(node.subclusters_):
        if node1_closer[idx]:
    return new_subcluster1, new_subcluster2
项目:pylmnn    作者:johny-c    | 项目源码 | 文件源码
def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500):
        """Find impostor pairs in chunks to avoid large memory usage

        x1 : array_like
            An array of transformed data samples with shape (n_samples, n_features).
        x2 : array_like
            An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples.
        t1 : array_like
            An array of distances to the margins with shape (n_samples,).
        t2 : array_like
            An array of distances to the margins with shape (m_samples,).
        batch_size : int (Default value = 500)
            The size of each chunk of x1 to compute distances to.
        return_dist : bool (Default value = False)
            Whether to return the distances to the impostors.

        tuple: (array_like, array_like, [array_like])

            imp1 : array_like
                An array of sample indices with shape (n_impostors,).
            imp2 : array_like
                An array of sample indices that violate a margin with shape (n_impostors,).
            dist : array_like, optional
                An array of pairwise distances of (imp1, imp2) with shape (n_impostors,).


        n, m = len(t1), len(t2)
        imp1, imp2, dist = [], [], []
        for chunk in gen_batches(n, batch_size):
            dist_out_in = euclidean_distances(x1[chunk], x2, squared=True)
            i1, j1 = np.where(dist_out_in < t1[chunk, None])
            i2, j2 = np.where(dist_out_in < t2[None, :])
            if len(i1):
                imp1.extend(i1 + chunk.start)
                if return_dist:
                    dist.extend(dist_out_in[i1, j1])
            if len(i2):
                imp1.extend(i2 + chunk.start)
                if return_dist:
                    dist.extend(dist_out_in[i2, j2])

        if return_dist:
            return imp1, imp2, dist
            return imp1, imp2
项目:clust    作者:BaselAbujamous    | 项目源码 | 文件源码
def reorderClusters(B, X, GDM, returnOrderIndices = False):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    Bloc = Bloc[:, np.any(Bloc, axis=0)]  # Only keep non-empty clusters

    B_ordered = np.zeros(Bloc.shape, dtype=bool)
    K = Bloc.shape[1]  # Number of clusters
    L = Xloc.shape[0]  # Number of datasets

    if K == 0:
        return Bloc

    # Find Cmeans and distances between clusters
    Cmeans = np.array([None] * L, dtype=object)
    D = np.zeros([K, K, L])  # KxKxL
    for l in range(L):
        Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float)  # (K) x (X[l] samples)
        for k in range(K):
            Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
        D[:, :, l] = skdists.euclidean_distances(Cmeans[l])  # KxK
    D = np.median(D, axis=2)  # KxK

    # Set first cluster as first, then find closest by closest
    B_ordered[:, 0] = Bloc[:, 0]
    I = np.zeros(K, dtype=int)
    I[0] = 0
    clustersDone = np.zeros(K, dtype=bool)
    clustersDone[0] = True
    for k in range(1,K):
        relevantD = D[I[k-1], ~clustersDone]
        clustersLeft = np.nonzero(~clustersDone)[0]
        nextCluster = np.argmin(relevantD)
        nextCluster = clustersLeft[nextCluster]
        B_ordered[:, k] = Bloc[:, nextCluster]
        I[k] = nextCluster
        clustersDone[nextCluster] = True

    if returnOrderIndices:
        return (B_ordered, I)
        return B_ordered
项目:ProjectOfDataMining    作者:IljaNovo    | 项目源码 | 文件源码
def set_preference(data, chunk_size):
    """Return the median of the distribution of pairwise L2 Euclidean distances 
        between samples (the rows of 'data') as the default preference parameter
        for Affinity Propagation clustering.

    data : array of shape (N_samples, N_features)
        The data-set submitted for Affinity Propagation clustering.

    chunk_size : int
        The size of random subsamples from the data-set whose similarity
        matrix is computed. The resulting median of the distribution of 
        pairwise distances between the data-points selected as part of a
        given subsample is stored into a list of medians. 

    preference : float
        The preference parameter for Affinity Propagation clustering is computed
        as the median of the list of median pairwise distances between the data-points
        selected as part of each of 15 rounds of random subsampling.

    N_samples, N_features = data.shape

    rng = np.arange(0, N_samples, dtype = int)
    medians = []

    for i in xrange(15):
        selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False)
        samples = data[selected_samples, :]

        S = - euclidean_distances(samples, data, squared = True)

        n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2)

        rows = np.zeros(0, dtype = int)
        for i in xrange(chunk_size):
            rows = np.append(rows, np.full(N_samples - i, i, dtype = int))

        cols = np.zeros(0, dtype = int)
        for i in xrange(chunk_size):
            cols = np.append(cols, np.delete(rng, selected_samples[:i+1]))

        triu_indices = tuple((rows, cols))

        preference = np.median(S, overwrite_input = True)

        del S

        if i % 4 == 3:

    preference = np.median(medians)

    return preference