Python scipy.spatial.distance 模块,squareform() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.spatial.distance.squareform()

项目:Stein-Variational-Gradient-Descent    作者:DartML    | 项目源码 | 文件源码
def svgd_kernel(self, h = -1):
        sq_dist = pdist(self.theta)
        pairwise_dists = squareform(sq_dist)**2
        if h < 0: # if h < 0, using median trick
            h = np.median(pairwise_dists)  
            h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1))

        # compute the rbf kernel

        Kxy = np.exp( -pairwise_dists / h**2 / 2)

        dxkxy = -np.matmul(Kxy, self.theta)
        sumkxy = np.sum(Kxy, axis=1)
        for i in range(self.theta.shape[1]):
            dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy)
        dxkxy = dxkxy / (h**2)
        return (Kxy, dxkxy)
项目:Stein-Variational-Gradient-Descent    作者:DartML    | 项目源码 | 文件源码
def svgd_kernel(self, theta, h = -1):
        sq_dist = pdist(theta)
        pairwise_dists = squareform(sq_dist)**2
        if h < 0: # if h < 0, using median trick
            h = np.median(pairwise_dists)  
            h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1))

        # compute the rbf kernel
        Kxy = np.exp( -pairwise_dists / h**2 / 2)

        dxkxy = -np.matmul(Kxy, theta)
        sumkxy = np.sum(Kxy, axis=1)
        for i in range(theta.shape[1]):
            dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy)
        dxkxy = dxkxy / (h**2)
        return (Kxy, dxkxy)
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def tsne_cluster_cuisine(df,sublist):
    lenlist=[0]
    df_sub = df[df['cuisine']==sublist[0]]
    lenlist.append(df_sub.shape[0])
    for cuisine in sublist[1:]:
        temp = df[df['cuisine']==cuisine]
        df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
        lenlist.append(df_sub.shape[0])
    df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
    print df_X.shape, lenlist

    dist = squareform(pdist(df_X, metric='cosine'))
    tsne = TSNE(metric='precomputed').fit_transform(dist)

    palette = sns.color_palette("hls", len(sublist))
    plt.figure(figsize=(10,10))
    for i,cuisine in enumerate(sublist):
        plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
        tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
    plt.legend()

#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
项目:cg    作者:michaelhabeck    | 项目源码 | 文件源码
def hessian(self, x, d=None):
        """
        Computes Hessian matrix
        """
        d = calc_distances(x) if d is None else d
        if d.ndim == 1: d = squareform(d)

        H = np.zeros((3*len(x), 3*len(x)))
        n = self.n

        for i in range(len(x)):
            for j in range(len(x)):

                if j == i: continue

                dx = x[i]-x[j]
                r  = d[i,j]
                h  = n / r**(0.5*n+2) * ((n+2) * np.multiply.outer(dx,dx) - np.eye(3) * r)

                H[3*i:3*(i+1), 3*j:3*(j+1)]  = -h 
                H[3*i:3*(i+1), 3*i:3*(i+1)] +=  h

        return H
项目:mondrian-kernel    作者:matejbalog    | 项目源码 | 文件源码
def construct_data_synthetic_Laplacian(D, lifetime, noise_var, N_train, N_test):
    # pick datapoint locations uniformly at random
    N = N_train + N_test
    X = np.random.rand(N, D)

    # construct kernel matrix
    K = scipy.exp(- lifetime * squareform(pdist(X, 'cityblock')))

    # sample the function at picked locations x
    y = np.linalg.cholesky(K).dot(np.random.randn(N)) + np.sqrt(noise_var) * np.random.randn(N)

    # pick training indices sequentially
    indices_train = range(0, N_train)
    indices_test  = range(N_train, N)

    # split the data into train and test
    X_train = X[indices_train]
    X_test  = X[indices_test ]
    y_train = y[indices_train]
    y_test  = y[indices_test ]

    return X_train, y_train, X_test, y_test


# SAMPLING
项目:pyannote-audio    作者:pyannote    | 项目源码 | 文件源码
def compute_similarity_matrix(self, parent=None):

        clusters = list(self._models)
        n_clusters = len(clusters)

        X = np.vstack([self[cluster][0] for cluster in clusters])

        nX = l2_normalize(X)
        similarities = -squareform(pdist(nX, metric=self.distance))

        matrix = ValueSortedDict()
        for i, j in itertools.combinations(range(n_clusters), 2):
            matrix[clusters[i], clusters[j]] = similarities[i, j]
            matrix[clusters[j], clusters[i]] = similarities[j, i]

        return matrix
项目:adversarial-variational-bayes    作者:gdikov    | 项目源码 | 文件源码
def compute_dcov_dcorr_statistics(y, alpha):
    """ Compute the statistics to distance covariance/correlation.  

    Parameters
    ----------
    y : (number of samples, dimension)-ndarray
        One row of y corresponds to one sample.
    alpha : float
            0 < alpha < 2
    Returns
    -------
    c : (number of samples, dimension)-ndarray
        Computed statistics.    

    """
    d = squareform(pdist(y))**alpha
    ck = mean(d, axis=0)
    c = d - ck - ck[:, newaxis] + mean(ck)

    return c
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def formClusters(dists, link, distance):
    """Form clusters based on hierarchical clustering of input distance matrix
    with linkage type and cutoff distance
    :param dists: numpy matrix of distances
    :param link: linkage type for hierarchical clustering
    :param distance: distance at which to cut into clusters
    :return: list of cluster assignments
    """
    # Make distance matrix square
    dists = squareform(dists)
    # Compute linkage
    links = linkage(dists, link)

    # import matplotlib.pyplot as plt
    # from scipy.cluster import hierarchy
    # plt.figure(figsize=(15,5))
    # p = hierarchy.dendrogram(links)

    # Break into clusters based on cutoff
    clusters = fcluster(links, distance, criterion='distance')
    return clusters
项目:Sisyphus    作者:davidbrandfonbrener    | 项目源码 | 文件源码
def plot_hamming_dist(s,W,brec):
    masks = s[:,0,:].T>0
    x_hat = np.zeros(masks.shape)

    for ii in range(masks.shape[1]):
        Weff = W*masks[:,ii]
        x_hat[:,ii] = np.linalg.inv(np.eye(100)-Weff).dot(brec)

    fig = plt.figure()
    plt.pcolormesh(squareform(pdist(np.sign(x_hat[:,:]).T,metric='hamming'))) #,vmax=.3)
    plt.colorbar()
    plt.ylim([0,x_hat.shape[1]])
    plt.xlim([0,x_hat.shape[1]])

    plt.axes().set_aspect('equal')
    plt.title('Hamming Distance Between Putative FPs')
    plt.ylabel('Time')
    plt.xlabel('Time')

    return fig
项目:kerpy    作者:oxmlcs    | 项目源码 | 文件源码
def kernel(self, X, Y=None):

        GenericTests.check_type(X,'X',np.ndarray,2)
        # if X=Y, use more efficient pdist call which exploits symmetry
        if Y is None:
            dists = squareform(pdist(X, 'euclidean'))
        else:
            GenericTests.check_type(Y,'Y',np.ndarray,2)
            assert(shape(X)[1]==shape(Y)[1])
            dists = cdist(X, Y, 'euclidean')
        if self.nu==0.5:
            #for nu=1/2, Matern class corresponds to Ornstein-Uhlenbeck Process
            K = (self.sigma**2.) * exp( -dists / self.width )                 
        elif self.nu==1.5:
            K = (self.sigma**2.) * (1+ sqrt(3.)*dists / self.width) * exp( -sqrt(3.)*dists / self.width )
        elif self.nu==2.5:
            K = (self.sigma**2.) * (1+ sqrt(5.)*dists / self.width + 5.0*(dists**2.) / (3.0*self.width**2.) ) * exp( -sqrt(5.)*dists / self.width )
        else:
            raise NotImplementedError()
        return K
项目:neural-combinatorial-optimization-rl-tensorflow    作者:MichelDeudon    | 项目源码 | 文件源码
def k_nearest_neighbor(self, sequence):
        # Calculate dist_matrix
        dist_array = pdist(sequence)
        dist_matrix = squareform(dist_array)
        # Construct tour
        new_sequence = [sequence[0]]
        current_city = 0
        visited_cities = [0]
        for i in range(1,len(sequence)):
            j = np.random.randint(0,min(len(sequence)-i,self.kNN))
            next_city = [index for index in dist_matrix[current_city].argsort() if index not in visited_cities][j]
            visited_cities.append(next_city)
            new_sequence.append(sequence[next_city])
            current_city = next_city
        return np.asarray(new_sequence)


    # Generate random TSP-TW instance
项目:analyse_website_dns    作者:mrcheng0910    | 项目源码 | 文件源码
def create_hc(G):
    """Creates hierarchical cluster of graph G from distance matrix"""
    path_length=nx.all_pairs_shortest_path_length(G)
    distances=numpy.zeros((len(G),len(G)))
    for u,p in path_length.items():
        for v,d in p.items():
            distances[u][v]=d
    # Create hierarchical cluster
    Y=distance.squareform(distances)
    Z=hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership=list(hierarchy.fcluster(Z,t=1.15))
    # Create collection of lists for blockmodel
    partition=defaultdict(list)
    for n,p in zip(list(range(len(G))),membership):
        partition[p].append(n)
    return list(partition.values())
项目:Waskom_PNAS_2017    作者:WagnerLabPapers    | 项目源码 | 文件源码
def create_3D_distance_matrix(vox_ijk, epi_fname):
    """Compute distance between voxels in the volume.

    Parameters
    ----------
    vox_ijk : n x 3 array
        Indices of voxels included in the ROI.
    epi_fname : file path
        Path to image defining the volume space.

    Returns
    -------
    dmat : array
        Dense square distance matrix.

    """
    aff = nib.load(epi_fname).affine
    vox_ras = nib.affines.apply_affine(aff, vox_ijk)
    dmat = squareform(pdist(vox_ras))

    return dmat
项目:LearnHash    作者:galad-loth    | 项目源码 | 文件源码
def PQTrain(data, lenSubVec,numSubCenter):
    (dataSize, dataDim)=data.shape
    if 0!=dataDim%lenSubVec:
        print "Cannot partition the feature space with the given segment number"
        return
    numSubVec=dataDim/lenSubVec
    centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32)
    distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32)
    objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001)
    for ii in range(numSubVec):
        print("PQ training. Processing "+str(ii)+"-th sub-vector")
        objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) 
        centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_
        distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean"))
    model={"centers":centers,"distOfCenters":distOfCenters}   
    return model
项目:SVM-CNN    作者:dlmacedo    | 项目源码 | 文件源码
def _compute_centers(self, X, sparse, rs):
        """Generate centers, then compute tau, dF and dN vals"""

        super(GRBFRandomLayer, self)._compute_centers(X, sparse, rs)

        centers = self.components_['centers']
        sorted_distances = np.sort(squareform(pdist(centers)))
        self.dF_vals = sorted_distances[:, -1]
        self.dN_vals = sorted_distances[:, 1]/100.0
        #self.dN_vals = 0.0002 * np.ones(self.dF_vals.shape)

        tauNum = np.log(np.log(self.grbf_lambda) /
                        np.log(1.0 - self.grbf_lambda))

        tauDenom = np.log(self.dF_vals/self.dN_vals)

        self.tau_vals = tauNum/tauDenom

        self._extra_args['taus'] = self.tau_vals

    # get radii according to ref [1]
项目:fuku-ml    作者:fukuball    | 项目源码 | 文件源码
def kernel_matrix(svm_model, original_X):

        if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'):
            K = (svm_model.zeta + svm_model.gamma * np.dot(original_X, original_X.T)) ** svm_model.Q
        elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'):
            pairwise_dists = squareform(pdist(original_X, 'euclidean'))
            K = np.exp(-svm_model.gamma * (pairwise_dists ** 2))

        '''
        K = np.zeros((svm_model.data_num, svm_model.data_num))

        for i in range(svm_model.data_num):
            for j in range(svm_model.data_num):
                if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'):
                    K[i, j] = Kernel.polynomial_kernel(svm_model, original_X[i], original_X[j])
                elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'):
                    K[i, j] = Kernel.gaussian_kernel(svm_model, original_X[i], original_X[j])
        '''

        return K
项目:scanpy    作者:theislab    | 项目源码 | 文件源码
def comp_distance(X, metric='euclidean'):
    """Compute distance matrix for data array X

    Parameters
    ----------
    X : np.ndarray
        Data array (rows store samples, columns store variables).
    metric : string
        For example 'euclidean', 'sqeuclidean', see sp.spatial.distance.pdist.

    Returns
    -------
    D : np.ndarray
        Distance matrix.
    """
    from scipy.spatial import distance
    return distance.squareform(distance.pdist(X, metric=metric))
项目:mglex    作者:fungs    | 项目源码 | 文件源码
def plot_clusters_igraph(responsibilities, color_groups):
    from scipy.spatial.distance import pdist, correlation, squareform
    from igraph import Graph, plot
    data = responsibilities[:, :2]
    Y = pdist(data, hellinger_distance)
    print(Y[:30], file=stderr)
    # return
    g = Graph()
    n = data.shape[0]
    g.add_vertices(n)
    colors = ["grey"]*n
    palette = list(colors_dict.values())
    for j, group in enumerate(color_groups):
        c = palette[j]
        for i in group:
            colors[i] = c
    l = g.layout_mds(dist=squareform(Y))
    plot(g, layout=l, vertex_color=colors, bbox=(1024, 1024), vertex_size=5)


# c&p from stackexchange
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
项目:geo-groupings    作者:tquach    | 项目源码 | 文件源码
def group_by_proximity(self, k=10):
        if len(self.points) == 0:
            return {}

        X = numpy.array([[p.lat, p.lon] for p in self.points])

        distance_matrix = distance.squareform(distance.pdist(X))
        db = KMeans(n_clusters=k).fit(distance_matrix)

        # re-attach ids
        grouped_points = {}
        for i, k in enumerate(db.labels_):
            logger.debug('idx, label [%s, %s]', i, k)
            if k not in grouped_points:
                grouped_points[k] = []
            point = self.points[i]
            grouped_points[k].append({'id': point.uid, 'lat': point.lat, 'lon': point.lon})

        logger.info('Finished grouping into %d groups.', len(grouped_points))
        return grouped_points
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def cluster_sequences(sequences, minsize=5):
    """
    Cluster the given sequences into groups of similar sequences.

    Return a triple that contains a pandas.DataFrame with the edit distances,
    the linkage result, and a list that maps sequence ids to their cluster id.
    If an entry is zero in that list, it means that the sequence is not part of
    a cluster.
    """
    matrix = distances(sequences)
    linkage = hierarchy.linkage(distance.squareform(matrix), method='average')
    # Linkage columns are:
    # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster
    inner = inner_nodes(hierarchy.to_tree(linkage))
    prev = linkage[:, 2].max()  # highest distance
    clusters = [0] * len(sequences)
    cl = 1
    for n in inner:
        if n.dist > 0 and prev / n.dist < 0.8 \
                and n.left.count >= minsize and n.right.count >= minsize:
            for id in collect_ids(n.left):
                # Do not overwrite previously assigned ids
                if clusters[id] == 0:
                    clusters[id] = cl
            cl += 1
        prev = n.dist
    # At the end of the above loop, we have not processed the rightmost
    # subtree. In our experiments, it never contains true novel sequences,
    # so we omit it.

    return pd.DataFrame(matrix), linkage, clusters
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def compute_db_index(matrix, kmeans):
    '''
    Compute Davies-Bouldin index, a measure of clustering quality.
    Faster and possibly more reliable than silhouette score.
    '''
    (n, m) = matrix.shape
    k = kmeans.n_clusters

    centers = kmeans.cluster_centers_
    labels = kmeans.labels_

    centroid_dists = sp_dist.squareform(sp_dist.pdist(centers))
    # Avoid divide-by-zero
    centroid_dists[np.abs(centroid_dists) < MIN_CENTROID_DIST] = MIN_CENTROID_DIST

    wss = np.zeros(k)
    counts = np.zeros(k)

    for i in xrange(n):
        label = labels[i]
        # note: this is 2x faster than scipy sqeuclidean
        sqdist = np.square(matrix[i,:] - centers[label,:]).sum()
        wss[label] += sqdist
        counts[label] += 1

    # Handle empty clusters
    counts[counts == 0] = 1

    scatter = np.sqrt(wss / counts)
    mixitude = (scatter + scatter[:, np.newaxis]) / centroid_dists
    np.fill_diagonal(mixitude, 0.0)

    worst_case_mixitude = np.max(mixitude, axis=1)
    db_score = worst_case_mixitude.sum() / k

    return db_score
项目:polo    作者:adrianveres    | 项目源码 | 文件源码
def run_orange3(Z, D):
    import Orange.clustering.hierarchical as orange_hier

    tree = orange_hier.tree_from_linkage(Z)
    start_time = time.time()
    orange_hier.optimal_leaf_ordering(tree, squareform(D))
    end_time = time.time()
    return end_time - start_time, None
项目:kernel_goodness_of_fit    作者:karlnapf    | 项目源码 | 文件源码
def kernel_matrix(self, X):

        # check for stupid mistake
        assert X.shape[0] > X.shape[1]

        sq_dists = squareform(pdist(X, 'sqeuclidean'))

        K = np.exp(-sq_dists/ self.scaling)
        return K
项目:kernel_goodness_of_fit    作者:karlnapf    | 项目源码 | 文件源码
def k_multiple(self, X):
        """
        Efficient computation of kernel matrix without loops

        Effectively does the same as calling self.k on all pairs of the input
        """
        assert(X.ndim == 1)

        sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean'))

        K = np.exp(-(sq_dists) / self.scaling)
        return K
项目:kernel_goodness_of_fit    作者:karlnapf    | 项目源码 | 文件源码
def k_multiple_dim(self, X):

        # check for stupid mistake
        assert X.shape[0] > X.shape[1]

        sq_dists = squareform(pdist(X, 'sqeuclidean'))

        K = np.exp(-(sq_dists) / self.scaling)
        return K
项目:kernel_goodness_of_fit    作者:karlnapf    | 项目源码 | 文件源码
def k_multiple(self, X):
        """
        Efficient computation of kernel matrix without loops

        Effectively does the same as calling self.k on all pairs of the input
        """
        assert(X.ndim == 1)

        sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean'))

        K = np.exp(-(sq_dists) / self.scaling)
        return K
项目:kernel_goodness_of_fit    作者:karlnapf    | 项目源码 | 文件源码
def k_multiple_dim(self, X):

        # check for stupid mistake
        assert X.shape[0] > X.shape[1]

        sq_dists = squareform(pdist(X, 'sqeuclidean'))

        K = np.exp(-(sq_dists) / self.scaling)
        return K
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def plot_bokeh(df,sublist,filename):
    lenlist=[0]
    df_sub = df[df['cuisine']==sublist[0]]
    lenlist.append(df_sub.shape[0])
    for cuisine in sublist[1:]:
        temp = df[df['cuisine']==cuisine]
        df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
        lenlist.append(df_sub.shape[0])
    df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
    print df_X.shape, lenlist

    dist = squareform(pdist(df_X, metric='cosine'))
    tsne = TSNE(metric='precomputed').fit_transform(dist)
    #cannot use seaborn palette for bokeh
    palette =['red','green','blue','yellow']
    colors =[]
    for i in range(len(sublist)):
        for j in range(lenlist[i+1]-lenlist[i]):
            colors.append(palette[i])
    #plot with boken
    output_file(filename)
    source = ColumnDataSource(
            data=dict(x=tsne[:,0],y=tsne[:,1],
                cuisine = df_sub['cuisine'],
                recipe = df_sub['recipeName']))

    hover = HoverTool(tooltips=[
                ("cuisine", "@cuisine"),
                ("recipe", "@recipe")])

    p = figure(plot_width=1000, plot_height=1000, tools=[hover],
               title="flavor clustering")

    p.circle('x', 'y', size=10, source=source,fill_color=colors)

    show(p)
项目:cg    作者:michaelhabeck    | 项目源码 | 文件源码
def gradient(self, x, D=None):
        """
        Return gradient
        """
        dx = np.array([np.subtract.outer(x[:,d],x[:,d]) for d in range(x.shape[1])])

        D  = calc_distances(x) if D is None else D
        D  = D**(-0.5*self.n-1)
        D  = squareform(D)

        g = - self.n * dx * D

        return g.sum(-1).T
项目:cg    作者:michaelhabeck    | 项目源码 | 文件源码
def gradient(self, x):

        d = self._distances
        if d is not None and np.ndim(d) == 1: d = squareform(d)

        return np.sum([self.params[k] * self.features[k].gradient(x,d)
                       for k in range(self.K)],0)
项目:OpenTDA    作者:outlace    | 项目源码 | 文件源码
def buildGraph(data, epsilon=1., metric='euclidean', p=2):
    D = squareform(pdist(data, metric=metric, p=p))
    D[D >= epsilon] = 0.
    G = nx.Graph(D)
    edges = list(map(set, G.edges()))
    weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]
    return G.nodes(), edges, weights
项目:mitre    作者:gerberlab    | 项目源码 | 文件源码
def cluster(target_sequence_ids, fasta_filename, method='average'):
    """ Form distance-based hierachical clustering of sequences.

    Looks up each entry in target_sequence_ids in the file 
    specified by fasta_filename to obtain an associated DNA 
    sequence. 

    In principle, we could just work with the Hamming distance, but 
    the sequences may be of different lengths (mostly small 
    differences.) So we need a more sophisticated approach: we use
    pairwise global alignment, scoring 0 for a match, -1 for mismatch,
    and -1.5 for opening or extending a gap. We then take the distance
    to be -1.0*(score). 

    UPGMA clustering is used when method='average', the default.

    Returns the distance matrix and the linkage matrix returned
    by the clustering routine.

    """
    # globalms arguments: seq1, seq2, match, mismatch, open, extend
    distance = lambda seq1, seq2: -1.0*(
        pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True)
    )
    sequences = fasta_to_dict(fasta_filename)
    N = len(target_sequence_ids)
    distances = np.zeros((N,N))
    # fill in the upper triangle
    for i,seqid1 in enumerate(target_sequence_ids):
        seq1 = sequences[seqid1]
        for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]):
            j = j_offset + i + 1
            seq2 = sequences[seqid2]
            distances[i][j] = distance(seq1, seq2)
    # convert to the form expected by the scipy clustering routines
    y = squareform(distances,checks=False)
    return distances, hierarchy.linkage(y,method)
项目:3D_Dense_Transformer_Networks    作者:JohnYC1995    | 项目源码 | 文件源码
def makeT(self,cp):
        # cp: [(k*k*k) x 3] control points
        # T: [((k*k*k)+4) x ((k*k*k)+4)]
        K = cp.shape[0]
        T = np.zeros((K+4, K+4))
        T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T
        R = squareform(pdist(cp, metric='euclidean'))
        R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0
        R = R * np.log(R)
        np.fill_diagonal(R, 0)
        T[:K, 4:] = R
        return T
项目:gpam_stats    作者:ricoms    | 项目源码 | 文件源码
def n1_fraction_borderline(data):

    def get_n1_for_round(sparse_matrix, y):
        Tcsr = minimum_spanning_tree(sparse_matrix)
        borders = set()
        a = Tcsr.nonzero()[0]
        b = Tcsr.nonzero()[1]

        for i in range(len(a)):
            if (y[a[i]] != y[b[i]]):
                borders.add(a[i])
                borders.add(b[i])
        n1 = len(borders)
        return n1

    features = data.columns[:-1, ]
    dist = pdist(data[features], 'euclidean')
    df_dist = pd.DataFrame(squareform(dist))
    sparse_matrix = csr_matrix(df_dist.values)

    labels = data.columns[-1]
    y = data[labels]

    n1 = 0
    rounds = 10

    for round in range(rounds):
        n1 = n1 + get_n1_for_round(sparse_matrix, y)

    n = len(data)
    n1 = (1.0 * n1) / (rounds * n)

    return n1
项目:gpam_stats    作者:ricoms    | 项目源码 | 文件源码
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data):

    features = data.columns[:-1,]
    labels = data.columns[-1]

    dist    = pdist(data[features], 'euclidean')
    df_dist = pd.DataFrame(squareform(dist))

    max_size = df_dist.copy( )
    max_size.iloc[:, :] = False

    classes = data.iloc[ :, -1].unique()
    n = data.shape[0]

    n2 = 0
    cl = 'bla'
    intra_min = 0
    inter_min = 0
    for i in range(data.shape[0]):
        ci = data.iloc[i, -1]
        if ci != cl:
            cl = ci
            intra_idx = data[data[labels] == ci].index.values.tolist()
            inter_idx = data[data[labels] != ci].index.values
        intra_idx.remove(i)
        intra_min = intra_min + df_dist.iloc[intra_idx, i].min()
        inter_min = inter_min + df_dist.iloc[inter_idx, i].min()
        intra_idx.append(i)

    # tratar caso de inter_min == 0
    if inter_min == 0:
        inter_min = 1

    n2 = (1.0 * intra_min) / (1.0 * inter_min)

    return n2
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def start_clustering(self):
        functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2)))
        self.distances = self.create_distance_matrix()
        functions.log('Start clustering...')
        self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete')
        functions.log('Clustering done.')
项目:luna16    作者:gzuidhof    | 项目源码 | 文件源码
def merge_candidates_scan(candidates, seriesuid, distance=5.):
    distances = pdist(candidates, metric='euclidean')
    adjacency_matrix = squareform(distances)

    # Determine nodes within distance, replace by 1 (=adjacency matrix)
    adjacency_matrix = np.where(adjacency_matrix<=distance,1,0)

    # Determine all connected components in the graph
    n, labels = connected_components(adjacency_matrix)
    new_candidates = np.zeros((n,3))

    # Take the mean for these connected components
    for cluster_i in range(n):
        points = candidates[np.where(labels==cluster_i)]
        center = np.mean(points,axis=0)
        new_candidates[cluster_i,:] = center

    x = new_candidates[:,0]
    y = new_candidates[:,1]
    z = new_candidates[:,2]
    labels = [seriesuid]*len(x)
    class_name = [0]*len(x)

    data= zip(labels,x,y,z,class_name)

    new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS)

    return new_candidates
项目:lddmm-ot    作者:jeanfeydy    | 项目源码 | 文件源码
def precompute_kernels(self, q) :
        """
        Returns a tuple of kernel, kernel', kernel'' matrices at position q.
        """
        x = q.reshape((self.npoints, self.dimension))
        dists = squareform(pdist(x, 'sqeuclidean'))
        K = exp(- dists / (2* self.kernel_scale ** 2))

        return (  K, 
                - K / (2* self.kernel_scale ** 2), 
                  K / (4* self.kernel_scale ** 4))
项目:lddmm-ot    作者:jeanfeydy    | 项目源码 | 文件源码
def dq_Kqp_a(self,q,p,a, kernels) :
        """
        Useful for the adjoint integration scheme.
        d_q (K_q p) . a  = ...
        """
        h = 1e-8
        Q0phA = q + h*a
        Q0mhA = q - h*a
        update_emp =  (  Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA))
                      -  Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h)
        return update_emp

        """x = q.reshape((self.npoints, self.dimension))
        p = p.reshape((self.npoints, self.dimension))
        a = a.reshape((self.npoints, self.dimension))
        dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij       = |x_i-x_j|^2
        # We have :
        # [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d }
        #
        # So that :
        # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
        grad = zeros((self.npoints, self.dimension))
        for d in range(self.dimension) :
            diffs = atleast_2d(x[:,d]).T - x[:,d]  # diffs_ij = x_i^d - x_j^d

            # K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d
            K = 2 * dists * kernels[1] * p[:,d]
            # grad_nd =   a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d }
            grad[:,d] = a[:,d] * sum( K , 1 )
        return grad.reshape((self.npoints * self.dimension,))"""
项目:Learning-sentence-representation-with-guidance-of-human-attention    作者:wangshaonan    | 项目源码 | 文件源码
def getPairsFast(d, type):
    X = []
    T = []
    pairs = []
    for i in range(len(d)):
        (p1,p2) = d[i]
        X.append(p1.representation)
        X.append(p2.representation)
        T.append(p1)
        T.append(p2)

    arr = pdist(X,'cosine')
    arr = squareform(arr)

    for i in range(len(arr)):
        arr[i,i]=1
        if i % 2 == 0:
            arr[i,i+1] = 1
        else:
            arr[i,i-1] = 1

    arr = np.argmin(arr,axis=1)
    for i in range(len(d)):
        (t1,t2) = d[i]
        p1 = None
        p2 = None
        if type == "MAX":
            p1 = T[arr[2*i]]
            p2 = T[arr[2*i+1]]
        if type == "RAND":
            p1 = getPairRand(d,i)
            p2 = getPairRand(d,i)
        if type == "MIX":
            p1 = getPairMixScore(d,i,T[arr[2*i]])
            p2 = getPairMixScore(d,i,T[arr[2*i+1]])
        pairs.append((p1,p2))
    return pairs
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def cao_juan_2009(topic_term_dists, num_topics):
    cos_pdists = squareform(pdist(topic_term_dists, metric='cosine')) 
    return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def deveaud_2014(topic_term_dists, num_topics):
    jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon)) 
    return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
项目:protein-interaction-network    作者:ericmjl    | 项目源码 | 文件源码
def compute_distmat(self, dataframe):
        """
        Computes the pairwise euclidean distances between every atom.

        Design choice: passed in a DataFrame to enable easier testing on
        dummy data.
        """

        self.eucl_dists = pdist(dataframe[['x', 'y', 'z']],
                                metric='euclidean')
        self.eucl_dists = pd.DataFrame(squareform(self.eucl_dists))
        self.eucl_dists.index = dataframe.index
        self.eucl_dists.columns = dataframe.index

        return self.eucl_dists
项目:eqnet    作者:mast-group    | 项目源码 | 文件源码
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False):
    """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points"""
    data = import_data(data_filename)
    encodings = []
    equivalence_sets = []

    for name, code in data.items():
        idx = len(encodings)
        enc = encoder.get_encoding(code['original'])
        assert not np.isnan(np.sum(enc))
        encodings.append(enc)
        for noisy_sample in code['noise']:
            enc = encoder.get_encoding(noisy_sample)
            assert not np.isnan(np.sum(enc))
            encodings.append(enc)
        equivalence_sets.append(set(range(idx, len(encodings))))

    encodings = np.array(encodings)

    all_distances = squareform(pdist(encodings, 'cosine'))  # TODO: avoid square form somehow
    assert not np.any(np.isnan(all_distances))

    # Average the lower triangle of all_distances
    avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2)

    sum_distance_within_eq_class = 0.
    num_pairs = 0
    for equiv_class_idxs in equivalence_sets:
        num_elements_in_class = len(equiv_class_idxs)
        if num_elements_in_class < 2:
            continue
        elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32)
        sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1))
        num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2

    avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs
    if print_stats:
        print(
            "Within Avg Dist: %s  All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points))
    return avg_distance_between_all_points / avg_distance_within_eq_class
项目:CElegansBehaviour    作者:ChristophKirst    | 项目源码 | 文件源码
def sort_points_to_line(vertices, start = 0):
  """Sorts points to a line by sequentiall connectoing nearest points

  Arguments:
    vertices (nx2 array): vertices of the line
    start (int): start index

  Returns:
    nx2 array: sorted points
  """

  d = squareform(pdist(vertices));

  i = start;
  n = vertices.shape[0];
  uidx = np.ones(n, dtype = bool);
  uidx[i] = False;
  sidx = [i];  

  while np.sum(uidx) > 0:
    i = np.argmin(d[i][uidx]);
    i = np.where(uidx)[0][i];
    sidx.append(i);
    uidx[i] = False;

  return vertices[sidx];
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def predict(otrain):
    binary = (otrain > 0)
    norm = NormalizePositive(axis=1)
    train = norm.fit_transform(otrain)

    dists = distance.pdist(binary, 'correlation')
    dists = distance.squareform(dists)

    neighbors = dists.argsort(axis=1)
    filled = train.copy()
    for u in range(filled.shape[0]):
        # n_u are the neighbors of user
        n_u = neighbors[u, 1:]
        for m in range(filled.shape[1]):
            # This code could be faster using numpy indexing trickery as the
            # cost of readibility (this is left as an exercise to the reader):
            revs = [train[neigh, m]
                    for neigh in n_u
                    if binary[neigh, m]]
            if len(revs):
                n = len(revs)
                n //= 2
                n += 1
                revs = revs[:n]
                filled[u,m] = np.mean(revs)

    return norm.inverse_transform(filled)
项目:kerpy    作者:oxmlcs    | 项目源码 | 文件源码
def kernel(self, X, Y=None):

        GenericTests.check_type(X,'X',np.ndarray,2)
        # if X=Y, use more efficient pdist call which exploits symmetry
        normX=reshape(np.linalg.norm(X,axis=1),(len(X),1))
        if Y is None:
            dists = squareform(pdist(X, 'euclidean'))
            normY=normX.T
        else:
            GenericTests.check_type(Y,'Y',np.ndarray,2)
            assert(shape(X)[1]==shape(Y)[1])
            normY=reshape(np.linalg.norm(Y,axis=1),(1,len(Y)))
            dists = cdist(X, Y, 'euclidean')
        K=0.5*(normX**self.alpha+normY**self.alpha-dists**self.alpha)
        return K
项目:kerpy    作者:oxmlcs    | 项目源码 | 文件源码
def kernel(self, X, Y=None):
        """
        Computes the hypercube kerpy k(x,y)=tanh(gamma)^d(x,y), where d is the
        Hamming distance between x and y

        X - 2d numpy.bool8 array, samples on right left side
        Y - 2d numpy.bool8 array, samples on left hand side.
            Can be None in which case its replaced by X
        """

        if not type(X) is numpy.ndarray:
            raise TypeError("X must be numpy array")

        if not len(X.shape) == 2:
            raise ValueError("X must be 2D numpy array")

        if not X.dtype == numpy.bool8:
            raise ValueError("X must be boolean numpy array")

        if not Y is None:
            if not type(Y) is numpy.ndarray:
                raise TypeError("Y must be None or numpy array")

            if not len(Y.shape) == 2:
                raise ValueError("Y must be None or 2D numpy array")

            if not Y.dtype == numpy.bool8:
                raise ValueError("Y must be boolean numpy array")

            if not X.shape[1] == Y.shape[1]:
                raise ValueError("X and Y must have same dimension if Y is not None")

        # un-normalise normalised hamming distance in both cases
        if Y is None:
            K = tanh(self.gamma) ** squareform(pdist(X, 'hamming') * X.shape[1])
        else:
            K = tanh(self.gamma) ** (cdist(X, Y, 'hamming') * X.shape[1])

        return K
项目:kerpy    作者:oxmlcs    | 项目源码 | 文件源码
def kernel(self, X, Y=None):
        """
        Computes the standard Gaussian kernel k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2)

        X - 2d numpy.ndarray, first set of samples:
            number of rows: number of samples
            number of columns: dimensionality
        Y - 2d numpy.ndarray, second set of samples, can be None in which case its replaced by X
        """
        if self.is_sparse:
            X = X.todense()
            Y = Y.todense()
        GenericTests.check_type(X, 'X',np.ndarray)
        assert(len(shape(X))==2)

        # if X=Y, use more efficient pdist call which exploits symmetry
        if Y is None:
            sq_dists = squareform(pdist(X, 'sqeuclidean'))
        else:
            GenericTests.check_type(Y, 'Y',np.ndarray)
            assert(len(shape(Y))==2)
            assert(shape(X)[1]==shape(Y)[1])
            sq_dists = cdist(X, Y, 'sqeuclidean')

        K = exp(-0.5 * (sq_dists) / self.width ** 2)
        return K