我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.spatial.distance.squareform()。
def svgd_kernel(self, h = -1): sq_dist = pdist(self.theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, self.theta) sumkxy = np.sum(Kxy, axis=1) for i in range(self.theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def svgd_kernel(self, theta, h = -1): sq_dist = pdist(theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, theta) sumkxy = np.sum(Kxy, axis=1) for i in range(theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def tsne_cluster_cuisine(df,sublist): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) palette = sns.color_palette("hls", len(sublist)) plt.figure(figsize=(10,10)) for i,cuisine in enumerate(sublist): plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\ tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i]) plt.legend() #interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def hessian(self, x, d=None): """ Computes Hessian matrix """ d = calc_distances(x) if d is None else d if d.ndim == 1: d = squareform(d) H = np.zeros((3*len(x), 3*len(x))) n = self.n for i in range(len(x)): for j in range(len(x)): if j == i: continue dx = x[i]-x[j] r = d[i,j] h = n / r**(0.5*n+2) * ((n+2) * np.multiply.outer(dx,dx) - np.eye(3) * r) H[3*i:3*(i+1), 3*j:3*(j+1)] = -h H[3*i:3*(i+1), 3*i:3*(i+1)] += h return H
def construct_data_synthetic_Laplacian(D, lifetime, noise_var, N_train, N_test): # pick datapoint locations uniformly at random N = N_train + N_test X = np.random.rand(N, D) # construct kernel matrix K = scipy.exp(- lifetime * squareform(pdist(X, 'cityblock'))) # sample the function at picked locations x y = np.linalg.cholesky(K).dot(np.random.randn(N)) + np.sqrt(noise_var) * np.random.randn(N) # pick training indices sequentially indices_train = range(0, N_train) indices_test = range(N_train, N) # split the data into train and test X_train = X[indices_train] X_test = X[indices_test ] y_train = y[indices_train] y_test = y[indices_test ] return X_train, y_train, X_test, y_test # SAMPLING
def compute_similarity_matrix(self, parent=None): clusters = list(self._models) n_clusters = len(clusters) X = np.vstack([self[cluster][0] for cluster in clusters]) nX = l2_normalize(X) similarities = -squareform(pdist(nX, metric=self.distance)) matrix = ValueSortedDict() for i, j in itertools.combinations(range(n_clusters), 2): matrix[clusters[i], clusters[j]] = similarities[i, j] matrix[clusters[j], clusters[i]] = similarities[j, i] return matrix
def compute_dcov_dcorr_statistics(y, alpha): """ Compute the statistics to distance covariance/correlation. Parameters ---------- y : (number of samples, dimension)-ndarray One row of y corresponds to one sample. alpha : float 0 < alpha < 2 Returns ------- c : (number of samples, dimension)-ndarray Computed statistics. """ d = squareform(pdist(y))**alpha ck = mean(d, axis=0) c = d - ck - ck[:, newaxis] + mean(ck) return c
def formClusters(dists, link, distance): """Form clusters based on hierarchical clustering of input distance matrix with linkage type and cutoff distance :param dists: numpy matrix of distances :param link: linkage type for hierarchical clustering :param distance: distance at which to cut into clusters :return: list of cluster assignments """ # Make distance matrix square dists = squareform(dists) # Compute linkage links = linkage(dists, link) # import matplotlib.pyplot as plt # from scipy.cluster import hierarchy # plt.figure(figsize=(15,5)) # p = hierarchy.dendrogram(links) # Break into clusters based on cutoff clusters = fcluster(links, distance, criterion='distance') return clusters
def plot_hamming_dist(s,W,brec): masks = s[:,0,:].T>0 x_hat = np.zeros(masks.shape) for ii in range(masks.shape[1]): Weff = W*masks[:,ii] x_hat[:,ii] = np.linalg.inv(np.eye(100)-Weff).dot(brec) fig = plt.figure() plt.pcolormesh(squareform(pdist(np.sign(x_hat[:,:]).T,metric='hamming'))) #,vmax=.3) plt.colorbar() plt.ylim([0,x_hat.shape[1]]) plt.xlim([0,x_hat.shape[1]]) plt.axes().set_aspect('equal') plt.title('Hamming Distance Between Putative FPs') plt.ylabel('Time') plt.xlabel('Time') return fig
def kernel(self, X, Y=None): GenericTests.check_type(X,'X',np.ndarray,2) # if X=Y, use more efficient pdist call which exploits symmetry if Y is None: dists = squareform(pdist(X, 'euclidean')) else: GenericTests.check_type(Y,'Y',np.ndarray,2) assert(shape(X)[1]==shape(Y)[1]) dists = cdist(X, Y, 'euclidean') if self.nu==0.5: #for nu=1/2, Matern class corresponds to Ornstein-Uhlenbeck Process K = (self.sigma**2.) * exp( -dists / self.width ) elif self.nu==1.5: K = (self.sigma**2.) * (1+ sqrt(3.)*dists / self.width) * exp( -sqrt(3.)*dists / self.width ) elif self.nu==2.5: K = (self.sigma**2.) * (1+ sqrt(5.)*dists / self.width + 5.0*(dists**2.) / (3.0*self.width**2.) ) * exp( -sqrt(5.)*dists / self.width ) else: raise NotImplementedError() return K
def k_nearest_neighbor(self, sequence): # Calculate dist_matrix dist_array = pdist(sequence) dist_matrix = squareform(dist_array) # Construct tour new_sequence = [sequence[0]] current_city = 0 visited_cities = [0] for i in range(1,len(sequence)): j = np.random.randint(0,min(len(sequence)-i,self.kNN)) next_city = [index for index in dist_matrix[current_city].argsort() if index not in visited_cities][j] visited_cities.append(next_city) new_sequence.append(sequence[next_city]) current_city = next_city return np.asarray(new_sequence) # Generate random TSP-TW instance
def create_hc(G): """Creates hierarchical cluster of graph G from distance matrix""" path_length=nx.all_pairs_shortest_path_length(G) distances=numpy.zeros((len(G),len(G))) for u,p in path_length.items(): for v,d in p.items(): distances[u][v]=d # Create hierarchical cluster Y=distance.squareform(distances) Z=hierarchy.complete(Y) # Creates HC using farthest point linkage # This partition selection is arbitrary, for illustrive purposes membership=list(hierarchy.fcluster(Z,t=1.15)) # Create collection of lists for blockmodel partition=defaultdict(list) for n,p in zip(list(range(len(G))),membership): partition[p].append(n) return list(partition.values())
def create_3D_distance_matrix(vox_ijk, epi_fname): """Compute distance between voxels in the volume. Parameters ---------- vox_ijk : n x 3 array Indices of voxels included in the ROI. epi_fname : file path Path to image defining the volume space. Returns ------- dmat : array Dense square distance matrix. """ aff = nib.load(epi_fname).affine vox_ras = nib.affines.apply_affine(aff, vox_ijk) dmat = squareform(pdist(vox_ras)) return dmat
def PQTrain(data, lenSubVec,numSubCenter): (dataSize, dataDim)=data.shape if 0!=dataDim%lenSubVec: print "Cannot partition the feature space with the given segment number" return numSubVec=dataDim/lenSubVec centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32) distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32) objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001) for ii in range(numSubVec): print("PQ training. Processing "+str(ii)+"-th sub-vector") objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_ distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean")) model={"centers":centers,"distOfCenters":distOfCenters} return model
def _compute_centers(self, X, sparse, rs): """Generate centers, then compute tau, dF and dN vals""" super(GRBFRandomLayer, self)._compute_centers(X, sparse, rs) centers = self.components_['centers'] sorted_distances = np.sort(squareform(pdist(centers))) self.dF_vals = sorted_distances[:, -1] self.dN_vals = sorted_distances[:, 1]/100.0 #self.dN_vals = 0.0002 * np.ones(self.dF_vals.shape) tauNum = np.log(np.log(self.grbf_lambda) / np.log(1.0 - self.grbf_lambda)) tauDenom = np.log(self.dF_vals/self.dN_vals) self.tau_vals = tauNum/tauDenom self._extra_args['taus'] = self.tau_vals # get radii according to ref [1]
def kernel_matrix(svm_model, original_X): if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'): K = (svm_model.zeta + svm_model.gamma * np.dot(original_X, original_X.T)) ** svm_model.Q elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'): pairwise_dists = squareform(pdist(original_X, 'euclidean')) K = np.exp(-svm_model.gamma * (pairwise_dists ** 2)) ''' K = np.zeros((svm_model.data_num, svm_model.data_num)) for i in range(svm_model.data_num): for j in range(svm_model.data_num): if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'): K[i, j] = Kernel.polynomial_kernel(svm_model, original_X[i], original_X[j]) elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'): K[i, j] = Kernel.gaussian_kernel(svm_model, original_X[i], original_X[j]) ''' return K
def comp_distance(X, metric='euclidean'): """Compute distance matrix for data array X Parameters ---------- X : np.ndarray Data array (rows store samples, columns store variables). metric : string For example 'euclidean', 'sqeuclidean', see sp.spatial.distance.pdist. Returns ------- D : np.ndarray Distance matrix. """ from scipy.spatial import distance return distance.squareform(distance.pdist(X, metric=metric))
def plot_clusters_igraph(responsibilities, color_groups): from scipy.spatial.distance import pdist, correlation, squareform from igraph import Graph, plot data = responsibilities[:, :2] Y = pdist(data, hellinger_distance) print(Y[:30], file=stderr) # return g = Graph() n = data.shape[0] g.add_vertices(n) colors = ["grey"]*n palette = list(colors_dict.values()) for j, group in enumerate(color_groups): c = palette[j] for i in group: colors[i] = c l = g.layout_mds(dist=squareform(Y)) plot(g, layout=l, vertex_color=colors, bbox=(1024, 1024), vertex_size=5) # c&p from stackexchange
def test_dbscan_similarity(): # Tests the DBSCAN algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 10 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute DBSCAN core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples) labels = db.fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def group_by_proximity(self, k=10): if len(self.points) == 0: return {} X = numpy.array([[p.lat, p.lon] for p in self.points]) distance_matrix = distance.squareform(distance.pdist(X)) db = KMeans(n_clusters=k).fit(distance_matrix) # re-attach ids grouped_points = {} for i, k in enumerate(db.labels_): logger.debug('idx, label [%s, %s]', i, k) if k not in grouped_points: grouped_points[k] = [] point = self.points[i] grouped_points[k].append({'id': point.uid, 'lat': point.lat, 'lon': point.lon}) logger.info('Finished grouping into %d groups.', len(grouped_points)) return grouped_points
def cluster_sequences(sequences, minsize=5): """ Cluster the given sequences into groups of similar sequences. Return a triple that contains a pandas.DataFrame with the edit distances, the linkage result, and a list that maps sequence ids to their cluster id. If an entry is zero in that list, it means that the sequence is not part of a cluster. """ matrix = distances(sequences) linkage = hierarchy.linkage(distance.squareform(matrix), method='average') # Linkage columns are: # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster inner = inner_nodes(hierarchy.to_tree(linkage)) prev = linkage[:, 2].max() # highest distance clusters = [0] * len(sequences) cl = 1 for n in inner: if n.dist > 0 and prev / n.dist < 0.8 \ and n.left.count >= minsize and n.right.count >= minsize: for id in collect_ids(n.left): # Do not overwrite previously assigned ids if clusters[id] == 0: clusters[id] = cl cl += 1 prev = n.dist # At the end of the above loop, we have not processed the rightmost # subtree. In our experiments, it never contains true novel sequences, # so we omit it. return pd.DataFrame(matrix), linkage, clusters
def compute_db_index(matrix, kmeans): ''' Compute Davies-Bouldin index, a measure of clustering quality. Faster and possibly more reliable than silhouette score. ''' (n, m) = matrix.shape k = kmeans.n_clusters centers = kmeans.cluster_centers_ labels = kmeans.labels_ centroid_dists = sp_dist.squareform(sp_dist.pdist(centers)) # Avoid divide-by-zero centroid_dists[np.abs(centroid_dists) < MIN_CENTROID_DIST] = MIN_CENTROID_DIST wss = np.zeros(k) counts = np.zeros(k) for i in xrange(n): label = labels[i] # note: this is 2x faster than scipy sqeuclidean sqdist = np.square(matrix[i,:] - centers[label,:]).sum() wss[label] += sqdist counts[label] += 1 # Handle empty clusters counts[counts == 0] = 1 scatter = np.sqrt(wss / counts) mixitude = (scatter + scatter[:, np.newaxis]) / centroid_dists np.fill_diagonal(mixitude, 0.0) worst_case_mixitude = np.max(mixitude, axis=1) db_score = worst_case_mixitude.sum() / k return db_score
def run_orange3(Z, D): import Orange.clustering.hierarchical as orange_hier tree = orange_hier.tree_from_linkage(Z) start_time = time.time() orange_hier.optimal_leaf_ordering(tree, squareform(D)) end_time = time.time() return end_time - start_time, None
def kernel_matrix(self, X): # check for stupid mistake assert X.shape[0] > X.shape[1] sq_dists = squareform(pdist(X, 'sqeuclidean')) K = np.exp(-sq_dists/ self.scaling) return K
def k_multiple(self, X): """ Efficient computation of kernel matrix without loops Effectively does the same as calling self.k on all pairs of the input """ assert(X.ndim == 1) sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean')) K = np.exp(-(sq_dists) / self.scaling) return K
def k_multiple_dim(self, X): # check for stupid mistake assert X.shape[0] > X.shape[1] sq_dists = squareform(pdist(X, 'sqeuclidean')) K = np.exp(-(sq_dists) / self.scaling) return K
def plot_bokeh(df,sublist,filename): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) #cannot use seaborn palette for bokeh palette =['red','green','blue','yellow'] colors =[] for i in range(len(sublist)): for j in range(lenlist[i+1]-lenlist[i]): colors.append(palette[i]) #plot with boken output_file(filename) source = ColumnDataSource( data=dict(x=tsne[:,0],y=tsne[:,1], cuisine = df_sub['cuisine'], recipe = df_sub['recipeName'])) hover = HoverTool(tooltips=[ ("cuisine", "@cuisine"), ("recipe", "@recipe")]) p = figure(plot_width=1000, plot_height=1000, tools=[hover], title="flavor clustering") p.circle('x', 'y', size=10, source=source,fill_color=colors) show(p)
def gradient(self, x, D=None): """ Return gradient """ dx = np.array([np.subtract.outer(x[:,d],x[:,d]) for d in range(x.shape[1])]) D = calc_distances(x) if D is None else D D = D**(-0.5*self.n-1) D = squareform(D) g = - self.n * dx * D return g.sum(-1).T
def gradient(self, x): d = self._distances if d is not None and np.ndim(d) == 1: d = squareform(d) return np.sum([self.params[k] * self.features[k].gradient(x,d) for k in range(self.K)],0)
def buildGraph(data, epsilon=1., metric='euclidean', p=2): D = squareform(pdist(data, metric=metric, p=p)) D[D >= epsilon] = 0. G = nx.Graph(D) edges = list(map(set, G.edges())) weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()] return G.nodes(), edges, weights
def cluster(target_sequence_ids, fasta_filename, method='average'): """ Form distance-based hierachical clustering of sequences. Looks up each entry in target_sequence_ids in the file specified by fasta_filename to obtain an associated DNA sequence. In principle, we could just work with the Hamming distance, but the sequences may be of different lengths (mostly small differences.) So we need a more sophisticated approach: we use pairwise global alignment, scoring 0 for a match, -1 for mismatch, and -1.5 for opening or extending a gap. We then take the distance to be -1.0*(score). UPGMA clustering is used when method='average', the default. Returns the distance matrix and the linkage matrix returned by the clustering routine. """ # globalms arguments: seq1, seq2, match, mismatch, open, extend distance = lambda seq1, seq2: -1.0*( pairwise2.align.globalms(seq1,seq2,0,-1,-1.5,-1.5, score_only=True) ) sequences = fasta_to_dict(fasta_filename) N = len(target_sequence_ids) distances = np.zeros((N,N)) # fill in the upper triangle for i,seqid1 in enumerate(target_sequence_ids): seq1 = sequences[seqid1] for j_offset, seqid2 in enumerate(target_sequence_ids[i+1:]): j = j_offset + i + 1 seq2 = sequences[seqid2] distances[i][j] = distance(seq1, seq2) # convert to the form expected by the scipy clustering routines y = squareform(distances,checks=False) return distances, hierarchy.linkage(y,method)
def makeT(self,cp): # cp: [(k*k*k) x 3] control points # T: [((k*k*k)+4) x ((k*k*k)+4)] K = cp.shape[0] T = np.zeros((K+4, K+4)) T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T R = squareform(pdist(cp, metric='euclidean')) R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0 R = R * np.log(R) np.fill_diagonal(R, 0) T[:K, 4:] = R return T
def n1_fraction_borderline(data): def get_n1_for_round(sparse_matrix, y): Tcsr = minimum_spanning_tree(sparse_matrix) borders = set() a = Tcsr.nonzero()[0] b = Tcsr.nonzero()[1] for i in range(len(a)): if (y[a[i]] != y[b[i]]): borders.add(a[i]) borders.add(b[i]) n1 = len(borders) return n1 features = data.columns[:-1, ] dist = pdist(data[features], 'euclidean') df_dist = pd.DataFrame(squareform(dist)) sparse_matrix = csr_matrix(df_dist.values) labels = data.columns[-1] y = data[labels] n1 = 0 rounds = 10 for round in range(rounds): n1 = n1 + get_n1_for_round(sparse_matrix, y) n = len(data) n1 = (1.0 * n1) / (rounds * n) return n1
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data): features = data.columns[:-1,] labels = data.columns[-1] dist = pdist(data[features], 'euclidean') df_dist = pd.DataFrame(squareform(dist)) max_size = df_dist.copy( ) max_size.iloc[:, :] = False classes = data.iloc[ :, -1].unique() n = data.shape[0] n2 = 0 cl = 'bla' intra_min = 0 inter_min = 0 for i in range(data.shape[0]): ci = data.iloc[i, -1] if ci != cl: cl = ci intra_idx = data[data[labels] == ci].index.values.tolist() inter_idx = data[data[labels] != ci].index.values intra_idx.remove(i) intra_min = intra_min + df_dist.iloc[intra_idx, i].min() inter_min = inter_min + df_dist.iloc[inter_idx, i].min() intra_idx.append(i) # tratar caso de inter_min == 0 if inter_min == 0: inter_min = 1 n2 = (1.0 * intra_min) / (1.0 * inter_min) return n2
def start_clustering(self): functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2))) self.distances = self.create_distance_matrix() functions.log('Start clustering...') self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete') functions.log('Clustering done.')
def merge_candidates_scan(candidates, seriesuid, distance=5.): distances = pdist(candidates, metric='euclidean') adjacency_matrix = squareform(distances) # Determine nodes within distance, replace by 1 (=adjacency matrix) adjacency_matrix = np.where(adjacency_matrix<=distance,1,0) # Determine all connected components in the graph n, labels = connected_components(adjacency_matrix) new_candidates = np.zeros((n,3)) # Take the mean for these connected components for cluster_i in range(n): points = candidates[np.where(labels==cluster_i)] center = np.mean(points,axis=0) new_candidates[cluster_i,:] = center x = new_candidates[:,0] y = new_candidates[:,1] z = new_candidates[:,2] labels = [seriesuid]*len(x) class_name = [0]*len(x) data= zip(labels,x,y,z,class_name) new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS) return new_candidates
def precompute_kernels(self, q) : """ Returns a tuple of kernel, kernel', kernel'' matrices at position q. """ x = q.reshape((self.npoints, self.dimension)) dists = squareform(pdist(x, 'sqeuclidean')) K = exp(- dists / (2* self.kernel_scale ** 2)) return ( K, - K / (2* self.kernel_scale ** 2), K / (4* self.kernel_scale ** 4))
def dq_Kqp_a(self,q,p,a, kernels) : """ Useful for the adjoint integration scheme. d_q (K_q p) . a = ... """ h = 1e-8 Q0phA = q + h*a Q0mhA = q - h*a update_emp = ( Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA)) - Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h) return update_emp """x = q.reshape((self.npoints, self.dimension)) p = p.reshape((self.npoints, self.dimension)) a = a.reshape((self.npoints, self.dimension)) dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij = |x_i-x_j|^2 # We have : # [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d } # # So that : # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d } grad = zeros((self.npoints, self.dimension)) for d in range(self.dimension) : diffs = atleast_2d(x[:,d]).T - x[:,d] # diffs_ij = x_i^d - x_j^d # K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d K = 2 * dists * kernels[1] * p[:,d] # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d } grad[:,d] = a[:,d] * sum( K , 1 ) return grad.reshape((self.npoints * self.dimension,))"""
def getPairsFast(d, type): X = [] T = [] pairs = [] for i in range(len(d)): (p1,p2) = d[i] X.append(p1.representation) X.append(p2.representation) T.append(p1) T.append(p2) arr = pdist(X,'cosine') arr = squareform(arr) for i in range(len(arr)): arr[i,i]=1 if i % 2 == 0: arr[i,i+1] = 1 else: arr[i,i-1] = 1 arr = np.argmin(arr,axis=1) for i in range(len(d)): (t1,t2) = d[i] p1 = None p2 = None if type == "MAX": p1 = T[arr[2*i]] p2 = T[arr[2*i+1]] if type == "RAND": p1 = getPairRand(d,i) p2 = getPairRand(d,i) if type == "MIX": p1 = getPairMixScore(d,i,T[arr[2*i]]) p2 = getPairMixScore(d,i,T[arr[2*i+1]]) pairs.append((p1,p2)) return pairs
def cao_juan_2009(topic_term_dists, num_topics): cos_pdists = squareform(pdist(topic_term_dists, metric='cosine')) return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
def deveaud_2014(topic_term_dists, num_topics): jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon)) return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
def compute_distmat(self, dataframe): """ Computes the pairwise euclidean distances between every atom. Design choice: passed in a DataFrame to enable easier testing on dummy data. """ self.eucl_dists = pdist(dataframe[['x', 'y', 'z']], metric='euclidean') self.eucl_dists = pd.DataFrame(squareform(self.eucl_dists)) self.eucl_dists.index = dataframe.index self.eucl_dists.columns = dataframe.index return self.eucl_dists
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False): """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points""" data = import_data(data_filename) encodings = [] equivalence_sets = [] for name, code in data.items(): idx = len(encodings) enc = encoder.get_encoding(code['original']) assert not np.isnan(np.sum(enc)) encodings.append(enc) for noisy_sample in code['noise']: enc = encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(enc)) encodings.append(enc) equivalence_sets.append(set(range(idx, len(encodings)))) encodings = np.array(encodings) all_distances = squareform(pdist(encodings, 'cosine')) # TODO: avoid square form somehow assert not np.any(np.isnan(all_distances)) # Average the lower triangle of all_distances avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2) sum_distance_within_eq_class = 0. num_pairs = 0 for equiv_class_idxs in equivalence_sets: num_elements_in_class = len(equiv_class_idxs) if num_elements_in_class < 2: continue elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32) sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1)) num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2 avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs if print_stats: print( "Within Avg Dist: %s All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points)) return avg_distance_between_all_points / avg_distance_within_eq_class
def sort_points_to_line(vertices, start = 0): """Sorts points to a line by sequentiall connectoing nearest points Arguments: vertices (nx2 array): vertices of the line start (int): start index Returns: nx2 array: sorted points """ d = squareform(pdist(vertices)); i = start; n = vertices.shape[0]; uidx = np.ones(n, dtype = bool); uidx[i] = False; sidx = [i]; while np.sum(uidx) > 0: i = np.argmin(d[i][uidx]); i = np.where(uidx)[0][i]; sidx.append(i); uidx[i] = False; return vertices[sidx];
def predict(otrain): binary = (otrain > 0) norm = NormalizePositive(axis=1) train = norm.fit_transform(otrain) dists = distance.pdist(binary, 'correlation') dists = distance.squareform(dists) neighbors = dists.argsort(axis=1) filled = train.copy() for u in range(filled.shape[0]): # n_u are the neighbors of user n_u = neighbors[u, 1:] for m in range(filled.shape[1]): # This code could be faster using numpy indexing trickery as the # cost of readibility (this is left as an exercise to the reader): revs = [train[neigh, m] for neigh in n_u if binary[neigh, m]] if len(revs): n = len(revs) n //= 2 n += 1 revs = revs[:n] filled[u,m] = np.mean(revs) return norm.inverse_transform(filled)
def kernel(self, X, Y=None): GenericTests.check_type(X,'X',np.ndarray,2) # if X=Y, use more efficient pdist call which exploits symmetry normX=reshape(np.linalg.norm(X,axis=1),(len(X),1)) if Y is None: dists = squareform(pdist(X, 'euclidean')) normY=normX.T else: GenericTests.check_type(Y,'Y',np.ndarray,2) assert(shape(X)[1]==shape(Y)[1]) normY=reshape(np.linalg.norm(Y,axis=1),(1,len(Y))) dists = cdist(X, Y, 'euclidean') K=0.5*(normX**self.alpha+normY**self.alpha-dists**self.alpha) return K
def kernel(self, X, Y=None): """ Computes the hypercube kerpy k(x,y)=tanh(gamma)^d(x,y), where d is the Hamming distance between x and y X - 2d numpy.bool8 array, samples on right left side Y - 2d numpy.bool8 array, samples on left hand side. Can be None in which case its replaced by X """ if not type(X) is numpy.ndarray: raise TypeError("X must be numpy array") if not len(X.shape) == 2: raise ValueError("X must be 2D numpy array") if not X.dtype == numpy.bool8: raise ValueError("X must be boolean numpy array") if not Y is None: if not type(Y) is numpy.ndarray: raise TypeError("Y must be None or numpy array") if not len(Y.shape) == 2: raise ValueError("Y must be None or 2D numpy array") if not Y.dtype == numpy.bool8: raise ValueError("Y must be boolean numpy array") if not X.shape[1] == Y.shape[1]: raise ValueError("X and Y must have same dimension if Y is not None") # un-normalise normalised hamming distance in both cases if Y is None: K = tanh(self.gamma) ** squareform(pdist(X, 'hamming') * X.shape[1]) else: K = tanh(self.gamma) ** (cdist(X, Y, 'hamming') * X.shape[1]) return K
def kernel(self, X, Y=None): """ Computes the standard Gaussian kernel k(x,y)=exp(-0.5* ||x-y||**2 / sigma**2) X - 2d numpy.ndarray, first set of samples: number of rows: number of samples number of columns: dimensionality Y - 2d numpy.ndarray, second set of samples, can be None in which case its replaced by X """ if self.is_sparse: X = X.todense() Y = Y.todense() GenericTests.check_type(X, 'X',np.ndarray) assert(len(shape(X))==2) # if X=Y, use more efficient pdist call which exploits symmetry if Y is None: sq_dists = squareform(pdist(X, 'sqeuclidean')) else: GenericTests.check_type(Y, 'Y',np.ndarray) assert(len(shape(Y))==2) assert(shape(X)[1]==shape(Y)[1]) sq_dists = cdist(X, Y, 'sqeuclidean') K = exp(-0.5 * (sq_dists) / self.width ** 2) return K