我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.spatial.distance.pdist()。
def svgd_kernel(self, h = -1): sq_dist = pdist(self.theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, self.theta) sumkxy = np.sum(Kxy, axis=1) for i in range(self.theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def test_cuttreeHybrid(): from dynamicTreeCut import cutreeHybrid d = np.transpose(np.arange(1, 10001).reshape(100, 100)) distances = pdist(d, "euclidean") link = linkage(distances, "average") test = cutreeHybrid(link, distances) true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] assert (test['labels'] == true).all() assert False
def svgd_kernel(self, theta, h = -1): sq_dist = pdist(theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, theta) sumkxy = np.sum(Kxy, axis=1) for i in range(theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def tsne_cluster_cuisine(df,sublist): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) palette = sns.color_palette("hls", len(sublist)) plt.figure(figsize=(10,10)) for i,cuisine in enumerate(sublist): plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\ tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i]) plt.legend() #interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def get_close_markers(markers,centroids=None, min_distance=20): if centroids is None: centroids = [m['centroid']for m in markers] centroids = np.array(centroids) ti = np.triu_indices(centroids.shape[0], 1) def full_idx(i): #get the pair from condensed matrix index #defindend inline because ti changes every time return np.array([ti[0][i], ti[1][i]]) #calculate pairwise distance, return dense distace matrix (upper triangle) distances = pdist(centroids,'euclidean') close_pairs = np.where(distances<min_distance) return full_idx(close_pairs)
def _compute_dispersion_matrix(X, labels): n = len(np.unique(labels)) dist = np.zeros((n, n)) ITR = list(itertools.combinations_with_replacement(range(n), 2)) for i, j in tqdm(ITR): if i == j: d = pdist(X[labels == i], metric='cosine') else: d = cdist(X[labels == i], X[labels == j], metric='cosine') # Only take upper diagonal (+diagonal elements) d = d[np.triu_indices(n=d.shape[0], m=d.shape[1], k=0)] dist[i, j] = dist[j, i] = d.mean() return dist
def construct_data_synthetic_Laplacian(D, lifetime, noise_var, N_train, N_test): # pick datapoint locations uniformly at random N = N_train + N_test X = np.random.rand(N, D) # construct kernel matrix K = scipy.exp(- lifetime * squareform(pdist(X, 'cityblock'))) # sample the function at picked locations x y = np.linalg.cholesky(K).dot(np.random.randn(N)) + np.sqrt(noise_var) * np.random.randn(N) # pick training indices sequentially indices_train = range(0, N_train) indices_test = range(N_train, N) # split the data into train and test X_train = X[indices_train] X_test = X[indices_test ] y_train = y[indices_train] y_test = y[indices_test ] return X_train, y_train, X_test, y_test # SAMPLING
def calculate_position_error_at_z(self, z=0): ''' Returns the standard deviation in x and y, and the euclidean distance between pairs of coordinates. ''' xy_at_given_z = [] for ax in self.axes: x, y = ax.getXY(z=z) xy_at_given_z.append((x,y)) X = [xy[0] for xy in xy_at_given_z] Y = [xy[1] for xy in xy_at_given_z] pairs = [] for x, y in zip(X, Y): pairs.append((x,y)) distances = distance.pdist(pairs) return ((np.std(X), np.std(Y)), np.mean(distances))
def distance(self, x, y): """ Computes squared euclidean distance between vectors x and y. Returns float. """ d = x - y # dist = numpy.ma.inner(d,d) dist = numpy.sum(d ** 2) # dist = pdist([x,y], 'sqeuclidean') # n = len(x) # code = \ # """ # int i; # double sum = 0.0, delta = 0.0f; # for (i = 0; i < n; i++) { # delta = (x[i]-y[i]); # sum += delta*delta; # } # return_val = sum; # """ # dist = weave.inline(code, ['x', 'y', 'n']) return dist
def compute_dcov_dcorr_statistics(y, alpha): """ Compute the statistics to distance covariance/correlation. Parameters ---------- y : (number of samples, dimension)-ndarray One row of y corresponds to one sample. alpha : float 0 < alpha < 2 Returns ------- c : (number of samples, dimension)-ndarray Computed statistics. """ d = squareform(pdist(y))**alpha ck = mean(d, axis=0) c = d - ck - ck[:, newaxis] + mean(ck) return c
def plot_hamming_dist(s,W,brec): masks = s[:,0,:].T>0 x_hat = np.zeros(masks.shape) for ii in range(masks.shape[1]): Weff = W*masks[:,ii] x_hat[:,ii] = np.linalg.inv(np.eye(100)-Weff).dot(brec) fig = plt.figure() plt.pcolormesh(squareform(pdist(np.sign(x_hat[:,:]).T,metric='hamming'))) #,vmax=.3) plt.colorbar() plt.ylim([0,x_hat.shape[1]]) plt.xlim([0,x_hat.shape[1]]) plt.axes().set_aspect('equal') plt.title('Hamming Distance Between Putative FPs') plt.ylabel('Time') plt.xlabel('Time') return fig
def test_mean_of_distances(self): """Test the mean of distances calculation (and the sum).""" X = np.array([[0.3, 0.4], [0.1, 4.0], [2.0, 1.0], [0.0, 0.5]]) counts = np.array([3, 2, 1, 2]) scipy_X = [] for c, count in enumerate(counts): for i in range(count): scipy_X.append(X[c]) # SciPy: Y = pdist(scipy_X, metric=cdist) scipy_N = np.sum(counts) N_unique_pairs = scipy_N * (scipy_N - 1.0) / 2.0 scipy_mean = Y.mean() self.assertTrue(Y.shape[0] == N_unique_pairs) self.assertTrue(scipy_mean == (np.sum(Y) / N_unique_pairs)) # C & Cython: c_mean = c_mean_dist(X, counts) self.assertTrue(np.isclose(c_mean, scipy_mean))
def kernel(self, X, Y=None): GenericTests.check_type(X,'X',np.ndarray,2) # if X=Y, use more efficient pdist call which exploits symmetry if Y is None: dists = squareform(pdist(X, 'euclidean')) else: GenericTests.check_type(Y,'Y',np.ndarray,2) assert(shape(X)[1]==shape(Y)[1]) dists = cdist(X, Y, 'euclidean') if self.nu==0.5: #for nu=1/2, Matern class corresponds to Ornstein-Uhlenbeck Process K = (self.sigma**2.) * exp( -dists / self.width ) elif self.nu==1.5: K = (self.sigma**2.) * (1+ sqrt(3.)*dists / self.width) * exp( -sqrt(3.)*dists / self.width ) elif self.nu==2.5: K = (self.sigma**2.) * (1+ sqrt(5.)*dists / self.width + 5.0*(dists**2.) / (3.0*self.width**2.) ) * exp( -sqrt(5.)*dists / self.width ) else: raise NotImplementedError() return K
def _compute_J(x, window_starts, L): """Compute the cost, which is proportional to the difference between pairs of windows""" # Get all windows and zscore them N_windows = len(window_starts) windows = np.zeros((N_windows, L)) for w in range(N_windows): temp = x[window_starts[w]:window_starts[w] + L] windows[w] = (temp - np.mean(temp)) / np.std(temp) # Calculate distances for all pairs of windows dist = pdist(np.vstack(windows), lambda u, v: np.sum((u - v) ** 2)) J = np.sum(dist) / float(L * (N_windows - 1)) return J
def k_nearest_neighbor(self, sequence): # Calculate dist_matrix dist_array = pdist(sequence) dist_matrix = squareform(dist_array) # Construct tour new_sequence = [sequence[0]] current_city = 0 visited_cities = [0] for i in range(1,len(sequence)): j = np.random.randint(0,min(len(sequence)-i,self.kNN)) next_city = [index for index in dist_matrix[current_city].argsort() if index not in visited_cities][j] visited_cities.append(next_city) new_sequence.append(sequence[next_city]) current_city = next_city return np.asarray(new_sequence) # Generate random TSP-TW instance
def kmeans_classify(A, means, metric): # set up the lists to return data_classes = [] data_metrics = [] # set up the distance to be the max number possible dist = sys.maxint for v in A: # for every data vector index = 0 for i in range(len(means.tolist())): m = means.tolist()[i] norm_matrix = np.vstack((v, m)) if norms.pdist(norm_matrix, metric)[0] < dist: dist = norms.pdist(norm_matrix, metric)[0] index = i data_classes.append([index]) data_metrics.append([dist]) dist = sys.maxint return np.matrix(data_classes), np.matrix(data_metrics)
def create_3D_distance_matrix(vox_ijk, epi_fname): """Compute distance between voxels in the volume. Parameters ---------- vox_ijk : n x 3 array Indices of voxels included in the ROI. epi_fname : file path Path to image defining the volume space. Returns ------- dmat : array Dense square distance matrix. """ aff = nib.load(epi_fname).affine vox_ras = nib.affines.apply_affine(aff, vox_ijk) dmat = squareform(pdist(vox_ras)) return dmat
def PQTrain(data, lenSubVec,numSubCenter): (dataSize, dataDim)=data.shape if 0!=dataDim%lenSubVec: print "Cannot partition the feature space with the given segment number" return numSubVec=dataDim/lenSubVec centers=npy.zeros((numSubVec*numSubCenter,lenSubVec),dtype=npy.float32) distOfCenters=npy.zeros((numSubCenter,numSubCenter,numSubVec),dtype=npy.float32) objKmeans=KMeans(numSubCenter,'k-means++',3,100,0.001) for ii in range(numSubVec): print("PQ training. Processing "+str(ii)+"-th sub-vector") objKmeans.fit(data[:,ii*lenSubVec:(ii+1)*lenSubVec]) centers[ii*numSubCenter:(ii+1)*numSubCenter,:]= objKmeans.cluster_centers_ distOfCenters[:,:,ii]=squareform(pdist(objKmeans.cluster_centers_,metric="euclidean")) model={"centers":centers,"distOfCenters":distOfCenters} return model
def _compute_centers(self, X, sparse, rs): """Generate centers, then compute tau, dF and dN vals""" super(GRBFRandomLayer, self)._compute_centers(X, sparse, rs) centers = self.components_['centers'] sorted_distances = np.sort(squareform(pdist(centers))) self.dF_vals = sorted_distances[:, -1] self.dN_vals = sorted_distances[:, 1]/100.0 #self.dN_vals = 0.0002 * np.ones(self.dF_vals.shape) tauNum = np.log(np.log(self.grbf_lambda) / np.log(1.0 - self.grbf_lambda)) tauDenom = np.log(self.dF_vals/self.dN_vals) self.tau_vals = tauNum/tauDenom self._extra_args['taus'] = self.tau_vals # get radii according to ref [1]
def kernel_matrix(svm_model, original_X): if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'): K = (svm_model.zeta + svm_model.gamma * np.dot(original_X, original_X.T)) ** svm_model.Q elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'): pairwise_dists = squareform(pdist(original_X, 'euclidean')) K = np.exp(-svm_model.gamma * (pairwise_dists ** 2)) ''' K = np.zeros((svm_model.data_num, svm_model.data_num)) for i in range(svm_model.data_num): for j in range(svm_model.data_num): if (svm_model.svm_kernel == 'polynomial_kernel' or svm_model.svm_kernel == 'soft_polynomial_kernel'): K[i, j] = Kernel.polynomial_kernel(svm_model, original_X[i], original_X[j]) elif (svm_model.svm_kernel == 'gaussian_kernel' or svm_model.svm_kernel == 'soft_gaussian_kernel'): K[i, j] = Kernel.gaussian_kernel(svm_model, original_X[i], original_X[j]) ''' return K
def generate_matches_array(labels): """ Return an array of bool in the same order as the distances from `scipy.spatial.distance.pdist` indicating whether a distance is for matching or non-matching labels. """ N = len(labels) matches = np.zeros(N * (N - 1) / 2, dtype=np.bool) # For every distance, mark whether it is a true match or not cur_matches_i = 0 for n in range(N): cur_label = labels[n] matches[cur_matches_i:cur_matches_i + (N - n) - 1] = np.asarray(labels[n + 1:]) == cur_label cur_matches_i += N - n - 1 return matches
def check_argv(): """Check the command line arguments.""" parser = argparse.ArgumentParser(description=__doc__.strip().split("\n")[0], add_help=False) parser.add_argument("labels_fn", help="file of labels") parser.add_argument( "distances_fn", help="file providing the distances between each pair of labels in the same order as " "`scipy.spatial.distance.pdist`" ) parser.add_argument( "--binary_dists", dest="binary_dists", action="store_true", help="distances are given in float32 binary format " "(default is to assume distances are given in text format)" ) parser.set_defaults(binary_dists=False) if len(sys.argv) == 1: parser.print_help() sys.exit(1) return parser.parse_args() # -----------------------------------------------------------------------------# # MAIN FUNCTION # # -----------------------------------------------------------------------------#
def plot_clusters_igraph(responsibilities, color_groups): from scipy.spatial.distance import pdist, correlation, squareform from igraph import Graph, plot data = responsibilities[:, :2] Y = pdist(data, hellinger_distance) print(Y[:30], file=stderr) # return g = Graph() n = data.shape[0] g.add_vertices(n) colors = ["grey"]*n palette = list(colors_dict.values()) for j, group in enumerate(color_groups): c = palette[j] for i in group: colors[i] = c l = g.layout_mds(dist=squareform(Y)) plot(g, layout=l, vertex_color=colors, bbox=(1024, 1024), vertex_size=5) # c&p from stackexchange
def get_cell_data(n=50, seed=0): np.random.seed(seed) cells_data = np.load('./data/cells_data.npy') sample_cells = np.random.choice(cells_data.shape[0], n, replace=False) D = pdist(cells_data[sample_cells, :], 'euclidean') Z = linkage(D, 'ward') return cells_data, Z, D
def get_random_data(n=50, seed=0): np.random.seed(seed) data = np.random.choice(10000, (n, 1), replace=False) D = pdist(data, 'euclidean') Z = linkage(D, 'ward') return data, Z, D
def kernel_matrix(self, X): # check for stupid mistake assert X.shape[0] > X.shape[1] sq_dists = squareform(pdist(X, 'sqeuclidean')) K = np.exp(-sq_dists/ self.scaling) return K
def k_multiple(self, X): """ Efficient computation of kernel matrix without loops Effectively does the same as calling self.k on all pairs of the input """ assert(X.ndim == 1) sq_dists = squareform(pdist(X.reshape(len(X), 1), 'sqeuclidean')) K = np.exp(-(sq_dists) / self.scaling) return K
def k_multiple_dim(self, X): # check for stupid mistake assert X.shape[0] > X.shape[1] sq_dists = squareform(pdist(X, 'sqeuclidean')) K = np.exp(-(sq_dists) / self.scaling) return K
def plot_bokeh(df,sublist,filename): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) #cannot use seaborn palette for bokeh palette =['red','green','blue','yellow'] colors =[] for i in range(len(sublist)): for j in range(lenlist[i+1]-lenlist[i]): colors.append(palette[i]) #plot with boken output_file(filename) source = ColumnDataSource( data=dict(x=tsne[:,0],y=tsne[:,1], cuisine = df_sub['cuisine'], recipe = df_sub['recipeName'])) hover = HoverTool(tooltips=[ ("cuisine", "@cuisine"), ("recipe", "@recipe")]) p = figure(plot_width=1000, plot_height=1000, tools=[hover], title="flavor clustering") p.circle('x', 'y', size=10, source=source,fill_color=colors) show(p)
def buildGraph(data, epsilon=1., metric='euclidean', p=2): D = squareform(pdist(data, metric=metric, p=p)) D[D >= epsilon] = 0. G = nx.Graph(D) edges = list(map(set, G.edges())) weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()] return G.nodes(), edges, weights
def is_satisfied(self, gcell): scale = np.array([[2, 0, 0], [0, 2, 0], [0, 0, 2]]) super_gcell = gcell.supercell(scale) target_cart = super_gcell.get_cartesian(ele=self.target_ele) # target_cart is a np array of target element's # cartesian coordinates mindist = np.min(pdist(target_cart)) is_ok = mindist > self.target_dist # import pdb # pdb.set_trace() return is_ok
def cluster(df, metric="euclidean", method="single", row=True, column=True): row_linkmat, col_linkmat = None, None if row: distmat = dist.pdist(df, metric) row_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(row_linkmat), :] if column: df = df.T distmat = dist.pdist(df, metric) col_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(col_linkmat), :].T return df, row_linkmat, col_linkmat
def makeT(self,cp): # cp: [(k*k*k) x 3] control points # T: [((k*k*k)+4) x ((k*k*k)+4)] K = cp.shape[0] T = np.zeros((K+4, K+4)) T[:K, 0] = 1; T[:K, 1:4] = cp; T[K, 4:] = 1; T[K+1:, 4:] = cp.T R = squareform(pdist(cp, metric='euclidean')) R = R * R;R[R == 0] = 1 # a trick to make R ln(R) 0 R = R * np.log(R) np.fill_diagonal(R, 0) T[:K, 4:] = R return T
def coherence(U,m): Phi = random_phi(m,U.shape[0]) PU = Phi.dot(U) d = distance.pdist(PU.T,'cosine') return abs(1-d)
def compare_distances(A,B,random_samples=[],s=200,pvalues=False): if len(random_samples) == 0: random_samples = np.zeros(A.shape[1],dtype=np.bool) random_samples[:min(s,A.shape[1])] = True np.random.shuffle(random_samples) dist_x = distance.pdist(A[:,random_samples].T,'euclidean') dist_y = distance.pdist(B[:,random_samples].T,'euclidean') pear = pearsonr(dist_x,dist_y) spear = spearmanr(dist_x,dist_y) if pvalues: return pear,spear else: return pear[0],spear[0]
def n1_fraction_borderline(data): def get_n1_for_round(sparse_matrix, y): Tcsr = minimum_spanning_tree(sparse_matrix) borders = set() a = Tcsr.nonzero()[0] b = Tcsr.nonzero()[1] for i in range(len(a)): if (y[a[i]] != y[b[i]]): borders.add(a[i]) borders.add(b[i]) n1 = len(borders) return n1 features = data.columns[:-1, ] dist = pdist(data[features], 'euclidean') df_dist = pd.DataFrame(squareform(dist)) sparse_matrix = csr_matrix(df_dist.values) labels = data.columns[-1] y = data[labels] n1 = 0 rounds = 10 for round in range(rounds): n1 = n1 + get_n1_for_round(sparse_matrix, y) n = len(data) n1 = (1.0 * n1) / (rounds * n) return n1
def n2_ratio_intra_extra_class_nearest_neighbor_distance(data): features = data.columns[:-1,] labels = data.columns[-1] dist = pdist(data[features], 'euclidean') df_dist = pd.DataFrame(squareform(dist)) max_size = df_dist.copy( ) max_size.iloc[:, :] = False classes = data.iloc[ :, -1].unique() n = data.shape[0] n2 = 0 cl = 'bla' intra_min = 0 inter_min = 0 for i in range(data.shape[0]): ci = data.iloc[i, -1] if ci != cl: cl = ci intra_idx = data[data[labels] == ci].index.values.tolist() inter_idx = data[data[labels] != ci].index.values intra_idx.remove(i) intra_min = intra_min + df_dist.iloc[intra_idx, i].min() inter_min = inter_min + df_dist.iloc[inter_idx, i].min() intra_idx.append(i) # tratar caso de inter_min == 0 if inter_min == 0: inter_min = 1 n2 = (1.0 * intra_min) / (1.0 * inter_min) return n2
def is_behavior_learning_done(self): """Check if the optimization is finished. Returns ------- finished : bool Is the learning of a behavior finished? """ if self.it <= self.n_samples_per_update: return False if not np.all(np.isfinite(self.fitness)): return True # Check for invalid values if not (np.all(np.isfinite(self.invsqrtC)) and np.all(np.isfinite(self.cov)) and np.all(np.isfinite(self.mean)) and np.isfinite(self.var)): self.logger.info("Stopping: infs or nans" % self.var) return True if (self.min_variance is not None and np.max(np.diag(self.cov)) * self.var <= self.min_variance): self.logger.info("Stopping: %g < min_variance" % self.var) return True max_dist = np.max(pdist(self.fitness[:, np.newaxis])) if max_dist < self.min_fitness_dist: self.logger.info("Stopping: %g < min_fitness_dist" % max_dist) return True cov_diag = np.diag(self.cov) if (self.max_condition is not None and np.max(cov_diag) > self.max_condition * np.min(cov_diag)): self.logger.info("Stopping: %g / %g > max_condition" % (np.max(self.cov), np.min(self.cov))) return True return False
def __call__(self): if len(self.words) == 0 or len(self.vectors) == 0: return [] distance_matrix = scidist.pdist(np.array(self.vectors),self.metric) linkage_matrix = hier.linkage(distance_matrix,self.linkage) dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors) clusterings = self._create_clusterings(dendrogram) return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
def calculate_fitness(feature_vectors): pairwise_euclidean_distances = distance.pdist(feature_vectors, 'euclidean') fitness = pairwise_euclidean_distances.mean() + \ pairwise_euclidean_distances.min() return fitness
def merge_candidates_scan(candidates, seriesuid, distance=5.): distances = pdist(candidates, metric='euclidean') adjacency_matrix = squareform(distances) # Determine nodes within distance, replace by 1 (=adjacency matrix) adjacency_matrix = np.where(adjacency_matrix<=distance,1,0) # Determine all connected components in the graph n, labels = connected_components(adjacency_matrix) new_candidates = np.zeros((n,3)) # Take the mean for these connected components for cluster_i in range(n): points = candidates[np.where(labels==cluster_i)] center = np.mean(points,axis=0) new_candidates[cluster_i,:] = center x = new_candidates[:,0] y = new_candidates[:,1] z = new_candidates[:,2] labels = [seriesuid]*len(x) class_name = [0]*len(x) data= zip(labels,x,y,z,class_name) new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS) return new_candidates
def precompute_kernels(self, q) : """ Returns a tuple of kernel, kernel', kernel'' matrices at position q. """ x = q.reshape((self.npoints, self.dimension)) dists = squareform(pdist(x, 'sqeuclidean')) K = exp(- dists / (2* self.kernel_scale ** 2)) return ( K, - K / (2* self.kernel_scale ** 2), K / (4* self.kernel_scale ** 4))
def dq_Kqp_a(self,q,p,a, kernels) : """ Useful for the adjoint integration scheme. d_q (K_q p) . a = ... """ h = 1e-8 Q0phA = q + h*a Q0mhA = q - h*a update_emp = ( Landmarks.K(self, Q0phA, p, Landmarks.precompute_kernels(self, Q0phA)) - Landmarks.K(self, Q0mhA, p, Landmarks.precompute_kernels(self, Q0mhA))) / (2*h) return update_emp """x = q.reshape((self.npoints, self.dimension)) p = p.reshape((self.npoints, self.dimension)) a = a.reshape((self.npoints, self.dimension)) dists = squareform(pdist(x, 'sqeuclidean')) # dists_ij = |x_i-x_j|^2 # We have : # [K_q p]_nd = sum_j { k(|x_n - x_j|^2) * p_j^d } # # So that : # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d } grad = zeros((self.npoints, self.dimension)) for d in range(self.dimension) : diffs = atleast_2d(x[:,d]).T - x[:,d] # diffs_ij = x_i^d - x_j^d # K_ij = 2 * (x_i^d - x_j^d) * k'(|x_i - x_j|^2) * p_j^d K = 2 * dists * kernels[1] * p[:,d] # grad_nd = a_nd * sum_j { 2 * (x_n^d - x_j^d) * k'(|x_n - x_j|^2) * p_j^d } grad[:,d] = a[:,d] * sum( K , 1 ) return grad.reshape((self.npoints * self.dimension,))"""
def getPairsFast(d, type): X = [] T = [] pairs = [] for i in range(len(d)): (p1,p2) = d[i] X.append(p1.representation) X.append(p2.representation) T.append(p1) T.append(p2) arr = pdist(X,'cosine') arr = squareform(arr) for i in range(len(arr)): arr[i,i]=1 if i % 2 == 0: arr[i,i+1] = 1 else: arr[i,i-1] = 1 arr = np.argmin(arr,axis=1) for i in range(len(d)): (t1,t2) = d[i] p1 = None p2 = None if type == "MAX": p1 = T[arr[2*i]] p2 = T[arr[2*i+1]] if type == "RAND": p1 = getPairRand(d,i) p2 = getPairRand(d,i) if type == "MIX": p1 = getPairMixScore(d,i,T[arr[2*i]]) p2 = getPairMixScore(d,i,T[arr[2*i+1]]) pairs.append((p1,p2)) return pairs
def cao_juan_2009(topic_term_dists, num_topics): cos_pdists = squareform(pdist(topic_term_dists, metric='cosine')) return np.sum(cos_pdists) / (num_topics*(num_topics - 1)/2)
def deveaud_2014(topic_term_dists, num_topics): jsd_pdists = squareform(pdist(topic_term_dists, metric=jensen_shannon)) return np.sum(jsd_pdists) / (num_topics*(num_topics - 1))
def check_embed_match(X_embed1, X_embed2): """ Check whether the two embeddings are almost the same by computing their normalized euclidean distances in the embedding space and checking the correlation. Inputs: - X_embed1, X_embed2: two Nxd matrices with coordinates in the embedding space Returns: - r: Pearson correlation coefficient between the normalized distances of the points """ D_emb1 = pdist(X_embed1, 'euclidean') D_emb2 = pdist(X_embed2, 'euclidean') D_emb1 /= D_emb1.max() D_emb2 /= D_emb2.max() return np.corrcoef(D_emb1, D_emb2)[0, 1]
def median_heuristic(y): """ Estimate RBF bandwith using median heuristic. Parameters ---------- y : (number of samples, dimension)-ndarray One row of y corresponds to one sample. Returns ------- bandwidth : float Estimated RBF bandwith. """ num_of_samples = y.shape[0] # number of samples # if y contains more samples, then it is subsampled to this cardinality num_of_samples_used = 100 # subsample y (if necessary; select '100' random y columns): if num_of_samples > num_of_samples_used: idx = choice(num_of_samples, num_of_samples_used, replace=False) y = y[idx] # broadcasting dist_vector = pdist(y) # pairwise Euclidean distances bandwith = median(dist_vector) / sqrt(2) return bandwith