我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用scipy.spatial.distance()。
def computeValDataDistanceMatrix(self): # TODO: use self.computeDescriptors(self.valdataDB) ? batchSize = self.cfgParams.batch_size nSamp = self.n_val_batches * batchSize descr = numpy.zeros((nSamp,self.descrNet.cfgParams.outputDim[1])) for i in range(self.n_val_batches): # if self.macroBatchSize > 0: # self.setMacroBatchData(self.valdataDB,numpy.floor(i / self.macroBatchSize).astype(numpy.int)) # miniBatchIdx = numpy.mod(i,self.macroBatchSize) # else: # miniBatchIdx = i miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.valdataDB,i) d = self.tfComputeDescr(miniBatchIdx) descr[i*batchSize:(i+1)*batchSize] = d dst = scipy.spatial.distance.pdist(descr,'euclidean') dst = scipy.spatial.distance.squareform(dst) return dst
def computeDistanceMatrix(self,test_set): batch_size = self.cfgParams.batch_size nSamp = test_set.numSamples descrLen = self.descrNet.cfgParams.outputDim[1] descr = numpy.zeros((nSamp,descrLen)) n_test_batches = nSamp / batch_size for i in range(n_test_batches): # if self.macroBatchSize > 0: # self.setMacroBatchData(test_set,numpy.floor(i / self.macroBatchSize).astype(numpy.int)) # miniBatchIdx = numpy.mod(i,self.macroBatchSize) # else: # miniBatchIdx = i miniBatchIdx = self.dataManager.makeMinibatchAvailable(test_set,i) d = self.tfComputeDescr(miniBatchIdx) descr[i*batch_size:(i+1)*batch_size] = d print("distances done") dst = scipy.spatial.distance.pdist(descr,'euclidean') dst = scipy.spatial.distance.squareform(dst) return dst
def fit(self, feat): # Compute affinity matrix using RBF kernel on pair-wise distances affinity = scipy.spatial.distance.pdist(np.array([f for id, f in feat])) sigma = -2 * np.var(affinity) affinity = np.exp(scipy.spatial.distance.squareform(affinity) / sigma) # Recursive clustering self.tree = { 'depth' : 0, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : None, 'items' : feat, 'affinity' : affinity } queue = [] heapq.heappush(queue, (-1 * len(self.tree['items']), np.random.rand(), self.tree)) while (self.tree['leafs'] < self.max_clusters) and (len(queue) > 0): if len(queue[0][2]['items']) <= self.min_cluster_size: break left, right, ncut_value = self.split(heapq.heappop(queue)[2]) if ncut_value > self.T: break if (left is not None) and (right is not None): heapq.heappush(queue, (-1 * len(left['items']), np.random.rand(), left)) heapq.heappush(queue, (-1 * len(right['items']), np.random.rand(), right))
def get_score_funcs(): """Helper to get the score functions""" from scipy import stats from scipy.spatial import distance score_funcs = Bunch() xy_arg_dist_funcs = [(n, f) for n, f in vars(distance).items() if isfunction(f) and not n.startswith('_')] xy_arg_stats_funcs = [(n, f) for n, f in vars(stats).items() if isfunction(f) and not n.startswith('_')] score_funcs.update(dict((n, _make_xy_sfunc(f)) for n, f in xy_arg_dist_funcs if _get_args(f) == ['u', 'v'])) score_funcs.update(dict((n, _make_xy_sfunc(f, ndim_output=True)) for n, f in xy_arg_stats_funcs if _get_args(f) == ['x', 'y'])) return score_funcs
def thrEstimation(self): x = 0.00 dx = 0.05 countsList = [] x_list = [] while x < 1: FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] countsList.append(counter[Best]) x+= dx x_list.append(x) dy = np.diff(countsList) for a, b in zip (x_list, dy): if b == max(dy): return a
def distancematrix(data, ydata=None): if ydata is None: distances = scipy.spatial.distance.pdist(data, 'euclidean') else: distances = scipy.spatial.distance.cdist(data, ydata, 'euclidean') return distances.astype(numpy.float32)
def batch_pdist(data_slice): # Each data_slice has tuples consisting of two points that we need to # find the great circle distance between and their weight: partial_sum = 0 for X, Y, weights in data_slice: dist = np.array([]) zipped = zip(X, Y) for x, y in zipped: dist = np.append(dist, great_circle(x, y).km) partial_sum += np.sum(weights * dist ) return partial_sum # return 10
def mean_pairwise_distance(X, weights=None, n_jobs=None, axis=0): """Function that returns the sum and mean of the pairwise distances of an 2D array X. Required arguments: X -- 2D array of points. Optional arguments: weights -- 1D array of counts or weights per point in X (default: 1s). n_jobs -- Number of cores to use for calculation (default: all). axis -- The axis of X corresponding to data elements (default: 0). """ N = X.shape[axis] if weights is None: weights = np.ones((N,)) if n_jobs is None: n_jobs = min(mp.cpu_count(),N) # Get the pairs and their weights to calculate the distances without # needing the whole of X, split it into roughly equal sub-arrays per cpu: pairs_split = np.array_split([(X[i:], X[:N - i], weights[i:] * weights[:N - i]) for i in xrange(1, N)], n_jobs, axis=axis) # Create a pool for each cpu to send the batch_dist function to each split. # Then, close the pool and wait for jobs to complete before continuing: pool = mp.Pool(processes=n_jobs) queue_sum = sum(pool.map(batch_pdist, pairs_split, chunksize=N // n_jobs)) pool.close() pool.join() N = weights.sum() # Compute the number of combinations, add to the number of unique pairs # and use that as the denominator to calculate the mean pairwise distance: mean = queue_sum / (N * (N - 1.0) / 2.0) # If you do not want to include distance from an item to itself use: # mean = queue_sum / (((N - 1)**2 + (N + 1)) / 2.0) return queue_sum, mean
def distance(a, b): """ Slow version of ``add`` to simulate work """ return np.sum(np.sqrt(np.sum((a - b)**2, axis=1))) # Parallel:
def checkFiltersDist(descrNet): wvals = descrNet.layer0.W.get_value() wvals = wvals.reshape((wvals.shape[0],numpy.prod(wvals.shape[1:]))) dst = scipy.spatial.distance.pdist(wvals,'cosine') dst = scipy.spatial.distance.squareform(dst) showDistanceMatrix(dst)
def split(self, node): # Perform normalized cut try: ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity']) except KeyboardInterrupt: raise except: return None, None, 0 # Create left and right node mask1, mask2 = (ind == 0), (ind == 1) if not (np.any(mask1) and np.any(mask2)): return None, None, 0 left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] } right = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] } # Force the node with the lower minimum distance to the query to be the left node if ind[0] == 1: # items are already sorted when passed to fit(), so we just need to look at the first item instead of re-computing all distances left, right = right, left # Modify parent node['children'] = [left, right] # Modify parent chain parent = node while parent is not None: parent['height'] += 1 parent['size'] += 2 parent['leafs'] += 1 parent = parent['parent'] return left, right, self.ncut_value(node['affinity'], ind)
def cosine_similarity(repr1, repr2): """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity).""" if repr1 is None or repr2 is None: return 0 assert not (np.isnan(repr2).any() or np.isinf(repr2).any()) assert not (np.isnan(repr1).any() or np.isinf(repr1).any()) sim = 1 - scipy.spatial.distance.cosine(repr1, repr2) if np.isnan(sim): # the similarity is nan if no term in the document is in the vocabulary return 0 return sim
def euclidean_distance(repr1, repr2): """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance).""" sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)])) return sim
def variational_distance(repr1, repr2): """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry).""" sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)]) return sim
def bhattacharyya_distance(repr1, repr2): """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance).""" sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)])) assert not np.isnan(sim), 'Error: Similarity is nan.' if np.isinf(sim): # the similarity is -inf if no term in the review is in the vocabulary return 0 return sim
def createLabels(self): self.labelList= [] with open(self.ccFile) as f: for line in f: if line.strip() == 'Labels': break for line in f: if line.strip() == 'Correlation coefficients': break goodLine = line.split() self.labelList.append("%s"%(goodLine[2].strip('\n'))) return self.labelList #changed, now the distance is defined directly by ccCalc
def checkMultiplicity(self, thr): FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] print('You are clustering with a threshold of %s'%(thr)) print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
def completenessEstimation(self): x = 0.00 dx = 0.05 while x > 1: FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
def minimalForCompleteness(self): print("Running estimator for minimal threshold for completeness") labels=self.createLabels() x = 0.00 dx = 0.05 countsList = {} x_list = [] while x < 1: Arrays= {} FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance') counter=collections.Counter(FlatC) Best = max(counter.iteritems(), key=operator.itemgetter(1))[0] toProcess=[Best] y=0 for cluster, filename in zip(FlatC,labels): if cluster in toProcess: hklFile = any_reflection_file(filename) b= hklFile.as_miller_arrays() for column in b: if column.is_xray_intensity_array(): Arrays[y]=column break y+=1 try: Arr = Arrays[0] except: countsList.append(0) for label in range(1, y): try: Arr = Arr.concatenate(Arrays[label]) except: pass countsList[x]=(Arr.completeness()) x+= dx # return minimal for max L = [] for key in countsList: if countsList[key]>0.98: L.append(key) L.sort() return L[0]
def createDendrogram(self): X = hierarchy.dendrogram(Tree, color_threshold=self.threshold) #self.textOutput.append('Plotted Dendrogram. Colored at a %s threshold for distance'%(threshold)) self.TreeCanvas.draw()
def mineHardNegativeTrainingPairsWithinMiniBatches(self): dnParams = self.descrNet.cfgParams batch_size = self.cfgParams.batch_size pairIdx = self.tvPairIdx #pairLabels = self.tvPairLabels y = self.tvY margin = self.pair_neg_margin diff = self.descrNet.output[pairIdx[:,0]] - self.descrNet.output[pairIdx[:,1]] dst = T.sum(diff**2,axis=1) / dnParams.outputDim[1] # divide by number of outputs, such that the max distance is 1 pairLabels = T.eq(y[pairIdx[:,0]],y[pairIdx[:,1]]) # same class / different class ? pair_cost = pairLabels*dst + (1-pairLabels)*T.sqr(T.maximum(0,margin - T.sqrt(dst))) # indices for all pairs of vectors in the minibatch pidx1,pidx2 = numpy.triu_indices(batch_size, 1) #numpy.mask_indices(batch_size, numpy.triu, 1) pidx1 = pidx1.reshape((len(pidx1),1)) pidx2 = pidx2.reshape((len(pidx2),1)) comb_pairIdx = numpy.concatenate((pidx1,pidx2),axis=1).astype(numpy.int32) dm = self.dataManager if isinstance(self.tvX,list): givens = { tv: data[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] for (tv,data) in zip(self.tvX,dm.tvsData_x) } else: givens = { self.tvX : dm.tvsData_x[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] } givens[self.y] = dm.tvsData_y[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] givens[pairIdx] = comb_pairIdx tf = theano.function(inputs=[self.tvIndex], outputs=[pair_cost], givens=givens) # for every sample get the index of the other sample with which together it forms the most expensive (highest cost) pair nSamp = self.n_train_batches*batch_size idx = numpy.zeros(nSamp,dtype=numpy.int32) labels = numpy.zeros(nSamp,dtype=numpy.int32) for i in range(self.n_train_batches): # if self.macroBatchSize > 0: # self.setMacroBatchData(self.traindataDB,numpy.floor(i / self.macroBatchSize).astype(numpy.int)) # miniBatchIdx = numpy.mod(i,self.macroBatchSize) # else: # miniBatchIdx = i miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.traindataDB,i) c = tf(miniBatchIdx) c = scipy.spatial.distance.squareform(c[0]) # find the max for each offset = i*batch_size maxIdx = numpy.argmax(c,axis=0) + offset idx[i*batch_size:(i+1)*batch_size] = maxIdx labels[i*batch_size:(i+1)*batch_size] = self.traindataDB.y[maxIdx] == self.traindataDB.y[i*batch_size:(i+1)*batch_size] #print(c) idx = numpy.concatenate((numpy.arange(nSamp,dtype=numpy.int32).reshape(nSamp,1),idx.reshape(nSamp,1)),axis=1) return idx,labels