我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用sklearn.metrics.pairwise.euclidean_distances()。
def predict(self, X): """Predict ranking values for new data. Parameters ---------- X : array, shape (n_test, n_features) Test data Returns ------- y : array, shape (n_test,) Ranking values """ n_features = X.shape[1] if self.n_features != n_features: raise ValueError("Expected %d dimensions, got %d" % (self.n_features, n_features)) K = euclidean_distances(self.X, X, squared=True) K /= self.denom np.exp(K, K) return np.sum(self.alpha[:, np.newaxis] * (K[:-1] - K[1:]), axis=0)
def transform(self, X): """ Transform X into subcluster centroids dimension. Each dimension represents the distance from the sample point to each cluster centroid. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data. Returns ------- X_trans : {array-like, sparse matrix}, shape (n_samples, n_clusters) Transformed data. """ check_is_fitted(self, 'subcluster_centers_') return euclidean_distances(X, self.subcluster_centers_)
def _select_target_neighbors(self): """Find the target neighbors of each sample, that stay fixed during training. Returns ------- array_like An array of neighbors indices for each sample with shape (n_samples, n_neighbors). """ self.logger.info('Finding target neighbors...') target_neighbors = np.empty((self.X_.shape[0], self.n_neighbors_), dtype=int) for class_ in self.classes_: class_ind, = np.where(np.equal(self.y_, class_)) dist = euclidean_distances(self.X_[class_ind], squared=True) np.fill_diagonal(dist, np.inf) neigh_ind = np.argpartition(dist, self.n_neighbors_ - 1, axis=1) neigh_ind = neigh_ind[:, :self.n_neighbors_] # argpartition doesn't guarantee sorted order, so we sort again but only the k neighbors row_ind = np.arange(len(class_ind))[:, None] neigh_ind = neigh_ind[row_ind, np.argsort(dist[row_ind, neigh_ind])] target_neighbors[class_ind] = class_ind[neigh_ind] return target_neighbors
def eta_L2(self): # Note that V should be positive return self.V*np.sum(euclidean_distances(self.eta,squared=True))
def closest_label(X, labels, vec, dist='cosine', ooc_only=False, top=10): if dist == 'euclidean': sim = euclidean_distances(X, vec.reshape(1, -1)) elif dist == 'cosine': sim = cosine_similarity(X, vec.reshape(1, -1)) else: raise NotImplementedError('dist must be euclidean or cosine') # get the top five indices indices = sim.argsort(axis=0)[-top:][::-1] words = [] for i in indices: words.append(labels[i[0]]) return " ".join(words)
def compare_distances(self, train_img, cluster): # sometimes the sift algorithm matches random points on screen so therefore # it is necessary to determine the euclidean distances between these points distances = euclidean_distances([self.kmeans.cluster_centers_[0]], cluster) height, width = train_img.shape new_cluster = [] # If all the points are greater than np.sqrt((width / 2) ** 2 + (height / 2) ** 2) # Which then we can assume that they are not correct # this will only work on images that fit the same dimensions against the query image for index, distance in enumerate(distances[0]): if distance <= np.sqrt((width / 2) ** 2 + (height / 2) ** 2): new_cluster.append(cluster[index]) return new_cluster
def fit(self, X): """Fit ranking SVM. Parameters ---------- X : array, shape (n_samples, n_features) Training data, sorted, highest rank first """ self.n_samples, self.n_features = X.shape self.n_alpha = self.n_samples - 1 self.X = X if self.n_samples < 2: raise ValueError("Expected at least 2 training samples, got %d" % self.n_samples) random_state = check_random_state(self.random_state) n_iter = self.n_iter if n_iter < 0: n_iter = int(50000 * np.sqrt(self.n_features)) K = euclidean_distances(self.X, squared=True) # Average distance between training data sigma = np.sqrt(K).sum() / ((self.n_samples - 1) * self.n_samples) sigma *= self.c_sigma self.denom = -np.maximum(2.0 * sigma ** 2, MACHINE_EPSILON) K /= self.denom np.exp(K, K) # Constraint violation cost Ci = np.linspace(self.n_alpha, 1, self.n_alpha) ** self.c_pow Ci *= 10 ** self.c_base # Optimize alpha parameters self.alpha = optimize(Ci, K, 1.0, n_iter, random_state) return self
def generateClustering(self, assignment_proba, centroids, drop_annotated_instances = False, cluster_labels = None): self.clusters = [Cluster() for x in range(self.num_clusters)] if cluster_labels is not None: for x in range(self.num_clusters): self.clusters[x].label = cluster_labels[x] ids = self.instances.getIds() for i in range(len(ids)): instance_id = ids[i] annotated = self.instances.isAnnotated(instance_id) c = self.assigned_clusters[i] proba = None if assignment_proba is not None: proba = assignment_proba[i, :] label = self.instances.getLabel(instance_id) family = self.instances.getFamily(instance_id) if centroids is not None: # Reshape to avoid warning from euclidean_distances # Does not take 1D array as input centroid = centroids[c].reshape(1, -1) features = self.instances.getInstance(instance_id).reshape(1,-1) distance = euclidean_distances(centroid, features)[0][0] else: distance = None self.clusters[c].addInstance(instance_id, distance, label, family, annotated) unknown_cluster_id = 0 for c in range(self.num_clusters): unknown_cluster_id = self.clusters[c].finalComputation(unknown_cluster_id)
def substract_picks(self, path): oldpicks = self._picks.copy() with open(path, 'r') as f: regions = yaml.load(f) self._picks = regions['Centers'] diameter = regions['Diameter'] x_cord = np.array([_[0] for _ in self._picks]) y_cord = np.array([_[1] for _ in self._picks]) x_cord_old = np.array([_[0] for _ in oldpicks]) y_cord_old = np.array([_[1] for _ in oldpicks]) distances = np.sum((euclidean_distances(oldpicks, self._picks)<diameter/2)*1,axis=1)>=1 filtered_list = [i for (i, v) in zip(oldpicks, distances) if not v] x_cord_new = np.array([_[0] for _ in filtered_list]) y_cord_new = np.array([_[1] for _ in filtered_list]) output = False if output: fig1 = plt.figure() plt.title('Old picks and new picks') plt.scatter(x_cord,-y_cord, c='r', label='Newpicks') plt.scatter(x_cord_old,-y_cord_old, c='b', label='Oldpicks') plt.scatter(x_cord_new,-y_cord_new, c='g', label='Picks to keep') fig1.show() self._picks = filtered_list self.update_pick_info_short() self.window.tools_settings_dialog.pick_diameter.setValue(regions['Diameter']) self.update_scene(picks_only=True)
def compute(self): """ Compute distance matrix. Returns ------- D: array, shape = [m, n] Distance matrix. """ return euclidean_distances(self.X, self.Y, squared=True)
def first_periodic_kernel(X, Y=None, gamma=None, period=None): # TODO: Add mathematical form of the kernel in the docstring """Compute the first periodic kernel between *X* and *Y*. Parameters ---------- X : array of shape (n_samples_X, n_features) Y : array of shape (n_samples_Y, n_features) gamma : float, default None If None, default to 1.0 / n_samples_X period : float, default None If None, default to 2 * pi. This parameter should not be default as wrong estimation lead to poor learning score. Returns ------- kernel_matrix : array of shape (n_samples_X, n_samples_Y) """ X, Y = check_pairwise_arrays(X, Y) if gamma is None: gamma = 0.8 if period is None: period = 2. * pi a = -log(gamma) / period b = 2 * pi / period c = sqrt(pi / a) * (exp(- b ** 2 / (4 * a)) + 1) K = euclidean_distances(X, Y, squared=True) # TODO: Optimize to avoid temporary? return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
def fit(self, X, y): eucl = euclidean_distances(X) k = self.k while True: simi_m = 1 / (1 + eucl) to_remove = simi_m.shape[0] - (k + 1) for vec in simi_m: vec[vec.argsort()[:to_remove]] = 0 g = Graph.Weighted_Adjacency(simi_m.tolist(), mode=ADJ_UNDIRECTED, loops=False) if g.is_connected(): break k += 1 self.k = k comm = g.community_multilevel() self.y_comm = np.array(comm.membership) self.y = y self.X = X self.mapping = {} for c in list(set(comm.membership)): com_clas = self.y[self.y_comm==c] self.mapping[c] = Counter(com_clas).most_common(1)[0][0]
def predict(self, X): y_pred = [] for x in X: dists = euclidean_distances([x], self.X)[0] simi_m = 1 / (1 + dists) nearest_com = self.y_comm[simi_m.argsort()[-self.k:]] y_pred.append(self.mapping[Counter(nearest_com).most_common(1)[0][0]]) return np.array(y_pred)
def dist_matrices(X1, X2, criterion='euclidean'): X1loc = np.array(X1) X2loc = np.array(X2) if len(X1loc.shape) == 1: if len(X2loc.shape) == 1: if X1loc.shape[0] == X2loc.shape[0]: # As row vectors X1loc = X1loc.reshape(1, -1) X2loc = X2loc.reshape(1, -1) else: # As column vectors X1loc = X1loc.reshape(-1, 1) X2loc = X2loc.reshape(-1, 1) else: if X1loc.shape[0] == X2loc.shape[1]: # Row vector VS. Many rows X1loc = X1loc.reshape(1, -1) elif X2loc.shape[1] == 1: # Column vector VS. Column vector X1loc = X1loc.reshape(-1, 1) elif X1loc.shape[0] == X2loc.shape[0]: # Row vector VS. transposed columns X1loc = X1loc.reshape(1, -1) X2loc = X2loc.transpose() else: raise ValueError('Invalid dimensions of X1 and X2') elif len(X2loc.shape) == 1: if X2loc.shape[0] == X1loc.shape[1]: # Many rows VS. row vector X2loc = X2loc.reshape(1, -1) else: raise ValueError('Invalid dimensions of X1 and X2') if criterion == 'euclidean': return skdists.euclidean_distances(X1loc, X2loc) elif criterion == 'hamming': raise NotImplementedError('Hamming distance between rows of matrices has not been implemented yet.') else: raise ValueError('Invalid distance criterion')
def process(self, rows_slice): tmp = self.array[rows_slice, ...] result = - euclidean_distances(tmp, self.array, squared = True) with Worker.hdf5_lock: with tables.open_file(self.hdf5_file, 'r+') as fileh: hdf5_array = fileh.get_node(self.path) hdf5_array[rows_slice, ...] = result del tmp
def test_euclidean_distances(): # Check the pairwise Euclidean distances computation X = [[0]] Y = [[1], [2]] D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) X = csr_matrix(X) Y = csr_matrix(Y) D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) rng = np.random.RandomState(0) X = rng.random_sample((10, 4)) Y = rng.random_sample((20, 4)) X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1) Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1) # check that we still get the right answers with {X,Y}_norm_squared D1 = euclidean_distances(X, Y) D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) assert_array_almost_equal(D2, D1) assert_array_almost_equal(D3, D1) assert_array_almost_equal(D4, D1) # check we get the wrong answer with wrong {X,Y}_norm_squared X_norm_sq *= 0.5 Y_norm_sq *= 0.5 wrong_D = euclidean_distances(X, Y, X_norm_squared=np.zeros_like(X_norm_sq), Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) # Paired distances
def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40, max_iter=300): # weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None } if weight_initilization == 'kmeans-pca': start_time = timeit.default_timer() pca = PCA(n_components=nClusters).fit(encoder_val_clean) kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed) y_pred = kmeans_model.fit_predict(encoder_val_clean) centroids = kmeans_model.cluster_centers_.T centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids))) end_time = timeit.default_timer() elif weight_initilization == 'k-means++': start_time = timeit.default_timer() kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15, random_state=seed) y_pred = kmeans_model.fit_predict(encoder_val_clean) D = 1.0 / euclidean_distances(encoder_val_clean, kmeans_model.cluster_centers_, squared=True) D **= 2.0 / (2 - 1) D /= np.sum(D, axis=1)[:, np.newaxis] centroids = kmeans_model.cluster_centers_.T centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids))) end_time = timeit.default_timer() print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred), '\t acc = {:.4f} '.format(bestMap(y, y_pred)), 'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time) if y_pred_prev is not None: print('Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev), '\tdatapoints-bestMap*datapoints: ', encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0]) return centroids, kmeans_model.inertia_, y_pred
def _split_node(node, threshold, branching_factor): """The node has to be split if there is no place for a new subcluster in the node. 1. Two empty nodes and two empty subclusters are initialized. 2. The pair of distant subclusters are found. 3. The properties of the empty subclusters and nodes are updated according to the nearest distance between the subclusters to the pair of distant subclusters. 4. The two nodes are set as children to the two subclusters. """ new_subcluster1 = _CFSubcluster() new_subcluster2 = _CFSubcluster() new_node1 = _CFNode( threshold, branching_factor, is_leaf=node.is_leaf, n_features=node.n_features) new_node2 = _CFNode( threshold, branching_factor, is_leaf=node.is_leaf, n_features=node.n_features) new_subcluster1.child_ = new_node1 new_subcluster2.child_ = new_node2 if node.is_leaf: if node.prev_leaf_ is not None: node.prev_leaf_.next_leaf_ = new_node1 new_node1.prev_leaf_ = node.prev_leaf_ new_node1.next_leaf_ = new_node2 new_node2.prev_leaf_ = new_node1 new_node2.next_leaf_ = node.next_leaf_ if node.next_leaf_ is not None: node.next_leaf_.prev_leaf_ = new_node2 dist = euclidean_distances( node.centroids_, Y_norm_squared=node.squared_norm_, squared=True) n_clusters = dist.shape[0] farthest_idx = np.unravel_index( dist.argmax(), (n_clusters, n_clusters)) node1_dist, node2_dist = dist[[farthest_idx]] node1_closer = node1_dist < node2_dist for idx, subcluster in enumerate(node.subclusters_): if node1_closer[idx]: new_node1.append_subcluster(subcluster) new_subcluster1.update(subcluster) else: new_node2.append_subcluster(subcluster) new_subcluster2.update(subcluster) return new_subcluster1, new_subcluster2
def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500): """Find impostor pairs in chunks to avoid large memory usage Parameters ---------- x1 : array_like An array of transformed data samples with shape (n_samples, n_features). x2 : array_like An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples. t1 : array_like An array of distances to the margins with shape (n_samples,). t2 : array_like An array of distances to the margins with shape (m_samples,). batch_size : int (Default value = 500) The size of each chunk of x1 to compute distances to. return_dist : bool (Default value = False) Whether to return the distances to the impostors. Returns ------- tuple: (array_like, array_like, [array_like]) imp1 : array_like An array of sample indices with shape (n_impostors,). imp2 : array_like An array of sample indices that violate a margin with shape (n_impostors,). dist : array_like, optional An array of pairwise distances of (imp1, imp2) with shape (n_impostors,). """ n, m = len(t1), len(t2) imp1, imp2, dist = [], [], [] for chunk in gen_batches(n, batch_size): dist_out_in = euclidean_distances(x1[chunk], x2, squared=True) i1, j1 = np.where(dist_out_in < t1[chunk, None]) i2, j2 = np.where(dist_out_in < t2[None, :]) if len(i1): imp1.extend(i1 + chunk.start) imp2.extend(j1) if return_dist: dist.extend(dist_out_in[i1, j1]) if len(i2): imp1.extend(i2 + chunk.start) imp2.extend(j2) if return_dist: dist.extend(dist_out_in[i2, j2]) if return_dist: return imp1, imp2, dist else: return imp1, imp2
def reorderClusters(B, X, GDM, returnOrderIndices = False): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) Bloc = Bloc[:, np.any(Bloc, axis=0)] # Only keep non-empty clusters B_ordered = np.zeros(Bloc.shape, dtype=bool) K = Bloc.shape[1] # Number of clusters L = Xloc.shape[0] # Number of datasets if K == 0: return Bloc # Find Cmeans and distances between clusters Cmeans = np.array([None] * L, dtype=object) D = np.zeros([K, K, L]) # KxKxL for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float) # (K) x (X[l] samples) for k in range(K): Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) D[:, :, l] = skdists.euclidean_distances(Cmeans[l]) # KxK D = np.median(D, axis=2) # KxK # Set first cluster as first, then find closest by closest B_ordered[:, 0] = Bloc[:, 0] I = np.zeros(K, dtype=int) I[0] = 0 clustersDone = np.zeros(K, dtype=bool) clustersDone[0] = True for k in range(1,K): relevantD = D[I[k-1], ~clustersDone] clustersLeft = np.nonzero(~clustersDone)[0] nextCluster = np.argmin(relevantD) nextCluster = clustersLeft[nextCluster] B_ordered[:, k] = Bloc[:, nextCluster] I[k] = nextCluster clustersDone[nextCluster] = True if returnOrderIndices: return (B_ordered, I) else: return B_ordered
def set_preference(data, chunk_size): """Return the median of the distribution of pairwise L2 Euclidean distances between samples (the rows of 'data') as the default preference parameter for Affinity Propagation clustering. Parameters ---------- data : array of shape (N_samples, N_features) The data-set submitted for Affinity Propagation clustering. chunk_size : int The size of random subsamples from the data-set whose similarity matrix is computed. The resulting median of the distribution of pairwise distances between the data-points selected as part of a given subsample is stored into a list of medians. Returns ------- preference : float The preference parameter for Affinity Propagation clustering is computed as the median of the list of median pairwise distances between the data-points selected as part of each of 15 rounds of random subsampling. """ N_samples, N_features = data.shape rng = np.arange(0, N_samples, dtype = int) medians = [] for i in xrange(15): selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False) samples = data[selected_samples, :] S = - euclidean_distances(samples, data, squared = True) n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2) rows = np.zeros(0, dtype = int) for i in xrange(chunk_size): rows = np.append(rows, np.full(N_samples - i, i, dtype = int)) cols = np.zeros(0, dtype = int) for i in xrange(chunk_size): cols = np.append(cols, np.delete(rng, selected_samples[:i+1])) triu_indices = tuple((rows, cols)) preference = np.median(S, overwrite_input = True) medians.append(preference) del S if i % 4 == 3: gc.collect() preference = np.median(medians) return preference