我们从Python开源项目中,提取了以下47个代码示例,用于说明如何使用numpy.bincount()。
def hog(img): h, w = img.shape gx = cv2.Sobel(img, cv2.CV_32F, 1, 0) gy = cv2.Sobel(img, cv2.CV_32F, 0, 1) mag, ang = cv2.cartToPolar(gx, gy) bins = np.int32(bin_n*ang/(2*np.pi)) # quantizing binvalues in (0...16) bin_cells = () mag_cells = () for i in range(wc): for j in range(hc): bin_cells += (bins[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) mag_cells += (mag[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) #np.bincount() return times of each number appear hists = [np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells)] hist = np.hstack(hists) # hist is a 16*wc*hc vector return hist
def log_likelihood(self, data): nks = np.bincount(self.labels_, minlength=self.n_clusters) # number of points in each cluster n, d = data.shape log_likelihood = 0 covar_matrices = self.covariances(self.labels_, cluster_centers=self.cluster_centers_, data=data) covar_matrix_det_v = np.linalg.det(covar_matrices) self._inv_covar_matrices = self._matrix_inverses(covar_matrices) for k, nk in enumerate(nks): if self.verbose == 1: print('log_likelihood: covar_matrix_det = {}'.format(covar_matrix_det_v[k])) term_1 = nk * (np.log(float(nk)/n) - 0.5 * d * np.log(2*np.pi) - 0.5 * np.log(abs(covar_matrix_det_v[k]))) cdist_result = cdist(data[self.labels_ == k], np.array([self.cluster_centers_[k]]), metric='mahalanobis', VI=self._inv_covar_matrices[k]) cdist_no_nan = cdist_result[~np.isnan(cdist_result)] # to deal with nans returned by cdist term_2 = -0.5 * (np.sum(cdist_no_nan)) k_sum = term_1 + term_2 log_likelihood += k_sum if np.isnan(log_likelihood) or log_likelihood == float('inf'): raise Exception('ll is nan or inf') return log_likelihood
def generate_environment_assignments(n, num_sources): '''Randomly assign `n` counts to one of `num_sources` environments. Parameters ---------- n : int Number of environment assignments to generate. num_sources : int Number of possible environment states (this includes the 'Unknown'). Returns ------- seq_env_assignments : np.array 1D vector of length `n`. The ith entry is the environment assignment of the ith feature. envcounts : np.array 1D vector of length `num_sources`. The ith entry is the total number of entries in `seq_env_assignments` which are equal to i. ''' seq_env_assignments = np.random.choice(np.arange(num_sources), size=n, replace=True) envcounts = np.bincount(seq_env_assignments, minlength=num_sources) return seq_env_assignments, envcounts
def getPixelIoU(gtImg,submImg): #TODO TEST THOROUGHLY def compress(img): intImg=np.empty(img.shape[:2],dtype='int32') if len(img.shape)==3: intImg[:,:]=img[:,:,0] intImg[:,:]+=(256*img[:,:,1]) intImg[:,:]+=((256**2)*img[:,:,1]) else: intImg[:,:]=img[:,:] un=np.unique(intImg) idx=np.zeros(un.max()+1) idx[un]=np.arange(un.shape[0],dtype='int32') return idx[intImg],un.max()+1 if gtImg.shape[:2]!=submImg[:2]: raise Exception("gtImg and submImg must have the same size") gt,maxGt=compress(gtImg) subm,maxSubm=compress(gtImg) comb=gt*maxSubm+subm intMatrix=np.bincount(comb.reshape(-1)).reshape([maxSubm,maxGt]) uMatrix=np.zeros(intMatrix.shape) uMatrix[:,:]+=intMatrix.sum(axis=0)[None,:] uMatrix[:,:]+=intMatrix.sum(axis=1)[:,None] uMatrix-=intMatrix return intMatrix/uMatrix.astype('float64'),intMatrix,uMatrix
def test_with_incorrect_minlength(self): x = np.array([], dtype=int) assert_raises_regex(TypeError, "an integer is required", lambda: np.bincount(x, minlength="foobar")) assert_raises_regex(ValueError, "must be positive", lambda: np.bincount(x, minlength=-1)) assert_raises_regex(ValueError, "must be positive", lambda: np.bincount(x, minlength=0)) x = np.arange(5) assert_raises_regex(TypeError, "an integer is required", lambda: np.bincount(x, minlength="foobar")) assert_raises_regex(ValueError, "minlength must be positive", lambda: np.bincount(x, minlength=-1)) assert_raises_regex(ValueError, "minlength must be positive", lambda: np.bincount(x, minlength=0))
def add(self, arr): if not isinstance(arr, np.ndarray): arr = np.array(arr) arr = arr.flatten() self.min = min(self.min, arr.min()) self.max = max(self.max, arr.max()) self.sum += arr.sum() self.num += len(arr) self.sum_squares += (arr ** 2).sum() indices = np.searchsorted(self.bucket_limits, arr, side='right') new_counts = np.bincount(indices, minlength=self.buckets.shape[0]) if new_counts.shape[0] > self.buckets.shape[0]: # This should only happen with nans and extremely large values assert new_counts.shape[0] == self.buckets.shape[0] + 1, new_counts.shape new_counts = new_counts[:self.buckets.shape[0]] self.buckets += new_counts
def hog(img): h, w = img.shape gx = cv2.Sobel(img, cv2.CV_32F, 1, 0) gy = cv2.Sobel(img, cv2.CV_32F, 0, 1) mag, ang = cv2.cartToPolar(gx, gy) bins = np.int32(bin_n*ang/(2*np.pi)) # quantizing binvalues in (0...16) bin_cells = () mag_cells = () for i in range(wc): for j in range(hc): bin_cells += (bins[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) mag_cells += (mag[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) hists = [np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells)] hist = np.hstack(hists) # hist is a 16*wc*hc vector return hist
def hog(img): h, w = img.shape gx = cv2.Sobel(img, cv2.CV_32F, 1, 0) gy = cv2.Sobel(img, cv2.CV_32F, 0, 1) mag, ang = cv2.cartToPolar(gx, gy) bins = np.int32(bin_n*ang/(2*np.pi)) # quantizing binvalues in (0...bin_n) bin_cells = () mag_cells = () for i in range(wc): for j in range(hc): bin_cells += (bins[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) mag_cells += (mag[j*h/hc:(j+1)*h/hc, i*w/wc:(i+1)*w/wc],) hists = [np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells)] hist = np.hstack(hists) # hist is a bin_n*wc*hc vector return hist
def selected_features(self): """Get the number of times a feature was selected """ if len(self.best_estimator_): # Get selected features from the best estimator : iterator = product(range(self._rep), range(self._nfolds)) fselected = [] featrange = np.arange(self._nfeat)[np.newaxis, ...] for k, i in iterator: estimator = self.best_estimator_[k][i].get_params()['features'] fselected.extend(list(estimator.transform(featrange).ravel().astype(int))) # Get the count for each feature : bins = np.bincount(np.array(fselected)) selectedBins = np.zeros((self._nfeat,), dtype=int) selectedBins[np.arange(len(bins))] = bins # Put everything in a Dataframe : resum = pd.DataFrame({'Name':self._name, 'Count':selectedBins, 'Percent':100*selectedBins/selectedBins.sum()}, columns=['Name', 'Count', 'Percent']) return resum else: print('You must run the fit() method before')
def get_confusion_matrix(self, gt_label, pred_label, class_num): """ Calcute the confusion matrix by given label and pred :param gt_label: the ground truth label :param pred_label: the pred label :param class_num: the nunber of class :return: the confusion matrix """ index = (gt_label * class_num + pred_label).astype('int32') label_count = np.bincount(index) confusion_matrix = np.zeros((class_num, class_num)) for i_label in range(class_num): for i_pred_label in range(class_num): cur_index = i_label * class_num + i_pred_label if cur_index < len(label_count): confusion_matrix[i_label, i_pred_label] = label_count[cur_index] return confusion_matrix
def bm25_weight(X, K1=100, B=0.8): """ Weighs each row of a sparse matrix X by BM25 weighting """ # calculate idf per term (user) X = coo_matrix(X) N = float(X.shape[0]) idf = log(N / (1 + bincount(X.col))) # calculate length_norm per document (artist) row_sums = numpy.ravel(X.sum(axis=1)) average_length = row_sums.mean() length_norm = (1.0 - B) + B * row_sums / average_length # weight matrix rows by bm25 X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] return X
def predict(self, X): y_preds = np.empty((X.shape[0], len(self.trees))) # Let each tree make a prediction on the data for i, tree in enumerate(self.trees): # Indices of the features that the tree has trained on idx = tree.feature_indices # Make a prediction based on those features prediction = tree.predict(X[:, idx]) y_preds[:, i] = prediction y_pred = [] # For each sample for sample_predictions in y_preds: # Select the most common class prediction y_pred.append(np.bincount(sample_predictions.astype('int')).argmax()) return y_pred
def _class_frequencies(X, y): """Count the number of non-zero values for each class y in sparse X.""" labels = np.unique(y) if len(labels) > 2: raise ValueError("Delta works only with binary classification problems") # Indices for each type of labels in y N1 = np.where(y == labels[0])[0] N2 = np.where(y == labels[1])[0] # Number of positive documents that each term appears on df1 = np.bincount(X[N1].nonzero()[1], minlength=X.shape[1]) # Number of negative documents that each term appears on df2 = np.bincount(X[N2].nonzero()[1], minlength=X.shape[1]) return N1.shape[0], df1, N2.shape[0], df2
def get_his_std_qi( data_pixel_qi, max_cts=None): ''' YG. Dev 16, 2016 Calculate the photon histogram for one q by giving Parameters: data_pixel_qi: one-D array, for the photon counts max_cts: for bin max, bin will be [0,1,2,..., max_cts] Return: bins his std ''' if max_cts is None: max_cts = np.max( data_pixel_qi ) +1 bins = np.arange(max_cts) dqn, dqm = data_pixel_qi.shape #get histogram here H = np.apply_along_axis(np.bincount, 1, np.int_(data_pixel_qi), minlength= max_cts )/dqm #do average for different frame his = np.average( H, axis=0) std = np.std( H, axis=0 ) #cal average photon counts kmean= np.average(data_pixel_qi ) return bins, his, std, kmean
def test_particle_octree_counts(): np.random.seed(int(0x4d3d3d3)) # Eight times as many! data = {} bbox = [] for i, ax in enumerate('xyz'): DW = DRE[i] - DLE[i] LE = DLE[i] data["particle_position_%s" % ax] = \ np.random.normal(0.5, scale=0.05, size=(NPART*8)) * DW + LE bbox.append( [DLE[i], DRE[i]] ) bbox = np.array(bbox) for n_ref in [16, 32, 64, 512, 1024]: ds = load_particles(data, 1.0, bbox = bbox, n_ref = n_ref) dd = ds.all_data() bi = dd["io","mesh_id"] v = np.bincount(bi.astype("intp")) assert_equal(v.max() <= n_ref, True) bi2 = dd["all","mesh_id"] assert_equal(bi, bi2)
def _parse_output(self): unique_ids = np.unique(self.tags) counts = np.bincount(self.tags + 1) sort_indices = np.argsort(self.tags) grab_indices = np.indices(self.tags.shape).ravel()[sort_indices] dens = self.densities[sort_indices] cp = 0 for i in unique_ids: cp_c = cp + counts[i + 1] if i == -1: cp += counts[i + 1] continue group_indices = grab_indices[cp:cp_c] self._groups.append(self._halo_class(self, i, group_indices, ptype=self.ptype)) md_i = np.argmax(dens[cp:cp_c]) px, py, pz = \ [self.particle_fields['particle_position_%s' % ax][group_indices] for ax in 'xyz'] self._max_dens[i] = (dens[cp:cp_c][md_i], px[md_i], py[md_i], pz[md_i]) cp += counts[i + 1]
def _setup_particles(self, x, y, z, setup_fields=None): """ Assigns grids to particles and sets up particle positions. *setup_fields* is a dict of fields other than the particle positions to set up. """ particle_grids, particle_grid_inds = self.ds.index._find_points(x, y, z) idxs = np.argsort(particle_grid_inds) self.particles[:, self.posx_index] = x[idxs] self.particles[:, self.posy_index] = y[idxs] self.particles[:, self.posz_index] = z[idxs] self.NumberOfParticles = np.bincount(particle_grid_inds.astype("intp"), minlength=self.num_grids) if self.num_grids > 1: np.add.accumulate(self.NumberOfParticles.squeeze(), out=self.ParticleGridIndices[1:]) else: self.ParticleGridIndices[1] = self.NumberOfParticles.squeeze() if setup_fields is not None: for key, value in setup_fields.items(): field = (self.ptype, key) if isinstance(key, string_types) else key if field not in self.default_fields: self.particles[:,self.field_list.index(field)] = value[idxs]
def train_test_split_per_class(X, y, train_size=None, test_size=None): sh = np.array(X.shape) num_classes = len(np.bincount(y)) sh[0] = 0 X_train_arr = np.zeros(sh, dtype=X.dtype) X_test_arr = np.zeros(sh, dtype=X.dtype) y_train_arr = np.zeros((0), dtype=y.dtype) y_test_arr = np.zeros((0), dtype=y.dtype) for i in range(num_classes): X_train, X_test, y_train, y_test = train_test_split(X[y==i], y[y==i], train_size=train_size, test_size=test_size) X_train_arr = np.append(X_train_arr, X_train, axis=0) X_test_arr = np.append(X_test_arr, X_test, axis=0) y_train_arr = np.append(y_train_arr, y_train) y_test_arr = np.append(y_test_arr, y_test) return X_train_arr, X_test_arr, y_train_arr, y_test_arr
def check_generate_valid_indexes(self, num_examples, batch_size): T = 90 scheme = EpochwiseShuffledInfiniteScheme(num_examples, batch_size) uniquenesses = [] all_indexes = [] for i in range(T): indexes = next(scheme) is_unique = len(indexes) == len(np.unique(indexes)) uniquenesses.append(is_unique) all_indexes.append(indexes) assert np.all(uniquenesses) counts = np.bincount(np.concatenate(all_indexes).ravel()) expected_counts = [batch_size * T // num_examples] * num_examples assert np.array_equal(counts, expected_counts)
def entropy_score(labels): """ entropy = sum(p*log(1/p)) """ n_labels = labels.shape[0] if n_labels <= 1: return 0.0 counts = np.bincount(labels) probs = counts / float(n_labels) n_classes = np.count_nonzero(probs) if n_classes <= 1: return 0.0 entropy = 0.0 for p in probs: entropy -= p*np.log(p) return entropy
def split(self, X, y, groups=None): splits = super(BalancedKFold, self).split(X, y, groups) y = np.array(y) for train_index, test_index in splits: split_y = y[test_index] classes_y, y_inversed = np.unique(split_y, return_inverse=True) min_y = min(np.bincount(y_inversed)) new_index = np.zeros(min_y * len(classes_y), dtype=int) for cls in classes_y: cls_index = test_index[split_y == cls] if len(cls_index) > min_y: cls_index = np.random.choice( cls_index, size=min_y, replace=False) new_index[cls * min_y:(cls + 1) * min_y] = cls_index yield train_index, new_index
def test_univariate_categorical(): # This test generates univariate data from a nominal variable with 6 levels # and probability vector p_theory, and performs a chi-square test on # posterior samples from MvKde. rng = gu.gen_rng(2) N_SAMPLES = 1000 p_theory = [.3, .1, .2, .15, .15, .1] samples_test = rng.choice(range(6), p=p_theory, size=N_SAMPLES) kde = MultivariateKde( [7], None, distargs={O: {ST: [C], SA:[{'k': 6}]}}, rng=rng) # Incorporate observations. for rowid, x in enumerate(samples_test): kde.incorporate(rowid, {7: x}) kde.transition() # Posterior samples. samples_gen = kde.simulate(-1, [7], N=N_SAMPLES) f_obs = np.bincount([s[7] for s in samples_gen]) f_exp = np.bincount(samples_test) _, pval = chisquare(f_obs, f_exp) assert 0.05 < pval # Get some coverage on logpdf_score. assert kde.logpdf_score() < 0
def test_crp_decrement(N, alpha, seed): A = gu.simulate_crp(N, alpha, rng=gu.gen_rng(seed)) Nk = list(np.bincount(A)) # Decrement all counts by 1. Nk = [n-1 if n > 1 else n for n in Nk] # Decrement rowids. crp = simulate_crp_gpm(N, alpha, rng=gu.gen_rng(seed)) targets = [c for c in crp.counts if crp.counts[c] > 1] seen = set([]) for r, c in crp.data.items(): if c in targets and c not in seen: seen.add(c) crp.unincorporate(r) if seen == len(targets): break assert_crp_equality(alpha, Nk, crp)
def test_conditional_real(state): # Simulate from the conditional Z|X fig, axes = plt.subplots(2,3) fig.suptitle('Conditional Simulation Of Indicator Z Given Data X') # Compute representative data sample for each indicator. means = [np.mean(DATA[DATA[:,1]==t], axis=0)[0] for t in INDICATORS] for mean, indicator, ax in zip(means, INDICATORS, axes.ravel('F')): samples_subpop = [s[1] for s in state.simulate(-1, [1], {0:mean}, None, N_SAMPLES)] ax.hist(samples_subpop, color='g', alpha=.4) ax.set_title('True Indicator %d' % indicator) ax.set_xlabel('Simulated Indicator') ax.set_xticks(INDICATORS) ax.set_ylabel('Frequency') ax.set_ylim([0, ax.get_ylim()[1]+10]) ax.grid() # Check that the simulated indicator agrees with true indicator. true_ind_a = indicator true_ind_b = indicator-1 if indicator % 2 else indicator+1 counts = np.bincount(samples_subpop) frac = sum(counts[[true_ind_a, true_ind_b]])/float(sum(counts)) assert .8 < frac
def plot_dist_discrete(X, output, clusters, ax=None, Y=None, hist=True): # Create a new axis? if ax is None: _, ax = plt.subplots() # Set up x axis. X = np.asarray(X, dtype=int) x_max = max(X) Y = range(int(x_max)+1) X_hist = np.bincount(X) / float(len(X)) ax.bar(Y, X_hist, color='gray', edgecolor='none') # Compute weighted pdfs pdf = np.zeros((len(clusters), len(Y))) W = [log(clusters[k].N) - log(float(len(X))) for k in clusters] for i, k in enumerate(clusters): pdf[i,:] = np.exp( [W[i] + clusters[k].logpdf(None, {output:y}) for y in Y]) color, alpha = gu.curve_color(i) ax.bar(Y, pdf[i,:], color=color, edgecolor='none', alpha=alpha) # Plot the sum of pdfs. ax.bar( Y, np.sum(pdf, axis=0), color='none', edgecolor='black', linewidth=3) ax.set_xlim([0, x_max+1]) # Title. ax.set_title(clusters.values()[0].name()) return ax
def onehot(self, data, min_length=None): if min_length == None: min_length = self.vocab_size return np.bincount(data, minlength=min_length)
def test_sample_from_probs2_gof(size): set_random_seed(size) probs = np.exp(2 * np.random.random(size)).astype(np.float32) counts = np.zeros(size, dtype=np.int32) num_samples = 2000 * size probs2 = np.tile(probs, (num_samples, 1)) samples = sample_from_probs2(probs2) probs /= probs.sum() # Normalize afterwards. counts = np.bincount(samples, minlength=size) print(counts) print(probs * num_samples) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def count_pairs(assignments, v1, v2, M): """Construct sufficient statistics for (v1, v2) pairs. Args: assignments: An _ x V assignment matrix with values in range(M). v1, v2: Column ids of the assignments matrix. M: The number of possible assignment bins. Returns: An M x M array of counts. """ assert v1 != v2 pairs = assignments[:, v1].astype(np.int32) * M + assignments[:, v2] return np.bincount(pairs, minlength=M * M).reshape((M, M))
def _fast_hist(self, label_true, label_pred, n_class): mask = (label_true >= 0) & (label_true < n_class) hist = np.bincount( n_class * label_true[mask].astype(int) + label_pred[mask], minlength=n_class**2).reshape(n_class, n_class) return hist
def relabel_by_size(labels): """ Relabel clusters so they are sorted by number of members, descending. Args: labels (np.array(int)): 1-based cluster labels """ order = np.argsort(np.argsort(-np.bincount(labels))) return 1 + order[labels]
def get_cluster_sizes(clustering): """ Returns a numpy array containing cell-counts for each cluster """ return np.bincount(clustering.clusters)[1:]
def add_many(self, elems): self.active = True elems = np.copy(elems).astype(np.int_) elems[elems > self.max_value] = 1 + self.max_value self.counts += np.bincount(elems, minlength=len(self.counts))
def get_cdna_mol_counts_per_gene(self, gene_index, remove_none_gene=True): mol_genes = self.get_column('gene') num_genes = len(gene_index.get_genes()) gene_counts = np.bincount(mol_genes, minlength=num_genes + 1) if remove_none_gene: gene_counts = gene_counts[:num_genes] return gene_counts
def get_leaf(labels): # Obtain the leaf as the majority of the labels return np.bincount(labels).argmax()
def compute_class_frequencies(segment,num_classes): if isinstance(segment,list): segment = np.asarray(segment) f = 1.0 * np.bincount(segment.reshape(-1,).astype(int),minlength=num_classes) / np.prod(segment.shape) return f
def compute_centralvoxel_frequencies(segment,minlength): if isinstance(segment,list): segment = np.asarray(segment) shape = segment.shape[-3:] middle_coordinate = np.zeros(3,int) for it_coordinate,coordinate in enumerate(shape): if coordinate%2==0: middle_coordinate[it_coordinate] = coordinate / 2 - 1 else: middle_coordinate[it_coordinate] = coordinate/2 segment = segment.reshape((-1,) + shape) f = 1.0 * np.bincount(segment[:,middle_coordinate[0],middle_coordinate[1],middle_coordinate[2]].reshape(-1,).astype(int),minlength=minlength) / np.prod(segment.shape[:-3]) return f
def get_class_distribution(self, subject_list): class_frequencies = np.zeros(self.n_classes) for subj in subject_list: labels = subj.load_labels() mask = subj.load_ROI_mask() class_frequencies += np.bincount(labels.flatten().astype('int'), weights=mask.flatten(), minlength=self.n_classes) return class_frequencies
def get_class_weights(self,subject_list, mask_bool = True): class_frequencies = np.zeros(self.n_classes) for subj in subject_list: labels = subj.load_labels() if mask_bool == 'ROI': mask = subj.load_ROI_mask() class_frequencies += np.bincount(labels.flatten().astype('int'), weights=mask.flatten().astype('int'), minlength=self.n_classes) elif mask_bool == 'labels': mask = np.zeros_like(labels) mask[labels > 0] = 1 # print(np.bincount(labels.flatten().astype('int'), weights=mask.flatten().astype('int'), # minlength=self.n_classes)) class_frequencies += np.bincount(labels.flatten().astype('int'), weights=mask.flatten().astype('int'), minlength=self.n_classes+1)[1:] else : class_frequencies += np.bincount(labels.flatten().astype('int'), minlength=self.n_classes) class_frequencies = class_frequencies / np.sum(class_frequencies) class_weight = np.sort(class_frequencies)[int(np.ceil(1.0*self.n_classes/2))] / class_frequencies class_weight[np.where(class_frequencies == 0)[0]] = 0 #avoid infinit weight return class_weight
def epoch_voting(Y, chunk_size): Y_new = Y.copy() for i in range(1+len(Y_new)/chunk_size): epoch = Y_new[i*chunk_size:(i+1)*chunk_size] if len(epoch) != 0: winner = np.bincount(epoch).argmax() Y_new[i*chunk_size:(i+1)*chunk_size] = winner return Y_new
def est_pmf(self, samples, normalize=True, eps=1e-10): """Estimate probability mass function from samples :param np.ndarray samples: `(n_samples, len(self.nsoutdims))` array of samples :param bool normalize: True: Return normalized probability estimates (default). False: Return integer outcome counts. :returns: Estimated probabilities as ndarray `est_pmf` with shape `self.nsoutdims` `n_samples * est_pmf[i1, ..., ik]` provides the number of occurences of outcome `(i1, ..., ik)` in `samples`. """ n_samples = samples.shape[0] n_out = np.prod(self.nsoutdims) if samples.ndim > 1: samples = self.pack_samples(samples) counts = np.bincount(samples, minlength=n_out) assert counts.shape == (n_out,) counts = counts.reshape(self.nsoutdims) assert counts.sum() == n_samples if normalize: return counts / n_samples else: return counts
def fit(self, data): """ Run K-Means on data n_init times. Parameters ---------- data: numpy array Returns ------- No value is returned. Function sets the following two object params: self.labels_ self.cluster_centers_ """ data = np.array(data) labels, cluster_centers = [], [] for i in range(self.n_init): if not self.warm_start: self.cluster_centers_ = None self._global_covar_matrices = None self._inv_covar_matrices = None self._fit(data) labels += [self.labels_] cluster_centers += [self.cluster_centers_] self.inertias_ += [self._inertia(data)] self.log_likelihoods_ += [self.log_likelihood(data)] best_idx = np.argmin(self.inertias_) self.labels_ = labels[best_idx] self.all_labels_ = labels self.best_log_likelihood_ = self.log_likelihoods_[best_idx] self.best_inertia_ = self.inertias_[best_idx] self.cluster_centers_ = cluster_centers[best_idx] if self.verbose == 1: print('fit: n_clusters: {}, label bin count: {}'.format(self.n_clusters, np.bincount(self.labels_, minlength=self.n_clusters)))
def _document_frequency(X): """Count the number of non-zero values for each feature in sparse X.""" if sp.isspmatrix_csr(X): return np.bincount(X.indices, minlength=X.shape[1]) else: return np.diff(sp.csc_matrix(X, copy=False).indptr)