我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.array_split()。
def create_agents(self, generator): """ Given information on a set of countries and a generator function, generate the agents and assign the results to ``self.agents``. :type generator: DataFrame, str, int :param generator: A function which generates the agents. """ self.generator = generator country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()]) country_array.index = range(len(country_array)) # Garbage collect before creating new processes. gc.collect() self.agents = pd.concat( self.pool.imap(self._gen_agents, np.array_split(country_array, self.processes * self.splits)) ) self.agents.index = range(len(self.agents))
def test_latlon2pix_internals(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position): img = make_image(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position) chunk_idx = img.chunk_idx res_x = img._full_res[0] res_y = img._full_res[1] pix_size = (img.pixsize_x, img.pixsize_y) origin = (img._start_lon, img._start_lat) # +0.5 for centre of pixels lons = (np.arange(res_x) + 0.5) * pix_size[0] + origin[0] all_lats = (np.arange(res_y) + 0.5) * pix_size[1] + origin[1] lats = np.array_split(all_lats, num_chunks)[chunk_idx] pix_x = np.arange(res_x) pix_y = np.arange(lats.shape[0]) d = np.array([[a, b] for a in lons for b in lats]) xy = img.lonlat2pix(d) true_xy = np.array([[a, b] for a in pix_x for b in pix_y]) assert np.all(xy == true_xy)
def test_pix2latlong(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position): img = make_image(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position) chunk_idx = img.chunk_idx res_x = img._full_res[0] res_y = img._full_res[1] pix_size = (img.pixsize_x, img.pixsize_y) origin = (img._start_lon, img._start_lat) true_lons = np.arange(res_x) * pix_size[0] + origin[0] all_lats = np.arange(res_y) * pix_size[1] + origin[1] true_lats = np.array_split(all_lats, num_chunks)[chunk_idx] true_d = np.array([[a, b] for a in true_lons for b in true_lats]) pix_x = np.arange(res_x) pix_y = np.arange(img.resolution[1]) # chunk resolution xy = np.array([[a, b] for a in pix_x for b in pix_y]) lonlats = img.pix2lonlat(xy) assert np.all(lonlats == true_d)
def transform(self, X): if self.tagger is None: raise ValueError("Must find_motifs before you can tag anything") logging.info("Tagging %s data with motifs using %d workers..." % ( str(X.shape), self.n_jobs)) if self.n_jobs > 1: pool = mp.ProcessingPool(self.n_jobs) splits = np.array_split(X, self.n_jobs) tag_lists = pool.map(self._tag_motifs, splits) tags = list(itertools.chain.from_iterable(tag_lists)) else: tags = self._tag_motifs(X) logging.info("All motifs have been tagged") return self._sparsify_tags(tags)
def subset_iterator(X, m, repeats=1): ''' Iterates over array X in chunks of m, repeat number of times. Each time the order of the repeat is randomly generated. ''' N, dim = X.shape progress = tqdm(total=repeats * int(N / m)) for i in range(repeats): indices = np.random.permutation(N) for idx in np.array_split(indices, N // m): yield X[idx][:] progress.update() progress.close()
def _split_into_groups(y, num_groups): groups = [[] for _ in range(num_groups)] group_index = 0 for cls in set(y): this_cls_indices = np.where(y == cls)[0] num_cls_samples = len(this_cls_indices) num_cls_split_groups = ceil(num_cls_samples / 500) split = np.array_split(this_cls_indices, num_cls_split_groups) for cls_group in split: groups[group_index] = np.hstack((groups[group_index], cls_group)) group_index = (group_index + 1) % num_groups return groups
def get_embedding_X(img): ''' Args : Numpy Images vector Returns : Embedded Matrix of length Samples, 4096 ''' img = img.reshape((img.shape[0], img.shape[1], img.shape[2], 1)) sess = tf.Session() imgs = tf.placeholder(tf.float32, [None, None, None, None]) vgg = vgg16(imgs, '/tmp/vgg16_weights.npz', sess) embs = [] cnt = 0 for img_batch in np.array_split(img, img.shape[0] / 1000): emb = sess.run(vgg.emb, feed_dict={vgg.imgs: img_batch}) embs.extend(emb) cnt += 1 progress = round(100 * (cnt * 1000 / img.shape[0]),2) if(progress%10 == 0): print progress embs = np.array(embs) print embs.shape embs = np.reshape(embs,(embs.shape[0],embs.shape[1] * embs.shape[2] * embs.shape[3])) return embs
def __init__(self, pobj, just_list = False, attr='_grids', round_robin=False): ObjectIterator.__init__(self, pobj, just_list, attr=attr) # pobj has to be a ParallelAnalysisInterface, so it must have a .comm # object. self._offset = pobj.comm.rank self._skip = pobj.comm.size # Note that we're doing this in advance, and with a simple means # of choosing them; more advanced methods will be explored later. if self._use_all: self.my_obj_ids = np.arange(len(self._objs)) else: if not round_robin: self.my_obj_ids = np.array_split( np.arange(len(self._objs)), self._skip)[self._offset] else: self.my_obj_ids = np.arange(len(self._objs))[self._offset::self._skip]
def iter_combinatorial_pairs(queue, num_examples, batch_size, interval, num_classes, augment_positive=False): num_examples_per_class = num_examples // num_classes pairs = np.array(list(itertools.combinations(range(num_examples), 2))) if augment_positive: additional_positive_pairs = make_positive_pairs( num_classes, num_examples_per_class, num_classes - 1) pairs = np.concatenate((pairs, additional_positive_pairs)) num_pairs = len(pairs) num_batches = num_pairs // batch_size perm = np.random.permutation(num_pairs) for i, batch_indexes in enumerate(np.array_split(perm, num_batches)): if i % interval == 0: x, c = queue.get() x = x.astype(np.float32) / 255.0 c = c.ravel() indexes0, indexes1 = pairs[batch_indexes].T x0, x1, c0, c1 = x[indexes0], x[indexes1], c[indexes0], c[indexes1] t = np.int32(c0 == c1) # 1 if x0 and x1 are same class, 0 otherwise yield x0, x1, t
def get_epoch_indexes(self): B = self.batch_size K = self.num_classes M = self.num_per_class N = K * M # number of total examples num_batches = M * int(K // B) # number of batches per epoch indexes = np.arange(N, dtype=np.int32).reshape(K, M) epoch_indexes = [] for m in range(M): perm = np.random.permutation(K) c_batches = np.array_split(perm, num_batches // M) for c_batch in c_batches: b = len(c_batch) # actual number of examples of this batch indexes_anchor = M * c_batch + m positive_candidates = np.delete(indexes[c_batch], m, axis=1) indexes_positive = positive_candidates[ range(b), np.random.choice(M - 1, size=b)] epoch_indexes.append((indexes_anchor, indexes_positive)) return epoch_indexes
def pre_processing(self): """Provide same API as Model, we split data to K folds here. """ if self.random: mask = np.random.permutation(self.train_x.shape[0]) train_x = self.train_x[mask] train_y = self.train_y[mask] else: train_x = self.train_x[:] train_y = self.train_y[:] if self.select_train_method == 'step': self.x_folds = [train_x[i::self.k_folds] for i in range(0, self.k_folds)] self.y_folds = [train_y[i::self.k_folds] for i in range(0, self.k_folds)] else: self.x_folds = np.array_split(train_x, self.k_folds) self.y_folds = np.array_split(train_y, self.k_folds) # for i in range(self.k_folds): # self.x_folds[i] = self.train_x[0] + self.x_folds[i] + self.train_x[-1] # self.y_folds[i] = self.train_y[0] + self.y_folds[i] + self.train_y[-1]
def Train(self, C, A, Y, SF): ''' Train the classifier using the sample matrix A and target matrix Y ''' C.fit(A, Y) YH = np.zeros(Y.shape, dtype = np.object) for i in np.array_split(np.arange(A.shape[0]), 32): #Split up verification into chunks to prevent out of memory YH[i] = C.predict(A[i]) s1 = SF(Y, YH) print('All:{:8.6f}'.format(s1)) ''' ss = ShuffleSplit(random_state = 1151) #Use fixed state for so training can be repeated later trn, tst = next(ss.split(A, Y)) #Make train/test split mi = [8] * 1 #Maximum number of iterations at each iter YH = np.zeros((A.shape[0]), dtype = np.object) for mic in mi: #Chunk size to split dataset for CV results #C.SetMaxIter(mic) #Set the maximum number of iterations to run #C.fit(A[trn], Y[trn]) #Perform training iterations '''
def add_point(self, t, alt, az): self.window.append((t, alt, az)) if self._current_window_size() < self.window_duration: return points = np.array(self.window) steady, current = np.array_split(points, 2) _, steady_cube = self.create_cube(steady) timestamps, current_cube = self.create_cube(current) t = self.denoise_and_compare_cubes(steady_cube, current_cube) self.trigger_criterion.append(list(t)) self.trigger_criterion_timestamps.append(list(timestamps)) has_triggered = self.check_trigger(t) new_duration = self.window_duration - self.step self._reduce_to_duration(new_duration)
def predict(self): if os.path.exists(DATA_QUERIES_VECTOR_NPZ) and not FORCE_LOAD: print('{}: loading precomputed data'.format(self.__class__.__name__)) self.load_precomputed_data() else: self.precomputed_similarity() batch_size = 100 batch_elements = math.ceil(self.queries_vector.shape[0] / batch_size) batch_queue = np.array_split(self.queries_vector.A, batch_elements) print("starting batch computation of Similarity and KNN calculation") # # multiple versions of calculating the prediction, some faster, some use more mem # prediction = self.multiprocessor_batch_calc(batch_queue) prediction = self.batch_calculation(batch_queue) # prediction = self.individual_calculation() # prediction = self.cosine_knn_calc() # prediction = self.custom_knn_calculation(prediction) train_avg_salary = sum(self.y_train) / len(self.y_train) cleaned_predictions = [x if str(x) != 'nan' else train_avg_salary for x in prediction] return self.y_train, cleaned_predictions
def load_test_data(self): # Remove non-mat files, and perform ascending sort allfiles = os.listdir(self.data_dir) npzfiles = [] for idx, f in enumerate(allfiles): if ".npz" in f: npzfiles.append(os.path.join(self.data_dir, f)) npzfiles.sort() # Files for validation sets val_files = np.array_split(npzfiles, self.n_folds) val_files = val_files[self.fold_idx] print "\n========== [Fold-{}] ==========\n".format(self.fold_idx) print "Load validation set:" data_val, label_val = self._load_npz_list_files(val_files) return data_val, label_val
def __init__(self, X, kern, Xm): super(PITC, self).__init__("PITC") M = np.shape(Xm)[0] self.M = M start = time.time() X_split = np.array_split(X, M) self.kern = kern kern_blocks = np.zeros((M),dtype=object) for t in xrange(M): nyst = Nystrom(X_split[t], kern, Xm, False) size = np.shape(X_split[t])[0] kern_blocks[t] = kern.K(X_split[t], X_split[t]) - nyst.precon + (kern.noise)*np.identity(size) self.blocks = kern_blocks blocked = block_diag(*kern_blocks) self.nyst = Nystrom(X, kern, Xm, False) self.precon = self.nyst.precon + blocked self.duration = time.time() - start
def _read_image_as_array(path, dtype, load_size, crop_size, flip): f = Image.open(path) A, B = numpy.array_split(numpy.asarray(f), 2, axis=1) if hasattr(f, 'close'): f.close() A = _resize(A, load_size, Image.BILINEAR, dtype) B = _resize(B, load_size, Image.NEAREST, dtype) sx, sy = numpy.random.randint(0, load_size-crop_size, 2) A = _crop(A, sx, sy, crop_size) B = _crop(B, sx, sy, crop_size) if flip and numpy.random.rand() > 0.5: A = numpy.fliplr(A) B = numpy.fliplr(B) return A.transpose(2, 0, 1), B.transpose(2, 0, 1)
def setup_figure(): f = plt.figure(figsize=(7, 5)) mat_grid = plt.GridSpec(2, 6, .07, .52, .98, .95, .15, .20) mat_axes = [f.add_subplot(spec) for spec in mat_grid] sticks_axes, rest_axes = np.array_split(mat_axes, 2) scatter_grid = plt.GridSpec(1, 6, .07, .30, .98, .49, .15, .05) scatter_axes = [f.add_subplot(spec) for spec in scatter_grid] kde_grid = plt.GridSpec(1, 6, .07, .07, .98, .21, .15, .05) kde_axes = [f.add_subplot(spec) for spec in kde_grid] cbar_ax = f.add_axes([.04, .62, .015, .26]) return f, sticks_axes, rest_axes, scatter_axes, kde_axes, cbar_ax
def partitions(min_val, max_val, n): """ Get start/stop boundaries for N partitions. Args: min_val (int): The starting value. max_val (int): The last value. n (int): The number of partitions. """ pts = np.array_split(np.arange(min_val, max_val+1), n) bounds = [] for pt in pts: bounds.append((int(pt[0]), int(pt[-1]))) return bounds
def fit(self, X, y): """Fit a series of independent estimators to the dataset. Parameters ---------- X : array, shape (n_samples, n_features, n_estimators) The training input samples. For each data slice, a clone estimator is fitted independently. y : array, shape (n_samples,) The target values. Returns ------- self : object Return self. """ self._check_Xy(X, y) self.estimators_ = list() # For fitting, the parallelization is across estimators. parallel, p_func, n_jobs = parallel_func(_sl_fit, self.n_jobs) estimators = parallel( p_func(self.base_estimator, split, y) for split in np.array_split(X, n_jobs, axis=-1)) self.estimators_ = np.concatenate(estimators, 0) return self
def _transform(self, X, method): """Aux. function to make parallel predictions/transformation.""" self._check_Xy(X) method = _check_method(self.base_estimator, method) if X.shape[-1] != len(self.estimators_): raise ValueError('The number of estimators does not match ' 'X.shape[2]') # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. parallel, p_func, n_jobs = parallel_func(_sl_transform, self.n_jobs) X_splits = np.array_split(X, n_jobs, axis=-1) est_splits = np.array_split(self.estimators_, n_jobs) y_pred = parallel(p_func(est, x, method) for (est, x) in zip(est_splits, X_splits)) if n_jobs > 1: y_pred = np.concatenate(y_pred, axis=1) else: y_pred = y_pred[0] return y_pred
def _yield_minibatches_idx(self, n_batches, data_ary, shuffle=True): indices = np.arange(data_ary.shape[0]) if shuffle: indices = np.random.permutation(indices) if n_batches > 1: remainder = data_ary.shape[0] % n_batches if remainder: minis = np.array_split(indices[:-remainder], n_batches) minis[-1] = np.concatenate((minis[-1], indices[-remainder:]), axis=0) else: minis = np.array_split(indices, n_batches) else: minis = (indices,) for idx_batch in minis: yield idx_batch
def test_mini_batch_k_means_random_init_partial_fit(): km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) # use the partial_fit API for online learning for X_minibatch in np.array_split(X, 10): km.partial_fit(X_minibatch) # compute the labeling on the complete dataset labels = km.predict(X) assert_equal(v_measure_score(true_labels, labels), 1.0)
def binned_batch_stream(target_statistics, batch_size, n_batches, n_bins=64): hist, bins = np.histogram(target_statistics, bins=n_bins) indx = np.argsort(target_statistics) indicies_categories = np.array_split(indx, np.cumsum(hist)[:-1]) per_category = batch_size / n_bins weight_correction = (np.float64(hist) / per_category).astype('float32') wc = np.repeat(weight_correction, per_category) for i in xrange(n_batches): sample = [ np.random.choice(ind, size=per_category, replace=True) for ind in indicies_categories ] yield np.hstack(sample), wc
def binned_batch_stream(target_statistics, batch_size, n_batches, n_bins=64): hist, bins = np.histogram(target_statistics, bins=n_bins) indx = np.argsort(target_statistics) indicies_categories = np.array_split(indx, np.cumsum(hist)[:-1]) n_samples = target_statistics.shape[0] per_category = batch_size / n_bins weight_correction = (n_bins * np.float64(hist) / n_samples).astype('float32') wc = np.repeat(weight_correction, per_category) for i in xrange(n_batches): sample = [ np.random.choice(ind, size=per_category, replace=True) for ind in indicies_categories ] yield np.hstack(sample), wc
def test_shape_factors(self): """ Tests for :func:`array_split.split.shape_factors`. """ f = shape_factors(4, 2) self.assertTrue(_np.all(f == 2)) f = shape_factors(4, 1) self.assertTrue(_np.all(f == 4)) f = shape_factors(5, 2) self.assertTrue(_np.all(f == [1, 5])) f = shape_factors(6, 2) self.assertTrue(_np.all(f == [2, 3])) f = shape_factors(6, 3) self.assertTrue(_np.all(f == [1, 2, 3]))
def scale(boxlist, y_scale, x_scale): """Scale box coordinates in x and y dimensions. Args: boxlist: BoxList holding N boxes y_scale: float x_scale: float Returns: boxlist: BoxList holding N boxes """ y_min, x_min, y_max, x_max = np.array_split(boxlist.get(), 4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxlist = np_box_list.BoxList(np.hstack([y_min, x_min, y_max, x_max])) fields = boxlist.get_extra_fields() for field in fields: extra_field_data = boxlist.get_field(field) scaled_boxlist.add_field(field, extra_field_data) return scaled_boxlist
def iterbatches(arrays, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' arrays = tuple(map(np.asarray, arrays)) n = arrays[0].shape[0] assert all(a.shape[0] == n for a in arrays[1:]) inds = np.arange(n) if shuffle: np.random.shuffle(inds) sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches for batch_inds in np.array_split(inds, sections): if include_final_partial_batch or len(batch_inds) == batch_size: yield tuple(a[batch_inds] for a in arrays)
def _gen_init_n_blocks(na, nb, ka, kb): num_nodes_a = np.arange(na) n_blocks_a = map(len, np.array_split(num_nodes_a, ka)) num_nodes_b = np.arange(nb) n_blocks_b = map(len, np.array_split(num_nodes_b, kb)) n_blocks_ = " ".join(map(str, n_blocks_a)) + " " + " ".join(map(str, n_blocks_b)) return n_blocks_
def gen_equal_partition(n, total): all_nodes = np.arange(total) n_blocks = list(map(len, np.array_split(all_nodes, n))) return n_blocks
def run_par(self, function, **kwargs): """ Run a function on the agents in parallel. """ columns = kwargs["columns"] if "columns" in kwargs else self.agents.columns # Garbage collect before creating new processes. gc.collect() return pd.concat(self.pool.imap(partial(function, **kwargs), np.array_split(self.agents[columns], self.processes * self.splits)))
def split_in_chunks(minibatch, num_splits, flatten_keys=['labels']): '''Return the splits per device Return a list of dictionaries, one per device. Each dictionary contains, for each key, the values that should be allocated on its device. ''' # Split the value of each key into chunks for k, v in minibatch.iteritems(): minibatch[k] = np.array_split(v, num_splits) if any(k == v for v in flatten_keys): minibatch[k] = [el.flatten() for el in minibatch[k]] return map(dict, zip(*[[(k, v) for v in value] for k, value in minibatch.items()]))
def chunk_iterator(dataset, chunk_size=1000): chunk_indices = np.array_split(np.arange(len(dataset)), len(dataset)/chunk_size) for chunk_ixs in chunk_indices: chunk = dataset[chunk_ixs] yield (chunk_ixs, chunk) raise StopIteration
def array_split(ary, indices_or_sections, axis=0): """Splits an array into multiple sub arrays along a given axis. This function is almost equivalent to :func:`cupy.split`. The only difference is that this function allows an integer sections that does not evenly divide the axis. .. seealso:: :func:`cupy.split` for more detail, :func:`numpy.array_split` """ return core.array_split(ary, indices_or_sections, axis)
def split(ary, indices_or_sections, axis=0): """Splits an array into multiple sub arrays along a given axis. Args: ary (cupy.ndarray): Array to split. indices_or_sections (int or sequence of ints): A value indicating how to divide the axis. If it is an integer, then is treated as the number of sections, and the axis is evenly divided. Otherwise, the integers indicate indices to split at. Note that the sequence on the device memory is not allowed. axis (int): Axis along which the array is split. Returns: A list of sub arrays. Each array is a view of the corresponding input array. .. seealso:: :func:`numpy.split` """ if ary.ndim <= axis: raise IndexError('Axis exceeds ndim') size = ary.shape[axis] if numpy.isscalar(indices_or_sections): if size % indices_or_sections != 0: raise ValueError( 'indices_or_sections must divide the size along the axes.\n' 'If you want to split the array into non-equally-sized ' 'arrays, use array_split instead.') return array_split(ary, indices_or_sections, axis)
def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' arrays = tuple(map(np.asarray, arrays)) n = arrays[0].shape[0] assert all(a.shape[0] == n for a in arrays[1:]) inds = np.arange(n) if shuffle: np.random.shuffle(inds) sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches for batch_inds in np.array_split(inds, sections): if include_final_partial_batch or len(batch_inds) == batch_size: yield tuple(a[batch_inds] for a in arrays)
def trim_data(data, resolution): r = [] for i in numpy.array_split(data, resolution): if len(i) > 0: r.append(numpy.average(i)) return r
def test_latlon2pix_edges(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position): img = make_image(pix_size_single, origin_point, is_flipped, num_chunks, chunk_position) chunk_idx = img.chunk_idx res_x = img._full_res[0] res_y = img._full_res[1] pix_size = (img.pixsize_x, img.pixsize_y) origin = (img._start_lon, img._start_lat) # compute chunks lons = np.arange(res_x + 1) * pix_size[0] + origin[0] # right edge +1 all_lats = np.arange(res_y) * pix_size[1] + origin[1] lats_chunks = np.array_split(all_lats, num_chunks)[chunk_idx] pix_x = np.concatenate((np.arange(res_x), [res_x - 1])) pix_y_chunks = range(lats_chunks.shape[0]) if chunk_position == 'end': pix_y = np.concatenate((pix_y_chunks, [pix_y_chunks[-1]])) lats = np.concatenate((lats_chunks, [res_y * pix_size[1] + origin[1]])) else: pix_y = pix_y_chunks lats = lats_chunks d = np.array([[a, b] for a in lons for b in lats]) xy = img.lonlat2pix(d) true_xy = np.array([[a, b] for a in pix_x for b in pix_y]) assert np.all(xy == true_xy)
def split_cfold(nsamples, k=5, seed=None): """ Function that returns indices for splitting data into random folds. Parameters ---------- nsamples: int the number of samples in the dataset k: int, optional the number of folds seed: int, optional random seed to provide to numpy Returns ------- cvinds: list list of arrays of length k, each with approximate shape (nsamples / k,) of indices. These indices are randomly permuted (without replacement) of assignments to each fold. cvassigns: ndarray array of shape (nsamples,) with each element in [0, k), that can be used to assign data to a fold. This corresponds to the indices of cvinds. """ np.random.seed(seed) pindeces = np.random.permutation(nsamples) cvinds = np.array_split(pindeces, k) cvassigns = np.zeros(nsamples, dtype=int) for n, inds in enumerate(cvinds): cvassigns[inds] = n return cvinds, cvassigns
def fit(self, x, y, *args, **kwargs): # set a different random seed for each thread np.random.seed(self.random_state + mpiops.chunk_index) if self.parallel: process_rfs = np.array_split(range(self.forests), mpiops.chunks)[mpiops.chunk_index] else: process_rfs = range(self.forests) for t in process_rfs: print('training forest {} using ' 'process {}'.format(t, mpiops.chunk_index)) # change random state in each forest self.kwargs['random_state'] = np.random.randint(0, 10000) rf = RandomForestTransformed( target_transform=self.target_transform, n_estimators=self.n_estimators, **self.kwargs ) rf.fit(x, y) if self.parallel: # used in training pk_f = join(self.temp_dir, 'rf_model_{}.pk'.format(t)) else: # used when parallel is false, i.e., during x-val pk_f = join(self.temp_dir, 'rf_model_{}_{}.pk'.format(t, mpiops.chunk_index)) with open(pk_f, 'wb') as fp: pickle.dump(rf, fp) if self.parallel: mpiops.comm.barrier() # Mark that we are now trained self._trained = True
def kmean_distance2(x, C): """Compute squared euclidian distance to the nearest cluster centre Parameters ---------- x : ndarray (n, d) array of n d-dimensional points C : ndarray (k, d) array of k cluster centres Returns ------- d2_x : ndarray (n,) length array of distances from each x to the nearest centre """ # To save memory we partition the computation nsplits = max(1, int(x.shape[0]/distance_partition_size)) splits = np.array_split(x, nsplits) d2_x = np.empty(x.shape[0]) idx = 0 for x_i in splits: n_i = x_i.shape[0] D2_x = scipy.spatial.distance.cdist(x_i, C, metric='sqeuclidean') d2_x[idx:idx + n_i] = np.amin(D2_x, axis=1) idx += n_i return d2_x
def compute_weights(x, C): """Number of points in x assigned to each centre c in C Parameters ---------- x : ndarray (n, d) array of n d-dimensional points C : ndarray (k, d) array of k cluster centres Returns ------- weights : ndarray (k,) length array giving number of x closest to each c in C """ nsplits = max(1, int(x.shape[0]/distance_partition_size)) splits = np.array_split(x, nsplits) closests = np.empty(x.shape[0], dtype=int) idx = 0 for x_i in splits: n_i = x_i.shape[0] D2_x = scipy.spatial.distance.cdist(x_i, C, metric='sqeuclidean') closests[idx: idx+n_i] = np.argmin(D2_x, axis=1) idx += n_i weights = np.bincount(closests, minlength=C.shape[0]) return weights
def reseed_point(X, C, index): """ Re-initialise the centre of a class if it loses all its members This should almost never happen. If it does, find the point furthest from all the other cluster centres and use that. Maybe a bad idea but a decent first pass Parameters ---------- X : ndarray (n, d) array of points C : ndarray (k, d) array of cluster centres index : int >= 0 index between 0..k-1 of the cluster that has lost it's points Returns ------- new_point : ndarray d-dimensional point for replacing the empty cluster centre. """ log.info("Reseeding class with no members") nsplits = max(1, int(X.shape[0]/distance_partition_size)) splits = np.array_split(X, nsplits) empty_index = np.ones(C.shape[0], dtype=bool) empty_index[index] = False local_candidate = None local_cost = 1e23 for x_i in splits: D2_x = scipy.spatial.distance.cdist(x_i, C, metric='sqeuclidean') costs = np.sum(D2_x[:, empty_index], axis=1) potential_idx = np.argmax(costs) potential_cost = costs[potential_idx] if potential_cost < local_cost: local_candidate = x_i[potential_idx] local_cost = potential_cost best_pernode = mpiops.comm.allgather(local_cost) best_node = np.argmax(best_pernode) new_point = mpiops.comm.bcast(local_candidate, root=best_node) return new_point
def __init__(self, shape, bbox, crs, name, n_subchunks, outputdir, band_tags=None): # affine self.A, _, _ = image.bbox2affine(bbox[1, 0], bbox[0, 0], bbox[0, 1], bbox[1, 1], shape[0], shape[1]) self.shape = shape self.outbands = len(band_tags) self.bbox = bbox self.name = name self.outputdir = outputdir self.n_subchunks = n_subchunks self.sub_starts = [k[0] for k in np.array_split( np.arange(self.shape[1]), mpiops.chunks * self.n_subchunks)] # file tags don't have spaces if band_tags: file_tags = ["_".join(k.lower().split()) for k in band_tags] else: file_tags = [str(k) for k in range(self.outbands)] band_tags = file_tags if mpiops.chunk_index == 0: # create a file for each band self.files = [] for band in range(self.outbands): output_filename = os.path.join(outputdir, name + "_" + file_tags[band] + ".tif") f = rasterio.open(output_filename, 'w', driver='GTiff', width=self.shape[0], height=self.shape[1], dtype=np.float32, count=1, crs=crs, transform=self.A, nodata=self.nodata_value) f.update_tags(1, image_type=band_tags[band]) self.files.append(f)
def gdalaverage(input_dir, out_dir, size): """ average data using gdal's averaging method. Parameters ---------- input_dir: str input dir name of the tifs that needs to be averaged out_dir: str output dir name size: int, optional size of kernel Returns ------- """ input_dir = abspath(input_dir) log.info('Reading tifs from {}'.format(input_dir)) tifs = glob.glob(join(input_dir, '*.tif')) process_tifs = np.array_split(tifs, mpiops.chunks)[mpiops.chunk_index] for tif in process_tifs: data_set = gdal.Open(tif, gdal.GA_ReadOnly) # band = data_set.GetRasterBand(1) # data_type = gdal.GetDataTypeName(band.DataType) # data = band.ReadAsArray() # no_data_val = band.GetNoDataValue() # averaged_data = filter_data(data, size, no_data_val) log.info('Calculated average for {}'.format(basename(tif))) output_file = join(out_dir, 'average_' + basename(tif)) src_gt = data_set.GetGeoTransform() tmp_file = '/tmp/tmp_{}.tif'.format(mpiops.chunk_index) resample_cmd = [TRANSLATE] + [tif, tmp_file] + \ ['-tr', str(src_gt[1]*size), str(src_gt[1]*size)] + \ ['-r', 'bilinear'] check_call(resample_cmd) rollback_cmd = [TRANSLATE] + [tmp_file, output_file] + \ ['-tr', str(src_gt[1]), str(src_gt[1])] check_call(rollback_cmd) log.info('Finished converting {}'.format(basename(tif)))
def mean(input_dir, out_dir, size, func, partitions, mask): input_dir = abspath(input_dir) if isdir(input_dir): log.info('Reading tifs from {}'.format(input_dir)) tifs = glob.glob(join(input_dir, '*.tif')) else: assert isfile(input_dir) tifs = [input_dir] process_tifs = np.array_split(tifs, mpiops.chunks)[mpiops.chunk_index] for tif in process_tifs: log.info('Starting to average {}'.format(basename(tif))) treat_file(tif, out_dir, size, func, partitions, mask) log.info('Finished averaging {}'.format(basename(tif)))
def inspect(input_dir, report_file, partitions, extension): input_dir = abspath(input_dir) if isdir(input_dir): log.info('Reading tifs from {}'.format(input_dir)) tifs = glob.glob(join(input_dir, '*.' + extension)) else: log.info('Reporting geoinfo for {}'.format(input_dir)) tifs = [input_dir] with open(report_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow(['FineName', 'band', 'NoDataValue', 'rows', 'cols', 'Min', 'Max', 'Mean', 'Std', 'DataType', 'Categories', 'NanCount']) process_tifs = np.array_split(tifs, mpiops.chunks)[mpiops.chunk_index] stats = [] # process geotiff stats including multibanded geotif for t in process_tifs: stats.append(get_stats(t, partitions)) # gather all process geotif stats in stats dict stats = _join_dicts(stats) # global gather in root stats = _join_dicts(mpiops.comm.gather(stats, root=0)) if mpiops.chunk_index == 0: for k, v in stats.items(): write_rows(v, writer)