我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用sklearn.utils.gen_batches()。
def partial_fit(self, X, sample_indices=None): """ Update the factorization using rows from X Parameters ---------- X: ndarray, shape (n_samples, n_features) Input data sample_indices: Indices for each row of X. If None, consider that row i index is i (useful when providing the whole data to the function) Returns ------- self """ X = check_array(X, dtype=[np.float32, np.float64], order='C') n_samples, n_features = X.shape batches = gen_batches(n_samples, self.batch_size) for batch in batches: this_X = X[batch] these_sample_indices = get_sub_slice(sample_indices, batch) self._single_batch_fit(this_X, these_sample_indices) return self
def test_partial_fit_classification(): """Test partial_fit for classification. It should output the same results as 'fit' for binary and multi-class classification. """ for X, y in classification_datasets.values(): batch_size = 100 n_samples = X.shape[0] elm_fit = ELMClassifier(random_state=random_state, batch_size=batch_size) elm_partial_fit = ELMClassifier(random_state=random_state) elm_fit.fit(X, y) for batch_slice in gen_batches(n_samples, batch_size): elm_partial_fit.partial_fit(X[batch_slice], y[batch_slice], classes=np.unique(y)) pred1 = elm_fit.predict(X) pred2 = elm_partial_fit.predict(X) assert_array_equal(pred1, pred2) assert_greater(elm_fit.score(X, y), 0.95) assert_greater(elm_partial_fit.score(X, y), 0.95)
def test_standard_scaler_trasform_with_partial_fit(): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[:(i + 1), :] chunks_copy = X_sofar.copy() scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.nextafter(0, 1) assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted assert_equal((i + 1), scaler_incr.n_samples_seen_)
def _fit(self, X, y): num_input, self.num_input_nodes = X.shape num_res, self.num_res_nodes = y.shape if num_input != num_res: raise('Data set error!') nodes_list = [self.num_input_nodes] + \ self.num_hidden_nodes + [self.num_res_nodes] self.init_param(nodes_list) if self.batch_size == 'auto': batch_size = min(200, num_input) else: batch_size = self.batch_size activations = [X] activations.extend( [np.empty((batch_size, n_out_node)) for n_out_node in nodes_list[1:]]) activations = self._forward_pass(activations) delta = [np.empty_like(a_layer) for a_layer in activations] dww = [np.empty_like(w) for w in self.ww] dth = [np.empty_like(th) for th in self.th] for it in xrange(self.max_iter): if self.shuffle: index = np.random.permutation(num_input) X = X[index] y = y[index] for batch in gen_batches(num_input, batch_size): activations[0] = X[batch] dww, dth = self._backprog( X[batch], y[batch], dww, dth, delta, activations) for layer in xrange(self.n_layers_ - 1): self.ww[layer] -= self.learning_rate_init * dww[layer] self.th[layer] -= self.learning_rate_init * dth[layer] return activations[-1]
def test_partial_fit_regression(): """Test partial_fit for regression. It should output the same results as 'fit' for regression on different activations functions. """ X = Xboston y = yboston batch_size = 100 n_samples = X.shape[0] for activation in ACTIVATION_TYPES: elm_fit = ELMRegressor(random_state=random_state, C=100, activation=activation, batch_size=batch_size) elm_partial_fit = ELMRegressor(activation=activation, C=100, random_state=random_state, batch_size=batch_size) elm_fit.fit(X, y) for batch_slice in gen_batches(n_samples, batch_size): elm_partial_fit.partial_fit(X[batch_slice], y[batch_slice]) pred1 = elm_fit.predict(X) pred2 = elm_partial_fit.predict(X) assert_almost_equal(pred1, pred2, decimal=2) assert_greater(elm_fit.score(X, y), 0.85) assert_greater(elm_partial_fit.score(X, y), 0.85)
def test_sparse_matrices(): """Test that sparse and dense input matrices yield equal output.""" X = Xdigits_binary[:50] y = ydigits_binary[:50] X = csr_matrix(X) n_hidden = 15 batch_size = 10 # Standard ELM elm = ELMClassifier(random_state=1, n_hidden=n_hidden) # Batch based elm_batch_based = ELMClassifier(random_state=1, n_hidden=n_hidden, batch_size=10) # ELM for partial fitting elm_parital = ELMClassifier(random_state=1, n_hidden=n_hidden) # Train classifiers elm.fit(X, y) elm_batch_based.fit(X, y) for batch_slice in gen_batches(X.shape[0], batch_size): elm_parital.partial_fit(X[batch_slice], y[batch_slice]) # Get decision scores y_pred = elm.decision_function(X) y_pred_batch_based = elm_batch_based.decision_function(X) y_pred_partial = elm_parital.decision_function(X) # The prediction values should be the same assert_almost_equal(y_pred, y_pred_batch_based) assert_almost_equal(y_pred_batch_based, y_pred_partial)
def _decision_scores(self, X): """Predict using the ELM model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) if self.batch_size is None: hidden_activations = self._compute_hidden_activations(X) y_pred = safe_sparse_dot(hidden_activations, self.coef_output_) else: n_samples = X.shape[0] batches = gen_batches(n_samples, self.batch_size) y_pred = np.zeros((n_samples, self.n_outputs_)) for batch in batches: h_batch = self._compute_hidden_activations(X[batch]) y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_) return y_pred
def fit(self, X, y=None): """Fit the model with X, using minibatches of size batch_size. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y: Passthrough for ``Pipeline`` compatibility. Returns ------- self: object Returns the instance itself. """ if isinstance(X, Data): X = X[:] X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: batch_size = 12 * n_features else: batch_size = self.batch_size for batch in gen_batches(n_samples, batch_size): x = X[batch] self.partial_fit(x, check_input=False) return self
def pairs_distances_batch(X, ind_a, ind_b, batch_size=500): """Equivalent to np.sum(np.square(x[ind_a] - x[ind_b]), axis=1) Parameters ---------- X : array_like An array of data samples with shape (n_samples, n_features_in). ind_a : array_like An array of samples indices with shape (m,). ind_b : array_like Another array of samples indices with shape (m,). batch_size : Size of each chunk of X to compute distances for (default: 500) Returns ------- array-like An array of pairwise distances with shape (m,). """ n = len(ind_a) res = np.zeros(n) for chunk in gen_batches(n, batch_size): res[chunk] = np.sum(np.square(X[ind_a[chunk]] - X[ind_b[chunk]]), axis=1) return res
def scalable_frobenius_norm_discrepancy(X, U, s, V): # if the input is not too big, just call scipy if X.shape[0] * X.shape[1] < MAX_MEMORY: A = X - U.dot(np.diag(s).dot(V)) return norm_diff(A, norm='fro') print("... computing fro norm by batches...") batch_size = 1000 Vhat = np.diag(s).dot(V) cum_norm = .0 for batch in gen_batches(X.shape[0], batch_size): M = X[batch, :] - U[batch, :].dot(Vhat) cum_norm += norm_diff(M, norm='fro', msg=False) return np.sqrt(cum_norm)
def fit(self, X, y=None): """Learns a dictionary from sparse matrix X Parameters ---------- X: csr-matrix (n_samples, n_features) Datset to learn the dictionary from """ if not sp.issparse(X): X = sp.csr_matrix(X) X = check_array(X, accept_sparse='csr', dtype=[np.float32, np.float64], copy=True) dtype = X.dtype n_samples, n_features = X.shape self.random_state = check_random_state(self.random_state) if self.detrend: self.row_mean_, self.col_mean_ = compute_biases(X, beta=self.beta, inplace=False) for i in range(X.shape[0]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.row_mean_[i] X.data -= self.col_mean_.take(X.indices, mode='clip') self.components_ = self.random_state.randn(self.n_components, n_features).astype(dtype) S = np.sqrt(np.sum(self.components_ ** 2, axis=1)) self.components_ /= S[:, np.newaxis] self.code_ = np.zeros((n_samples, self.n_components), dtype=dtype) self._refit(X) self.feature_freq_ = np.bincount(X.indices) / n_samples self.feature_n_iter_ = np.zeros(n_features, dtype=int) sparsity = X.nnz / n_samples / n_features if self.batch_size is None: batch_size = int(ceil(1. / sparsity)) else: batch_size = self.batch_size self.comp_norm_ = np.zeros(self.n_components, dtype=dtype) self.C_ = np.zeros((self.n_components, self.n_components), dtype=dtype) self.B_ = np.zeros((self.n_components, n_features), dtype=dtype) self.n_iter_ = 0 if self.verbose: log_lim = log(n_samples * self.n_epochs / batch_size, 10) self.verbose_iter_ = (np.logspace(0, log_lim, self.verbose, base=10) - 1) * batch_size self.verbose_iter_ = self.verbose_iter_.tolist() for i in range(self.n_epochs): permutation = self.random_state.permutation(n_samples) batches = gen_batches(n_samples, batch_size) for batch in batches: self._single_batch_fit(X, permutation[batch]) self._refit(X) return self
def transform(self, X): """ Compute the codes associated to input matrix X, decomposing it onto the dictionary Parameters ---------- X: ndarray, shape = (n_samples, n_features) Returns ------- code: ndarray, shape = (n_samples, n_components) """ check_is_fitted(self, 'components_') dtype = self.components_.dtype X = check_array(X, order='C', dtype=dtype.type) if X.flags['WRITEABLE'] is False: X = X.copy() n_samples, n_features = X.shape if not hasattr(self, 'G_agg') or self.G_agg != 'full': G = self.components_.dot(self.components_.T) else: G = self.G_ Dx = X.dot(self.components_.T) code = np.ones((n_samples, self.n_components), dtype=dtype) sample_indices = np.arange(n_samples) size_job = ceil(n_samples / self.n_threads) batches = list(gen_batches(n_samples, size_job)) par_func = lambda batch: _enet_regression_single_gram( G, Dx[batch], X[batch], code, get_sub_slice(sample_indices, batch), self.code_l1_ratio, self.code_alpha, self.code_pos, self.tol, self.max_iter) if self.n_threads > 1: res = self._pool.map(par_func, batches) _ = list(res) else: _enet_regression_single_gram( G, Dx, X, code, sample_indices, self.code_l1_ratio, self.code_alpha, self.code_pos, self.tol, self.max_iter) return code
def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500): """Find impostor pairs in chunks to avoid large memory usage Parameters ---------- x1 : array_like An array of transformed data samples with shape (n_samples, n_features). x2 : array_like An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples. t1 : array_like An array of distances to the margins with shape (n_samples,). t2 : array_like An array of distances to the margins with shape (m_samples,). batch_size : int (Default value = 500) The size of each chunk of x1 to compute distances to. return_dist : bool (Default value = False) Whether to return the distances to the impostors. Returns ------- tuple: (array_like, array_like, [array_like]) imp1 : array_like An array of sample indices with shape (n_impostors,). imp2 : array_like An array of sample indices that violate a margin with shape (n_impostors,). dist : array_like, optional An array of pairwise distances of (imp1, imp2) with shape (n_impostors,). """ n, m = len(t1), len(t2) imp1, imp2, dist = [], [], [] for chunk in gen_batches(n, batch_size): dist_out_in = euclidean_distances(x1[chunk], x2, squared=True) i1, j1 = np.where(dist_out_in < t1[chunk, None]) i2, j2 = np.where(dist_out_in < t2[None, :]) if len(i1): imp1.extend(i1 + chunk.start) imp2.extend(j1) if return_dist: dist.extend(dist_out_in[i1, j1]) if len(i2): imp1.extend(i2 + chunk.start) imp2.extend(j2) if return_dist: dist.extend(dist_out_in[i2, j2]) if return_dist: return imp1, imp2, dist else: return imp1, imp2
def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)
def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal(np.zeros(n_features, dtype=np.float64), scaler_incr.var_) assert_array_almost_equal(np.ones(n_features, dtype=np.float64), scaler_incr.scale_) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal(np.std(X[batch0], axis=0), scaler_incr.scale_) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
def test_maxabs_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = sparse.csr_matrix(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = sparse.csc_matrix(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csc.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)