Python sklearn.utils 模块，gen_batches() 实例源码

我们从Python开源项目中，提取了以下16个代码示例，用于说明如何使用sklearn.utils.gen_batches()。

项目：modl 作者：arthurmensch | 项目源码 | 文件源码

def partial_fit(self, X, sample_indices=None):
        """
        Update the factorization using rows from X

        Parameters
        ----------
        X: ndarray, shape (n_samples, n_features)
            Input data
        sample_indices:
            Indices for each row of X. If None, consider that row i index is i
            (useful when providing the whole data to the function)
        Returns
        -------
        self
        """
        X = check_array(X, dtype=[np.float32, np.float64], order='C')

        n_samples, n_features = X.shape
        batches = gen_batches(n_samples, self.batch_size)

        for batch in batches:
            this_X = X[batch]
            these_sample_indices = get_sub_slice(sample_indices, batch)
            self._single_batch_fit(this_X, these_sample_indices)
        return self

项目：extreme-learning-machines 作者：IssamLaradji | 项目源码 | 文件源码

def test_partial_fit_classification():
    """Test partial_fit for classification.

    It should output the same results as 'fit' for binary and
    multi-class classification.
    """
    for X, y in classification_datasets.values():
        batch_size = 100
        n_samples = X.shape[0]

        elm_fit = ELMClassifier(random_state=random_state,
                                batch_size=batch_size)
        elm_partial_fit = ELMClassifier(random_state=random_state)

        elm_fit.fit(X, y)
        for batch_slice in gen_batches(n_samples, batch_size):
            elm_partial_fit.partial_fit(X[batch_slice], y[batch_slice],
                                        classes=np.unique(y))

        pred1 = elm_fit.predict(X)
        pred2 = elm_partial_fit.predict(X)

        assert_array_equal(pred1, pred2)
        assert_greater(elm_fit.score(X, y), 0.95)
        assert_greater(elm_partial_fit.score(X, y), 0.95)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_standard_scaler_trasform_with_partial_fit():
    # Check some postconditions after applying partial_fit and transform
    X = X_2d[:100, :]

    scaler_incr = StandardScaler()
    for i, batch in enumerate(gen_batches(X.shape[0], 1)):

        X_sofar = X[:(i + 1), :]
        chunks_copy = X_sofar.copy()
        scaled_batch = StandardScaler().fit_transform(X_sofar)

        scaler_incr = scaler_incr.partial_fit(X[batch])
        scaled_incr = scaler_incr.transform(X_sofar)

        assert_array_almost_equal(scaled_batch, scaled_incr)
        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
        right_input = scaler_incr.inverse_transform(scaled_incr)
        assert_array_almost_equal(X_sofar, right_input)

        zero = np.zeros(X.shape[1])
        epsilon = np.nextafter(0, 1)
        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
        assert_array_less(zero, scaler_incr.scale_ + epsilon)
        # (i+1) because the Scaler has been already fitted
        assert_equal((i + 1), scaler_incr.n_samples_seen_)

项目：ML_algorithm 作者：luoshao23 | 项目源码 | 文件源码

def _fit(self, X, y):
        num_input, self.num_input_nodes = X.shape
        num_res, self.num_res_nodes = y.shape

        if num_input != num_res:
            raise('Data set error!')

        nodes_list = [self.num_input_nodes] + \
            self.num_hidden_nodes + [self.num_res_nodes]

        self.init_param(nodes_list)
        if self.batch_size == 'auto':
            batch_size = min(200, num_input)
        else:
            batch_size = self.batch_size

        activations = [X]
        activations.extend( [np.empty((batch_size, n_out_node))
                             for n_out_node in nodes_list[1:]])

        activations = self._forward_pass(activations)

        delta = [np.empty_like(a_layer) for a_layer in activations]
        dww = [np.empty_like(w) for w in self.ww]
        dth = [np.empty_like(th) for th in self.th]

        for it in xrange(self.max_iter):
            if self.shuffle:
                index = np.random.permutation(num_input)
                X = X[index]
                y = y[index]
            for batch in gen_batches(num_input, batch_size):
                activations[0] = X[batch]
                dww, dth = self._backprog(
                    X[batch], y[batch], dww, dth, delta, activations)
                for layer in xrange(self.n_layers_ - 1):

                    self.ww[layer] -= self.learning_rate_init * dww[layer]
                    self.th[layer] -= self.learning_rate_init * dth[layer]

        return activations[-1]

项目：extreme-learning-machines 作者：IssamLaradji | 项目源码 | 文件源码

def test_partial_fit_regression():
    """Test partial_fit for regression.

    It should output the same results as 'fit' for regression on
    different activations functions.
    """
    X = Xboston
    y = yboston
    batch_size = 100
    n_samples = X.shape[0]

    for activation in ACTIVATION_TYPES:
        elm_fit = ELMRegressor(random_state=random_state, C=100,
                               activation=activation, batch_size=batch_size)

        elm_partial_fit = ELMRegressor(activation=activation,
                                       C=100,
                                       random_state=random_state,
                                       batch_size=batch_size)

        elm_fit.fit(X, y)
        for batch_slice in gen_batches(n_samples, batch_size):
            elm_partial_fit.partial_fit(X[batch_slice], y[batch_slice])

        pred1 = elm_fit.predict(X)
        pred2 = elm_partial_fit.predict(X)

        assert_almost_equal(pred1, pred2, decimal=2)
        assert_greater(elm_fit.score(X, y), 0.85)
        assert_greater(elm_partial_fit.score(X, y), 0.85)

项目：extreme-learning-machines 作者：IssamLaradji | 项目源码 | 文件源码

def test_sparse_matrices():
    """Test that sparse and dense input matrices yield equal output."""
    X = Xdigits_binary[:50]
    y = ydigits_binary[:50]
    X = csr_matrix(X)
    n_hidden = 15
    batch_size = 10

    # Standard ELM
    elm = ELMClassifier(random_state=1, n_hidden=n_hidden)
    # Batch based
    elm_batch_based = ELMClassifier(random_state=1, n_hidden=n_hidden,
                                    batch_size=10)
    # ELM for partial fitting
    elm_parital = ELMClassifier(random_state=1, n_hidden=n_hidden)
    # Train classifiers
    elm.fit(X, y)
    elm_batch_based.fit(X, y)

    for batch_slice in gen_batches(X.shape[0], batch_size):
        elm_parital.partial_fit(X[batch_slice], y[batch_slice])

    # Get decision scores
    y_pred = elm.decision_function(X)
    y_pred_batch_based = elm_batch_based.decision_function(X)
    y_pred_partial = elm_parital.decision_function(X)

    # The prediction values should be the same
    assert_almost_equal(y_pred, y_pred_batch_based)
    assert_almost_equal(y_pred_batch_based, y_pred_partial)

项目：extreme-learning-machines 作者：IssamLaradji | 项目源码 | 文件源码

def _decision_scores(self, X):
        """Predict using the ELM model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The predicted values.
        """
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        if self.batch_size is None:
            hidden_activations = self._compute_hidden_activations(X)
            y_pred = safe_sparse_dot(hidden_activations, self.coef_output_)
        else:
            n_samples = X.shape[0]
            batches = gen_batches(n_samples, self.batch_size)

            y_pred = np.zeros((n_samples, self.n_outputs_))
            for batch in batches:
                h_batch = self._compute_hidden_activations(X[batch])
                y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_)

        return y_pred

项目：odin 作者：imito | 项目源码 | 文件源码

def fit(self, X, y=None):
    """Fit the model with X, using minibatches of size batch_size.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and
        n_features is the number of features.

    y: Passthrough for ``Pipeline`` compatibility.

    Returns
    -------
    self: object
        Returns the instance itself.
    """
    if isinstance(X, Data):
      X = X[:]
    X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
    n_samples, n_features = X.shape

    if self.batch_size is None:
      batch_size = 12 * n_features
    else:
      batch_size = self.batch_size

    for batch in gen_batches(n_samples, batch_size):
      x = X[batch]
      self.partial_fit(x, check_input=False)
    return self

项目：pylmnn 作者：johny-c | 项目源码 | 文件源码

def pairs_distances_batch(X, ind_a, ind_b, batch_size=500):
    """Equivalent to  np.sum(np.square(x[ind_a] - x[ind_b]), axis=1)

    Parameters
    ----------
    X : array_like
        An array of data samples with shape (n_samples, n_features_in).
    ind_a : array_like
        An array of samples indices with shape (m,).
    ind_b : array_like
        Another array of samples indices with shape (m,).
    batch_size :
        Size of each chunk of X to compute distances for (default: 500)

    Returns
    -------
    array-like
        An array of pairwise distances with shape (m,).

    """
    n = len(ind_a)
    res = np.zeros(n)
    for chunk in gen_batches(n, batch_size):
        res[chunk] = np.sum(np.square(X[ind_a[chunk]] - X[ind_b[chunk]]), axis=1)

    return res

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def scalable_frobenius_norm_discrepancy(X, U, s, V):
    # if the input is not too big, just call scipy
    if X.shape[0] * X.shape[1] < MAX_MEMORY:
        A = X - U.dot(np.diag(s).dot(V))
        return norm_diff(A, norm='fro')

    print("... computing fro norm by batches...")
    batch_size = 1000
    Vhat = np.diag(s).dot(V)
    cum_norm = .0
    for batch in gen_batches(X.shape[0], batch_size):
        M = X[batch, :] - U[batch, :].dot(Vhat)
        cum_norm += norm_diff(M, norm='fro', msg=False)
    return np.sqrt(cum_norm)

项目：modl 作者：arthurmensch | 项目源码 | 文件源码

def fit(self, X, y=None):
        """Learns a dictionary from sparse matrix X

        Parameters
        ----------
        X: csr-matrix (n_samples, n_features)
            Datset to learn the dictionary from

        """
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        X = check_array(X, accept_sparse='csr',
                        dtype=[np.float32, np.float64], copy=True)
        dtype = X.dtype
        n_samples, n_features = X.shape

        self.random_state = check_random_state(self.random_state)

        if self.detrend:
            self.row_mean_, self.col_mean_ = compute_biases(X,
                                                            beta=self.beta,
                                                            inplace=False)
            for i in range(X.shape[0]):
                X.data[X.indptr[i]:X.indptr[i + 1]] -= self.row_mean_[i]
            X.data -= self.col_mean_.take(X.indices, mode='clip')

        self.components_ = self.random_state.randn(self.n_components,
                                                      n_features).astype(dtype)
        S = np.sqrt(np.sum(self.components_ ** 2, axis=1))
        self.components_ /= S[:, np.newaxis]
        self.code_ = np.zeros((n_samples, self.n_components), dtype=dtype)
        self._refit(X)

        self.feature_freq_ = np.bincount(X.indices) / n_samples
        self.feature_n_iter_ = np.zeros(n_features, dtype=int)

        sparsity = X.nnz / n_samples / n_features
        if self.batch_size is None:
            batch_size = int(ceil(1. / sparsity))
        else:
            batch_size = self.batch_size

        self.comp_norm_ = np.zeros(self.n_components, dtype=dtype)
        self.C_ = np.zeros((self.n_components, self.n_components), dtype=dtype)
        self.B_ = np.zeros((self.n_components, n_features), dtype=dtype)

        self.n_iter_ = 0

        if self.verbose:
            log_lim = log(n_samples * self.n_epochs / batch_size, 10)
            self.verbose_iter_ = (np.logspace(0, log_lim, self.verbose,
                                              base=10) - 1) * batch_size
            self.verbose_iter_ = self.verbose_iter_.tolist()

        for i in range(self.n_epochs):
            permutation = self.random_state.permutation(n_samples)
            batches = gen_batches(n_samples, batch_size)
            for batch in batches:
                self._single_batch_fit(X, permutation[batch])
        self._refit(X)
        return self

项目：modl 作者：arthurmensch | 项目源码 | 文件源码

def transform(self, X):
        """
        Compute the codes associated to input matrix X, decomposing it onto
        the dictionary

        Parameters
        ----------
        X: ndarray, shape = (n_samples, n_features)

        Returns
        -------
        code: ndarray, shape = (n_samples, n_components)
        """
        check_is_fitted(self, 'components_')

        dtype = self.components_.dtype
        X = check_array(X, order='C', dtype=dtype.type)
        if X.flags['WRITEABLE'] is False:
            X = X.copy()
        n_samples, n_features = X.shape
        if not hasattr(self, 'G_agg') or self.G_agg != 'full':
            G = self.components_.dot(self.components_.T)
        else:
            G = self.G_
        Dx = X.dot(self.components_.T)
        code = np.ones((n_samples, self.n_components), dtype=dtype)
        sample_indices = np.arange(n_samples)
        size_job = ceil(n_samples / self.n_threads)
        batches = list(gen_batches(n_samples, size_job))

        par_func = lambda batch: _enet_regression_single_gram(
            G, Dx[batch], X[batch], code,
            get_sub_slice(sample_indices, batch),
            self.code_l1_ratio, self.code_alpha, self.code_pos,
            self.tol, self.max_iter)
        if self.n_threads > 1:
            res = self._pool.map(par_func, batches)
            _ = list(res)
        else:
            _enet_regression_single_gram(
                G, Dx, X, code,
                sample_indices,
                self.code_l1_ratio, self.code_alpha, self.code_pos,
                self.tol, self.max_iter)

        return code

项目：pylmnn 作者：johny-c | 项目源码 | 文件源码

def _find_impostors_batch(x1, x2, t1, t2, return_dist=False, batch_size=500):
        """Find impostor pairs in chunks to avoid large memory usage

        Parameters
        ----------
        x1 : array_like
            An array of transformed data samples with shape (n_samples, n_features).
        x2 : array_like
            An array of transformed data samples with shape (m_samples, n_features) where m_samples < n_samples.
        t1 : array_like
            An array of distances to the margins with shape (n_samples,).
        t2 : array_like
            An array of distances to the margins with shape (m_samples,).
        batch_size : int (Default value = 500)
            The size of each chunk of x1 to compute distances to.
        return_dist : bool (Default value = False)
            Whether to return the distances to the impostors.

        Returns
        -------
        tuple: (array_like, array_like, [array_like])

            imp1 : array_like
                An array of sample indices with shape (n_impostors,).
            imp2 : array_like
                An array of sample indices that violate a margin with shape (n_impostors,).
            dist : array_like, optional
                An array of pairwise distances of (imp1, imp2) with shape (n_impostors,).

        """

        n, m = len(t1), len(t2)
        imp1, imp2, dist = [], [], []
        for chunk in gen_batches(n, batch_size):
            dist_out_in = euclidean_distances(x1[chunk], x2, squared=True)
            i1, j1 = np.where(dist_out_in < t1[chunk, None])
            i2, j2 = np.where(dist_out_in < t2[None, :])
            if len(i1):
                imp1.extend(i1 + chunk.start)
                imp2.extend(j1)
                if return_dist:
                    dist.extend(dist_out_in[i1, j1])
            if len(i2):
                imp1.extend(i2 + chunk.start)
                imp2.extend(j2)
                if return_dist:
                    dist.extend(dist_out_in[i2, j2])

        if return_dist:
            return imp1, imp2, dist
        else:
            return imp1, imp2

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_minmax_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MinMaxScaler().fit(X)

        scaler_incr = MinMaxScaler()
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MinMaxScaler().fit(X[batch0])
        scaler_incr = MinMaxScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std until the end of partial fits, and
        scaler_batch = MinMaxScaler().fit(X)
        scaler_incr = MinMaxScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_standard_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = StandardScaler(with_std=False).fit(X)

        scaler_incr = StandardScaler(with_std=False)
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
        assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_incr = StandardScaler().partial_fit(X[batch0])
        if chunk_size == 1:
            assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
                                      scaler_incr.scale_)
        else:
            assert_array_almost_equal(np.var(X[batch0], axis=0),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.std(X[batch0], axis=0),
                                      scaler_incr.scale_)  # no constants

        # Test std until the end of partial fits, and
        scaler_batch = StandardScaler().fit(X)
        scaler_incr = StandardScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_maxabs_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d[:100, :]
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MaxAbsScaler().fit(X)

        scaler_incr = MaxAbsScaler()
        scaler_incr_csr = MaxAbsScaler()
        scaler_incr_csc = MaxAbsScaler()
        for batch in gen_batches(n, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            X_csr = sparse.csr_matrix(X[batch])
            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
            X_csc = sparse.csc_matrix(X[batch])
            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csc.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csc.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MaxAbsScaler().fit(X[batch0])
        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std until the end of partial fits, and
        scaler_batch = MaxAbsScaler().fit(X)
        scaler_incr = MaxAbsScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)