Python sklearn.utils.validation 模块，check_X_y() 实例源码

我们从Python开源项目中，提取了以下22个代码示例，用于说明如何使用sklearn.utils.validation.check_X_y()。

项目：project-template 作者：scikit-learn-contrib | 项目源码 | 文件源码

def fit(self, X, y):
        """A reference implementation of a fitting function

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y)
        # Return the estimator
        return self

项目：project-template 作者：scikit-learn-contrib | 项目源码 | 文件源码

def fit(self, X, y):
        """A reference implementation of a fitting function for a classifier.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values. An array of int.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

项目：polylearn 作者：scikit-learn-contrib | 项目源码 | 文件源码

def _check_X_y(self, X, y):

        # helpful error message for sklearn < 1.17
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2

        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "
                            "scikit-learn.")

        X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
                         multi_output=False)

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y

项目：sparsereg 作者：Ohjeah | 项目源码 | 文件源码

def fit(self, x_, y, sample_weight=None):
        X, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
            copy=self.copy_X, sample_weight=None)

        if sample_weight is not None:
            x, y = _rescale_data(x, y, sample_weight)

        self.iters = 0
        self.ind_ = np.ones(x.shape[1], dtype=bool) # initial guess
        if self.threshold > 0:
            self._reduce(x, y)
        else:
            self.coef_ = self._regress(x[:, self.ind_], y, self.alpha)

        if self.unbias and self.alpha >= 0:
            self._unbias(x, y)

        self._set_intercept(X_offset, y_offset, X_scale)
        return self

项目：sparsereg 作者：Ohjeah | 项目源码 | 文件源码

def fit(self, x_, y, sample_weight=None):
        n_samples, n_features = x_.shape

        X, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
            copy=self.copy_X, sample_weight=None)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            x, y = _rescale_data(x, y, sample_weight)

        coefs, intercept = fit_with_noise(x, y, self.sigma, self.alpha, self.n)
        self.intercept_ = intercept
        self.coef_ = coefs
        self._set_intercept(X_offset, y_offset, X_scale)
        return self

项目：xam 作者：MaxHalford | 项目源码 | 文件源码

def fit(self, X, y=None, **fit_params):

        # scikit-learn checks
        X, y = check_X_y(X, y)

        n_terms = min(self.n_terms, X.shape[1])

        # Get a list of unique labels from y
        labels = np.unique(y)

        # Determine the n top terms per class
        self.top_terms_per_class_ = {
            c: set(np.argpartition(np.sum(X[y == c], axis=0), -n_terms)[-n_terms:])
            for c in labels
        }

        # Return the classifier
        return self

项目：ottertune 作者：cmu-db | 项目源码 | 文件源码

def fit(self, X, y):
        #import traceback
        from fabric.api import local

        X, y = check_X_y(X, y, allow_nd=True, multi_output=True,
                         y_numeric=True, estimator="GridSearch")
        print "njobs = {}".format(self.njobs)
        if self.njobs > 1:
            assert False
#             iterable = [(i, pg, self.estimator_cls, self.kf, X, y, \
#                          self.score_fns, len(self.parameter_grid)) \
#                          for i,pg in enumerate(self.parameter_grid)]
#             try:
#                 p = multiprocessing.Pool(self.njobs)
#                 res = p.map(mp_grid_search, iterable)
#                 print res
#             except:
#                 traceback.print_exc()
        else:
            self.grid_scores = []
            estimator = self.estimator_cls()
            num_tasks = len(self.parameter_grid)
            for i,params in enumerate(self.parameter_grid):
                print "Starting task {}/{}...".format(i+1, num_tasks)
                with stopwatch("Done. Elapsed time"):
                    self.grid_scores.append(mp_grid_search((i,
                                                           params,
                                                           estimator,
                                                           self.kf,
                                                           X,
                                                           y,
                                                           self.score_fns,
                                                           len(self.parameter_grid))))

                if self.checkpoint_path is not None:
                    local("rm -f {}*.p".format(self.checkpoint_path))
                    savepath = self.checkpoint_path + "_{}.p".format(i)
                    with open(savepath, 'w') as f:
                        pickle.dump(self.grid_scores, f)

项目：ottertune 作者：cmu-db | 项目源码 | 文件源码

def check_X_y(self, X, y):
        from sklearn.utils.validation import check_X_y

        if X.shape[0] > GPR.MAX_TRAIN_SIZE:
            raise Exception("X_train size cannot exceed {} ({})"
                            .format(GPR.MAX_TRAIN_SIZE, X.shape[0]))
        return check_X_y(X, y, multi_output=True,
                         allow_nd=True, y_numeric=True,
                         estimator="GPR")

项目：ottertune 作者：cmu-db | 项目源码 | 文件源码

def fit(self, X_train, y_train, ridge=1.0):
        self._reset()
        X_train, y_train = self.check_X_y(X_train, y_train)
        self.X_train = np.float32(X_train)
        self.y_train = np.float32(y_train)
        sample_size = self.X_train.shape[0]

        if np.isscalar(ridge):
            ridge = np.ones(sample_size) * ridge
        assert ridge.ndim == 1

        X_dists = np.zeros((sample_size, sample_size), dtype=np.float32)
        with tf.Session(graph=self.graph, config=tf.ConfigProto(
                intra_op_parallelism_threads=self.NUM_THREADS)) as sess:
            dist_op = self.ops['dist_op']
            v1, v2 = self.vars['v1_h'], self.vars['v2_h']
            for i in range(sample_size):
                X_dists[i] = sess.run(dist_op, feed_dict={v1:self.X_train[i], v2:self.X_train})

            K_ridge_op = self.ops['K_ridge_op']
            X_dists_ph = self.vars['X_dists_h']
            ridge_ph = self.vars['ridge_h']

            self.K = sess.run(K_ridge_op, feed_dict={X_dists_ph:X_dists, ridge_ph:ridge})

            K_ph = self.vars['K_h']

            K_inv_op = self.ops['K_inv_op']
            self.K_inv = sess.run(K_inv_op, feed_dict={K_ph:self.K})

            xy_op = self.ops['xy_op']
            K_inv_ph = self.vars['K_inv_h']
            yt_ph = self.vars['yt_h']
            self.xy_ = sess.run(xy_op, feed_dict={K_inv_ph:self.K_inv,
                                                  yt_ph:self.y_train})

        return self

项目：Optimus 作者：Yatoom | 项目源码 | 文件源码

def fit(self, X, y):
        """
        Fit on X.
        :param X: {array-like, sparse matrix}, shape (n_samples, n_features). Input data, where `n_samples` is the 
        number of samples and `n_features` is the number of features.
        :return: Returns self
        """

        # Numpy
        X = np.array(X)
        y = np.array(y)

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        # Store so that we know what we fitted on
        self.X_ = X
        self.y_ = y

        # Get dimensions
        input_dim = X.shape[1]
        output_dim = len(self.classes_)

        # Create a model if needed
        if (input_dim, output_dim) != self.io:
            self.model = self._build(input_dim, output_dim)

        self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs, verbose=self.verbose)

        # Return the classifier
        return self

项目：polylearn 作者：scikit-learn-contrib | 项目源码 | 文件源码

def _check_X_y(self, X, y):
        X, y = check_X_y(X, y, accept_sparse='csc', multi_output=False,
                         dtype=np.double, y_numeric=True)
        y = y.astype(np.double).ravel()
        return X, y

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def fit(self, X, y):
        """Builds a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, dtype=np.float32, multi_output=False)
        return super(MondrianForestRegressor, self).fit(X, y)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def fit(self, X, y):
        """Builds a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, dtype=np.float32, multi_output=False)
        return super(MondrianForestClassifier, self).fit(X, y)

项目：PredictiveServer 作者：KeyboardNerd | 项目源码 | 文件源码

def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.X_ = DynamicBayesianClassifier._first_col(X)
        self.y_ = y
        self.size_ = self.X_.size
        for i in range(self.X_.size):
            if y[i] not in self.dbayesmode_major_.keys():
                self.dbayesmode_major_[y[i]] = scalgoutil.DBayesMode(y[i])
            self.dbayesmode_major_[y[i]].update(self.X_[i])
            self.update_priors()
        return self

项目：sparsereg 作者：Ohjeah | 项目源码 | 文件源码

def fit(self, x, y, **kwargs):
        #x, y = check_X_y(x, y, multi_output=False)
        super().fit(self._transform(x, y), y, **kwargs)
        self._arrange_coef()
        return self

项目：sparsereg 作者：Ohjeah | 项目源码 | 文件源码

def fit(self, x, y=None):
        x, y = check_X_y(x, y)
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=self.random_state)
        self.front = run_ffx(x_train, x_test, y_train, y_test,
                             self.exponents, self.operators, num_alphas=self.num_alphas, l1_ratios=self.l1_ratios,
                             target_score=self.target_score, n_tail=self.n_tail, random_state=self.random_state,
                             strategies=self.strategies, n_jobs=self.n_jobs, max_complexity=self.max_complexity,
                             rational=self.rational, eps=self.eps, **self.kw)
        self.make_model(x_test, y_test)
        return self

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def fit(self, X, y):
        X, y = check_X_y(X, y)
        return self

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def fit(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        if sp.issparse(X):
            raise ValueError("Nonsensical Error")
        return self

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.coef_ = np.ones(X.shape[1])
        return self

项目：pyAFM 作者：cmaclell | 项目源码 | 文件源码

def fit(self, X, y):
        """
        Train the Logistic model, X and y are numpy arrays.
        """
        X, y = check_X_y(X, y) 
        #, accept_sparse=['csr', 'csc']) # not sure how to handle sparse
        self.classes_, y = np.unique(y, return_inverse=True)

        if self.fit_intercept:
            X = np.insert(X, 0, 1, axis=1)

        w0 = np.zeros(X.shape[1])

        if self.bounds is None:
            self.bounds_ = [(None, None) for v in w0]
        elif isinstance(self.bounds, tuple) and len(self.bounds) == 2:
            self.bounds_ = [self.bounds for v in w0]
        elif self.fit_intercept and len(self.bounds) == len(w0) - 1:
            self.bounds_ = np.concatenate(([(None, None)], self.bounds))
        else:
            self.bounds_ = self.bounds
        if len(self.bounds_) != len(w0):
            raise ValueError("Bounds must be the same length as the coef")

        if isinstance(self.l2, Number):
            self.l2_ = [self.l2 for v in w0]
        elif self.fit_intercept and len(self.l2) == len(w0) - 1:
            self.l2_ = np.insert(self.l2, 0, 0)
        else:
            self.l2_ = self.l2
        if len(self.l2_) != len(w0):
            raise ValueError("L2 penalty must be the same length as the coef, be sure the intercept is accounted for.")

        # the intercept should never be regularized.
        if self.fit_intercept:
            self.l2_[0] = 0.0

        w = minimize(_ll, w0, args=(X, y, self.l2_),
                               jac=_ll_grad, 
                               method=self.method, bounds=self.bounds_,
                               options={'maxiter': self.max_iter, 
                                        #'disp': True
                               })['x']

        if self.fit_intercept:
            self.intercept_ = w[0:1]
            self.coef_ = w[1:]
        else:
            self.intercept_ = np.array([])
            self.coef_ = w
        return self

项目：xam 作者：MaxHalford | 项目源码 | 文件源码

def fit(self, X, y=None, **fit_params):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # meta_features_ have as many rows as there are in X and as many
        # columns as there are models. However, if use_proba is True then
        # ((n_classes - 1) * n_models) columns have to be stored
        if self.use_proba:
            self.n_probas_ = len(np.unique(y)) - 1
            self.meta_features_ = np.empty((len(X), len(self.models) * (self.n_probas_)))
        else:
            self.meta_features_ = np.empty((len(X), len(self.models)))

        # Generate CV folds
        folds = self.cv.split(X, y)

        for train_index, test_index in folds:
            for i, (name, model) in enumerate(self.models.items()):
                # Extract fit params for the model
                model_fit_params = fit_params.get(name, {})
                # Train the model on the training set
                model.fit(X[train_index], y[train_index], **model_fit_params)
                # If use_proba is True then the probabilities of each class for
                # each model have to be predicted and then stored into
                # meta_features
                if self.use_proba:
                    probabilities = model.predict_proba(X[test_index])
                    for j, k in enumerate(range(self.n_probas_ * i, self.n_probas_ * (i + 1))):
                        self.meta_features_[test_index, k] = probabilities[:, j]
                else:
                    self.meta_features_[test_index, i] = model.predict(X[test_index])

        # Combine the predictions with the original features
        if self.use_base_features:
            self.meta_features_ = np.hstack((self.meta_features_, X))

        self.meta_model.fit(self.meta_features_, y)

        # Each model has to be fit on all the data for further predictions
        for model in self.models.values():
            model.fit(X, y)

        return self

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present."
                             % self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                [estimator.log_marginal_likelihood()
                 for estimator in self.base_estimator_.estimators_])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self