Python sklearn.utils.validation 模块,check_X_y() 实例源码


项目:project-template    作者:scikit-learn-contrib    | 项目源码 | 文件源码
def fit(self, X, y):
        """A reference implementation of a fitting function

        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in

        self : object
            Returns self.
        X, y = check_X_y(X, y)
        # Return the estimator
        return self
项目:project-template    作者:scikit-learn-contrib    | 项目源码 | 文件源码
def fit(self, X, y):
        """A reference implementation of a fitting function for a classifier.

        X : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values. An array of int.

        self : object
            Returns self.
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self
项目:polylearn    作者:scikit-learn-contrib    | 项目源码 | 文件源码
def _check_X_y(self, X, y):

        # helpful error message for sklearn < 1.17
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2

        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "

        X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y
项目:sparsereg    作者:Ohjeah    | 项目源码 | 文件源码
def fit(self, x_, y, sample_weight=None):
        X, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
            copy=self.copy_X, sample_weight=None)

        if sample_weight is not None:
            x, y = _rescale_data(x, y, sample_weight)

        self.iters = 0
        self.ind_ = np.ones(x.shape[1], dtype=bool) # initial guess
        if self.threshold > 0:
            self._reduce(x, y)
            self.coef_ = self._regress(x[:, self.ind_], y, self.alpha)

        if self.unbias and self.alpha >= 0:
            self._unbias(x, y)

        self._set_intercept(X_offset, y_offset, X_scale)
        return self
项目:sparsereg    作者:Ohjeah    | 项目源码 | 文件源码
def fit(self, x_, y, sample_weight=None):
        n_samples, n_features = x_.shape

        X, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
            copy=self.copy_X, sample_weight=None)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            x, y = _rescale_data(x, y, sample_weight)

        coefs, intercept = fit_with_noise(x, y, self.sigma, self.alpha, self.n)
        self.intercept_ = intercept
        self.coef_ = coefs
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
项目:xam    作者:MaxHalford    | 项目源码 | 文件源码
def fit(self, X, y=None, **fit_params):

        # scikit-learn checks
        X, y = check_X_y(X, y)

        n_terms = min(self.n_terms, X.shape[1])

        # Get a list of unique labels from y
        labels = np.unique(y)

        # Determine the n top terms per class
        self.top_terms_per_class_ = {
            c: set(np.argpartition(np.sum(X[y == c], axis=0), -n_terms)[-n_terms:])
            for c in labels

        # Return the classifier
        return self
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def fit(self, X, y):
        #import traceback
        from fabric.api import local

        X, y = check_X_y(X, y, allow_nd=True, multi_output=True,
                         y_numeric=True, estimator="GridSearch")
        print "njobs = {}".format(self.njobs)
        if self.njobs > 1:
            assert False
#             iterable = [(i, pg, self.estimator_cls, self.kf, X, y, \
#                          self.score_fns, len(self.parameter_grid)) \
#                          for i,pg in enumerate(self.parameter_grid)]
#             try:
#                 p = multiprocessing.Pool(self.njobs)
#                 res =, iterable)
#                 print res
#             except:
#                 traceback.print_exc()
            self.grid_scores = []
            estimator = self.estimator_cls()
            num_tasks = len(self.parameter_grid)
            for i,params in enumerate(self.parameter_grid):
                print "Starting task {}/{}...".format(i+1, num_tasks)
                with stopwatch("Done. Elapsed time"):

                if self.checkpoint_path is not None:
                    local("rm -f {}*.p".format(self.checkpoint_path))
                    savepath = self.checkpoint_path + "_{}.p".format(i)
                    with open(savepath, 'w') as f:
                        pickle.dump(self.grid_scores, f)
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def check_X_y(self, X, y):
        from sklearn.utils.validation import check_X_y

        if X.shape[0] > GPR.MAX_TRAIN_SIZE:
            raise Exception("X_train size cannot exceed {} ({})"
                            .format(GPR.MAX_TRAIN_SIZE, X.shape[0]))
        return check_X_y(X, y, multi_output=True,
                         allow_nd=True, y_numeric=True,
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def fit(self, X_train, y_train, ridge=1.0):
        X_train, y_train = self.check_X_y(X_train, y_train)
        self.X_train = np.float32(X_train)
        self.y_train = np.float32(y_train)
        sample_size = self.X_train.shape[0]

        if np.isscalar(ridge):
            ridge = np.ones(sample_size) * ridge
        assert ridge.ndim == 1

        X_dists = np.zeros((sample_size, sample_size), dtype=np.float32)
        with tf.Session(graph=self.graph, config=tf.ConfigProto(
                intra_op_parallelism_threads=self.NUM_THREADS)) as sess:
            dist_op = self.ops['dist_op']
            v1, v2 = self.vars['v1_h'], self.vars['v2_h']
            for i in range(sample_size):
                X_dists[i] =, feed_dict={v1:self.X_train[i], v2:self.X_train})

            K_ridge_op = self.ops['K_ridge_op']
            X_dists_ph = self.vars['X_dists_h']
            ridge_ph = self.vars['ridge_h']

            self.K =, feed_dict={X_dists_ph:X_dists, ridge_ph:ridge})

            K_ph = self.vars['K_h']

            K_inv_op = self.ops['K_inv_op']
            self.K_inv =, feed_dict={K_ph:self.K})

            xy_op = self.ops['xy_op']
            K_inv_ph = self.vars['K_inv_h']
            yt_ph = self.vars['yt_h']
            self.xy_ =, feed_dict={K_inv_ph:self.K_inv,

        return self
项目:Optimus    作者:Yatoom    | 项目源码 | 文件源码
def fit(self, X, y):
        Fit on X.
        :param X: {array-like, sparse matrix}, shape (n_samples, n_features). Input data, where `n_samples` is the 
        number of samples and `n_features` is the number of features.
        :return: Returns self

        # Numpy
        X = np.array(X)
        y = np.array(y)

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        # Store so that we know what we fitted on
        self.X_ = X
        self.y_ = y

        # Get dimensions
        input_dim = X.shape[1]
        output_dim = len(self.classes_)

        # Create a model if needed
        if (input_dim, output_dim) !=
            self.model = self._build(input_dim, output_dim), y, batch_size=self.batch_size, epochs=self.epochs, verbose=self.verbose)

        # Return the classifier
        return self
项目:polylearn    作者:scikit-learn-contrib    | 项目源码 | 文件源码
def _check_X_y(self, X, y):
        X, y = check_X_y(X, y, accept_sparse='csc', multi_output=False,
                         dtype=np.double, y_numeric=True)
        y = y.astype(np.double).ravel()
        return X, y
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def fit(self, X, y):
        """Builds a forest of trees from the training set (X, y).

        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        self : object
            Returns self.
        X, y = check_X_y(X, y, dtype=np.float32, multi_output=False)
        return super(MondrianForestRegressor, self).fit(X, y)
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def fit(self, X, y):
        """Builds a forest of trees from the training set (X, y).

        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        self : object
            Returns self.
        X, y = check_X_y(X, y, dtype=np.float32, multi_output=False)
        return super(MondrianForestClassifier, self).fit(X, y)
项目:PredictiveServer    作者:KeyboardNerd    | 项目源码 | 文件源码
def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.X_ = DynamicBayesianClassifier._first_col(X)
        self.y_ = y
        self.size_ = self.X_.size
        for i in range(self.X_.size):
            if y[i] not in self.dbayesmode_major_.keys():
                self.dbayesmode_major_[y[i]] = scalgoutil.DBayesMode(y[i])
        return self
项目:sparsereg    作者:Ohjeah    | 项目源码 | 文件源码
def fit(self, x, y, **kwargs):
        #x, y = check_X_y(x, y, multi_output=False)
        super().fit(self._transform(x, y), y, **kwargs)
        return self
项目:sparsereg    作者:Ohjeah    | 项目源码 | 文件源码
def fit(self, x, y=None):
        x, y = check_X_y(x, y)
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=self.random_state)
        self.front = run_ffx(x_train, x_test, y_train, y_test,
                             self.exponents, self.operators, num_alphas=self.num_alphas, l1_ratios=self.l1_ratios,
                             target_score=self.target_score, n_tail=self.n_tail, random_state=self.random_state,
                             strategies=self.strategies, n_jobs=self.n_jobs, max_complexity=self.max_complexity,
                             rational=self.rational, eps=self.eps, **
        self.make_model(x_test, y_test)
        return self
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y):
        X, y = check_X_y(X, y)
        return self
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        if sp.issparse(X):
            raise ValueError("Nonsensical Error")
        return self
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.coef_ = np.ones(X.shape[1])
        return self
项目:pyAFM    作者:cmaclell    | 项目源码 | 文件源码
def fit(self, X, y):
        Train the Logistic model, X and y are numpy arrays.
        X, y = check_X_y(X, y) 
        #, accept_sparse=['csr', 'csc']) # not sure how to handle sparse
        self.classes_, y = np.unique(y, return_inverse=True)

        if self.fit_intercept:
            X = np.insert(X, 0, 1, axis=1)

        w0 = np.zeros(X.shape[1])

        if self.bounds is None:
            self.bounds_ = [(None, None) for v in w0]
        elif isinstance(self.bounds, tuple) and len(self.bounds) == 2:
            self.bounds_ = [self.bounds for v in w0]
        elif self.fit_intercept and len(self.bounds) == len(w0) - 1:
            self.bounds_ = np.concatenate(([(None, None)], self.bounds))
            self.bounds_ = self.bounds
        if len(self.bounds_) != len(w0):
            raise ValueError("Bounds must be the same length as the coef")

        if isinstance(self.l2, Number):
            self.l2_ = [self.l2 for v in w0]
        elif self.fit_intercept and len(self.l2) == len(w0) - 1:
            self.l2_ = np.insert(self.l2, 0, 0)
            self.l2_ = self.l2
        if len(self.l2_) != len(w0):
            raise ValueError("L2 penalty must be the same length as the coef, be sure the intercept is accounted for.")

        # the intercept should never be regularized.
        if self.fit_intercept:
            self.l2_[0] = 0.0

        w = minimize(_ll, w0, args=(X, y, self.l2_),
                               method=self.method, bounds=self.bounds_,
                               options={'maxiter': self.max_iter, 
                                        #'disp': True

        if self.fit_intercept:
            self.intercept_ = w[0:1]
            self.coef_ = w[1:]
            self.intercept_ = np.array([])
            self.coef_ = w
        return self
项目:xam    作者:MaxHalford    | 项目源码 | 文件源码
def fit(self, X, y=None, **fit_params):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # meta_features_ have as many rows as there are in X and as many
        # columns as there are models. However, if use_proba is True then
        # ((n_classes - 1) * n_models) columns have to be stored
        if self.use_proba:
            self.n_probas_ = len(np.unique(y)) - 1
            self.meta_features_ = np.empty((len(X), len(self.models) * (self.n_probas_)))
            self.meta_features_ = np.empty((len(X), len(self.models)))

        # Generate CV folds
        folds =, y)

        for train_index, test_index in folds:
            for i, (name, model) in enumerate(self.models.items()):
                # Extract fit params for the model
                model_fit_params = fit_params.get(name, {})
                # Train the model on the training set
      [train_index], y[train_index], **model_fit_params)
                # If use_proba is True then the probabilities of each class for
                # each model have to be predicted and then stored into
                # meta_features
                if self.use_proba:
                    probabilities = model.predict_proba(X[test_index])
                    for j, k in enumerate(range(self.n_probas_ * i, self.n_probas_ * (i + 1))):
                        self.meta_features_[test_index, k] = probabilities[:, j]
                    self.meta_features_[test_index, i] = model.predict(X[test_index])

        # Combine the predictions with the original features
        if self.use_base_features:
            self.meta_features_ = np.hstack((self.meta_features_, X)), y)

        # Each model has to be fit on all the data for further predictions
        for model in self.models.values():
  , y)

        return self
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y):
        """Fit Gaussian process classification model

        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        self : returns an instance of self.
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present."
                             % self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class), y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                 for estimator in self.base_estimator_.estimators_])
            self.log_marginal_likelihood_value_ = \

        return self