Python sklearn.preprocessing 模块，LabelBinarizer() 实例源码

我们从Python开源项目中，提取了以下42个代码示例，用于说明如何使用sklearn.preprocessing.LabelBinarizer()。

项目：stacked_generalization 作者：fukatani | 项目源码 | 文件源码

def _get_child_predict(self, clf, X, index=None):
        if self.stack_by_proba and hasattr(clf, 'predict_proba'):
            if self.save_stage0 and index is not None:
                proba = util.saving_predict_proba(clf, X, index)
            else:
                proba = clf.predict_proba(X)
            return proba[:, 1:]
        elif hasattr(clf, 'predict'):
            predict_result = clf.predict(X)
            if isinstance(clf, ClassifierMixin):
                lb = LabelBinarizer()
                lb.fit(predict_result)
                return lb.fit_transform(predict_result)
            else:
                return predict_result.reshape((predict_result.size, 1))
        else:
            return clf.fit_transform(X)

项目：palladio 作者：slipguru | 项目源码 | 文件源码

def fit(self, X, y, check_input=True):
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        Y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            # we don't (yet) support multi-label classification in ENet
            raise ValueError(
                "%s doesn't support multi-label classification" % (
                    self.__class__.__name__))

        # Y = column_or_1d(Y, warn=True)
        super(ElasticNetClassifier, self).fit(X, Y)
        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
        self.coef_ = self.coef_.reshape(ndim, -1)

        return self

项目：keras-text 作者：raghakot | 项目源码 | 文件源码

def __init__(self, inputs, labels, test_indices=None, **kwargs):
        """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
        easy to serialize and deserialize everything as a unit.

        Args:
            inputs: The raw model inputs. This can be set to None if you dont want
                to serialize this value when you save the dataset.
            labels: The raw output labels.
            test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
                across experiments to make results comparable. `generate_test_indices` can be used generate first
                time indices.
            **kwargs: Additional key value items to store.
        """
        self.X = np.array(inputs)
        self.y = np.array(labels)
        for key, value in kwargs.items():
            setattr(self, key, value)

        self._test_indices = None
        self._train_indices = None
        self.test_indices = test_indices

        self.is_multi_label = isinstance(labels[0], (set, list, tuple))
        self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
        self.y = self.label_encoder.fit_transform(self.y).flatten()

项目：Nature-Conservancy-Fish-Image-Prediction 作者：Brok-Bucholtz | 项目源码 | 文件源码

def preprocess(image_shape, image_paths, labels=[]):
    features = []
    for image_path in tqdm(image_paths):
        image_data = list(Image.open(image_path).resize(image_shape[:2]).getdata())
        image_data = np.asarray(image_data).reshape(image_shape)

        features.append(image_data)

    # Normalizer
    features = np.asarray(features)
    features = features / 255.0

    if labels:
        # one hot encode
        label_binarizer = LabelBinarizer()
        labels = label_binarizer.fit_transform(labels)
        # Shuffle
        features, labels = shuffle(features, labels)

    return features, labels

项目：Vulcan 作者：rfratila | 项目源码 | 文件源码

def get_one_hot(in_matrix):
    """
    Reformat truth matrix to same size as the output of the dense network.

    Args:
        in_matrix: the categorized 1D matrix

    Returns: a one-hot matrix representing the categorized matrix
    """
    if in_matrix.dtype.name == 'category':
        custum_array = in_matrix.cat.codes

    elif isinstance(in_matrix, np.ndarray):
        custum_array = in_matrix

    else:
        raise ValueError("Input matrix cannot be converted.")

    lb = LabelBinarizer()
    return np.array(lb.fit_transform(custum_array), dtype='float32')

项目：Face_Recognition 作者：AkiraXD0712 | 项目源码 | 文件源码

def extract_data(path):
    global CLASSES
    images, labels = traverse_dir(path)
    images = np.array(images)
    # change to ont-hot vector
    one_hot = preprocessing.LabelBinarizer()
    one_hot.fit(labels)
    nb_classes = len(one_hot.classes_)

    with open(path+'\labels.txt', 'w') as f:
        for label in one_hot.classes_:
            f.write(label + '\n')

    one_hots = list(one_hot.transform([i]) for i in labels)
    one_hots = np.array(one_hots)
    one_hots = np.reshape(one_hots, (images.shape[0], nb_classes))

    return images, one_hots, nb_classes

项目：protein-interaction-network 作者：ericmjl | 项目源码 | 文件源码

def encode_bond_features(self, bond_set):
        """
        We break out this function for encoding bond types because it is
        reused and occupies several lines.

        Parameters:
        ===========
        - bond_set: (set or list) of bonds.
        """
        bond_lb = LabelBinarizer()
        bond_lb.fit(BOND_TYPES)

        bonds = np.zeros(len(BOND_TYPES))
        if len(bond_set) > 0:
            bond_array = bond_lb.transform([i for i in bond_set])

            for b in bond_array:
                bonds = bonds + b

        return bonds

项目：TextCNN 作者：huazhisong | 项目源码 | 文件源码

def load_data_labels(data_file, labels_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    data = []
    labels = []
    with open(data_file, 'r', encoding='latin-1') as f:
        data.extend([s.strip() for s in f.readlines()])
        data = [clean_str(s) for s in data]

    with open(labels_file, 'r') as f:
        labels.extend([s.strip() for s in f.readlines()])
        lables = [label.split(',')[1].strip() for label in labels]

    lb = LabelBinarizer()
    y = lb.fit_transform(lables)

    # max_document_length = max([len(x.split(" ")) for x in data])
    # print(max_document_length)
    vocab_processor = learn.preprocessing.VocabularyProcessor(1000)
    x = np.array(list(vocab_processor.fit_transform(data)))
    return x, y, vocab_processor

项目：polylearn 作者：scikit-learn-contrib | 项目源码 | 文件源码

def _check_X_y(self, X, y):

        # helpful error message for sklearn < 1.17
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2

        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "
                            "scikit-learn.")

        X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
                         multi_output=False)

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def _partial_fit(self, X, y, classes=None, first_partial_fit=None):

        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                             "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_count_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        self._class_log_prob()
        self._update_complement_features(X, y_one_hot)
        self.is_fitted = True

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def _partial_fit(self, X, y, classes=None, first_partial_fit=None):

        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                         "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_count_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        self._features_in_class(X, y_one_hot)
        self.is_fitted = True

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def _partial_fit(self, X, y, classes=None, first_partial_fit=None):

        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                         "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_count_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        self._update_complement_features(X, y_one_hot)
        self._update_features(X, y_one_hot)
        self.is_fitted = True

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def _partial_fit(self, X, y, classes=None, first_partial_fit=None):

        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                         "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_count_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        #self._class_log_prob()
        self._update_complement_features(X, y_one_hot)
        self.is_fitted = True

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def _partial_fit(self, X, y, classes=None, first_partial_fit=None):

        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                         "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_counts_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        self._class_log_prob()
        self._features_in_class(X, y_one_hot)
        self.is_fitted = True

项目：MLLearning 作者：buptdjd | 项目源码 | 文件源码

def data2Vector(self):
        vec = DictVectorizer()
        dummy_x = vec.fit_transform(self.feature_list).toarray()
        lb = LabelBinarizer()
        dummy_y = lb.fit_transform(self.label_list)
        return dummy_x, dummy_y

    # here the decision tree use the algorithm which we call ID3, ID3 will use
    # information gain as feature select

项目：TF-Net 作者：Jorba123 | 项目源码 | 文件源码

def to_one_hot(y):
    """Transform multi-class labels to binary labels
        The output of to_one_hot is sometimes referred to by some authors as the
        1-of-K coding scheme.
        Parameters
        ----------
        y : numpy array or sparse matrix of shape (n_samples,) or
            (n_samples, n_classes) Target values. The 2-d matrix should only
            contain 0 and 1, represents multilabel classification. Sparse
            matrix can be CSR, CSC, COO, DOK, or LIL.
        Returns
        -------
        Y : numpy array or CSR matrix of shape [n_samples, n_classes]
            Shape will be [n_samples, 1] for binary problems.
        classes_ : class vector extraceted from y.
        """
    lb = LabelBinarizer()
    lb.fit(y)
    Y = lb.transform(y)
    return (Y.base, lb.classes_)

项目：TF-Net 作者：Jorba123 | 项目源码 | 文件源码

def to_one_hot(y):
    """Transform multi-class labels to binary labels
        The output of to_one_hot is sometimes referred to by some authors as the
        1-of-K coding scheme.
        Parameters
        ----------
        y : numpy array or sparse matrix of shape (n_samples,) or
            (n_samples, n_classes) Target values. The 2-d matrix should only
            contain 0 and 1, represents multilabel classification. Sparse
            matrix can be CSR, CSC, COO, DOK, or LIL.
        Returns
        -------
        Y : numpy array or CSR matrix of shape [n_samples, n_classes]
            Shape will be [n_samples, 1] for binary problems.
        classes_ : class vector extraceted from y.
        """
    lb = LabelBinarizer()
    lb.fit(y)
    Y = lb.transform(y)
    return (Y.base, lb.classes_)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def check_proba_classif_convergence(X_train, y_train, mc):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)

    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = mc.predict_proba(X_train)
    labels = mc.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = mc.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def check_proba_classif_convergence(est, X_train, y_train):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = est.predict_proba(X_train)
    labels = est.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = est.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)

项目：dac-training 作者：jlonij | 项目源码 | 文件源码

def load_csv(training_fn, features_fn):
    '''
    Transform tabular data set into NumPy arrays.
    '''
    df = pd.read_csv(training_fn, sep='\t')

    features = json.load(open(features_fn))['features']
    data = df[features].as_matrix()
    print('Data:', data.shape)

    labels = df[['label']].as_matrix().reshape(-1)
    lb = preprocessing.LabelBinarizer()
    lb.fit(labels)
    print('Labels:', labels.shape)

    return features, data, labels

项目：keyphrase-extraction 作者：sagarchaturvedi1 | 项目源码 | 文件源码

def classify(y_true, y_pred):

    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

项目：lstm-crf-ner 作者：qfzxhy | 项目源码 | 文件源码

def bio_classification_report(y_gold,y_pred):
    #y_gold: [[],[],[]]
    #y_pred:
    lb = LabelBinarizer()
    y_gold_combined = lb.fit_transform(list(chain.from_iterable(y_gold)))
    y_pred_combined = lb.fit_transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset,key=lambda tag: tag.split('-',1)[::-1])
    class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
    return classification_report(
        y_gold_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset
    )

项目：Identify-Numbers 作者：jinhang | 项目源码 | 文件源码

def __init__(self, hidden_layer_sizes=(100,), activation="relu",
                 algorithm='l-bfgs', alpha=0.00001,
                 batch_size=200, learning_rate="constant",
                 learning_rate_init=0.5, power_t=0.5, max_iter=200,
                 shuffle=False, random_state=None, tol=1e-5,
                 verbose=False, warm_start=False):

        sup = super(MultilayerPerceptronClassifier, self)
        sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
                     activation=activation, algorithm=algorithm, alpha=alpha,
                     batch_size=batch_size, learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init, power_t=power_t,
                     max_iter=max_iter, loss='log_loss', shuffle=shuffle,
                     random_state=random_state, tol=tol,
                     beta=0, sparsity_param=0,
                     verbose=verbose, warm_start=warm_start)

        self.label_binarizer_ = LabelBinarizer()

项目：operalib 作者：operalib | 项目源码 | 文件源码

def fit(self, y):
        """Fit simplex coding

        Parameters
        ----------
        targets : array, shape = [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d array represents the simplex coding for
            multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """
        if self.binarizer is None:
            self.binarizer_ = LabelBinarizer(neg_label=0, pos_label=1,
                                             sparse_output=True)
        self.binarizer_.fit(y)
        dimension = self.binarizer_.classes_.size
        if dimension > 2:
            self.simplex_operator_ = SimplexCoding.code(dimension)
        else:
            self.simplex_operator_ = ones((1, 1))
        return self

项目：Defect-Prediction 作者：Jorba123 | 项目源码 | 文件源码

def to_one_hot(y):
    """Transform multi-class labels to binary labels
        The output of to_one_hot is sometimes referred to by some authors as the
        1-of-K coding scheme.
        Parameters
        ----------
        y : numpy array or sparse matrix of shape (n_samples,) or
            (n_samples, n_classes) Target values. The 2-d matrix should only
            contain 0 and 1, represents multilabel classification. Sparse
            matrix can be CSR, CSC, COO, DOK, or LIL.
        Returns
        -------
        Y : numpy array or CSR matrix of shape [n_samples, n_classes]
            Shape will be [n_samples, 1] for binary problems.
        classes_ : class vector extraceted from y.
        """
    lb = LabelBinarizer()
    lb.fit(y)
    Y = lb.transform(y)
    return (Y.base, lb.classes_)

项目：muffnn 作者：civisanalytics | 项目源码 | 文件源码

def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96)

项目：SVM-CNN 作者：dlmacedo | 项目源码 | 文件源码

def __init__(self, n_hidden=20, alpha=0.5, rbf_width=1.0,
                 activation_func='tanh', activation_args=None,
                 user_components=None, regressor=None,
                 binarizer=LabelBinarizer(-1, 1),
                 random_state=None):

        super(ELMClassifier, self).__init__(n_hidden=n_hidden,
                                            alpha=alpha,
                                            random_state=random_state,
                                            activation_func=activation_func,
                                            activation_args=activation_args,
                                            user_components=user_components,
                                            rbf_width=rbf_width,
                                            regressor=regressor)

        self.classes_ = None
        self.binarizer = binarizer

项目：plume 作者：WiseDoge | 项目源码 | 文件源码

def fit(self, X, y):
        """
        :param X_: shape = [n_samples, n_features] 
        :param y: shape = [n_samples] 
        :return: self
        """
        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes = labelbin.classes_
        self.class_count = np.zeros(Y.shape[1], dtype=np.float64)
        self.feature_count = np.zeros((Y.shape[1], X.shape[1]),
                                      dtype=np.float64)

        self.feature_count += Y.T @ X
        self.class_count += Y.sum(axis=0)
        smoothed_fc = self.feature_count + self.alpha
        smoothed_cc = smoothed_fc.sum(axis=1)

        self.feature_log_prob = (np.log(smoothed_fc) -
                                 np.log(smoothed_cc.reshape(-1, 1)))

项目：PortfolioTimeSeriesAnalysis 作者：MizioAnd | 项目源码 | 文件源码

def feature_mapping_to_numerical_values(self, df):
        TwoSigmaFinModTools._is_one_hot_encoder = 0
        mask = ~df.isnull()
        # Assume that training set has all possible feature_var_names
        # Although it may occur in real life that a training set may hold a feature_var_name. But it is probably
        # avoided since such features cannot
        # be part of the trained learning algo.
        # Add missing feature_var_names of training set not occurring in test set. Add these with zeros in columns.
        if not any(tuple(df.columns == 'y')):
            # All one-hot encoded feature var names occurring in test data is assigned the public variable
            # df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns

        _feature_names_num = np.zeros((TwoSigmaFinModTools._non_numerical_feature_names.shape[0],), dtype=object)
        ith = 0
        for feature_name in TwoSigmaFinModTools._non_numerical_feature_names:
            # Create a feature_nameNum list
            feature_name_num = ''.join([feature_name, 'Num'])
            _feature_names_num[ith] = feature_name_num
            ith += 1
            TwoSigmaFinModTools.encode_labels_in_numeric_format(df, feature_name)

            if TwoSigmaFinModTools._is_one_hot_encoder:
                is_with_label_binarizer = 0
                if is_with_label_binarizer:
                    mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True)
                    feature_var_values = mapper_df.fit_transform(df.copy())
                    print(df[feature_name].isnull().sum().sum())
                    print(df[feature_name][mask[feature_name]].isnull().sum().sum())
                    for ite in feature_var_values.columns:
                        df[ite] = feature_var_values[ite]
                else:
                    TwoSigmaFinModTools.one_hot_encoder(df, feature_name)
        TwoSigmaFinModTools._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)

项目：deep-learning-nd 作者：RyanCCollins | 项目源码 | 文件源码

def display_image_predictions(features, labels, predictions):
    n_classes = 10
    label_names = _load_label_names()
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(range(n_classes))
    label_ids = label_binarizer.inverse_transform(np.array(labels))

    fig, axies = plt.subplots(nrows=4, ncols=2)
    fig.tight_layout()
    fig.suptitle('Softmax Predictions', fontsize=20, y=1.1)

    n_predictions = 3
    margin = 0.05
    ind = np.arange(n_predictions)
    width = (1. - 2. * margin) / n_predictions

    for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)):
        pred_names = [label_names[pred_i] for pred_i in pred_indicies]
        correct_name = label_names[label_id]

        axies[image_i][0].imshow(feature*255)
        axies[image_i][0].set_title(correct_name)
        axies[image_i][0].set_axis_off()

        axies[image_i][1].barh(ind + margin, pred_values[::-1], width)
        axies[image_i][1].set_yticks(ind + margin)
        axies[image_i][1].set_yticklabels(pred_names[::-1])
        axies[image_i][1].set_xticks([0, 0.5, 1.0])