Python sklearn.preprocessing 模块,MultiLabelBinarizer() 实例源码

我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用sklearn.preprocessing.MultiLabelBinarizer()

项目:ml-projects    作者:saopayne    | 项目源码 | 文件源码
def represent(documents):

    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    # Tokenization
    vectorizer = TfidfVectorizer(tokenizer=tokenize)

    # Learn and transform train documents
    vectorised_train_documents = vectorizer.fit_transform(train_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
    test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

    return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
项目:keras-text    作者:raghakot    | 项目源码 | 文件源码
def __init__(self, inputs, labels, test_indices=None, **kwargs):
        """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
        easy to serialize and deserialize everything as a unit.

        Args:
            inputs: The raw model inputs. This can be set to None if you dont want
                to serialize this value when you save the dataset.
            labels: The raw output labels.
            test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
                across experiments to make results comparable. `generate_test_indices` can be used generate first
                time indices.
            **kwargs: Additional key value items to store.
        """
        self.X = np.array(inputs)
        self.y = np.array(labels)
        for key, value in kwargs.items():
            setattr(self, key, value)

        self._test_indices = None
        self._train_indices = None
        self.test_indices = test_indices

        self.is_multi_label = isinstance(labels[0], (set, list, tuple))
        self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
        self.y = self.label_encoder.fit_transform(self.y).flatten()
项目:EUSIPCO2017    作者:Veleslavia    | 项目源码 | 文件源码
def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
        """
        Test metadata format
        ---------------------
        filename : string
        class_ids: string of ints with space as a delimiter
        """
        test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
        self.X = list(test_dataset.filename)
        targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
        self.ml_binarizer = MultiLabelBinarizer().fit(targets)
        self.y_true = self.ml_binarizer.transform(targets)

        self.y_pred = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
        self.model_module = model_module
        self.weights_path = weights_path
        self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
        self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
        self.evaluation_strategy = evaluation_strategy
        self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
        self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
项目:pumpp    作者:bmcfee    | 项目源码 | 文件源码
def __init__(self, name='chord', sr=22050, hop_length=512, sparse=False):
        '''Initialize a chord task transformer'''

        super(ChordTransformer, self).__init__(name=name,
                                               namespace='chord',
                                               sr=sr, hop_length=hop_length)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([list(range(12))])
        self._classes = set(self.encoder.classes_)
        self.sparse = sparse

        self.register('pitch', [None, 12], np.bool)
        if self.sparse:
            self.register('root', [None, 1], np.int)
            self.register('bass', [None, 1], np.int)
        else:
            self.register('root', [None, 13], np.bool)
            self.register('bass', [None, 13], np.bool)
项目:pumpp    作者:bmcfee    | 项目源码 | 文件源码
def __init__(self, name, namespace, labels=None):
        super(StaticLabelTransformer, self).__init__(name=name,
                                                     namespace=namespace,
                                                     sr=1, hop_length=1)

        if labels is None:
            labels = jams.schema.values(namespace)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.register('tags', [len(self._classes)], np.bool)
项目:ltls    作者:kjasinska    | 项目源码 | 文件源码
def __init__(self, multilabel=False):
        self.multilabel = multilabel
        if self.multilabel:
            self.le = MultiLabelBinarizer(sparse_output=True)
        else:
            self.le = LabelEncoder()
        self.from_classes = False
项目:pumpp    作者:bmcfee    | 项目源码 | 文件源码
def __init__(self, name, namespace, labels=None, sr=22050, hop_length=512):
        super(DynamicLabelTransformer, self).__init__(name=name,
                                                      namespace=namespace,
                                                      sr=sr,
                                                      hop_length=hop_length)

        if labels is None:
            labels = jams.schema.values(namespace)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)

        self.register('tags', [None, len(self._classes)], np.bool)
项目:pandas-pipelines-custom-transformers    作者:jem1031    | 项目源码 | 文件源码
def fit(self, X, y=None):
        Xsplit = X.applymap(lambda x: x.split(self.sep))
        self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns]
        return self
项目:ai-chatbot-framework    作者:alfredfrancis    | 项目源码 | 文件源码
def train(X, y, outpath=None, verbose=True):
    def build(X, y=None):
        """
        Inner build function that builds a single model.
        """
        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(
                tokenizer=identity, preprocessor=None, lowercase=False)),
            ('clf', OneVsRestClassifier(LinearSVC()))])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = preprocessing.MultiLabelBinarizer()
    y = labels.fit_transform(y)

    model = build(X, y)
    model.labels_ = labels

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

            if verbose:
                print("Model written out to {}".format(outpath))

    return model
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnna_no_labels_take_closest(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)
        knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        print(pred)
        np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnna_predict(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnnb_predict(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnnb_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=False)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_BRKnnb_auto_optimize_k(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True)

        # noinspection PyUnusedLocal
        def fun(s, X, y_):
            return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]]

        BRKNeighborsClassifier._get_split = fun
        knn.fit(data, y)
        self.assertEquals(3, knn.n_neighbors)
        pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)

        # def test_time_brknnb(self):
        #     times = []
        #     X = sp.rand(10000, 5000, density=0.005, format='csr')
        #     y = sp.rand(10000, 3000, density=0.005, format='csr')
        #     knn = BRKNeighborsClassifier(n_neighbors=100)
        #     knn.fit(X,y)
        #     X_test = sp.rand(1000, 5000, density=0.005, format ='csr')
        #     for _ in range(5):
        #         start = default_timer()
        #         knn.predict(X_test)
        #         times.append(default_timer() - start)
        #     print(np.mean(times))
项目:ycml    作者:skylander86    | 项目源码 | 文件源码
def _fit(self, X, Y_labels, **kwargs):
        Y_labels_filtered = filter_labels(Y_labels, include=self.include, exclude=self.exclude)
        self.label_binarizer_ = MultiLabelBinarizer(sparse_output=False).fit(Y_labels_filtered)
        logger.info('{} labels found in training instances.'.format(len(self.classes_)))

        if not len(self.classes_): raise ValueError('There are no labels available for fitting model.')

        return super(MultiLabelsClassifier, self)._fit(X, Y_labels_filtered, **kwargs)
    #end def