我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用sklearn.preprocessing.MultiLabelBinarizer()。
def represent(documents): train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] # Tokenization vectorizer = TfidfVectorizer(tokenizer=tokenize) # Learn and transform train documents vectorised_train_documents = vectorizer.fit_transform(train_docs) vectorised_test_documents = vectorizer.transform(test_docs) # Transform multilabel labels mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id]) return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
def __init__(self, inputs, labels, test_indices=None, **kwargs): """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it easy to serialize and deserialize everything as a unit. Args: inputs: The raw model inputs. This can be set to None if you dont want to serialize this value when you save the dataset. labels: The raw output labels. test_indices: The optional test indices to use. Ideally, this should be generated one time and reused across experiments to make results comparable. `generate_test_indices` can be used generate first time indices. **kwargs: Additional key value items to store. """ self.X = np.array(inputs) self.y = np.array(labels) for key, value in kwargs.items(): setattr(self, key, value) self._test_indices = None self._train_indices = None self.test_indices = test_indices self.is_multi_label = isinstance(labels[0], (set, list, tuple)) self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer() self.y = self.label_encoder.fit_transform(self.y).flatten()
def __init__(self, model_module, weights_path, evaluation_strategy="s2"): """ Test metadata format --------------------- filename : string class_ids: string of ints with space as a delimiter """ test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"]) self.X = list(test_dataset.filename) targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids] self.ml_binarizer = MultiLabelBinarizer().fit(targets) self.y_true = self.ml_binarizer.transform(targets) self.y_pred = np.zeros(shape=self.y_true.shape) self.y_pred_raw = np.zeros(shape=self.y_true.shape) self.y_pred_raw_average = np.zeros(shape=self.y_true.shape) self.model_module = model_module self.weights_path = weights_path self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME)) self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME))) self.evaluation_strategy = evaluation_strategy self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24] self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
def __init__(self, name='chord', sr=22050, hop_length=512, sparse=False): '''Initialize a chord task transformer''' super(ChordTransformer, self).__init__(name=name, namespace='chord', sr=sr, hop_length=hop_length) self.encoder = MultiLabelBinarizer() self.encoder.fit([list(range(12))]) self._classes = set(self.encoder.classes_) self.sparse = sparse self.register('pitch', [None, 12], np.bool) if self.sparse: self.register('root', [None, 1], np.int) self.register('bass', [None, 1], np.int) else: self.register('root', [None, 13], np.bool) self.register('bass', [None, 13], np.bool)
def __init__(self, name, namespace, labels=None): super(StaticLabelTransformer, self).__init__(name=name, namespace=namespace, sr=1, hop_length=1) if labels is None: labels = jams.schema.values(namespace) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.register('tags', [len(self._classes)], np.bool)
def __init__(self, multilabel=False): self.multilabel = multilabel if self.multilabel: self.le = MultiLabelBinarizer(sparse_output=True) else: self.le = LabelEncoder() self.from_classes = False
def __init__(self, name, namespace, labels=None, sr=22050, hop_length=512): super(DynamicLabelTransformer, self).__init__(name=name, namespace=namespace, sr=sr, hop_length=hop_length) if labels is None: labels = jams.schema.values(namespace) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.register('tags', [None, len(self._classes)], np.bool)
def fit(self, X, y=None): Xsplit = X.applymap(lambda x: x.split(self.sep)) self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns] return self
def train(X, y, outpath=None, verbose=True): def build(X, y=None): """ Inner build function that builds a single model. """ model = Pipeline([ ('preprocessor', NLTKPreprocessor()), ('vectorizer', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False)), ('clf', OneVsRestClassifier(LinearSVC()))]) model.fit(X, y) return model # Label encode the targets labels = preprocessing.MultiLabelBinarizer() y = labels.fit_transform(y) model = build(X, y) model.labels_ = labels if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) if verbose: print("Model written out to {}".format(outpath)) return model
def test_BRKnna_no_labels_take_closest(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() print(pred) np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
def test_BRKnna_predict(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnna_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnnb_predict(self): data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
def test_BRKnnb_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=False) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
def test_BRKnnb_auto_optimize_k(self): data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True) # noinspection PyUnusedLocal def fun(s, X, y_): return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]] BRKNeighborsClassifier._get_split = fun knn.fit(data, y) self.assertEquals(3, knn.n_neighbors) pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred) # def test_time_brknnb(self): # times = [] # X = sp.rand(10000, 5000, density=0.005, format='csr') # y = sp.rand(10000, 3000, density=0.005, format='csr') # knn = BRKNeighborsClassifier(n_neighbors=100) # knn.fit(X,y) # X_test = sp.rand(1000, 5000, density=0.005, format ='csr') # for _ in range(5): # start = default_timer() # knn.predict(X_test) # times.append(default_timer() - start) # print(np.mean(times))
def _fit(self, X, Y_labels, **kwargs): Y_labels_filtered = filter_labels(Y_labels, include=self.include, exclude=self.exclude) self.label_binarizer_ = MultiLabelBinarizer(sparse_output=False).fit(Y_labels_filtered) logger.info('{} labels found in training instances.'.format(len(self.classes_))) if not len(self.classes_): raise ValueError('There are no labels available for fitting model.') return super(MultiLabelsClassifier, self)._fit(X, Y_labels_filtered, **kwargs) #end def