我们从Python开源项目中,提取了以下42个代码示例,用于说明如何使用sklearn.preprocessing.LabelBinarizer()。
def _get_child_predict(self, clf, X, index=None): if self.stack_by_proba and hasattr(clf, 'predict_proba'): if self.save_stage0 and index is not None: proba = util.saving_predict_proba(clf, X, index) else: proba = clf.predict_proba(X) return proba[:, 1:] elif hasattr(clf, 'predict'): predict_result = clf.predict(X) if isinstance(clf, ClassifierMixin): lb = LabelBinarizer() lb.fit(predict_result) return lb.fit_transform(predict_result) else: return predict_result.reshape((predict_result.size, 1)) else: return clf.fit_transform(X)
def fit(self, X, y, check_input=True): self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if self._label_binarizer.y_type_.startswith('multilabel'): # we don't (yet) support multi-label classification in ENet raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) # Y = column_or_1d(Y, warn=True) super(ElasticNetClassifier, self).fit(X, Y) if self.classes_.shape[0] > 2: ndim = self.classes_.shape[0] else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) return self
def __init__(self, inputs, labels, test_indices=None, **kwargs): """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it easy to serialize and deserialize everything as a unit. Args: inputs: The raw model inputs. This can be set to None if you dont want to serialize this value when you save the dataset. labels: The raw output labels. test_indices: The optional test indices to use. Ideally, this should be generated one time and reused across experiments to make results comparable. `generate_test_indices` can be used generate first time indices. **kwargs: Additional key value items to store. """ self.X = np.array(inputs) self.y = np.array(labels) for key, value in kwargs.items(): setattr(self, key, value) self._test_indices = None self._train_indices = None self.test_indices = test_indices self.is_multi_label = isinstance(labels[0], (set, list, tuple)) self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer() self.y = self.label_encoder.fit_transform(self.y).flatten()
def preprocess(image_shape, image_paths, labels=[]): features = [] for image_path in tqdm(image_paths): image_data = list(Image.open(image_path).resize(image_shape[:2]).getdata()) image_data = np.asarray(image_data).reshape(image_shape) features.append(image_data) # Normalizer features = np.asarray(features) features = features / 255.0 if labels: # one hot encode label_binarizer = LabelBinarizer() labels = label_binarizer.fit_transform(labels) # Shuffle features, labels = shuffle(features, labels) return features, labels
def get_one_hot(in_matrix): """ Reformat truth matrix to same size as the output of the dense network. Args: in_matrix: the categorized 1D matrix Returns: a one-hot matrix representing the categorized matrix """ if in_matrix.dtype.name == 'category': custum_array = in_matrix.cat.codes elif isinstance(in_matrix, np.ndarray): custum_array = in_matrix else: raise ValueError("Input matrix cannot be converted.") lb = LabelBinarizer() return np.array(lb.fit_transform(custum_array), dtype='float32')
def extract_data(path): global CLASSES images, labels = traverse_dir(path) images = np.array(images) # change to ont-hot vector one_hot = preprocessing.LabelBinarizer() one_hot.fit(labels) nb_classes = len(one_hot.classes_) with open(path+'\labels.txt', 'w') as f: for label in one_hot.classes_: f.write(label + '\n') one_hots = list(one_hot.transform([i]) for i in labels) one_hots = np.array(one_hots) one_hots = np.reshape(one_hots, (images.shape[0], nb_classes)) return images, one_hots, nb_classes
def encode_bond_features(self, bond_set): """ We break out this function for encoding bond types because it is reused and occupies several lines. Parameters: =========== - bond_set: (set or list) of bonds. """ bond_lb = LabelBinarizer() bond_lb.fit(BOND_TYPES) bonds = np.zeros(len(BOND_TYPES)) if len(bond_set) > 0: bond_array = bond_lb.transform([i for i in bond_set]) for b in bond_array: bonds = bonds + b return bonds
def load_data_labels(data_file, labels_file): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ data = [] labels = [] with open(data_file, 'r', encoding='latin-1') as f: data.extend([s.strip() for s in f.readlines()]) data = [clean_str(s) for s in data] with open(labels_file, 'r') as f: labels.extend([s.strip() for s in f.readlines()]) lables = [label.split(',')[1].strip() for label in labels] lb = LabelBinarizer() y = lb.fit_transform(lables) # max_document_length = max([len(x.split(" ")) for x in data]) # print(max_document_length) vocab_processor = learn.preprocessing.VocabularyProcessor(1000) x = np.array(list(vocab_processor.fit_transform(data))) return x, y, vocab_processor
def _check_X_y(self, X, y): # helpful error message for sklearn < 1.17 is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2 if is_2d or type_of_target(y) != 'binary': raise TypeError("Only binary targets supported. For training " "multiclass or multilabel models, you may use the " "OneVsRest or OneVsAll metaestimators in " "scikit-learn.") X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc', multi_output=False) self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1) y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double) return X, y
def _partial_fit(self, X, y, classes=None, first_partial_fit=None): if first_partial_fit and not classes: raise ValueError("classes must be passed on the first call " "to partial_fit.") if not self.is_fitted: self.alpha_sum_ = X.shape[1] * self.alpha if classes: self.classes_ = classes lb = LabelBinarizer() y_one_hot = lb.fit_transform(y) self.class_count_ = np.sum(y_one_hot, axis=0) if not self.classes_: self.classes_ = lb.classes_ self._class_log_prob() self._update_complement_features(X, y_one_hot) self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None): if first_partial_fit and not classes: raise ValueError("classes must be passed on the first call " "to partial_fit.") if not self.is_fitted: self.alpha_sum_ = X.shape[1] * self.alpha if classes: self.classes_ = classes lb = LabelBinarizer() y_one_hot = lb.fit_transform(y) self.class_count_ = np.sum(y_one_hot, axis=0) if not self.classes_: self.classes_ = lb.classes_ self._features_in_class(X, y_one_hot) self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None): if first_partial_fit and not classes: raise ValueError("classes must be passed on the first call " "to partial_fit.") if not self.is_fitted: self.alpha_sum_ = X.shape[1] * self.alpha if classes: self.classes_ = classes lb = LabelBinarizer() y_one_hot = lb.fit_transform(y) self.class_count_ = np.sum(y_one_hot, axis=0) if not self.classes_: self.classes_ = lb.classes_ self._update_complement_features(X, y_one_hot) self._update_features(X, y_one_hot) self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None): if first_partial_fit and not classes: raise ValueError("classes must be passed on the first call " "to partial_fit.") if not self.is_fitted: self.alpha_sum_ = X.shape[1] * self.alpha if classes: self.classes_ = classes lb = LabelBinarizer() y_one_hot = lb.fit_transform(y) self.class_count_ = np.sum(y_one_hot, axis=0) if not self.classes_: self.classes_ = lb.classes_ #self._class_log_prob() self._update_complement_features(X, y_one_hot) self.is_fitted = True
def _partial_fit(self, X, y, classes=None, first_partial_fit=None): if first_partial_fit and not classes: raise ValueError("classes must be passed on the first call " "to partial_fit.") if not self.is_fitted: self.alpha_sum_ = X.shape[1] * self.alpha if classes: self.classes_ = classes lb = LabelBinarizer() y_one_hot = lb.fit_transform(y) self.class_counts_ = np.sum(y_one_hot, axis=0) if not self.classes_: self.classes_ = lb.classes_ self._class_log_prob() self._features_in_class(X, y_one_hot) self.is_fitted = True
def data2Vector(self): vec = DictVectorizer() dummy_x = vec.fit_transform(self.feature_list).toarray() lb = LabelBinarizer() dummy_y = lb.fit_transform(self.label_list) return dummy_x, dummy_y # here the decision tree use the algorithm which we call ID3, ID3 will use # information gain as feature select
def to_one_hot(y): """Transform multi-class labels to binary labels The output of to_one_hot is sometimes referred to by some authors as the 1-of-K coding scheme. Parameters ---------- y : numpy array or sparse matrix of shape (n_samples,) or (n_samples, n_classes) Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Sparse matrix can be CSR, CSC, COO, DOK, or LIL. Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. classes_ : class vector extraceted from y. """ lb = LabelBinarizer() lb.fit(y) Y = lb.transform(y) return (Y.base, lb.classes_)
def check_proba_classif_convergence(X_train, y_train, mc): lb = LabelBinarizer() y_bin = lb.fit_transform(y_train) le = LabelEncoder() y_enc = le.fit_transform(y_train) proba = mc.predict_proba(X_train) labels = mc.predict(X_train) assert_array_equal(proba, y_bin) assert_array_equal(labels, lb.inverse_transform(y_bin)) # For points completely far away from the training data, this # should converge to the empirical distribution of labels. # X is scaled between to -1.0 and 1.0 X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]), -30.0 * np.ones(X_train.shape[1]))) inf_proba = mc.predict_proba(X_inf) emp_proba = np.bincount(y_enc) / float(len(y_enc)) assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])
def check_proba_classif_convergence(est, X_train, y_train): lb = LabelBinarizer() y_bin = lb.fit_transform(y_train) le = LabelEncoder() y_enc = le.fit_transform(y_train) proba = est.predict_proba(X_train) labels = est.predict(X_train) assert_array_equal(proba, y_bin) assert_array_equal(labels, lb.inverse_transform(y_bin)) # For points completely far away from the training data, this # should converge to the empirical distribution of labels. X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]), -30.0 * np.ones(X_train.shape[1]))) inf_proba = est.predict_proba(X_inf) emp_proba = np.bincount(y_enc) / float(len(y_enc)) assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
def load_csv(training_fn, features_fn): ''' Transform tabular data set into NumPy arrays. ''' df = pd.read_csv(training_fn, sep='\t') features = json.load(open(features_fn))['features'] data = df[features].as_matrix() print('Data:', data.shape) labels = df[['label']].as_matrix().reshape(-1) lb = preprocessing.LabelBinarizer() lb.fit(labels) print('Labels:', labels.shape) return features, data, labels
def classify(y_true, y_pred): lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels = [class_indices[cls] for cls in tagset], target_names = tagset, )
def bio_classification_report(y_gold,y_pred): #y_gold: [[],[],[]] #y_pred: lb = LabelBinarizer() y_gold_combined = lb.fit_transform(list(chain.from_iterable(y_gold))) y_pred_combined = lb.fit_transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset,key=lambda tag: tag.split('-',1)[::-1]) class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)} return classification_report( y_gold_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset )
def __init__(self, hidden_layer_sizes=(100,), activation="relu", algorithm='l-bfgs', alpha=0.00001, batch_size=200, learning_rate="constant", learning_rate_init=0.5, power_t=0.5, max_iter=200, shuffle=False, random_state=None, tol=1e-5, verbose=False, warm_start=False): sup = super(MultilayerPerceptronClassifier, self) sup.__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, algorithm=algorithm, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='log_loss', shuffle=shuffle, random_state=random_state, tol=tol, beta=0, sparsity_param=0, verbose=verbose, warm_start=warm_start) self.label_binarizer_ = LabelBinarizer()
def fit(self, y): """Fit simplex coding Parameters ---------- targets : array, shape = [n_samples,] or [n_samples, n_classes] Target values. The 2-d array represents the simplex coding for multilabel classification. Returns ------- self : returns an instance of self. """ if self.binarizer is None: self.binarizer_ = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True) self.binarizer_.fit(y) dimension = self.binarizer_.classes_.size if dimension > 2: self.simplex_operator_ = SimplexCoding.code(dimension) else: self.simplex_operator_ = ones((1, 1)) return self
def test_cross_val_predict(): # Make sure it works in cross_val_predict for multiclass. X, y = load_iris(return_X_y=True) y = LabelBinarizer().fit_transform(y) X = StandardScaler().fit_transform(X) mlp = MLPClassifier(n_epochs=10, solver_kwargs={'learning_rate': 0.05}, random_state=4567).fit(X, y) cv = KFold(n_splits=4, random_state=457, shuffle=True) y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba') auc = roc_auc_score(y, y_oos, average=None) assert np.all(auc >= 0.96)
def __init__(self, n_hidden=20, alpha=0.5, rbf_width=1.0, activation_func='tanh', activation_args=None, user_components=None, regressor=None, binarizer=LabelBinarizer(-1, 1), random_state=None): super(ELMClassifier, self).__init__(n_hidden=n_hidden, alpha=alpha, random_state=random_state, activation_func=activation_func, activation_args=activation_args, user_components=user_components, rbf_width=rbf_width, regressor=regressor) self.classes_ = None self.binarizer = binarizer
def fit(self, X, y): """ :param X_: shape = [n_samples, n_features] :param y: shape = [n_samples] :return: self """ labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes = labelbin.classes_ self.class_count = np.zeros(Y.shape[1], dtype=np.float64) self.feature_count = np.zeros((Y.shape[1], X.shape[1]), dtype=np.float64) self.feature_count += Y.T @ X self.class_count += Y.sum(axis=0) smoothed_fc = self.feature_count + self.alpha smoothed_cc = smoothed_fc.sum(axis=1) self.feature_log_prob = (np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1)))
def feature_mapping_to_numerical_values(self, df): TwoSigmaFinModTools._is_one_hot_encoder = 0 mask = ~df.isnull() # Assume that training set has all possible feature_var_names # Although it may occur in real life that a training set may hold a feature_var_name. But it is probably # avoided since such features cannot # be part of the trained learning algo. # Add missing feature_var_names of training set not occurring in test set. Add these with zeros in columns. if not any(tuple(df.columns == 'y')): # All one-hot encoded feature var names occurring in test data is assigned the public variable # df_test_all_feature_var_names. self.df_test_all_feature_var_names = df.columns _feature_names_num = np.zeros((TwoSigmaFinModTools._non_numerical_feature_names.shape[0],), dtype=object) ith = 0 for feature_name in TwoSigmaFinModTools._non_numerical_feature_names: # Create a feature_nameNum list feature_name_num = ''.join([feature_name, 'Num']) _feature_names_num[ith] = feature_name_num ith += 1 TwoSigmaFinModTools.encode_labels_in_numeric_format(df, feature_name) if TwoSigmaFinModTools._is_one_hot_encoder: is_with_label_binarizer = 0 if is_with_label_binarizer: mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True) feature_var_values = mapper_df.fit_transform(df.copy()) print(df[feature_name].isnull().sum().sum()) print(df[feature_name][mask[feature_name]].isnull().sum().sum()) for ite in feature_var_values.columns: df[ite] = feature_var_values[ite] else: TwoSigmaFinModTools.one_hot_encoder(df, feature_name) TwoSigmaFinModTools._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)
def display_image_predictions(features, labels, predictions): n_classes = 10 label_names = _load_label_names() label_binarizer = LabelBinarizer() label_binarizer.fit(range(n_classes)) label_ids = label_binarizer.inverse_transform(np.array(labels)) fig, axies = plt.subplots(nrows=4, ncols=2) fig.tight_layout() fig.suptitle('Softmax Predictions', fontsize=20, y=1.1) n_predictions = 3 margin = 0.05 ind = np.arange(n_predictions) width = (1. - 2. * margin) / n_predictions for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)): pred_names = [label_names[pred_i] for pred_i in pred_indicies] correct_name = label_names[label_id] axies[image_i][0].imshow(feature*255) axies[image_i][0].set_title(correct_name) axies[image_i][0].set_axis_off() axies[image_i][1].barh(ind + margin, pred_values[::-1], width) axies[image_i][1].set_yticks(ind + margin) axies[image_i][1].set_yticklabels(pred_names[::-1]) axies[image_i][1].set_xticks([0, 0.5, 1.0])
def main(): digits = load_digits() x_train, x_test, y_train_, y_test_ = cross_validation.train_test_split(digits.data, digits.target, test_size=0.2, random_state=0) lb = preprocessing.LabelBinarizer() lb.fit(digits.target) y_train = lb.transform(y_train_) y_test = lb.transform(y_test_) sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, shape=[None, 64]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) w_1 = weight_variable([64, 32]) b_1 = bias_variable([32]) h_1 = tf.nn.relu(tf.matmul(x, w_1) + b_1) w_2 = weight_variable([32, 10]) b_2 = bias_variable([10]) y = tf.nn.softmax(tf.matmul(h_1, w_2) + b_2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) sess.run(tf.initialize_all_variables()) for i in range(1000): train_step.run(feed_dict={x: x_train, y_: y_train}) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print(accuracy.eval(feed_dict={x: x_test, y_: y_test}))
def main(): digits = load_digits() x_train, x_test, y_train_, y_test_ = cross_validation.train_test_split(digits.data, digits.target, test_size=0.2, random_state=0) lb = preprocessing.LabelBinarizer() lb.fit(digits.target) y_train = lb.transform(y_train_) y_test = lb.transform(y_test_) sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, shape=[None, 64]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) phase_train = tf.placeholder(tf.bool, name='phase_train') w_1 = weight_variable([64, 32]) b_1 = bias_variable([32]) t_1 = tf.matmul(x, w_1) + b_1 bn = batch_norm(t_1, 1, phase_train) h_1 = binarized_ops.binarized(bn) w_2 = weight_variable([32, 10]) b_2 = bias_variable([10]) y = tf.nn.softmax(tf.matmul(h_1, w_2) + b_2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) sess.run(tf.initialize_all_variables()) for i in range(1000): train_step.run(feed_dict={x: x_train, y_: y_train, phase_train: True}) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print(accuracy.eval(feed_dict={x: x_test, y_: y_test, phase_train: False}))