我们从Python开源项目中,提取了以下23个代码示例,用于说明如何使用sklearn.preprocessing.label_binarize()。
def _check_binary_probabilistic_predictions(y_true, y_prob): """Check that y_true is binary and y_prob contains valid probabilities""" check_consistent_length(y_true, y_prob) labels = np.unique(y_true) if len(labels) != 2: raise ValueError("Only binary classification is supported. " "Provided labels %s." % labels) if y_prob.max() > 1: raise ValueError("y_prob contains values greater than 1.") if y_prob.min() < 0: raise ValueError("y_prob contains values less than 0.") return label_binarize(y_true, labels)[:, 0]
def test_precision_recall_f_ignored_labels(): # Test a subset of labels may be requested for PRF y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] y_true_bin = label_binarize(y_true, classes=np.arange(5)) y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) data = [(y_true, y_pred), (y_true_bin, y_pred_bin)] for i, (y_true, y_pred) in enumerate(data): recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) recall_all = partial(recall_score, y_true, y_pred, labels=None) assert_array_almost_equal([.5, 1.], recall_13(average=None)) assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro')) assert_almost_equal((.5 * 2 + 1. * 1) / 3, recall_13(average='weighted')) assert_almost_equal(2. / 3, recall_13(average='micro')) # ensure the above were meaningful tests: for average in ['macro', 'weighted', 'micro']: assert_not_equal(recall_13(average=average), recall_all(average=average))
def multilabel_precision_recall(y_score, y_test, clf_target_ids, clf_target_names): from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score from sklearn.preprocessing import label_binarize # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() # Find indices that have non-zero detections clf_target_map = { k: v for k,v in zip(clf_target_ids, clf_target_names)} id2ind = {tid: idx for (idx,tid) in enumerate(clf_target_ids)} # Only handle the targets encountered unique = np.unique(y_test) nzinds = np.int64([id2ind[target] for target in unique]) # Binarize and create precision-recall curves y_test_multi = label_binarize(y_test, classes=unique) for i,target in enumerate(unique): index = id2ind[target] name = clf_target_map[target] precision[name], recall[name], _ = precision_recall_curve(y_test_multi[:, i], y_score[:, index]) average_precision[name] = average_precision_score(y_test_multi[:, i], y_score[:, index]) # Compute micro-average ROC curve and ROC area precision["average"], recall["average"], _ = precision_recall_curve(y_test_multi.ravel(), y_score[:,nzinds].ravel()) average_precision["micro"] = average_precision_score(y_test_multi, y_score[:,nzinds], average="micro") average_precision["macro"] = average_precision_score(y_test_multi, y_score[:,nzinds], average="macro") return precision, recall, average_precision
def binarize_labels(actual): return label_binarize(actual, list(set(actual)))
def roc_auc(actual, predictions, average='weighted'): class_names = list(set(actual)) # use binarized values for AUC score calculation return roc_auc_score(label_binarize(actual, class_names), label_binarize(predictions, class_names), average=average)
def generate_prec_recall_points(clf, test_examples, test_labels, pk_file): # Generate precision-recall points and store in a pickle file. precision = dict() recall = dict() average_precision = dict() thresholds = dict() n_classes = len(clf.model.classes_) y_test = label_binarize(test_labels, clf.model.classes_) y_score = clf.predict_raw_prob(test_examples) # It only output 1 column of positive probability. y_score = y_score[:, 1:] for i in range(n_classes - 1): precision[i], recall[i], thresholds[i] = precision_recall_curve( y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], thresholds['micro'] = \ precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") if pk_file is not None: with open(pk_file, 'wb') as f: pickle.dump((precision, recall, average_precision, thresholds), f)
def roc(y_true, y_pred, classes=[0, 1, 2, 3, 4]): y_true = label_binarize(y_true, classes=classes) y_pred = label_binarize(y_pred, classes=classes) n_classes = len(classes) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) return roc_auc
def _marg_rounded(self, x, y): y_node = y.nodes y_link = y.links Y_node = label_binarize(y_node, self.prop_encoder_.classes_) Y_link = label_binarize(y_link, self.link_encoder_.classes_) # XXX can this be avoided? Y_node, Y_link = map(_binary_2d, (Y_node, Y_link)) src_type = Y_node[x.link_to_prop[:, 0]] trg_type = Y_node[x.link_to_prop[:, 1]] if self.compat_features: pw = np.einsum('...j,...k,...l->...jkl', src_type, trg_type, Y_link) compat = np.tensordot(x.X_compat.T, pw, axes=[1, 0]) else: # equivalent to compat_features == np.ones(n_links) compat = np.einsum('ij,ik,il->jkl', src_type, trg_type, Y_link) second_order = [] if self.coparents_ or self.grandparents_ or self.siblings_: link = {(a, b): k for k, (a, b) in enumerate(x.link_to_prop)} if self.coparents_: second_order.extend(y_link[link[a, b]] & y_link[link[c, b]] for a, b, c in x.second_order) if self.grandparents_: second_order.extend(y_link[link[a, b]] & y_link[link[b, c]] for a, b, c in x.second_order) if self.siblings_: second_order.extend(y_link[link[b, a]] & y_link[link[b, c]] for a, b, c in x.second_order) second_order = np.array(second_order) return Y_node, Y_link, compat, second_order
def compute_roc(y_test, y_test_proba, nb_classes): y_test = label_binarize(y_test, classes=range(0, nb_classes)) fpr, tpr, roc_auc = {}, {}, {} for i in range(nb_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_test_proba[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_test_proba.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) return roc_auc, fpr, tpr
def load_abalone_data(proportion=1044./4177): from sklearn import datasets from sklearn import preprocessing from sklearn import cross_validation abalone = datasets.fetch_mldata('regression-datasets abalone') X_cate = np.array([abalone.target[i].tolist() for i in range(abalone.target.shape[0])]) X_cate = preprocessing.label_binarize(X_cate, np.unique(X_cate)) X = np.hstack((X_cate, abalone.data)) y = abalone.int1[0].T.astype(np.float64) y = y[:, None] X = X.astype(np.float64) X_train, X_test, y_train, y_test = \ cross_validation.train_test_split(X, y, test_size=proportion) return X_train, y_train, X_test, y_test
def _score_micro_average(self, y, y_pred, classes, n_classes): """ Compute the micro average scores for the ROCAUC curves. """ # Convert y to binarized array for micro and macro scores y = label_binarize(y, classes=classes) if n_classes == 2: y = np.hstack((1-y, y)) # Compute micro-average self.fpr[MICRO], self.tpr[MICRO], _ = roc_curve(y.ravel(), y_pred.ravel()) self.roc_auc[MICRO] = auc(self.fpr[MICRO], self.tpr[MICRO])
def test_precision_recall_curve(): iris=load_iris() X=iris.data y=iris.target y = label_binarize(y, classes=[0, 1, 2]) n_classes = y.shape[1] np.random.seed(0) n_samples, n_features = X.shape X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.5,random_state=0) clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0)) clf.fit(X_train,y_train) y_score = clf.fit(X_train, y_train).decision_function(X_test) fig=plt.figure() ax=fig.add_subplot(1,1,1) precision = dict() recall = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) ax.plot(recall[i],precision[i],label="target=%s"%i) ax.set_xlabel("Recall Score") ax.set_ylabel("Precision Score") ax.set_title("P-R") ax.legend(loc='best') ax.set_xlim(0,1.1) ax.set_ylim(0,1.1) ax.grid() plt.show()
def test_roc_auc_score(): iris=load_iris() X=iris.data y=iris.target y = label_binarize(y, classes=[0, 1, 2]) n_classes = y.shape[1] np.random.seed(0) n_samples, n_features = X.shape X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.5,random_state=0) clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0)) clf.fit(X_train,y_train) y_score = clf.fit(X_train, y_train).decision_function(X_test) fig=plt.figure() ax=fig.add_subplot(1,1,1) fpr = dict() tpr = dict() roc_auc=dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i],y_score[:, i]) roc_auc[i] = roc_auc_score(fpr[i], tpr[i]) ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i])) ax.plot([0, 1], [0, 1], 'k--') ax.set_xlabel("FPR") ax.set_ylabel("TPR") ax.set_title("ROC") ax.legend(loc="best") ax.set_xlim(0,1.1) ax.set_ylim(0,1.1) ax.grid() plt.show()
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def test_precision_recall_f_extra_labels(): # Test handling of explicit additional (not in input) labels to PRF y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] y_true_bin = label_binarize(y_true, classes=np.arange(5)) y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) data = [(y_true, y_pred), (y_true_bin, y_pred_bin)] for i, (y_true, y_pred) in enumerate(data): # No average: zeros in array actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=None) assert_array_almost_equal([0., 1., 1., .5, 0.], actual) # Macro average is changed actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average='macro') assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) # No effect otheriwse for average in ['micro', 'weighted', 'samples']: if average == 'samples' and i == 0: continue assert_almost_equal(recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average), recall_score(y_true, y_pred, labels=None, average=average)) # Error when introducing invalid label in multilabel case # (although it would only affect performance if average='macro'/None) for average in [None, 'macro', 'micro', 'samples']: assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, labels=np.arange(6), average=average) assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, labels=np.arange(-1, 4), average=average)
def test_matthews_corrcoef(): rng = np.random.RandomState(0) y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)] # corrcoef of same vectors must be 1 assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) # corrcoef, when the two vectors are opposites of each other, should be -1 y_true_inv = ["b" if i == "a" else "a" for i in y_true] assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1) y_true_inv2 = label_binarize(y_true, ["a", "b"]) * -1 assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1) # For the zero vector case, the corrcoef cannot be calculated and should # result in a RuntimeWarning mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered', matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0]) # But will output 0 assert_almost_equal(mcc, 0.) # And also for any other vector with 0 variance mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered', matthews_corrcoef, y_true, rng.randint(-100, 100) * np.ones(20, dtype=int)) # But will output 0 assert_almost_equal(mcc, 0.) # These two vectors have 0 correlation and hence mcc should be 0 y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1] y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1] assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.) # Check that sample weight is able to selectively exclude mask = [1] * 10 + [0] * 10 # Now the first half of the vector elements are alone given a weight of 1 # and hence the mcc will not be a perfect 0 as in the previous case assert_raises(AssertionError, assert_almost_equal, matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.)
def plot_roc(y_score, y_test, target_map, title='ROC curve'): import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc, precision_recall_curve from sklearn.preprocessing import label_binarize # Compute Precision-Recall and plot curve fpr = dict() tpr = dict() roc_auc = dict() target_ids = target_map.keys() target_names = target_map.values() print target_names y_test_multi = label_binarize(y_test, classes=target_ids) N, n_classes = y_score.shape[:2] for i,name in enumerate(target_names): fpr[name], tpr[name], _ = roc_curve(y_test_multi[:, i], y_score[:, i]) roc_auc[name] = auc(fpr[name], tpr[name]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test_multi.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot Precision-Recall curve for each class plt.clf() plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr["micro"], tpr["micro"], label='ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), linewidth=3) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.legend(loc="lower right") plt.show() for i,name in enumerate(target_names): plt.plot(fpr[name], tpr[name], label='{0}'.format(name.title().replace('_', ' '))) # label='{0} (area = {1:0.2f})' # ''.format(name.title().replace('_', ' '), roc_auc[name])) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title) plt.legend(loc="lower right") plt.show(block=False)
def _as_dmatrix(self): kwargs = dict(label=self.records['label']) kwargs['feature_names'] = self.feature_names featdat = self.records[self.basic_feat_cols] featdat = featdat.view(fields.dtype).reshape(len(featdat), -1) if self.hps.embedding_tag: embs = cache_embeddings.load_embeddings(self.hps.embedding_tag) npids, embsize = embs.shape assert embsize == self.hps.embedding_dimension logging.info('Loaded {}-d embeddings from rnn model {}'.format( embsize, self.hps.embedding_tag)) pids = self.records['pid'] # NB: pids are 1-indexed pidxs = (pids-1).astype(np.int32) lookuped = embs[pidxs] orig_shape = featdat.shape featdat = np.hstack((featdat, lookuped)) logging.info('Shape went from {} to {} after adding pid embeddings'.format( orig_shape, featdat.shape)) onehot_matrices = [] for onehot_var in self.onehot_vars: onehot = label_binarize(self.records[onehot_var], classes=range(1, self.FIELD_TO_NVALUES[onehot_var]+1), sparse_output=True).astype(fields.dtype) onehot_matrices.append(onehot) if onehot_matrices: # TODO: There are some perf issues with this. Look into this workaround: # https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices/33259578#33259578 featdat = scipy.sparse.hstack([featdat,]+onehot_matrices, format='csr') logging.info('Made dmatrix with feature data having shape {}'.format(featdat.shape)) # https://github.com/dmlc/xgboost/issues/2554 if not kwargs['label'].flags.c_contiguous: logging.info('Contiguizing labels') kwargs['label'] = np.ascontiguousarray(kwargs['label']) logging.info('Contiguized') if isinstance(featdat, np.ndarray) and not featdat.flags.c_contiguous: logging.info('Contiguizing feature data') featdat = np.ascontiguousarray(featdat) if FTYPES: kwargs['feature_types'] = self.feature_types return xgb.DMatrix(featdat, **kwargs)
def main(): plt.figure() for j in range(1,6): random_state = np.random.RandomState(0) X,y = load_file(file_name,j) k = 2 # y = label_binarize(y, classes=[0, 1, 2]) # n_classes = y.shape[1] # print n_classes n_classes = 2 ylabel, ave= transformtolabel(y,k) ylabel = np.array(ylabel) # ylabel = np.transpose(ylabel) # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, ylabel, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True, random_state=random_state)) y_score = classifier.fit(X_train, y_train).decision_function(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): # print y_test[i] fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # print fpr[1] ############################################################################## # Plot of a ROC curve for a specific class # plt.figure() # plt.plot(fpr[0], tpr[0], label='CO below %0.2f' % ave +' (area = %0.2f)' %roc_auc[0]) plt.plot(fpr[1], tpr[1], label='O3 prediction (area = %0.2f)' %roc_auc[1]+'(%0.0f'% j+' features)') plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic for SVM') plt.legend(loc="lower right") plt.show()
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')