Python sklearn.preprocessing 模块,label_binarize() 实例源码

我们从Python开源项目中,提取了以下23个代码示例,用于说明如何使用sklearn.preprocessing.label_binarize()

项目:MHG-scansion    作者:henchc    | 项目源码 | 文件源码
def _check_binary_probabilistic_predictions(y_true, y_prob):
    """Check that y_true is binary and y_prob contains valid probabilities"""
    check_consistent_length(y_true, y_prob)

    labels = np.unique(y_true)

    if len(labels) != 2:
        raise ValueError("Only binary classification is supported. "
                         "Provided labels %s." % labels)

    if y_prob.max() > 1:
        raise ValueError("y_prob contains values greater than 1.")

    if y_prob.min() < 0:
        raise ValueError("y_prob contains values less than 0.")

    return label_binarize(y_true, labels)[:, 0]
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_precision_recall_f_ignored_labels():
    # Test a subset of labels may be requested for PRF
    y_true = [1, 1, 2, 3]
    y_pred = [1, 3, 3, 3]
    y_true_bin = label_binarize(y_true, classes=np.arange(5))
    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
    data = [(y_true, y_pred),
            (y_true_bin, y_pred_bin)]

    for i, (y_true, y_pred) in enumerate(data):
        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
        recall_all = partial(recall_score, y_true, y_pred, labels=None)

        assert_array_almost_equal([.5, 1.], recall_13(average=None))
        assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
        assert_almost_equal((.5 * 2 + 1. * 1) / 3,
                            recall_13(average='weighted'))
        assert_almost_equal(2. / 3, recall_13(average='micro'))

        # ensure the above were meaningful tests:
        for average in ['macro', 'weighted', 'micro']:
            assert_not_equal(recall_13(average=average),
                             recall_all(average=average))
项目:pybot    作者:spillai    | 项目源码 | 文件源码
def multilabel_precision_recall(y_score, y_test, clf_target_ids, clf_target_names): 
    from sklearn.metrics import precision_recall_curve
    from sklearn.metrics import average_precision_score
    from sklearn.preprocessing import label_binarize

    # Compute Precision-Recall and plot curve
    precision = dict()
    recall = dict()
    average_precision = dict()

    # Find indices that have non-zero detections
    clf_target_map = { k: v for k,v in zip(clf_target_ids, clf_target_names)}
    id2ind = {tid: idx for (idx,tid) in enumerate(clf_target_ids)}

    # Only handle the targets encountered
    unique = np.unique(y_test)
    nzinds = np.int64([id2ind[target] for target in unique])

    # Binarize and create precision-recall curves
    y_test_multi = label_binarize(y_test, classes=unique)
    for i,target in enumerate(unique):
        index = id2ind[target]
        name = clf_target_map[target]
        precision[name], recall[name], _ = precision_recall_curve(y_test_multi[:, i],
                                                                  y_score[:, index])
        average_precision[name] = average_precision_score(y_test_multi[:, i], y_score[:, index])

    # Compute micro-average ROC curve and ROC area
    precision["average"], recall["average"], _ = precision_recall_curve(y_test_multi.ravel(),
                                                                        y_score[:,nzinds].ravel())
    average_precision["micro"] = average_precision_score(y_test_multi, y_score[:,nzinds],
                                                         average="micro") 
    average_precision["macro"] = average_precision_score(y_test_multi, y_score[:,nzinds],
                                                         average="macro") 
    return precision, recall, average_precision
项目:pyML    作者:tekrei    | 项目源码 | 文件源码
def binarize_labels(actual):
    return label_binarize(actual, list(set(actual)))
项目:pyML    作者:tekrei    | 项目源码 | 文件源码
def roc_auc(actual, predictions, average='weighted'):
    class_names = list(set(actual))
    # use binarized values for AUC score calculation
    return roc_auc_score(label_binarize(actual, class_names), label_binarize(predictions, class_names), average=average)
项目:bionlp17    作者:leebird    | 项目源码 | 文件源码
def generate_prec_recall_points(clf, test_examples, test_labels, pk_file):
    # Generate precision-recall points and store in a pickle file.

    precision = dict()
    recall = dict()
    average_precision = dict()
    thresholds = dict()

    n_classes = len(clf.model.classes_)
    y_test = label_binarize(test_labels, clf.model.classes_)

    y_score = clf.predict_raw_prob(test_examples)
    # It only output 1 column of positive probability.
    y_score = y_score[:, 1:]

    for i in range(n_classes - 1):
        precision[i], recall[i], thresholds[i] = precision_recall_curve(
            y_test[:, i],
            y_score[:, i])
        average_precision[i] = average_precision_score(y_test[:, i],
                                                       y_score[:, i])
    # Compute micro-average ROC curve and ROC area
    precision["micro"], recall["micro"], thresholds['micro'] = \
        precision_recall_curve(y_test.ravel(), y_score.ravel())
    average_precision["micro"] = average_precision_score(y_test, y_score,
                                                         average="micro")

    if pk_file is not None:
        with open(pk_file, 'wb') as f:
            pickle.dump((precision, recall, average_precision, thresholds), f)
项目:tefla    作者:litan    | 项目源码 | 文件源码
def roc(y_true, y_pred, classes=[0, 1, 2, 3, 4]):
    y_true = label_binarize(y_true, classes=classes)
    y_pred = label_binarize(y_pred, classes=classes)
    n_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return roc_auc
项目:marseille    作者:vene    | 项目源码 | 文件源码
def _marg_rounded(self, x, y):
        y_node = y.nodes
        y_link = y.links
        Y_node = label_binarize(y_node, self.prop_encoder_.classes_)
        Y_link = label_binarize(y_link, self.link_encoder_.classes_)

        # XXX can this be avoided?
        Y_node, Y_link = map(_binary_2d, (Y_node, Y_link))

        src_type = Y_node[x.link_to_prop[:, 0]]
        trg_type = Y_node[x.link_to_prop[:, 1]]

        if self.compat_features:
            pw = np.einsum('...j,...k,...l->...jkl',
                           src_type, trg_type, Y_link)
            compat = np.tensordot(x.X_compat.T, pw, axes=[1, 0])
        else:
            # equivalent to compat_features == np.ones(n_links)
            compat = np.einsum('ij,ik,il->jkl', src_type, trg_type, Y_link)

        second_order = []

        if self.coparents_ or self.grandparents_ or self.siblings_:
            link = {(a, b): k for k, (a, b) in enumerate(x.link_to_prop)}
            if self.coparents_:
                second_order.extend(y_link[link[a, b]] & y_link[link[c, b]]
                                    for a, b, c in x.second_order)
            if self.grandparents_:
                second_order.extend(y_link[link[a, b]] & y_link[link[b, c]]
                                    for a, b, c in x.second_order)
            if self.siblings_:
                second_order.extend(y_link[link[b, a]] & y_link[link[b, c]]
                                    for a, b, c in x.second_order)
        second_order = np.array(second_order)

        return Y_node, Y_link, compat, second_order
项目:tefla    作者:openAGI    | 项目源码 | 文件源码
def roc(y_true, y_pred, classes=[0, 1, 2, 3, 4]):
    y_true = label_binarize(y_true, classes=classes)
    y_pred = label_binarize(y_pred, classes=classes)
    n_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return roc_auc
项目:RIDDLE    作者:jisungk    | 项目源码 | 文件源码
def compute_roc(y_test, y_test_proba, nb_classes):
    y_test = label_binarize(y_test, classes=range(0, nb_classes))

    fpr, tpr, roc_auc = {}, {}, {}
    for i in range(nb_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_test_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), 
        y_test_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    return roc_auc, fpr, tpr
项目:SCFGP    作者:MaxInGaussian    | 项目源码 | 文件源码
def load_abalone_data(proportion=1044./4177):
    from sklearn import datasets
    from sklearn import preprocessing
    from sklearn import cross_validation
    abalone = datasets.fetch_mldata('regression-datasets abalone')
    X_cate = np.array([abalone.target[i].tolist()
        for i in range(abalone.target.shape[0])])
    X_cate = preprocessing.label_binarize(X_cate, np.unique(X_cate))
    X = np.hstack((X_cate, abalone.data))
    y = abalone.int1[0].T.astype(np.float64)
    y = y[:, None]
    X = X.astype(np.float64)
    X_train, X_test, y_train, y_test = \
        cross_validation.train_test_split(X, y, test_size=proportion)
    return X_train, y_train, X_test, y_test
项目:SCFGP    作者:MaxInGaussian    | 项目源码 | 文件源码
def load_abalone_data(proportion=1044./4177):
    from sklearn import datasets
    from sklearn import preprocessing
    from sklearn import cross_validation
    abalone = datasets.fetch_mldata('regression-datasets abalone')
    X_cate = np.array([abalone.target[i].tolist()
        for i in range(abalone.target.shape[0])])
    X_cate = preprocessing.label_binarize(X_cate, np.unique(X_cate))
    X = np.hstack((X_cate, abalone.data))
    y = abalone.int1[0].T.astype(np.float64)
    y = y[:, None]
    X = X.astype(np.float64)
    X_train, X_test, y_train, y_test = \
        cross_validation.train_test_split(X, y, test_size=proportion)
    return X_train, y_train, X_test, y_test
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def _score_micro_average(self, y, y_pred, classes, n_classes):
        """
        Compute the micro average scores for the ROCAUC curves.
        """
        # Convert y to binarized array for micro and macro scores
        y = label_binarize(y, classes=classes)
        if n_classes == 2:
            y = np.hstack((1-y, y))

        # Compute micro-average
        self.fpr[MICRO], self.tpr[MICRO], _ = roc_curve(y.ravel(), y_pred.ravel())
        self.roc_auc[MICRO] = auc(self.fpr[MICRO], self.tpr[MICRO])
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_precision_recall_curve():

    iris=load_iris()
    X=iris.data
    y=iris.target

    y = label_binarize(y, classes=[0, 1, 2])
    n_classes = y.shape[1]

    np.random.seed(0)
    n_samples, n_features = X.shape
    X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]

    X_train,X_test,y_train,y_test=train_test_split(X,y,
            test_size=0.5,random_state=0)

    clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
    clf.fit(X_train,y_train)
    y_score = clf.fit(X_train, y_train).decision_function(X_test)

    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    precision = dict()
    recall = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
                                                            y_score[:, i])
        ax.plot(recall[i],precision[i],label="target=%s"%i)
    ax.set_xlabel("Recall Score")
    ax.set_ylabel("Precision Score")
    ax.set_title("P-R")
    ax.legend(loc='best')
    ax.set_xlim(0,1.1)
    ax.set_ylim(0,1.1)
    ax.grid()
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_roc_auc_score():

    iris=load_iris()
    X=iris.data
    y=iris.target

    y = label_binarize(y, classes=[0, 1, 2])
    n_classes = y.shape[1]

    np.random.seed(0)
    n_samples, n_features = X.shape
    X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]

    X_train,X_test,y_train,y_test=train_test_split(X,y,
            test_size=0.5,random_state=0)

    clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
    clf.fit(X_train,y_train)
    y_score = clf.fit(X_train, y_train).decision_function(X_test)

    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    fpr = dict()
    tpr = dict()
    roc_auc=dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i],y_score[:, i])
        roc_auc[i] = roc_auc_score(fpr[i], tpr[i])
        ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i]))
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlabel("FPR")
    ax.set_ylabel("TPR")
    ax.set_title("ROC")
    ax.legend(loc="best")
    ax.set_xlim(0,1.1)
    ax.set_ylim(0,1.1)
    ax.grid()
    plt.show()
项目:sport-news-retrieval    作者:Andyccs    | 项目源码 | 文件源码
def ensemble_classify():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  vectoriser.fit(tweet_list)
  vectorised_tweet_list = vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  n_estimators = 10  # number of weak learners
  model = AdaBoostClassifier(n_estimators=n_estimators)
  ada_classifier = model.fit(train_vector, train_labels)
  result = ada_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/tfidf_ada.csv", sep=',')
  save_model(ada_classifier, 'tfidf_ada')

  # evaluation
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)
  generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
项目:sport-news-retrieval    作者:Andyccs    | 项目源码 | 文件源码
def lin_svc():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  fitted_vectoriser = vectoriser.fit(tweet_list)
  vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  # train model and predict
  model = LinearSVC()
  ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
  result = ovr_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  save_to_csv("data/testset_labels.csv", test_labels)
  result.tofile("data/tfidf_linsvc.csv", sep=',')

  save_model(ovr_classifier, 'tfidf_linsvc')
  save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

  # evaluation
  label_score = ovr_classifier.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)

  evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_precision_recall_f_extra_labels():
    # Test handling of explicit additional (not in input) labels to PRF
    y_true = [1, 3, 3, 2]
    y_pred = [1, 1, 3, 2]
    y_true_bin = label_binarize(y_true, classes=np.arange(5))
    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
    data = [(y_true, y_pred),
            (y_true_bin, y_pred_bin)]

    for i, (y_true, y_pred) in enumerate(data):
        # No average: zeros in array
        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
                              average=None)
        assert_array_almost_equal([0., 1., 1., .5, 0.], actual)

        # Macro average is changed
        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
                              average='macro')
        assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)

        # No effect otheriwse
        for average in ['micro', 'weighted', 'samples']:
            if average == 'samples' and i == 0:
                continue
            assert_almost_equal(recall_score(y_true, y_pred,
                                             labels=[0, 1, 2, 3, 4],
                                             average=average),
                                recall_score(y_true, y_pred, labels=None,
                                             average=average))

    # Error when introducing invalid label in multilabel case
    # (although it would only affect performance if average='macro'/None)
    for average in [None, 'macro', 'micro', 'samples']:
        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
                      labels=np.arange(6), average=average)
        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
                      labels=np.arange(-1, 4), average=average)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_matthews_corrcoef():
    rng = np.random.RandomState(0)
    y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)]

    # corrcoef of same vectors must be 1
    assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)

    # corrcoef, when the two vectors are opposites of each other, should be -1
    y_true_inv = ["b" if i == "a" else "a" for i in y_true]

    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
    y_true_inv2 = label_binarize(y_true, ["a", "b"]) * -1
    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)

    # For the zero vector case, the corrcoef cannot be calculated and should
    # result in a RuntimeWarning
    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                               matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0])

    # But will output 0
    assert_almost_equal(mcc, 0.)

    # And also for any other vector with 0 variance
    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                               matthews_corrcoef, y_true,
                               rng.randint(-100, 100) * np.ones(20, dtype=int))

    # But will output 0
    assert_almost_equal(mcc, 0.)

    # These two vectors have 0 correlation and hence mcc should be 0
    y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
    y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.)

    # Check that sample weight is able to selectively exclude
    mask = [1] * 10 + [0] * 10
    # Now the first half of the vector elements are alone given a weight of 1
    # and hence the mcc will not be a perfect 0 as in the previous case
    assert_raises(AssertionError, assert_almost_equal,
                  matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.)
项目:pybot    作者:spillai    | 项目源码 | 文件源码
def plot_roc(y_score, y_test, target_map, title='ROC curve'): 
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve, auc, precision_recall_curve
    from sklearn.preprocessing import label_binarize

    # Compute Precision-Recall and plot curve
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    target_ids = target_map.keys()
    target_names = target_map.values()
    print target_names

    y_test_multi = label_binarize(y_test, classes=target_ids)
    N, n_classes = y_score.shape[:2]
    for i,name in enumerate(target_names):
        fpr[name], tpr[name], _ = roc_curve(y_test_multi[:, i], y_score[:, i])
        roc_auc[name] = auc(fpr[name], tpr[name]) 

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_multi.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 

    # Plot Precision-Recall curve for each class
    plt.clf()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr["micro"], tpr["micro"],
             label='ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]), linewidth=3)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.legend(loc="lower right")
    plt.show()

    for i,name in enumerate(target_names):
        plt.plot(fpr[name], tpr[name],
                 label='{0}'.format(name.title().replace('_', ' ')))
                 # label='{0} (area = {1:0.2f})'
                 #       ''.format(name.title().replace('_', ' '), roc_auc[name]))

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show(block=False)
项目:instacart-basket-prediction    作者:colinmorris    | 项目源码 | 文件源码
def _as_dmatrix(self):
    kwargs = dict(label=self.records['label'])
    kwargs['feature_names'] = self.feature_names

    featdat = self.records[self.basic_feat_cols]
    featdat = featdat.view(fields.dtype).reshape(len(featdat), -1)

    if self.hps.embedding_tag:
      embs = cache_embeddings.load_embeddings(self.hps.embedding_tag)
      npids, embsize = embs.shape
      assert embsize == self.hps.embedding_dimension
      logging.info('Loaded {}-d embeddings from rnn model {}'.format(
        embsize, self.hps.embedding_tag))
      pids = self.records['pid']
      # NB: pids are 1-indexed
      pidxs = (pids-1).astype(np.int32)
      lookuped = embs[pidxs]
      orig_shape = featdat.shape
      featdat = np.hstack((featdat, lookuped))
      logging.info('Shape went from {} to {} after adding pid embeddings'.format(
        orig_shape, featdat.shape))

    onehot_matrices = []
    for onehot_var in self.onehot_vars:
      onehot = label_binarize(self.records[onehot_var], 
          classes=range(1, self.FIELD_TO_NVALUES[onehot_var]+1),
          sparse_output=True).astype(fields.dtype)
      onehot_matrices.append(onehot)
    if onehot_matrices:
      # TODO: There are some perf issues with this. Look into this workaround:
      # https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices/33259578#33259578
      featdat = scipy.sparse.hstack([featdat,]+onehot_matrices, format='csr')

    logging.info('Made dmatrix with feature data having shape {}'.format(featdat.shape))

    # https://github.com/dmlc/xgboost/issues/2554
    if not kwargs['label'].flags.c_contiguous:
      logging.info('Contiguizing labels')
      kwargs['label'] = np.ascontiguousarray(kwargs['label'])
      logging.info('Contiguized')
    if isinstance(featdat, np.ndarray) and not featdat.flags.c_contiguous:
      logging.info('Contiguizing feature data')
      featdat = np.ascontiguousarray(featdat)

    if FTYPES:
      kwargs['feature_types'] = self.feature_types

    return xgb.DMatrix(featdat, **kwargs)
项目:aq_weather    作者:eliucidate    | 项目源码 | 文件源码
def main():
  plt.figure()
  for j in range(1,6):
    random_state = np.random.RandomState(0)
    X,y = load_file(file_name,j)
    k = 2
    # y = label_binarize(y, classes=[0, 1, 2])
    # n_classes = y.shape[1]
    # print n_classes
    n_classes = 2
    ylabel, ave= transformtolabel(y,k)
    ylabel = np.array(ylabel)
    # ylabel = np.transpose(ylabel)
  # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, ylabel, test_size=.5,
                                                        random_state=0)

    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True,
                                     random_state=random_state))
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
      # print y_test[i]
      fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
      roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    # print fpr[1]

    ##############################################################################

    # Plot of a ROC curve for a specific class

    # plt.figure()
    # plt.plot(fpr[0], tpr[0], label='CO below %0.2f' % ave +' (area = %0.2f)' %roc_auc[0])
    plt.plot(fpr[1], tpr[1], label='O3 prediction (area = %0.2f)' %roc_auc[1]+'(%0.0f'% j+' features)')
  plt.plot([0, 1], [0, 1], 'k--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.0])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver operating characteristic for SVM')
  plt.legend(loc="lower right")
  plt.show()
项目:sport-news-retrieval    作者:Andyccs    | 项目源码 | 文件源码
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()
    sentences.append(temp_doc)

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  create_directory('model')
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')