Python sklearn.metrics 模块，matthews_corrcoef() 实例源码

我们从Python开源项目中，提取了以下18个代码示例，用于说明如何使用sklearn.metrics.matthews_corrcoef()。

项目：playground 作者：Pennsy | 项目源码 | 文件源码

def learn_decision_tree(data):
    DT = tree.DecisionTreeClassifier(max_depth=7)
    scorer = make_scorer(matthews_corrcoef)
    for i in range(5):
        scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
        print("iteration",i, "dt mean:", scores.mean())
        scores = list(scores)
        print("Decision Tree train scores:\n", scores)
    return DT
    # DT = DT.fit(train_data[:, :-1], train_data[:, -1])
    # predictionsDT = DT.predict(validation_data[:, :-1])

    # validating predicions
    # dtError = 0
    # for i in range(0, len(validation_data)):
    #         if(validation_data[i][20] != predictionsDT[i]):
    #                 dtError = dtError + 1
    # print("DT Error : ", float(dtError)/len(validation_data)*100.0)

项目：PEP 作者：ma-compbio | 项目源码 | 文件源码

def analyzeResult_temp(data,model,DataVecs):
    predict = model.predict(DataVecs)
    data['predict'] = predict
    print ("Accuracy: %f %%" % (100. * sum(data["label"] == data["predict"]) / len(data["label"])))
    answer1 = data[data["label"] == 1]
    answer2 = data[data["label"] == 0]
    print ("Positive Accuracy: %f %%" % (100. * sum(answer1["label"] == answer1["predict"]) / len(answer1["label"])))
    print ("Negative Accuracy: %f %%" % (100. * sum(answer2["label"] == answer2["predict"]) / len(answer2["label"])))
    try:
        result_auc = model.predict_proba(DataVecs)
        print ("Roc:%f\nAUPR:%f\n" % (roc_auc_score(data["label"],result_auc[:,1]),
            average_precision_score(data["label"],result_auc[:,1])))
        print("Precision:%f\nRecall:%f\nF1score:%f\nMCC:%f\n" %(precision_score(data["label"],data["predict"]),
            recall_score(data["label"],data["predict"]),
            f1_score(data["label"],data["predict"]),
            matthews_corrcoef(data["label"],data["predict"])))
    except:
        print "ROC unavailable"

# Performance evaluation and result analysis uing adjusted thresholds

项目：PEP 作者：ma-compbio | 项目源码 | 文件源码

def analyzeResult(data,model,DataVecs,threshold):
    predict = model.predict_proba(DataVecs)[:,1]
    True,False=1,0
    data['predict'] = (predict > threshold)
    print ("Accuracy: %f %%" % (100. * sum(data["label"] == data["predict"]) / len(data["label"])))
    answer1 = data[data["label"] == 1]
    answer2 = data[data["label"] == 0]
    print ("Positive Accuracy: %f %%" % (100. * sum(answer1["label"] == answer1["predict"]) / len(answer1["label"])))
    print ("Negative Accuracy: %f %%" % (100. * sum(answer2["label"] == answer2["predict"]) / len(answer2["label"])))
    try:
        result_auc = model.predict_proba(DataVecs)
        print ("Roc:%f\nAUPR:%f\n" % (roc_auc_score(data["label"],result_auc[:,1]),
            average_precision_score(data["label"],result_auc[:,1])))
        print("Precision:%f\nRecall:%f\nF1score:%f\nMCC:%f\n" %(precision_score(data["label"],data["predict"]),
            recall_score(data["label"],data["predict"]),
            f1_score(data["label"],data["predict"]),
            matthews_corrcoef(data["label"],data["predict"])))
    except:
        print "ROC unavailable"

# Performance evaluation

项目：MetaHeuristic 作者：gonzalesMK | 项目源码 | 文件源码

def __init__(self, name,classifier=None, number_gen=20,
                 verbose=0, repeat=1, parallel=False,
                 make_logbook=False, random_state=None,
                 cv_metric_fuction=make_scorer(matthews_corrcoef), 
                 features_metric_function=None):

        self._name = name
        self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
        self.number_gen = number_gen
        self.verbose = verbose
        self.repeat = repeat
        self.parallel=parallel
        self.make_logbook = make_logbook
        self.random_state = random_state
        self.cv_metric_function= cv_metric_fuction
        self.features_metric_function= features_metric_function
        self._random_object = check_random_state(self.random_state)
        random.seed(self.random_state)

项目：MetaHeuristic 作者：gonzalesMK | 项目源码 | 文件源码

def __init__(self, name,classifier=None, number_gen=20,
                 verbose=0, repeat=1, parallel=False,
                 make_logbook=False, random_state=None,
                 cv_metric_fuction=make_scorer(matthews_corrcoef), 
                 features_metric_function=None):

        self._name = name
        self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
        self.number_gen = number_gen
        self.verbose = verbose
        self.repeat = repeat
        self.parallel=parallel
        self.make_logbook = make_logbook
        self.random_state = random_state
        self.cv_metric_function= cv_metric_fuction
        self.features_metric_function= features_metric_function
        self._random_object = check_random_state(self.random_state)
        random.seed(self.random_state)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_confusion_matrix_binary():
    # Test confusion matrix - binary classification case
    y_true, y_pred, _ = make_prediction(binary=True)

    def test(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        assert_array_equal(cm, [[22, 3], [8, 17]])

        tp, fp, fn, tn = cm.flatten()
        num = (tp * tn - fp * fn)
        den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

        true_mcc = 0 if den == 0 else num / den
        mcc = matthews_corrcoef(y_true, y_pred)
        assert_array_almost_equal(mcc, true_mcc, decimal=2)
        assert_array_almost_equal(mcc, 0.57, decimal=2)

    test(y_true, y_pred)
    test([str(y) for y in y_true],
         [str(y) for y in y_pred])

项目：mitre 作者：gerberlab | 项目源码 | 文件源码

def leave_one_out_report(combined_results):
    """ Evaluate leave-one-out CV results from different methods.

    Arguments: 
    combined_results: list of tuples of the form
    (method_name, true_y_vector, predicted_probabilities_vector)

    Note the vectors really do need to be numpy arrays.

    Returns: formatted report as string

    """
    ### 
    # Unfortunate code duplication with tabulate_metrics here,
    # to be resolved later
    probability_metrics = [
        ('AUC', roc_auc_score),
        ('AP', metrics.average_precision_score)
    ]
    binary_metrics = [
        ('F1', metrics.f1_score),
        ('MCC', metrics.matthews_corrcoef),
        ('precision', metrics.precision_score),
        ('recall', metrics.recall_score)
    ] 
    metric_results = {label: [] for label, _ in
               probability_metrics + binary_metrics}
    metric_results.update({'tn': [], 'fp': [], 'fn': [], 'tp': []})
    for label, metric in probability_metrics:
        for fold, y_true, y_pred in combined_results:
            metric_results[label].append(metric(y_true, y_pred))
    for method, y_true, probabilities in combined_results:
        y_pred = probabilities > 0.5
        for label, metric in binary_metrics:
            metric_results[label].append(metric(y_true, y_pred))
        conf = zip(
            ('tn', 'fp', 'fn', 'tp'),
            metrics.confusion_matrix(y_true, y_pred).flat
        )
        for label, n in conf:
            metric_results[label].append(n)
    index=[t[0] for t in combined_results]
    table = pd.DataFrame(data=metric_results, 
                         index=index)
    report = table.to_string(float_format=lambda x: '%.3g' % x)
    return report

项目：PEP 作者：ma-compbio | 项目源码 | 文件源码

def score_func(estimator,X,Y):
    global accuracy,precision,recall,f1,mcc,auc,aupr,resultpredict,resultproba,resultlabel
    predict_proba = estimator.predict_proba(X)[:,1]
    True,False=1,0
    predict = (predict_proba > 0.50)
    resultlabel = np.hstack((resultlabel,Y))
    resultpredict = np.hstack((resultpredict,predict))
    resultproba = np.hstack((resultproba,predict_proba))
    precision+=precision_score(Y,predict)
    recall+=recall_score(Y,predict)
    f1+=f1_score(Y,predict)
    accuracy += accuracy_score(Y,predict)
    mcc += matthews_corrcoef(Y,predict)
    auc += roc_auc_score(Y,predict_proba)
    aupr += average_precision_score(Y,predict_proba)
    print "finish one"
    return matthews_corrcoef(Y,predict)

# Performance evaluation

项目：PEP 作者：ma-compbio | 项目源码 | 文件源码

def score_function(y_test,yfit):
    precision = precision_score(y_test,yfit)
    recall = recall_score(y_test,yfit)
    f1 = f1_score(y_test,yfit)
    mcc = matthews_corrcoef(y_test,yfit)
    return precision, recall, f1, mcc

# Randomly sample balanced sizes of postiive and negative samples, used only when data balance is required

项目：deepcpg 作者：cangermueller | 项目源码 | 文件源码

def mcc(y, z, round=True):
    """Compute Matthew's correlation coefficient."""
    if round:
        y = np.round(y)
        z = np.round(z)
    return skm.matthews_corrcoef(y, z)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_matthews_corrcoef_nan():
    assert_equal(matthews_corrcoef([0], [1]), 0.0)
    assert_equal(matthews_corrcoef([0, 0], [0, 1]), 0.0)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_matthews_corrcoef_against_numpy_corrcoef():
    rng = np.random.RandomState(0)
    y_true = rng.randint(0, 2, size=20)
    y_pred = rng.randint(0, 2, size=20)

    assert_almost_equal(matthews_corrcoef(y_true, y_pred),
                        np.corrcoef(y_true, y_pred)[0, 1], 10)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_matthews_corrcoef():
    rng = np.random.RandomState(0)
    y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)]

    # corrcoef of same vectors must be 1
    assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)

    # corrcoef, when the two vectors are opposites of each other, should be -1
    y_true_inv = ["b" if i == "a" else "a" for i in y_true]

    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
    y_true_inv2 = label_binarize(y_true, ["a", "b"]) * -1
    assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)

    # For the zero vector case, the corrcoef cannot be calculated and should
    # result in a RuntimeWarning
    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                               matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0])

    # But will output 0
    assert_almost_equal(mcc, 0.)

    # And also for any other vector with 0 variance
    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                               matthews_corrcoef, y_true,
                               rng.randint(-100, 100) * np.ones(20, dtype=int))

    # But will output 0
    assert_almost_equal(mcc, 0.)

    # These two vectors have 0 correlation and hence mcc should be 0
    y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
    y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1]
    assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.)

    # Check that sample weight is able to selectively exclude
    mask = [1] * 10 + [0] * 10
    # Now the first half of the vector elements are alone given a weight of 1
    # and hence the mcc will not be a perfect 0 as in the previous case
    assert_raises(AssertionError, assert_almost_equal,
                  matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.)

项目：SourceFilterContoursMelody 作者：juanjobosch | 项目源码 | 文件源码

def clf_metrics(p_train, p_test, y_train, y_test):
    """ Compute metrics on classifier predictions

    Parameters
    ----------
    p_train : np.array [n_samples]
        predicted probabilities for training set
    p_test : np.array [n_samples]
        predicted probabilities for testing set
    y_train : np.array [n_samples]
        Training labels.
    y_test : np.array [n_samples]
        Testing labels.

    Returns
    -------
    clf_scores : dict
        classifier scores for training set
    """
    y_pred_train = 1*(p_train >= 0.5)
    y_pred_test = 1*(p_test >= 0.5)

    train_scores = {}
    test_scores = {}

    train_scores['accuracy'] = metrics.accuracy_score(y_train, y_pred_train)
    test_scores['accuracy'] = metrics.accuracy_score(y_test, y_pred_test)

    train_scores['mcc'] = metrics.matthews_corrcoef(y_train, y_pred_train)
    test_scores['mcc'] = metrics.matthews_corrcoef(y_test, y_pred_test)

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_train,
                                                           y_pred_train)
    train_scores['precision'] = p
    train_scores['recall'] = r
    train_scores['f1'] = f
    train_scores['support'] = s

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_test,
                                                           y_pred_test)
    test_scores['precision'] = p
    test_scores['recall'] = r
    test_scores['f1'] = f
    test_scores['support'] = s

    train_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_train, y_pred_train, labels=[0, 1])
    test_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_test, y_pred_test, labels=[0, 1])

    train_scores['auc score'] = \
        metrics.roc_auc_score(y_train, p_train + 1, average='weighted')
    test_scores['auc score'] = \
        metrics.roc_auc_score(y_test, p_test + 1, average='weighted')

    clf_scores = {'train': train_scores, 'test': test_scores}

    return clf_scores

项目：SourceFilterContoursMelody 作者：juanjobosch | 项目源码 | 文件源码

def melodiness_metrics(m_train, m_test, y_train, y_test):
    """ Compute metrics on melodiness score

    Parameters
    ----------
    m_train : np.array [n_samples]
        melodiness scores for training set
    m_test : np.array [n_samples]
        melodiness scores for testing set
    y_train : np.array [n_samples]
        Training labels.
    y_test : np.array [n_samples]
        Testing labels.

    Returns
    -------
    melodiness_scores : dict
        melodiness scores for training set
    """
    m_bin_train = 1*(m_train >= 1)
    m_bin_test = 1*(m_test >= 1)

    train_scores = {}
    test_scores = {}

    train_scores['accuracy'] = metrics.accuracy_score(y_train, m_bin_train)
    test_scores['accuracy'] = metrics.accuracy_score(y_test, m_bin_test)

    train_scores['mcc'] = metrics.matthews_corrcoef(y_train, m_bin_train)
    test_scores['mcc'] = metrics.matthews_corrcoef(y_test, m_bin_test)

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_train,
                                                           m_bin_train)
    train_scores['precision'] = p
    train_scores['recall'] = r
    train_scores['f1'] = f
    train_scores['support'] = s

    (p, r, f, s) = metrics.precision_recall_fscore_support(y_test,
                                                           m_bin_test)
    test_scores['precision'] = p
    test_scores['recall'] = r
    test_scores['f1'] = f
    test_scores['support'] = s

    train_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_train, m_bin_train, labels=[0, 1])
    test_scores['confusion matrix'] = \
        metrics.confusion_matrix(y_test, m_bin_test, labels=[0, 1])

    train_scores['auc score'] = \
        metrics.roc_auc_score(y_train, m_train + 1, average='weighted')
    test_scores['auc score'] = \
        metrics.roc_auc_score(y_test, m_test + 1, average='weighted')

    melodiness_scores = {'train': train_scores, 'test': test_scores}

    return melodiness_scores

项目：mitre 作者：gerberlab | 项目源码 | 文件源码

def tabulate_metrics(cv_results, name):
    """ Calculate accuracy metrics from probabilities, format them.

    Given a list of tuples, each of the form (index,
    vector_of_true_outcomes, vector_of_predicted_probabilities), for
    each index (representing one fold of CV) assess multiple accuracy
    metrics (eg ROC AUC, F1 score, positive predictive value) for the
    predicted probabilities WRT the true outcomes (for that fold's
    test set.) Also take the median across all folds. Then format
    these nicely into a table (labeled with the given name) and return
    that, as a string.

    For metrics which require a binary prediction, a threshold
    of 0.5 is used.

    """
    # Each of the metric functions should take two non-optional
    # arguments, y_true and y_pred. 
    # These accept predicted probabilities.
    probability_metrics = [
        ('AUC', roc_auc_score),
        ('AP', metrics.average_precision_score)
    ]
    # These need binary predictions
    binary_metrics = [
        ('F1', metrics.f1_score),
        ('MCC', metrics.matthews_corrcoef),
        ('precision', metrics.precision_score),
        ('recall', metrics.recall_score)
    ] 
    # Mutual information? Odds ratios?

    results = {label: [] for label, _ in
               probability_metrics + binary_metrics}
    results.update({'tn': [], 'fp': [], 'fn': [], 'tp': []})
    for label, metric in probability_metrics:
        for fold, y_true, y_pred in cv_results:
            results[label].append(metric(y_true, y_pred))
    for fold, y_true, probabilities in cv_results:
        y_pred = probabilities > 0.5
        for label, metric in binary_metrics:
            results[label].append(metric(y_true, y_pred))
        conf = zip(
            ('tn', 'fp', 'fn', 'tp'),
            metrics.confusion_matrix(y_true, y_pred).flat
        )
        for label, n in conf:
            results[label].append(n)

    index=['fold_%d' % i for i, _, _ in cv_results]
    table = pd.DataFrame(data=results, 
                         index=index)
    table.loc['median/sum'] = 0.
    for k,_ in probability_metrics + binary_metrics:
        table.loc['median/sum',k] = np.median(results[k])
    for k in ('tn', 'fp', 'fn', 'tp'):
        table.loc['median/sum',k] = np.sum(results[k])

    report = table.to_string(float_format=lambda x: '%.3g' % x)
    report = ('%s: \n' % name) + report  
    return report

项目：motif 作者：rabitt | 项目源码 | 文件源码

def score(self, y_predicted, y_target, y_prob=None):
        """ Compute metrics on classifier predictions

        Parameters
        ----------
        y_predicted : np.array [n_samples]
            Predicted class labels
        y_target : np.array [n_samples]
            Target class labels
        y_prob : np.array [n_samples] or None, default=None
            predicted probabilties. If None, auc is not computed

        Returns
        -------
        scores : dict
            dictionary of scores for the following metrics:
            accuracy, matthews correlation coefficient, precision, recall, f1,
            support, confusion matrix, auc score
        """
        labels = set(y_target)
        labels.update(y_predicted)
        is_binary = len(labels) <= 2

        scores = {}
        scores['accuracy'] = metrics.accuracy_score(y_target, y_predicted)

        if is_binary:
            scores['mcc'] = metrics.matthews_corrcoef(y_target, y_predicted)
        else:
            scores['mcc'] = None

        (scores['precision'],
         scores['recall'],
         scores['f1'],
         scores['support']) = metrics.precision_recall_fscore_support(
             y_target, y_predicted
         )

        scores['confusion matrix'] = metrics.confusion_matrix(
            y_target, y_predicted, labels=list(labels)
        )

        if y_prob is not None:
            scores['auc score'] = metrics.roc_auc_score(
                y_target, y_prob + 1, average='weighted'
            )
        else:
            scores['auc score'] = None

        return scores


###############################################################################

项目：Data-Processing-for-Gene-Sequence 作者：stella-gao | 项目源码 | 文件源码

def error_rate(output_file):
    count = 0
    for line in open(output_file):
        count += 1
        print line

    print count

    cnt = 0
    error = 0
    y = []
    y_predict = []

    for line in open(output_file):
        cnt += 1
        print float(line.split()[4])
        y_predict.append(float(line.split()[4]))
        if cnt <= 48:
            y.append(1)
        else:
            y.append(0)

    y_predict = np.array(y_predict)
    y = np.array(y)
    print "y_predict"
    print y_predict
    print "y"
    print y
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y, y_predict)
    print "AUC is :"
    print auc(false_positive_rate, true_positive_rate)
    print roc_auc_score(y, y_predict)
    print "--------------------------------------------"
    print "MCC is :"
    print matthews_corrcoef(y, [round(x) for x in y_predict])
    print "--------------------------------------------"
    print "Accuracy is :"
    print accuracy_score(y, [round(x) for x in y_predict])

    testRounded = [round(x) for x in y_predict]
    #print(testRounded)
    #print count

    cnt = 0
    for i in xrange(count):
        cnt += abs(testRounded[i] - y[i])

    error_rate = (cnt/count)*100

    print("Test error rate is: %.4f%%" % error_rate)
    print("Accuracy is: %.4f%%" % (100 - error_rate))