def report_metrics(y_dset, y_pred, batch_size, dset='Val'): # Print additional metrics involving predictions n_rows = (y_dset.shape[0] / batch_size) * batch_size y_true = y_dset[0:n_rows, :].flatten() y_pred = y_pred.flatten() val_ap = average_precision_score(y_true, y_pred) val_roc = roc_auc_score(y_true, y_pred) n = y_true.size n_pos = y_true.sum() idx_sorted = np.argsort(-y_pred) val_rec = [] logging.info(dset + "-AP {:.6f}".format(val_ap)) logging.info(dset + "-ROC {:.6f}".format(val_roc)) for i, v in enumerate([10, 25, 50, 75, 100]): tp = y_true[idx_sorted[:int(v * n / 100)]].sum() val_rec.append(tp * 1.0 / n_pos) logging.info(dset + "-R{} {:.6f}".format(v, val_rec[i])) return val_ap, val_rec[2] # ############################## Main program #################################
def analyzeResult_temp(data,model,DataVecs): predict = model.predict(DataVecs) data['predict'] = predict print ("Accuracy: %f %%" % (100. * sum(data["label"] == data["predict"]) / len(data["label"]))) answer1 = data[data["label"] == 1] answer2 = data[data["label"] == 0] print ("Positive Accuracy: %f %%" % (100. * sum(answer1["label"] == answer1["predict"]) / len(answer1["label"]))) print ("Negative Accuracy: %f %%" % (100. * sum(answer2["label"] == answer2["predict"]) / len(answer2["label"]))) try: result_auc = model.predict_proba(DataVecs) print ("Roc:%f\nAUPR:%f\n" % (roc_auc_score(data["label"],result_auc[:,1]), average_precision_score(data["label"],result_auc[:,1]))) print("Precision:%f\nRecall:%f\nF1score:%f\nMCC:%f\n" %(precision_score(data["label"],data["predict"]), recall_score(data["label"],data["predict"]), f1_score(data["label"],data["predict"]), matthews_corrcoef(data["label"],data["predict"]))) except: print "ROC unavailable" # Performance evaluation and result analysis uing adjusted thresholds
def analyzeResult(data,model,DataVecs,threshold): predict = model.predict_proba(DataVecs)[:,1] True,False=1,0 data['predict'] = (predict > threshold) print ("Accuracy: %f %%" % (100. * sum(data["label"] == data["predict"]) / len(data["label"]))) answer1 = data[data["label"] == 1] answer2 = data[data["label"] == 0] print ("Positive Accuracy: %f %%" % (100. * sum(answer1["label"] == answer1["predict"]) / len(answer1["label"]))) print ("Negative Accuracy: %f %%" % (100. * sum(answer2["label"] == answer2["predict"]) / len(answer2["label"]))) try: result_auc = model.predict_proba(DataVecs) print ("Roc:%f\nAUPR:%f\n" % (roc_auc_score(data["label"],result_auc[:,1]), average_precision_score(data["label"],result_auc[:,1]))) print("Precision:%f\nRecall:%f\nF1score:%f\nMCC:%f\n" %(precision_score(data["label"],data["predict"]), recall_score(data["label"],data["predict"]), f1_score(data["label"],data["predict"]), matthews_corrcoef(data["label"],data["predict"]))) except: print "ROC unavailable" # Performance evaluation
def get_scores(clf, X_t_train, y_train, X_t_test, y_test): clf.fit(X_t_train, y_train) app = dict() score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None) avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples') prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro') rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro') avg_prec = average_precision_score(y_test, clf.predict(X_t_test)) metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))] #app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]]) app['F2 Score'] = avg_sample_score app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test)) app['P_AUPR'] = avg_prec app['Precision'] = prec_score app['Recall'] = rec_score return app
def _average_precision(y_true, y_score): """Alternative implementation to check for correctness of `average_precision_score`.""" pos_label = np.unique(y_true)[1] n_pos = np.sum(y_true == pos_label) order = np.argsort(y_score)[::-1] y_score = y_score[order] y_true = y_true[order] score = 0 for i in range(len(y_score)): if y_true[i] == pos_label: # Compute precision up to document i # i.e, percentage of relevant documents up to document i. prec = 0 for j in range(0, i + 1): if y_true[j] == pos_label: prec += 1.0 prec /= (i + 1.0) score += prec return score / n_pos
def _test_precision_recall_curve(y_true, probas_pred): # Test Precision-Recall and aread under PR curve p, r, thresholds = precision_recall_curve(y_true, probas_pred) precision_recall_auc = auc(r, p) assert_array_almost_equal(precision_recall_auc, 0.85, 2) assert_array_almost_equal(precision_recall_auc, average_precision_score(y_true, probas_pred)) assert_almost_equal(_average_precision(y_true, probas_pred), precision_recall_auc, 1) assert_equal(p.size, r.size) assert_equal(p.size, thresholds.size + 1) # Smoke test in the case of proba having only one value p, r, thresholds = precision_recall_curve(y_true, np.zeros_like(probas_pred)) precision_recall_auc = auc(r, p) assert_array_almost_equal(precision_recall_auc, 0.75, 3) assert_equal(p.size, r.size) assert_equal(p.size, thresholds.size + 1)
def multilabel_precision_recall(y_score, y_test, clf_target_ids, clf_target_names): from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score from sklearn.preprocessing import label_binarize # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() # Find indices that have non-zero detections clf_target_map = { k: v for k,v in zip(clf_target_ids, clf_target_names)} id2ind = {tid: idx for (idx,tid) in enumerate(clf_target_ids)} # Only handle the targets encountered unique = np.unique(y_test) nzinds = np.int64([id2ind[target] for target in unique]) # Binarize and create precision-recall curves y_test_multi = label_binarize(y_test, classes=unique) for i,target in enumerate(unique): index = id2ind[target] name = clf_target_map[target] precision[name], recall[name], _ = precision_recall_curve(y_test_multi[:, i], y_score[:, index]) average_precision[name] = average_precision_score(y_test_multi[:, i], y_score[:, index]) # Compute micro-average ROC curve and ROC area precision["average"], recall["average"], _ = precision_recall_curve(y_test_multi.ravel(), y_score[:,nzinds].ravel()) average_precision["micro"] = average_precision_score(y_test_multi, y_score[:,nzinds], average="micro") average_precision["macro"] = average_precision_score(y_test_multi, y_score[:,nzinds], average="macro") return precision, recall, average_precision
def plot_precision_recall(indir, gts_file, outdir): groundtruths = read_item_tag(gts_file) plt.figure(1) indir = utils.abs_path_dir(indir) for item in os.listdir(indir): if ".csv" in item: isrcs = read_preds(indir + "/" + item) test_groundtruths = [] predictions = [] for isrc in isrcs: if isrc in groundtruths: test_groundtruths.append(groundtruths[isrc]) predictions.append(isrcs[isrc]) test_groundtruths = [tag=="s" for tag in test_groundtruths] precision, recall, _ = precision_recall_curve(test_groundtruths, predictions) plt.plot(recall, precision, label=item[:-4] + " (" + str(round(average_precision_score(test_groundtruths, predictions), 3)) + ")") plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([-0.05, 1.05]) plt.title('Precision-Recall curve for Algo (AUC)') plt.legend(loc='best') plt.savefig(outdir + "precision_recall.png", dpi=200, bbox_inches="tight") # plt.show() plt.close() utils.print_success("Precision-Recall curve created in " + outdir)
def plot_pr(gold, predicted_prob, lb): pp1 = predicted_prob[:,1] # prob for class 1 p, r, th = precision_recall_curve(gold, pp1) ap = average_precision_score(gold, pp1) plt.plot(r, p, label= lb + ' (area = {0:0.2f})' ''.format(ap)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision and Recall') plt.legend(loc="upper right") #plt.show()
def eval_clf(gold, clf, mat, start = 0): pp = clf.predict_proba(mat[start:,:]) pp1 = pp[:,1] ap = average_precision_score(gold[start:], pp1) return ap
def video_mean_ap(score_dict, video_list): avail_video_labels = [set([i.num_label for i in v.instances]) for v in video_list if v.id in score_dict] pred_array = np.array([score_dict[v.id] for v in video_list if v.id in score_dict]) gt_array = np.zeros(pred_array.shape) for i in xrange(pred_array.shape[0]): gt_array[i, list(avail_video_labels[i])] = 1 mean_ap = average_precision_score(gt_array, pred_array, average='macro') return mean_ap
def get_roc_score(edges_pos, edges_neg, emb=None): if emb is None: feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) preds = [] pos = [] for e in edges_pos: preds.append(sigmoid(adj_rec[e[0], e[1]])) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score
def leave_one_out_report(combined_results): """ Evaluate leave-one-out CV results from different methods. Arguments: combined_results: list of tuples of the form (method_name, true_y_vector, predicted_probabilities_vector) Note the vectors really do need to be numpy arrays. Returns: formatted report as string """ ### # Unfortunate code duplication with tabulate_metrics here, # to be resolved later probability_metrics = [ ('AUC', roc_auc_score), ('AP', metrics.average_precision_score) ] binary_metrics = [ ('F1', metrics.f1_score), ('MCC', metrics.matthews_corrcoef), ('precision', metrics.precision_score), ('recall', metrics.recall_score) ] metric_results = {label: [] for label, _ in probability_metrics + binary_metrics} metric_results.update({'tn': [], 'fp': [], 'fn': [], 'tp': []}) for label, metric in probability_metrics: for fold, y_true, y_pred in combined_results: metric_results[label].append(metric(y_true, y_pred)) for method, y_true, probabilities in combined_results: y_pred = probabilities > 0.5 for label, metric in binary_metrics: metric_results[label].append(metric(y_true, y_pred)) conf = zip( ('tn', 'fp', 'fn', 'tp'), metrics.confusion_matrix(y_true, y_pred).flat ) for label, n in conf: metric_results[label].append(n) index=[t[0] for t in combined_results] table = pd.DataFrame(data=metric_results, index=index) report = table.to_string(float_format=lambda x: '%.3g' % x) return report
def avg_precision(predictions_proba, _, labels, parameters): return metrics.average_precision_score(labels, predictions_proba)
def mean_ap(distmat, query_ids=None, gallery_ids=None, query_cams=None, gallery_cams=None): distmat = to_numpy(distmat) m, n = distmat.shape # Fill up default values if query_ids is None: query_ids = np.arange(m) if gallery_ids is None: gallery_ids = np.arange(n) if query_cams is None: query_cams = np.zeros(m).astype(np.int32) if gallery_cams is None: gallery_cams = np.ones(n).astype(np.int32) # Ensure numpy array query_ids = np.asarray(query_ids) gallery_ids = np.asarray(gallery_ids) query_cams = np.asarray(query_cams) gallery_cams = np.asarray(gallery_cams) # Sort and find correct matches indices = np.argsort(distmat, axis=1) matches = (gallery_ids[indices] == query_ids[:, np.newaxis]) # Compute AP for each query aps = [] for i in range(m): # Filter out the same id and same camera valid = ((gallery_ids[indices[i]] != query_ids[i]) | (gallery_cams[indices[i]] != query_cams[i])) y_true = matches[i, valid] y_score = -distmat[i][indices[i]][valid] if not np.any(y_true): continue aps.append(average_precision_score(y_true, y_score)) if len(aps) == 0: raise RuntimeError("No valid query") return np.mean(aps)
def on_epoch_end(self, epoch, logs={}): X_validation = self.model.validation_data[0] y_validation = self.model.validation_data[1] y_result=self.model.predict(X_validation) map = average_precision_score(y_validation.data[y_validation.start: y_validation.end], y_result, average='micro') logs['val_map']=map print("val_MAP: {}\n".format(map))
def generate_prec_recall_points(clf, test_examples, test_labels, pk_file): # Generate precision-recall points and store in a pickle file. precision = dict() recall = dict() average_precision = dict() thresholds = dict() n_classes = len(clf.model.classes_) y_test = label_binarize(test_labels, clf.model.classes_) y_score = clf.predict_raw_prob(test_examples) # It only output 1 column of positive probability. y_score = y_score[:, 1:] for i in range(n_classes - 1): precision[i], recall[i], thresholds[i] = precision_recall_curve( y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], thresholds['micro'] = \ precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") if pk_file is not None: with open(pk_file, 'wb') as f: pickle.dump((precision, recall, average_precision, thresholds), f)
def run_auc(job, context, name, compare_id): """ AUC of roc plot. ROC plot is defined with mismapped reads being negatives, correctly-mapped reads being positives, and AUC expressing how good of a classifier of correctly-mapped-ness the MAPQ score is. It says nothing about how well the reads are actually mapped. """ if not have_sklearn: return ["sklearn_not_installed"] * 2 work_dir = job.fileStore.getLocalTempDir() compare_file = os.path.join(work_dir, '{}.compare.positions'.format(name)) job.fileStore.readGlobalFile(compare_id, compare_file) try: data = np.loadtxt(compare_file, dtype=np.int, delimiter =', ', usecols=(1,2)).T auc = roc_auc_score(data[0], data[1]) aupr = average_precision_score(data[0], data[1]) except: # will happen if file is empty auc, aupr = 0, 0 return auc, aupr
def save_prcurve(prob, answer, model_name, save_fn, use_neg=True): """ save prc curve """ if not use_neg: prob_dn = [] ans_dn = [] for p in prob: prob_dn.append(p[1:]) for ans in answer: ans_dn.append(ans[1:]) prob = np.reshape(np.array(prob_dn), (-1)) ans = np.reshape(np.array(ans_dn), (-1)) else: prob = np.reshape(prob, (-1)) ans = np.reshape(answer, (-1)) precision, recall, threshold = precision_recall_curve(ans, prob) average_precision = average_precision_score(ans, prob) plt.clf() plt.plot(recall[:], precision[:], lw=2, color='navy', label=model_name) plt.xlabel('Recall') plt.ylabel('Precision') # plt.ylim([0.3, 1.0]) # plt.xlim([0.0, 0.4]) plt.title('Precision-Recall Area={0:0.2f}'.format(average_precision)) plt.legend(loc="upper right") plt.grid(True) plt.savefig(save_fn)
def score_func(estimator,X,Y): global accuracy,precision,recall,f1,mcc,auc,aupr,resultpredict,resultproba,resultlabel predict_proba = estimator.predict_proba(X)[:,1] True,False=1,0 predict = (predict_proba > 0.50) resultlabel = np.hstack((resultlabel,Y)) resultpredict = np.hstack((resultpredict,predict)) resultproba = np.hstack((resultproba,predict_proba)) precision+=precision_score(Y,predict) recall+=recall_score(Y,predict) f1+=f1_score(Y,predict) accuracy += accuracy_score(Y,predict) mcc += matthews_corrcoef(Y,predict) auc += roc_auc_score(Y,predict_proba) aupr += average_precision_score(Y,predict_proba) print "finish one" return matthews_corrcoef(Y,predict) # Performance evaluation
def main(): """ Calculate the Average Precision (AP) at k. """ # Get the arguments args = docopt("""Calculate the Average Precision (AP) at k. Usage: ap.py <test_results_file> <k> <test_results_file> = the test set result file <k> = the cutoff; if it is equal to zero, all the rank is considered. """) test_results_file = args['<test_results_file>'] cutoff = int(args['<k>']) # Sort the lines in the file in descending order according to the score dataset = load_dataset(test_results_file) dataset = sorted(dataset, key=lambda line: line[-1], reverse=True) gold = np.array([1 if label == 'True' else 0 for (x, y, label, score) in dataset]) scores = np.array([score for (x, y, label, score) in dataset]) for i in range(1, min(cutoff + 1, len(dataset))): try: score = average_precision_score(gold[:i], scores[:i]) except: score = 0 print 'Average Precision at %d is %.3f' % (i, 0 if score == -1 else score) print 'FINAL: Average Precision at %d is %.3f' % (len(dataset), average_precision_score(gold, scores))
def compute_pr(y_test, probability_predictions): """ Compute Precision-Recall, thresholds and PR AUC. Args: y_test (list) : true label values corresponding to the predictions. Also length n. probability_predictions (list) : predictions coming from an ML algorithm of length n. Returns: dict: """ _validate_predictions_and_labels_are_equal_length(probability_predictions, y_test) # Calculate PR precisions, recalls, pr_thresholds = skmetrics.precision_recall_curve(y_test, probability_predictions) pr_auc = skmetrics.average_precision_score(y_test, probability_predictions) # get ideal cutoffs for suggestions (upper right or 1,1) pr_distances = (precisions - 1) ** 2 + (recalls - 1) ** 2 # To prevent the case where there are two points with the same minimum distance, return only the first # np.where returns a tuple (we want the first element in the first array) pr_index = np.where(pr_distances == np.min(pr_distances))[0][0] best_precision = precisions[pr_index] best_recall = recalls[pr_index] ideal_pr_cutoff = pr_thresholds[pr_index] return {'pr_auc': pr_auc, 'best_pr_cutoff': ideal_pr_cutoff, 'best_precision': best_precision, 'best_recall': best_recall, 'precisions': precisions, 'recalls': recalls, 'pr_thresholds': pr_thresholds}
def evalData(z,test_set_y): " z- prediction test_set_y is the truth " diff=z-test_set_y fpr, tpr, thresholds = metrics.roc_curve(test_set_y.ravel(), z.ravel(), pos_label=1) auc=metrics.auc(fpr, tpr) ap=metrics.average_precision_score(test_set_y.ravel(), z.ravel()) Q=test_set_y.shape[0] Pk10=0 Pk20=0 Pk30=0 Pk50=0 Pk37=0 for i in range(Q): Pk10+=ranking_precision_score(test_set_y[i], z[i], k=10) Pk20+=ranking_precision_score(test_set_y[i], z[i], k=20) Pk30+=ranking_precision_score(test_set_y[i], z[i], k=30) Pk37+=ranking_precision_score(test_set_y[i], z[i], k=37) Pk50+=ranking_precision_score(test_set_y[i], z[i], k=30) Pk10=Pk10/Q Pk20=Pk20/Q Pk30=Pk30/Q Pk50=Pk50/Q Pk37=Pk37/Q cross=metrics.log_loss(test_set_y,z) print '\n' print 'AUC',auc,'MSE',np.mean((diff)**2),'Cross-entropy:',cross print 'Precision at k=10: ',Pk10,' k=20: ',Pk20,' k=30: ',Pk30,' k=50: ',Pk50, ' k=37: ',Pk37 return Pk37
def compute_ap(class_score_matrix, labels): num_classes=class_score_matrix.shape[1] one_hot_labels=dense_to_one_hot(labels, num_classes) predictions=np.array(class_score_matrix>0, dtype="int32") average_precision=[] for i in range(num_classes): ps=average_precision_score(one_hot_labels[:, i], class_score_matrix[:, i]) # if not np.isnan(ps): average_precision.append(ps) return np.array(average_precision)
def evaluate(classes, y_gt, y_pred, threshold_value=0.5): """ Arguments: y_gt (num_bag x L): groud truth y_pred (num_bag x L): prediction """ print("thresh = {:.6f}".format(threshold_value)) y_pred_bin = y_pred >= threshold_value score_f1_macro = f1_score(y_gt, y_pred_bin, average="macro") print("Macro f1_socre = {:.6f}".format(score_f1_macro)) score_f1_micro = f1_score(y_gt, y_pred_bin, average="micro") print("Micro f1_socre = {:.6f}".format(score_f1_micro)) # hamming loss h_loss = hamming_loss(y_gt, y_pred_bin) print("Hamming Loss = {:.6f}".format(h_loss)) mAP = average_precision_score(y_gt, y_pred) print("mAP = {:.2f}%".format(mAP * 100)) # ap_classes = [] # for i, cls in enumerate(classes): # ap_cls = average_precision_score(y_gt[:, i], y_pred[:, i]) # ap_classes.append(ap_cls) # print("AP({}) = {:.2f}%".format(cls, ap_cls * 100)) # print("mAP = {:.2f}%".format(np.mean(ap_classes) * 100))
def computeAveragePrecisionMetrics(truthValues, testValues): """ Compute average precision. """ metrics = [ { 'name': 'average_precision', 'value': average_precision_score( y_true=truthValues, y_score=testValues) } ] return metrics
def compute_metrics(sess, logits_op, placeholders, data_file, exporter=None): """Compute metrics MAP and MRR over a dataset. :param sess: TensorFlow session :param logits_op: an operation that returns the scores for a given set of sentences :param placeholders: placeholders defined for `logits_op` :data_file: a HDF5 file object holding the dataset :returns: the values of MAP and MRR as a tuple: (MAP, MRR) """ questions_ph, sentences_ph, keep_prob_ph = placeholders if exporter is None: exporter = dataio.no_op() next(exporter) # priming the coroutine total_avep = 0.0 total_mrr = 0.0 n_questions = 0 for batch in dataio.question_batches(data_file): feed_dict = { questions_ph: batch.questions, sentences_ph: batch.sentences, keep_prob_ph: 1.0 } scores = logits_op.eval(session=sess, feed_dict=feed_dict) exporter.send(scores) n_questions += 1 avep = average_precision(batch.labels, scores) total_avep += avep mrr = mean_reciprocal_rank(batch.labels, scores) total_mrr += mrr exporter.close() mean_avep = total_avep / n_questions mean_mrr = total_mrr / n_questions return mean_avep, mean_mrr
def get_scores(clf, X_t_train, y_train, X_t_test, y_test): clf.fit(X_t_train, y_train) y_score = clf.predict_proba(X_t_test) app = dict() score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None) #auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples') avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples') prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro') rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro') avg_prec = average_precision_score(y_test, clf.predict(X_t_test)) metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))] #app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]]) fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(list(enumerate(mlb.classes_)))): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i]) app['F2 Score'] = avg_sample_score app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test)) app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]]) app['P_AUPR'] = avg_prec app['Precision'] = prec_score app['Recall'] = rec_score app['ROC_AUC_samples'] = roc_auc return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test): clf.fit(X_t_train, y_train) app = dict() score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None) avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples') avg_prec = average_precision_score(y_test, clf.predict(X_t_test)) metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))] app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]]) app['F2 Score'] = avg_sample_score app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test)) app['Precision Score Avg (PR Curve)'] = avg_prec return app
def MB_test(preds, ytest): f1 =f1_score(ytest, preds, average=None) precision = precision_score(ytest, preds, average=None) recall = recall_score(ytest, preds, average=None) precisionbothclass = average_precision_score(ytest, preds) fpr, tpr, thresholds = roc_curve(ytest, preds) classifciationreprot = classification_report(ytest, preds) f1 = f1.astype(float) precision = precision.astype(float) recall = recall.astype(float) return f1, precision, recall,precisionbothclass, preds, fpr, tpr, thresholds ,classifciationreprot
def mean_average_precision(y_trues, y_scores): """ y_trues : [nb_samples, nb_classes] y_scores : [nb_samples, nb_classes] map : float (MAP) """ aps = [] for y_t, y_s in zip(y_trues, y_scores): ap = metrics.average_precision_score(y_t, y_s) aps.append(ap) return np.mean(np.array(aps))
def PRC_AUC(Y_hats, Y_test): p,r,thresholds = precision_recall_curve(Y_test.flatten(), Y_hats.flatten()) thresholds = np.hstack([thresholds, thresholds[-1]]) prc = np.vstack([r,p]).T auc = average_precision_score(Y_test.flatten(), Y_hats.flatten(), average='micro') return prc, auc, thresholds
def evaluate(binarise_result, y_test, y_score, file_name): """ computes the accuracy, precision and recall. plots the precision and recall curve. saves the plots to the figure folder. :param binarise_result: list of binarised result after prediction from classifier :type binarise_result: list[list[int]] :param y_test: list of binarised labels from the test set :type y_test: list[list[int]] :param y_score: distance of each sample from the decision boundary for each class :type y_score:list :param file_name: directory name for saving all figures from the plots :type file_name: str :return: :rtype: """ num_class = y_test.shape[1] # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(num_class): precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") # create directory create_directory('figure') create_directory('figure/' + file_name) # plots plot_precision_recall_curve(average_precision, precision, recall, file_name) # Plot Precision-Recall curve for each class plot_precision_recall_curve_all_classes(average_precision, precision, recall, file_name, num_class) generate_eval_metrics(binarise_result, file_name, y_test)
def test_average_precision_score_score_non_binary_class(): # Test that average_precision_score function returns an error when trying # to compute average_precision_score for multiclass task. rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains three different class values y_true = rng.randint(0, 3, size=10) assert_raise_message(ValueError, "multiclass format is not supported", average_precision_score, y_true, y_pred)
def test_average_precision_score_duplicate_values(): # Duplicate values with precision-recall require a different # processing than when computing the AUC of a ROC, because the # precision-recall curve is a decreasing curve # The following situation corresponds to a perfect # test statistic, the average_precision_score should be 1 y_true = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] y_score = [0, .1, .1, .4, .5, .6, .6, .9, .9, 1, 1] assert_equal(average_precision_score(y_true, y_score), 1)
def test_average_precision_score_tied_values(): # Here if we go from left to right in y_true, the 0 values are # are separated from the 1 values, so it appears that we've # Correctly sorted our classifications. But in fact the first two # values have the same score (0.5) and so the first two values # could be swapped around, creating an imperfect sorting. This # imperfection should come through in the end score, making it less # than one. y_true = [0, 1, 1] y_score = [.5, .5, .6] assert_not_equal(average_precision_score(y_true, y_score), 1.)
def get_average_precision(y_gold_standard,y_predicted): """ Computes the average precision score. Also known as the area under the precision-recall curve. Keyword arguments: y_gold_standard -- Expected labels y_predicted -- Predicted labels """ return average_precision_score(y_gold_standard, y_predicted)
def test_result(inferences, labels): ap = metrics.average_precision_score(labels, inferences) r2 = metrics.r2_score(labels, inferences) roc_auc = metrics.roc_auc_score(labels, inferences) return ap, r2, roc_auc
def tabulate_metrics(cv_results, name): """ Calculate accuracy metrics from probabilities, format them. Given a list of tuples, each of the form (index, vector_of_true_outcomes, vector_of_predicted_probabilities), for each index (representing one fold of CV) assess multiple accuracy metrics (eg ROC AUC, F1 score, positive predictive value) for the predicted probabilities WRT the true outcomes (for that fold's test set.) Also take the median across all folds. Then format these nicely into a table (labeled with the given name) and return that, as a string. For metrics which require a binary prediction, a threshold of 0.5 is used. """ # Each of the metric functions should take two non-optional # arguments, y_true and y_pred. # These accept predicted probabilities. probability_metrics = [ ('AUC', roc_auc_score), ('AP', metrics.average_precision_score) ] # These need binary predictions binary_metrics = [ ('F1', metrics.f1_score), ('MCC', metrics.matthews_corrcoef), ('precision', metrics.precision_score), ('recall', metrics.recall_score) ] # Mutual information? Odds ratios? results = {label: [] for label, _ in probability_metrics + binary_metrics} results.update({'tn': [], 'fp': [], 'fn': [], 'tp': []}) for label, metric in probability_metrics: for fold, y_true, y_pred in cv_results: results[label].append(metric(y_true, y_pred)) for fold, y_true, probabilities in cv_results: y_pred = probabilities > 0.5 for label, metric in binary_metrics: results[label].append(metric(y_true, y_pred)) conf = zip( ('tn', 'fp', 'fn', 'tp'), metrics.confusion_matrix(y_true, y_pred).flat ) for label, n in conf: results[label].append(n) index=['fold_%d' % i for i, _, _ in cv_results] table = pd.DataFrame(data=results, index=index) table.loc['median/sum'] = 0. for k,_ in probability_metrics + binary_metrics: table.loc['median/sum',k] = np.median(results[k]) for k in ('tn', 'fp', 'fn', 'tp'): table.loc['median/sum',k] = np.sum(results[k]) report = table.to_string(float_format=lambda x: '%.3g' % x) report = ('%s: \n' % name) + report return report
def main(): """ Train a classifier based on all the measures, to discriminate hypernymy from one other single relation. """ # Get the arguments args = docopt("""Calculate the Average Precision (AP) at k for every hyper-other relation in the dataset. Usage: ap_on_each_relation.py <test_results_file> <test_set_file> <k> <test_results_file> = the test set result file. <test_set_file> = the test set containing the original relations. <k> = the cutoff; if it is equal to zero, all the rank is considered. """) test_set_file = args['<test_set_file>'] test_results_file = args['<test_results_file>'] cutoff = int(args['<k>']) # Load the test set print 'Loading the dataset...' test_set, relations = load_dataset(test_set_file + '.test') hyper_relation = 'hyper' for other_relation in [relation for relation in relations if relation != hyper_relation]: curr_relations = [other_relation, hyper_relation] print '==================================================' print 'Testing', hyper_relation, 'vs.', other_relation, '...' # Filter out the dataset to contain only these two relations relation_index = { relation : index for index, relation in enumerate(curr_relations) } curr_test_set = { (x, y) : relation for (x, y), relation in test_set.iteritems() if relation in curr_relations } # Sort the lines in the file in descending order according to the score with codecs.open(test_results_file, 'r', 'utf-8') as f_in: dataset = [tuple(line.strip().split('\t')) for line in f_in] dataset = [(x, y, label, float(score)) for (x, y, label, score) in dataset if (x, y) in curr_test_set] dataset = sorted(dataset, key=lambda line: line[-1], reverse=True) # relevance: rel(i) is an indicator function equaling 1 if the item at rank i is a hypernym gold = np.array([1 if label == 'True' else 0 for (x, y, label, score) in dataset]) scores = np.array([score for (x, y, label, score) in dataset]) for i in range(1, min(cutoff + 1, len(dataset))): score = average_precision_score(gold[:i], scores[:i]) print 'Average Precision at %d is %.3f' % (i, 0 if score == -1 else score) print 'FINAL: Average Precision at %d is %.3f' % (len(dataset), average_precision_score(gold, scores))
def get_results(clf, X_test, y_test, typename): oldcwd = os.getcwd() os.chdir(currentrun) # for clf in Clfs: tempcwd = os.getcwd() dire = str(clf.class_prior) + typename if not os.path.exists(dire): os.makedirs(dire) os.chdir(dire) preds = clf.predict(np.array(X_test)) print 'Getting ' + typename+' results...' f1, precision, recall,precisionbothclass, preds,fpr, tpr, thresholds ,cr= MB_test(np.array(preds).astype(float), np.array(y_test).astype(float)) fpr, tpr, thresholds = roc_curve(y_test, preds) roc_auc = auc(fpr, tpr) pr, rc,thr = precision_recall_curve(y_test, preds) #print precision_recall_curve(y_test, preds), average_precision_score(y_test, preds, average="micro") #print accuracy_score(y_test, preds) average_precision = average_precision_score(y_test, preds, average="micro") plt.figure() plt.clf() plt.plot(pr, rc, label='Precision-Recall curve') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall Curve: AUC={0:0.2f}'.format(average_precision)) plt.legend(loc="lower left") plt.savefig('Precision-Recall-Curve'+str(clf.class_prior)+'.png') roc_auc = auc(fpr, tpr) plt.figure() plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr, lw=1, label='ROC CrowdNB %s (area = %0.2f)' % (str(clf.class_prior), roc_auc)) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc="lower right") plt.savefig('ROC'+str(clf.class_prior)+'.png') plt.close() plot_classification_report(cr) f = open('classificationreport.txt','w') f.write(str(cr)) f.close() f = open('accuracy.txt','w') f.write(str(accuracy_score(y_test, preds))) f.close() #print str(precision)+','+str(recall)+','+str(f1)+','+str(precisionbothclass) os.chdir(tempcwd) os.chdir(oldcwd)