Python sklearn 模块,metrics() 实例源码

我们从Python开源项目中,提取了以下42个代码示例,用于说明如何使用sklearn.metrics()

项目:sakmapper    作者:szairis    | 项目源码 | 文件源码
def apply_lens(df, lens='pca', dist='euclidean', n_dim=2, **kwargs):
    """
    input: N x F dataframe of observations
    output: N x n_dim image of input data under lens function
    """
    if n_dim != 2:
        raise 'error: image of data set must be two-dimensional'
    if dist not in ['euclidean', 'correlation']:
        raise 'error: only euclidean and correlation distance metrics are supported'
    if lens == 'pca' and dist != 'euclidean':
        raise 'error: PCA requires the use of euclidean distance metric'

    if lens == 'pca':
        df_lens = pd.DataFrame(decomposition.PCA(n_components=n_dim, **kwargs).fit_transform(df), df.index)
    elif lens == 'mds':
        D = metrics.pairwise.pairwise_distances(df, metric=dist)
        df_lens = pd.DataFrame(manifold.MDS(n_components=n_dim, **kwargs).fit_transform(D), df.index)
    elif lens == 'neighbor':
        D = metrics.pairwise.pairwise_distances(df, metric=dist)
        df_lens = pd.DataFrame(manifold.SpectralEmbedding(n_components=n_dim, **kwargs).fit_transform(D), df.index)
    else:
        raise 'error: only PCA, MDS, neighborhood lenses are supported'

    return df_lens
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def calculate_regression_metrics(trained_sklearn_estimator, x_test, y_test):
    """
    Given a trained estimator, calculate metrics.

    Args:
        trained_sklearn_estimator (sklearn.base.BaseEstimator): a scikit-learn estimator that has been `.fit()`
        y_test (numpy.ndarray): A 1d numpy array of the y_test set (predictions)
        x_test (numpy.ndarray): A 2d numpy array of the x_test set (features)

    Returns:
        dict: A dictionary of metrics objects
    """
    # Get predictions
    predictions = trained_sklearn_estimator.predict(x_test)

    # Calculate individual metrics
    mean_squared_error = skmetrics.mean_squared_error(y_test, predictions)
    mean_absolute_error = skmetrics.mean_absolute_error(y_test, predictions)

    result = {'mean_squared_error': mean_squared_error, 'mean_absolute_error': mean_absolute_error}

    return result
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in non_phylogenetic_metrics():
        raise ValueError("Unknown metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')

    return skbio.diversity.beta_diversity(
        metric=metric,
        counts=counts,
        ids=sample_ids,
        pairwise_func=sklearn.metrics.pairwise_distances,
        n_jobs=n_jobs
    )
项目:sptgraph    作者:epfl-lts2    | 项目源码 | 文件源码
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import silhouette_score

    shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
    train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)

    train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
    full_mat = np.array(list(shape_df.values))

    centroids = None
    labels = None
    best_score = 0
    for k in k_range:
        res = cluster_shapes(train_mat, full_mat, k)
        score = silhouette_score(full_mat, res[1])
        if score > best_score:
            centroids = res[0]
            labels = res[1]
            best_score = score

    mols[cluster_key] = labels
    return mols, centroids
项目:FeatureHub    作者:HDI-Project    | 项目源码 | 文件源码
def compute_metrics_cv(self, X, Y):
        """Compute cross-validated metrics.

        Trains this model on data X with labels Y.

        Returns a MetricList with the name, scoring type, and value for each
        Metric. Note that these values may be numpy floating points, and should
        be converted prior to insertion in a database.

        Parameters
        ----------
        X : numpy array-like or pd.DataFrame
            data
        Y : numpy array-like or pd.DataFrame or pd.DataSeries
            labels
        """

        scorings, scorings_ = self._get_scorings()

        # compute scores
        scores = self.cv_score_mean(X, Y, scorings_)

        # unpack into MetricList
        metric_list = self.scores_to_metriclist(scorings, scores)
        return metric_list
项目:FeatureHub    作者:HDI-Project    | 项目源码 | 文件源码
def compute_metrics_train_test(self, X, Y, n):
        """Compute metrics on test set.
        """

        X, Y = self._format_matrices(X, Y)

        X_train, Y_train = X[:n], Y[:n]
        X_test, Y_test = X[n:], Y[n:]

        scorings, scorings_ = self._get_scorings()

        # Determine binary/multiclass classification
        classes = np.unique(Y)
        params = self._get_params(classes)

        # fit model on entire training set
        self.model.fit(X_train, Y_train)

        scores = {}
        for scoring in scorings_:
            scores[scoring] = self._do_scoring(scoring, params, self.model,
                    X_test, Y_test)

        metric_list = self.scores_to_metriclist(scorings, scores)
        return metric_list
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def observe(self, observation):
        """Process observation for metrics."""
        if self.lastY is not None:
            self.metrics.update(observation, self.lastY)
            if 'text' in observation.keys():
                self.labels += self._text2predictions(self.lastY)
                self.observations += [observation['score']]
            self.lastY = None
        return observation
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def reset_metrics(self):
        """Reset metrics, observations and labels."""
        super().reset_metrics()
        del self.observations[:]
        del self.labels[:]
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def report(self):
        """Return report with metrics on the whole data."""
        loss = sklearn.metrics.log_loss(self.labels, self.observations)
        acc = sklearn.metrics.accuracy_score(self.labels,
                                             self._text2predictions(self._predictions2text(self.observations)))
        try:
            auc = sklearn.metrics.roc_auc_score(self.labels, self.observations)
        except ValueError:
            auc = 0
        report = dict()
        report['comments'] = len(self.observations)
        report['loss'] = loss
        report['accuracy'] = acc
        report['auc'] = auc
        return report
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, n_jobs: int=1)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")
    if n_jobs != 1 and metric == 'weighted_unifrac':
        raise ValueError("Weighted UniFrac is not parallelizable")

    counts = table.matrix_data.toarray().astype(int).T
    sample_ids = table.ids(axis='sample')
    feature_ids = table.ids(axis='observation')

    try:
        results = skbio.diversity.beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            otu_ids=feature_ids,
            tree=phylogeny,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs
        )
    except skbio.tree.MissingNodeError as e:
        message = str(e).replace('otu_ids', 'feature_ids')
        message = message.replace('tree', 'phylogeny')
        raise skbio.tree.MissingNodeError(message)

    return results
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def beta_phylogenetic_alt(table: BIOMV210Format, phylogeny: NewickFormat,
                          metric: str, n_jobs: int=1,
                          variance_adjusted: bool=False,
                          alpha: float=None,
                          bypass_tips: bool=False) -> skbio.DistanceMatrix:

    metrics = phylogenetic_metrics_alt_dict()
    generalized_unifrac = 'generalized_unifrac'

    if metric not in metrics:
        raise ValueError("Unknown metric: %s" % metric)

    if alpha is not None and metric != generalized_unifrac:
        raise ValueError('The alpha parameter is only allowed when the choice'
                         ' of metric is generalized_unifrac')

    # this behaviour is undefined, so let's avoid a seg fault
    cpus = psutil.cpu_count(logical=False)
    if n_jobs > cpus:
        raise ValueError('The value of n_jobs cannot exceed the number of '
                         'processors (%d) available in this system.' % cpus)

    if metric == generalized_unifrac:
        alpha = 1.0 if alpha is None else alpha
        f = partial(metrics[metric], alpha=alpha)
    else:
        f = metrics[metric]

    # unifrac processes tables and trees should be filenames
    return f(str(table), str(phylogeny), threads=n_jobs,
             variance_adjusted=variance_adjusted, bypass_tips=bypass_tips)
项目:model_sweeper    作者:akimovmike    | 项目源码 | 文件源码
def compute_metrics(metrics, learn_data, model_data):
    target_label_gound_truth = load_df_from_sample_notation(model_data['Feature Sample Location'])[model_data['Target Variable']]
    prediction = learn_data['Prediction']
    if metric=='AUC':
        return 0.9 # zaglushka sk.metrics.auc_mathafaka( target_label_gound_truth, prediction)
项目:model_sweeper    作者:akimovmike    | 项目源码 | 文件源码
def sw_evalute_model(learn_data, overwrite_existing, worker_id=None):
#     learn_data = db['learns'].find_one(learn_id)
    model_data = db[learn_data['Model'][-1]].find_one(learn_data['Model'][0])
    if learn_data['Status']['Prediction Computed']:
        for metric in learn_data['Evaluation Results'].keys():
            if learn_data['Evaluation Results']==None or overwrite_existing:
                learn_data['Evaluation Results'][metrics] = compute_metrics(metric, learn_data, model_data)    

        learn_data['Status']['Model Evaluated'] = True
        db['learns'].update(learn_data['_id'], learn_data)
项目:brainiak    作者:brainiak    | 项目源码 | 文件源码
def score(self, X, y, sample_weight=None):
        """Returns the mean accuracy on the given test data and labels.

        NOTE: In the condition of sklearn.svm.SVC with precomputed kernel
        when the kernel matrix is computed portion by portion, the function
        will ignore the first input argument X.

        Parameters
        ----------
        X: list of tuple (data1, data2)
            data1 and data2 are numpy array in shape [num_TRs, num_voxels]
            to be computed for correlation.
            They are test samples.
            They contain the activity data filtered by ROIs
            and prepared for correlation computation.
            Within list, all data1s must have the same num_voxels value,
            all data2s must have the same num_voxels value.
            len(X) is the number of test samples.

        y: 1D numpy array
            labels, len(X) equals len(y), which is num_samples
        sample_weight: 1D array in shape [num_samples], optional
            Sample weights.

        Returns
        -------
        score : float
            Mean accuracy of self.predict(X) wrt. y.
        """
        from sklearn.metrics import accuracy_score
        if isinstance(self.clf, sklearn.svm.SVC) \
                and self.clf.kernel == 'precomputed' \
                and self.training_data_ is None:
            result = accuracy_score(y, self.predict(),
                                    sample_weight=sample_weight)
        else:
            result = accuracy_score(y, self.predict(X),
                                    sample_weight=sample_weight)
        return result
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def get_report(clf, test_data_x, test_data_y):
    """
    Returns a string with a report of how the classifier *clf* does on the test data.

    :param clf: The classifier to use for calculating the scores.
    :param test_data_x: The test data observations to use for predictions.
    :param test_data_y: The test data class label to use.
    :return: A string containing a report on the performance of the classifier comparing the predicted class labels
             versus the true.
    """
    test_data_y_pred = predict(clf, test_data_x, probabilities=False)

    report_lines = [
        "Classification report:",
        "Best parameters set found on development set:",
        "",
        str(clf.best_estimator_),
        "",
        grid_scores(clf),
        "Detailed classification report:",
        ""
        "The model is trained on the full development set.",
        "The scores are computed on the full evaluation set.",
        "",
        sklearn.metrics.classification_report(test_data_y, test_data_y_pred),
        "",
        cm_report(sklearn.metrics.confusion_matrix(test_data_y, test_data_y_pred),
                  labels=['Interictal', 'Preictal']),
        "",
    ]
    report = '\n'.join(report_lines)
    return report
项目:vwoptimize    作者:denik    | 项目源码 | 文件源码
def root_mean_squared_error(*args, **kwargs):
    import sklearn.metrics
    return math.sqrt(sklearn.metrics.mean_squared_error(*args, **kwargs))
项目:vwoptimize    作者:denik    | 项目源码 | 文件源码
def extract_score(metric, outputs):
    if not outputs:
        raise ValueError('error: No output captured from vw')

    orig_outputs = outputs

    stage, metric = _parse_vw_metric(metric)
    outputs = (outputs or {}).get(stage)

    if not outputs:
        raise ValueError('error: No output for stage %r. Available: %r' % (stage, ', '.join(orig_outputs.keys())))

    values = [x.get(metric) for x in outputs]

    for item in values:
        if item is None:
            raise ValueError('Metric (%s)%s not found. Available metrics: %s' % (stage, metric, outputs[0].keys()))

    try:
        values = [float(x) for x in values]
    except Exception:
        if values[0].endswith(' h'):
            return values
        return None

    return values
项目:vwoptimize    作者:denik    | 项目源码 | 文件源码
def recall_at_precision(*args, **kwargs):
    from sklearn.metrics import precision_recall_curve
    metric_param = kwargs.pop('metric_param')
    required_precision = _parse_number_or_fraction(metric_param)
    precision, recall, thresholds = precision_recall_curve(*args, **kwargs)

    for pr, r in izip(precision, recall):
        if pr >= required_precision:
            return r
项目:vwoptimize    作者:denik    | 项目源码 | 文件源码
def log_report_one(prefix, metrics, y_true, y_pred, sample_weight, config, classification_report, outputs=None, mask=None):

    if mask is not None:
        y_true = np.ma.MaskedArray(y_true, mask=mask).compressed()
        y_pred = np.ma.MaskedArray(y_pred, mask=mask).compressed()
        sample_weight = np.ma.MaskedArray(sample_weight, mask=mask).compressed() if sample_weight is not None else None
        assert y_true.shape == y_pred.shape, (y_true.shape, y_pred.shape)

    for metric in metrics:
        log_always('%s%s = %s', prefix, metric, _frmt_score(calculate_or_extract_score(metric, y_true, y_pred, config, outputs=outputs, sample_weight=sample_weight)))

    if classification_report:
        assert y_true is not None
        assert y_pred is not None
        log_classification_report(prefix, y_true, y_pred, labels=config.get('named_labels'), threshold=config.get('threshold'))  # XXX sample_weight
项目:complex    作者:ttrouill    | 项目源码 | 文件源码
def __init__(self, preds, true_vals, ranks, raw_ranks):
        self.preds = preds
        self.ranks = ranks
        self.true_vals = true_vals
        self.raw_ranks = raw_ranks

        #Test if not all the prediction are the same, sometimes happens with overfitting,
        #and leads scikit-learn to output incorrect average precision (i.e ap=1)
        if not (preds == preds[0]).all() :
            #Due to the use of np.isclose in sklearn.metrics.ranking._binary_clf_curve (called by following metrics function),
            #I have to rescale the predictions if they are too small:
            preds_rescaled = preds

            diffs = np.diff(np.sort(preds))
            min_diff = min(abs(diffs[np.nonzero(diffs)]))
            if min_diff < 1e-8 : #Default value of absolute tolerance of np.isclose
                preds_rescaled = (preds * ( 1e-7 / min_diff )).astype('d')

            self.ap = sklearn.metrics.average_precision_score(true_vals,preds_rescaled)
            self.precision, self.recall, self.thresholds = sklearn.metrics.precision_recall_curve(true_vals,preds_rescaled) 
        else:
            logger.warning("All prediction scores are equal, probable overfitting, replacing scores by random scores")
            self.ap = (true_vals == 1).sum() / float(len(true_vals))
            self.thresholds = preds[0]
            self.precision = (true_vals == 1).sum() / float(len(true_vals))
            self.recall = 0.5


        self.mrr =-1
        self.raw_mrr =-1

        if ranks is not None:
            self.mrr = np.mean(1.0 / ranks)
            self.raw_mrr = np.mean(1.0 / raw_ranks)
项目:FeatureSqueezing    作者:QData    | 项目源码 | 文件源码
def train_detector(x_train, y_train, x_val, y_val):
    fpr, tpr, thresholds = roc_curve(y_train, x_train)
    accuracy = [ sklearn.metrics.accuracy_score(y_train, x_train>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
    roc_auc = auc(fpr, tpr)

    idx_best = np.argmax(accuracy)
    print "Best training accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy[idx_best], tpr[idx_best], fpr[idx_best], thresholds[idx_best])
    print "ROC_AUC: %.4f" % roc_auc

    accuracy_val = [ sklearn.metrics.accuracy_score(y_val, x_val>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
    tpr_val, fpr_val = zip(*[ get_tpr_fpr(y_val, x_val, threshold)  for threshold in thresholds  ])
    # roc_auc_val = auc(fpr_val, tpr_val)
    print "Validation accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy_val[idx_best], tpr_val[idx_best], fpr_val[idx_best], thresholds[idx_best])

    return threshold, accuracy_val, fpr_val, tpr_val
项目:amle    作者:elibol    | 项目源码 | 文件源码
def roc_auc_score(y_truth, y_pred, num_classes=None):
    return sklearn.metrics.roc_auc_score(*map(partial(to_matrix, num_classes=num_classes), [y_truth, y_pred]))


#########################
# AUTOSKLEARN UTILS
#########################
项目:Aion    作者:aleisalem    | 项目源码 | 文件源码
def calculateMetrics(truth, predicted):
    """
    Calculates and returns a set of metrics from ground truth and predicted vectors
    :param truth: A list of ground truth labels
    :type truth: list
    :param predicted: A list of predicted labels
    :type predicted: list
    :return: A dict of metrics including accuracy, recall, specificity, precision, and F1-score
    """
    try:
        # Sanity check
        if not len(truth) == len(predicted):
            prettyPrint("The two vectors have different dimensionality", "warning")
            return {}

        metrics = {}
        # Calculate different mterics
        metrics["accuracy"] = accuracy_score(truth, predicted)
        metrics["recall"] = recall_score(truth, predicted)
        metrics["specificity"] = specificity_score(truth, predicted) # From Aion.utils.misc
        metrics["precision"] = precision_score(truth, predicted)
        metrics["f1score"] = f1_score(truth, predicted)

    except Exception as e:
        prettyPrintError(e)
        return {}

    return metrics
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def calculate_binary_classification_metrics(trained_sklearn_estimator, x_test, y_test):
    """
    Given a trained estimator, calculate metrics.

    Args:
        trained_sklearn_estimator (sklearn.base.BaseEstimator): a scikit-learn estimator that has been `.fit()`
        x_test (numpy.ndarray): A 2d numpy array of the x_test set (features)
        y_test (numpy.ndarray): A 1d numpy array of the y_test set (predictions)

    Returns:
        dict: A dictionary of metrics objects
    """
    # Squeeze down y_test to 1D
    y_test = np.squeeze(y_test)

    _validate_predictions_and_labels_are_equal_length(x_test, y_test)

    # Get binary and probability classification predictions
    binary_predictions = np.squeeze(trained_sklearn_estimator.predict(x_test))
    probability_predictions = np.squeeze(trained_sklearn_estimator.predict_proba(x_test)[:, 1])

    # Calculate accuracy
    accuracy = skmetrics.accuracy_score(y_test, binary_predictions)
    roc = compute_roc(y_test, probability_predictions)
    pr = compute_pr(y_test, probability_predictions)

    # Unpack the roc and pr dictionaries so the metric lookup is easier for plot and ensemble methods
    return {'accuracy': accuracy, **roc, **pr}
项目:FeatureHub    作者:HDI-Project    | 项目源码 | 文件源码
def _get_scorings(self):
        """Get scorings for this problem type.

        Returns
        -------
        scorings : list of dict
            Information on metric name and associated "scoring" as defined in
            sklearn.metrics
        scorings_ : list
            List of "scoring" as defined in sklearn.metrics. This is a "utility
            variable" that can be used where we just need the names of the
            scoring functions and not the more complete information.
        """
        # scoring_types maps user-readable name to `scoring`, as argument to
        # cross_val_score
        # See also http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        if self._is_classification():
            scorings = Model.CLASSIFICATION_SCORING
            scorings_= [s["scoring"] for s in scorings]
        elif self._is_regression():
            scorings = Model.REGRESSION_SCORING
            scorings_= [s["scoring"] for s in scorings]
        else:
            raise NotImplementedError

        return scorings, scorings_
项目:EvadeML-Zoo    作者:mzweilin    | 项目源码 | 文件源码
def evalulate_detection_test(Y_detect_test, Y_detect_pred):
    accuracy = sklearn.metrics.accuracy_score(Y_detect_test, Y_detect_pred, normalize=True, sample_weight=None)
    tpr, fpr, tp, ap = get_tpr_fpr(Y_detect_test, Y_detect_pred)
    return accuracy, tpr, fpr, tp, ap
项目:tdlstm    作者:bluemonk482    | 项目源码 | 文件源码
def eval(self, session, feed, saver, early_stopping_rounds, early_stopping_metric_list, early_stopping_metric_minimize=False, metrics='accuracy'):
        test_loss_value, acc_test, pred = session.run(self.test_loss, feed)
        f1_3class, f1_2class = fscores(self.data.dev_y, pred)
        if not self.tuning:
            print("*** Validation Loss = {:.6f}; Validation Accuracy = {:.5f}; 3-class F1 = {:.5f}; 2-class F1 = {:.5f}"
                        .format(test_loss_value, acc_test, f1_3class, f1_2class))
            print()
        early_stop = False
        early_stopping_score = -1
        if metrics == 'accuracy':
            early_stopping_score = acc_test
            early_stopping_metric_list.append(acc_test)
        elif metrics == '3classf1':
            early_stopping_score = f1_3class
            early_stopping_metric_list.append(f1_3class)
        elif metrics == '2classf1':
            early_stopping_score = f1_2class
            early_stopping_metric_list.append(f1_2class)
        assert early_stopping_score > 0

        if (not self.FLAGS.restore) and (early_stopping_metric_minimize): # For minimising the eval score
            if all(early_stopping_score <= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                # best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(min(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop)
        elif not (self.FLAGS.restore and early_stopping_metric_minimize):  # For maximising the eval score
            if all(early_stopping_score >= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                # best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(max(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop)
项目:tdlstm    作者:bluemonk482    | 项目源码 | 文件源码
def eval(self, session, feed, saver, early_stopping_rounds, early_stopping_metric_list, early_stopping_metric_minimize=False, metrics='accuracy'):
        test_loss_value, acc_test, pred = session.run(self.test_loss, feed)
        f1_3class, f1_2class = fscores(self.data.dev_y, pred)
        if not self.tuning:
            print("*** Validation Loss = {:.6f}; Validation Accuracy = {:.5f}; 3-class F1 = {:.5f}; 2-class F1 = {:.5f}"
                        .format(test_loss_value, acc_test, f1_3class, f1_2class))
            print()
        early_stop = False
        early_stopping_score = -1
        if metrics == 'accuracy':
            early_stopping_score = acc_test
            early_stopping_metric_list.append(acc_test)
        elif metrics == '3classf1':
            early_stopping_score = f1_3class
            early_stopping_metric_list.append(f1_3class)
        elif metrics == '2classf1':
            early_stopping_score = f1_2class
            early_stopping_metric_list.append(f1_2class)
        assert early_stopping_score > 0

        if (not self.FLAGS.restore) and (early_stopping_metric_minimize): # For minimising the eval score
            if all(early_stopping_score <= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(min(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop)
        elif not (self.FLAGS.restore and early_stopping_metric_minimize):  # For maximising the eval score
            if all(early_stopping_score >= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(max(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop)
项目:tdlstm    作者:bluemonk482    | 项目源码 | 文件源码
def eval(self, session, feed, saver, early_stopping_rounds, early_stopping_metric_list, early_stopping_metric_minimize=False, metrics='accuracy'):
        test_loss_value, acc_test, pred, eval_summary = session.run(self.test_loss, feed)
        f1_3class, f1_2class = fscores(self.data.dev_y, pred)
        if not self.tuning:
            print("*** Validation Loss = {:.6f}; Validation Accuracy = {:.5f}; 3-class F1 = {:.5f}; 2-class F1 = {:.5f}"
                        .format(test_loss_value, acc_test, f1_3class, f1_2class))
            print()
        early_stop = False
        early_stopping_score = -1
        if metrics == 'accuracy':
            early_stopping_score = acc_test
            early_stopping_metric_list.append(acc_test)
        elif metrics == '3classf1':
            early_stopping_score = f1_3class
            early_stopping_metric_list.append(f1_3class)
        elif metrics == '2classf1':
            early_stopping_score = f1_2class
            early_stopping_metric_list.append(f1_2class)
        assert early_stopping_score > 0

        if (not self.FLAGS.restore) and (early_stopping_metric_minimize): # For minimising the eval score
            if all(early_stopping_score <= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(min(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop)
        elif not (self.FLAGS.restore and early_stopping_metric_minimize):  # For maximising the eval score
            if all(early_stopping_score >= i for i in early_stopping_metric_list):
                saver.save(session, self.FLAGS.checkpoint_file)
                best_eval_score = (acc_test, f1_3class, f1_2class)
            if early_stopping_metric_list[::-1].index(max(early_stopping_metric_list)) > early_stopping_rounds:
                early_stop = True
            return (test_loss_value, (acc_test, f1_3class, f1_2class), early_stop, eval_summary)
项目:audit-log-detection    作者:twosixlabs    | 项目源码 | 文件源码
def compile(self):

        self.model_.compile(optimizer=self.optimizer, loss=self.loss, metrics=None)
项目:FeatureSqueezing    作者:uvasrg    | 项目源码 | 文件源码
def train_detector(x_train, y_train, x_val, y_val):
    fpr, tpr, thresholds = roc_curve(y_train, x_train)
    accuracy = [ sklearn.metrics.accuracy_score(y_train, x_train>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
    roc_auc = auc(fpr, tpr)

    idx_best = np.argmax(accuracy)
    print "Best training accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy[idx_best], tpr[idx_best], fpr[idx_best], thresholds[idx_best])
    print "ROC_AUC: %.4f" % roc_auc

    accuracy_val = [ sklearn.metrics.accuracy_score(y_val, x_val>threshold, normalize=True, sample_weight=None) for threshold in thresholds ]
    tpr_val, fpr_val = zip(*[ get_tpr_fpr(y_val, x_val, threshold)  for threshold in thresholds  ])
    # roc_auc_val = auc(fpr_val, tpr_val)
    print "Validation accuracy: %.4f, TPR(Recall): %.4f, FPR: %.4f @%.4f" % (accuracy_val[idx_best], tpr_val[idx_best], fpr_val[idx_best], thresholds[idx_best])

    return threshold, accuracy_val, fpr_val, tpr_val
项目:semeval2017-scienceie    作者:UKPLab    | 项目源码 | 文件源码
def build_lstm(output_dim, embeddings):

    loss_function = "categorical_crossentropy"

    # this is the placeholder tensor for the input sequences
    sequence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")

    # this embedding layer will transform the sequences of integers
    embedded = Embedding(embeddings.shape[0], embeddings.shape[1], input_length=MAX_SEQUENCE_LENGTH, weights=[embeddings], trainable=True)(sequence)

    # 4 convolution layers (each 1000 filters)
    cnn = [Convolution1D(filter_length=filters, nb_filter=1000, border_mode="same") for filters in [2, 3, 5, 7]]
    # concatenate
    merged_cnn = merge([cnn(embedded) for cnn in cnn], mode="concat")
    # create attention vector from max-pooled convoluted
    maxpool = Lambda(lambda x: keras_backend.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    attention_vector = maxpool(merged_cnn)

    forwards = AttentionLSTM(64, attention_vector)(embedded)
    backwards = AttentionLSTM(64, attention_vector, go_backwards=True)(embedded)

    # concatenate the outputs of the 2 LSTM layers
    bi_lstm = merge([forwards, backwards], mode="concat", concat_axis=-1)

    after_dropout = Dropout(0.5)(bi_lstm)

    # softmax output layer
    output = Dense(output_dim=output_dim, activation="softmax")(after_dropout)

    # the complete omdel
    model = Model(input=sequence, output=output)

    # try using different optimizers and different optimizer configs
    model.compile("adagrad", loss_function, metrics=["accuracy"])

    return model
项目:python-alp    作者:tboquet    | 项目源码 | 文件源码
def to_dict_w_opt(model, metrics=None):
    """Serializes a sklearn model. Saves the parameters,
        not the attributes.

    Args:
        model(sklearn.BaseEstimator): the model to serialize,
            must be in SUPPORTED
        metrics(list, optionnal): a list of metrics to monitor

    Returns:
        a dictionnary of the serialized model
    """

    config = dict()
    typestring = str(type(model))[8:][:-2]
    config['config'] = typestring

    attr = model.__dict__

    for k, v in attr.items():
        # check if parameter or attribute
        if k[-1:] == '_':
            # do not store attributes
            pass
        else:
            config[k] = typeconversion(v)

    # to be discussed :
    # we add the metrics to the config even if it doesnt
    # make sense for a sklearn model
    # the metrics are then catch in model_from_dict_w_opt
    if metrics is not None:
        config['metrics'] = []
        for m in metrics:
            config['metrics'].append(m)

    return config
项目:python-alp    作者:tboquet    | 项目源码 | 文件源码
def model_from_dict_w_opt(model_dict, custom_objects=None):
    """Builds a sklearn model from a serialized model using `to_dict_w_opt`

    Args:
        model_dict(dict): a serialized sklearn model
        custom_objects(dict, optionnal): a dictionnary mapping custom objects
            names to custom objects (callables, etc.)

    Returns:
        A new sklearn.BaseEstimator (in SUPPORTED) instance. The attributes
        are not loaded.

    """
    if custom_objects is None:
        custom_objects = dict()

    # custom_objects = {k: deserialize(k, custom_objects[k])
    #                   for k in custom_objects}

    # safety check
    if model_dict['config'] not in keyval:
        raise NotImplementedError("sklearn model not supported.")

    # load the metrics
    if 'metrics' in model_dict:
        metrics = model_dict.pop('metrics')
    else:
        metrics = None

    # create a new instance of the appropriate model type
    model = copy.deepcopy(keyval[model_dict['config']])

    # load the parameters
    for k, v in model_dict.items():
        if isinstance(v, list):  # pragma: no cover
            setattr(model, k, np.array(v))
        else:
            setattr(model, k, v)

    return model, metrics
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def select_model(training_data, method='logistic',
                 do_segment_split=True,
                 processes=1,
                 cv_verbosity=2,
                 model_params=None,
                 random_state=None):
    """
    Fits a model given by *method* to the training data.
    :param training_data: The training data to fit the model with
    :param method: A string which specifies the model to use.
    :param do_segment_split: If True, the training data will be split by segment.
    :param processes: The number of processes to use for the grid search.
    :param cv_verbosity: The verbosity level of the grid search. 0 is silent, 2 is maximum verbosity.
    :param model_params: An optional dictionary with keyword arguments to tune the grid search.
    :param random_state: A constant which will seed the random number generator if given.
    :return: The fitted grid search object.
    """

    logging.info("Training a {} model".format(method))

    training_data_x = training_data.drop('Preictal', axis=1)
    training_data_y = training_data['Preictal']

    cv = get_cv_generator(training_data, do_segment_split=do_segment_split, random_state=random_state)

    scorer = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score, average='weighted')
    model_dict = get_model(method,
                           training_data_x,
                           training_data_y,
                           model_params=model_params,
                           random_state=random_state)
    common_cv_kwargs = dict(cv=cv,
                            scoring=scorer,
                            n_jobs=processes,
                            pre_dispatch='2*n_jobs',
                            refit=True,
                            verbose=cv_verbosity,
                            iid=False)

    cv_kwargs = dict(common_cv_kwargs)
    cv_kwargs.update(model_dict)

    logging.info("Running grid search using the parameters: {}".format(model_dict))
    clf = GridSearchCV(**cv_kwargs)
    clf.fit(training_data_x, training_data_y)

    return clf
项目:vwoptimize    作者:denik    | 项目源码 | 文件源码
def classification_report(y_true, y_pred, labels=None, sample_weight=None, digits=4, threshold=None):
    # this function is copied from https://github.com/scikit-learn/scikit-learn/blob/412996f/sklearn/metrics/classification.py#L1341 (c) respective authors
    # I pulled it here to fix formatting bug.
    from sklearn.metrics import precision_recall_fscore_support, accuracy_score

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if labels is None:
        from sklearn.utils.multiclass import unique_labels

        if threshold is not None:
            y_true = y_true > threshold
            y_pred = y_pred > threshold

        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)

    last_line_heading = 'avg / total'
    target_names = ['%s' % l for l in labels]

    results = [["", "precision", "recall", "f1-score", "support", "accuracy"]]

    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
                                                  average=None,
                                                  sample_weight=sample_weight)

    for i, label in enumerate(labels):
        values = [target_names[i]]
        for v in (p[i], r[i], f1[i]):
            values += ["{0:0.{1}f}".format(v, digits)]
        values += ["{0}".format(s[i])]
        accuracy = accuracy_score(y_true == label, y_pred == label, sample_weight=sample_weight)
        values += ["{0:0.{1}f}".format(accuracy, digits)]
        results.append(values)

    values = [last_line_heading]
    for v in (np.average(p, weights=s),
              np.average(r, weights=s),
              np.average(f1, weights=s)):
        values += ["{0:0.{1}f}".format(v, digits)]
    values += ['{0}'.format(np.sum(s))]
    accuracy = accuracy_score(y_true, y_pred, sample_weight=sample_weight)
    values += ["{0:0.{1}f}".format(accuracy, digits)]
    results.append(values)

    return results
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def roc_plot_from_thresholds(roc_thresholds_by_model, save=False, debug=False):
    """
    From a given dictionary of thresholds by model, create a ROC curve for each model.

    Args:
        roc_thresholds_by_model (dict): A dictionary of ROC thresholds by model name.
        save (bool): False to display the image (default) or True to save it (but not display it)
        debug (bool): verbost output.
    """
    # TODO consolidate this and PR plotter into 1 function
    # TODO make the colors randomly generated from rgb values
    # Cycle through the colors list
    color_iterator = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
    # Initialize plot
    plt.figure()
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TRP)')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0, 1], [0, 1], linestyle=DIAGONAL_LINE_STYLE, color=DIAGONAL_LINE_COLOR)

    # Calculate and plot for each model
    for color, (model_name, metrics) in zip(color_iterator, roc_thresholds_by_model.items()):
        # Extract model name and metrics from dictionary
        roc_auc = metrics['roc_auc']
        tpr = metrics['true_positive_rates']
        fpr = metrics['false_positive_rates']
        best_true_positive_rate = metrics['best_true_positive_rate']
        best_false_positive_rate = metrics['best_false_positive_rate']

        if debug:
            print('{} model:'.format(model_name))
            print(pd.DataFrame({'FPR': fpr, 'TPR': tpr}))

        # plot the line
        label = '{} (ROC AUC = {})'.format(model_name, round(roc_auc, 2))
        plt.plot(fpr, tpr, color=color, label=label)
        plt.plot([best_false_positive_rate], [best_true_positive_rate], marker='*', markersize=10, color=color)

    plt.legend(loc="lower right")

    if save:
        plt.savefig('ROC.png')
        source_path = os.path.dirname(os.path.abspath(__file__))
        print('\nROC plot saved in: {}'.format(source_path))

    plt.show()
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def pr_plot_from_thresholds(pr_thresholds_by_model, save=False, debug=False):
    """
    From a given dictionary of thresholds by model, create a PR curve for each model.

    Args:
        pr_thresholds_by_model (dict): A dictionary of PR thresholds by model name.
        save (bool): False to display the image (default) or True to save it (but not display it)
        debug (bool): verbost output.
    """
    # TODO consolidate this and PR plotter into 1 function
    # TODO make the colors randomly generated from rgb values
    # Cycle through the colors list
    color_iterator = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
    # Initialize plot
    plt.figure()
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall (PR)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0, 1], [1, 0], linestyle=DIAGONAL_LINE_STYLE, color=DIAGONAL_LINE_COLOR)

    # Calculate and plot for each model
    for color, (model_name, metrics) in zip(color_iterator, pr_thresholds_by_model.items()):
        # Extract model name and metrics from dictionary
        pr_auc = metrics['pr_auc']
        precision = metrics['precisions']
        recall = metrics['recalls']
        best_recall = metrics['best_recall']
        best_precision = metrics['best_precision']

        if debug:
            print('{} model:'.format(model_name))
            print(pd.DataFrame({'Recall': recall, 'Precision': precision}))

        # plot the line
        label = '{} (PR AUC = {})'.format(model_name, round(pr_auc, 2))
        plt.plot(recall, precision, color=color, label=label)
        plt.plot([best_recall], [best_precision], marker='*', markersize=10, color=color)

    plt.legend(loc="lower left")

    if save:
        plt.savefig('PR.png')
        source_path = os.path.dirname(os.path.abspath(__file__))
        print('\nPR plot saved in: {}'.format(source_path))

    plt.show()
项目:-Python-Analysis_of_wine_quality    作者:ekolik    | 项目源码 | 文件源码
def decis_tree(wine_set):
    # to remember the if the wine_set red or white
    w = wine_set

    # subset data for better tree visibility
    # wine_set = wine_set[:100]

    # recode quality (response variable) into 2 groups: 0:{3,4,5}, 1:{6,7,8,9}
    recode = {3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1}
    wine_set['quality_c'] = wine_set['quality'].map(recode)

    # round explanatory data for easier tree
    # wine_set["residual_sugar"] = wine_set["residual_sugar"].round()
    # wine_set["alcohol"] = wine_set["alcohol"].round()

    # split into training and testing sets
    predictors = wine_set[["residual_sugar", 'alcohol']]
    targets = wine_set.quality_c

    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)

    # build model on training data
    classifier = DecisionTreeClassifier()
    classifier = classifier.fit(pred_train, tar_train)

    predictions = classifier.predict(pred_test)

    # print the confusion matrix and accuracy of the model
    print(sklearn.metrics.confusion_matrix(tar_test, predictions))
    print(sklearn.metrics.accuracy_score(tar_test, predictions))

    # export the tree for viewing
    if w.equals(red):
        export_graphviz(classifier, out_file="red_decision_tree.dot")
    else:
        export_graphviz(classifier, out_file="white_decision_tree.dot")
    # to view the decision tree create a .pdf file from the created .dot file
    # by typing in the terminal from this directory: dot -Tpdf decision_tree.dot -o decision_tree.pdf
# print('----------------Decision Tree------------------------')
# call(decis_tree)


# ____________________________________Random Forests________________
项目:-Python-Analysis_of_wine_quality    作者:ekolik    | 项目源码 | 文件源码
def random_forests(wine_set):
    # recode quality (response variable) into 2 groups: 0:{3,4,5}, 1:{6,7,8,9}
    recode = {3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1}
    wine_set['quality_c'] = wine_set['quality'].map(recode)

    # split into training and testing sets
    predictors = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
                           'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]

    targets = wine_set.quality_c

    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)

    # build model on training data#
    classifier = RandomForestClassifier(n_estimators=25)
    classifier = classifier.fit(pred_train, tar_train)

    predictions = classifier.predict(pred_test)
    # print the confusion matrix and accuracy of the model
    print('confusion matrix:\n', sklearn.metrics.confusion_matrix(tar_test, predictions))
    print('\naccuracy:', sklearn.metrics.accuracy_score(tar_test, predictions))

    # to display the relative importance of each predictive variable
    model = ExtraTreesClassifier()
    model.fit(pred_train, tar_train)

    print('importance of predictors:')
    dct = dict()
    for c in range(len(predictors.columns)):
        dct[predictors.columns[c]] = model.feature_importances_[c]
    print(sorted(dct.items(), key=operator.itemgetter(1), reverse=True))

    # run different numbers of trees to see the effect of the number on the accuracy of the prediction
    n = 100
    accuracy = [0]*n

    for i in range(n):
        classifier = RandomForestClassifier(n_estimators=i+1)
        classifier = classifier.fit(pred_train, tar_train)
        predictions = classifier.predict(pred_test)
        accuracy[i] = sklearn.metrics.accuracy_score(tar_test, predictions)

    plt.plot(range(1, n+1), accuracy)
    plt.xlabel("Number of trees")
    plt.ylabel("Accuracy of prediction")
    plt.title("Effect of the number of trees on the prediction accuracy")
    plt.show()

    print(accuracy)

# print('----------------Random Forests------------------------')
# call(random_forests)


# ________________________________Lasso Regression__________________________________
项目:dcase2016_task4    作者:pafoster    | 项目源码 | 文件源码
def do_system_evaluation(dataset, dataset_evaluation_mode, result_path):

    # Set warnings off, sklearn metrics will trigger warning for classes without
    # predicted samples in F1-scoring. This is just to keep printing clean.
    #warnings.simplefilter("ignore")

    fold_wise_class_eer = numpy.zeros((len(dataset.folds(mode=dataset_evaluation_mode)), dataset.audio_tag_count))

    for fold in dataset.folds(mode=dataset_evaluation_mode):
        class_wise_eer       = numpy.zeros((dataset.audio_tag_count))
        results = []
        result_filename = get_result_filename(fold=fold, path=result_path)
        if os.path.isfile(result_filename):
            with open(result_filename, 'rt') as f:
                for row in csv.reader(f, delimiter=','):
                    results.append(row)
        else:
            raise IOError("Result file not found [%s]" % result_filename)

        for tag_id,tag in enumerate(dataset.audio_tags):

            y_true_binary = []
            y_true_file = []
            y_score = []
            for result in results:
                if tag == result[1]:
                    relative_path = dataset.package_list[0]['local_audio_path'].replace(dataset.local_path,'')[1:] + os.path.sep + result[0]
                    y_true_file.append(result[0])
                    if tag in dataset.file_meta(relative_path)[0]['tags']:
                        y_true_binary.append(1)
                    else:
                        y_true_binary.append(0)

                    y_score.append(float(result[2]))

            if numpy.any(y_true_binary):
                class_wise_eer[tag_id] = compute_eer(result_filename, tag, dict(zip(y_true_file, y_true_binary)))
            else:
                class_wise_eer[tag_id] = None

        fold_wise_class_eer[fold - 1 if fold > 0 else fold, :] = class_wise_eer

    print "  File-wise evaluation, over %d folds" % (dataset.fold_count)

    print "     {:20s} | {:8s}".format('Tag', 'EER')
    print "     ==============================================="
    labels = numpy.array([dataset.tagcode_to_taglabel(t) for t in dataset.audio_tags])
    for i in numpy.argsort(labels):
        print "     {:20s} | {:3.3f} ".format(labels[i],
                                                                    numpy.nanmean(fold_wise_class_eer[:,i])
                                                                    )
    print "     ==============================================="
    print "     {:20s} | {:3.3f} ".format('Mean error',
                                                      numpy.mean(numpy.nanmean(fold_wise_class_eer))
                                                      )
    # Restore warnings to default settings
    warnings.simplefilter("default")
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def rand_forest_train(self):
        # ??????????
        users = pd.read_csv('names.csv')
        # ??similarity?platform?reputation?entropy????????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        y = users['human_or_machine']

        # ?????????? 25%???????
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

        # ????????????????
        from sklearn.feature_extraction import DictVectorizer
        vec = DictVectorizer(sparse=False)
        X_train = vec.fit_transform(X_train.to_dict(orient='record'))
        X_test = vec.transform(X_test.to_dict(orient='record'))

        # ?????????????????????
        from sklearn.tree import DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        dtc_y_pred = dtc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import GradientBoostingClassifier
        gbc = GradientBoostingClassifier()
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_test)

        from sklearn.metrics import classification_report
        # ??????????????????? ?????????? ??? F1??
        print("??????????", dtc.score(X_test, y_test))
        print(classification_report(dtc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", rfc.score(X_test, y_test))
        print(classification_report(rfc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", gbc.score(X_test, y_test))
        print(classification_report(gbc_y_pred, y_test))


        users = pd.read_csv('values.csv')

        # ??????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        X = vec.transform(X.to_dict(orient='record'))
        print(rfc.predict(X))

        self.dtc = dtc
        self.rfc = rfc
        self.gbc = gbc