Python scipy.spatial.distance 模块,canberra() 实例源码

我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用scipy.spatial.distance.canberra()

项目:python_mozetl    作者:mozilla    | 项目源码 | 文件源码
def similarity_function(x, y):
    """ Similarity function for comparing user features.

    This actually really should be implemented in taar.similarity_recommender
    and then imported here for consistency.
    """

    def safe_get(field, row, default_value):
        # Safely get a value from the Row. If the value is None, get the
        # default value.
        return row[field] if row[field] is not None else default_value

    # Extract the values for the categorical and continuous features for both
    # the x and y samples. Use an empty string as the default value for missing
    # categorical fields and 0 for the continuous ones.
    x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
    x_continuous_features = [safe_get(k, x, 0) for k in CONTINUOUS_FEATURES]
    y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
    y_continuous_features = [safe_get(k, y, 0) for k in CONTINUOUS_FEATURES]

    # Here a larger distance indicates a poorer match between categorical variables.
    j_d = (distance.hamming(x_categorical_features, y_categorical_features))
    j_c = (distance.canberra(x_continuous_features, y_continuous_features))

    # Take the product of similarities to attain a univariate similarity score.
    # Add a minimal constant to prevent zero values from categorical features.
    # Note: since both the distance function return a Numpy type, we need to
    # call the |item| function to get the underlying Python type. If we don't
    # do that this job will fail when performing KDE due to SPARK-20803 on
    # Spark 2.2.0.
    return abs((j_c + 0.001) * j_d).item()
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)
        wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
项目:teneto    作者:wiheto    | 项目源码 | 文件源码
def getDistanceFunction(requested_metric):
    """

    This function returns a specified distance function.


    **PARAMETERS**

    :'requested_metric': can be 'hamming', 'eculidean' or any of the functions in https://docs.scipy.org/doc/scipy/reference/spatial.distance.html which only require u and v as input. 

    **OUTPUT**

    returns distance function (as function)

    **HISTORY**

    :Created: Dec 2016, WHT
    :Updated (v0.2.1): Aug 2017, WHT. Changed from distance functions being in misc to using scipy. 

    """

    distance_options = {
        'braycurtis': distance.braycurtis,
        'canberra': distance.canberra,
        'chebyshev': distance.chebyshev,
        'cityblock': distance.cityblock,
        'correlation': distance.correlation,
        'cosine': distance.cosine,
        'euclidean': distance.euclidean,
        'sqeuclidean': distance.sqeuclidean,
        'dice': distance.dice,
        'hamming': distance.hamming,
        'jaccard': distance.jaccard,
        'kulsinski': distance.kulsinski,
        'matching': distance.matching,
        'rogerstanimoto': distance.rogerstanimoto,
        'russellrao': distance.russellrao,
        'sokalmichener': distance.sokalmichener,
        'sokalsneath': distance.sokalsneath,
        'yule': distance.yule,
    }

    if requested_metric in distance_options: 
        return distance_options[requested_metric]
    else:
        raise ValueError('Distance function cannot be found.')
项目:zhihu-machine-learning-challenge-2017    作者:HouJP    | 项目源码 | 文件源码
def generate(config, argv):
    # load valid dataset index
    valid_index_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'),
                                              config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
    valid_index = DataUtil.load_vector(valid_index_fp, 'int')
    valid_index = [num - 1 for num in valid_index]

    # load topic btm vec
    topic_btm_vec = load_topic_btm_vec(config)

    # offline / online
    data_name = argv[0]

    dis_func_names = ["cosine",
                      "cityblock",
                      "jaccard",
                      "canberra",
                      "euclidean",
                      "minkowski",
                      "braycurtis"]

    btm_dis_feature_fn = ['vote_fs_btm_dis_%s' % dis_func_name for dis_func_name in dis_func_names]
    btm_dis_feature_f = [open('%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'),
                                                fn,
                                                data_name), 'w') for fn in btm_dis_feature_fn]

    if 'offline' == data_name:
        btm_tw_cw_features = load_features_from_file(config, 'fs_btm_tw_cw', data_name, valid_index)
        LogUtil.log('INFO', 'load_features_from_file, len=%d' % len(btm_tw_cw_features))
        for line_id in range(len(btm_tw_cw_features)):
            doc_vec = btm_tw_cw_features[line_id]
            for dis_id in range(len(dis_func_names)):
                vec = [0.] * 1999
                for topic_id in range(1999):
                    topic_vec = topic_btm_vec[topic_id]
                    if 'minkowski' == dis_func_names[dis_id]:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
                    else:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
                btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))
    else:
        btm_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), data_name)
        btm_vec_f = open(btm_vec_fp, 'r')
        for line in btm_vec_f:
            doc_vec = np.nan_to_num(parse_feature_vec(line))
            for dis_id in range(len(dis_func_names)):
                vec = [0.] * 1999
                for topic_id in range(1999):
                    topic_vec = topic_btm_vec[topic_id]
                    if 'minkowski' == dis_func_names[dis_id]:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
                    else:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
                btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))

    for f in btm_dis_feature_f:
        f.close()