Python fuzzywuzzy.fuzz 模块,token_sort_ratio() 实例源码

我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用fuzzywuzzy.fuzz.token_sort_ratio()

项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def autorsAmendment(self,autors):
        #formatea si es un diputado de forma que se pueda buscar en la bd
        strip=autors[0].strip()
        typeaut = self.typeAutor(name=strip)
        if typeaut is not 'grupo':
            max = 0
            member = None
            for memb in self.members:
                ratio = fuzz.token_sort_ratio(strip, memb['nombre'])
                if ratio > max:
                    member = memb
                    max = ratio
            return member['nombre']

        else:
            return strip
项目:kaggle-quora-dup    作者:aerdem4    | 项目源码 | 文件源码
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df
项目:PaStA    作者:lfd    | 项目源码 | 文件源码
def evaluate_patch_pair(thresholds, lhs, rhs):
    left_message, left_diff = lhs
    right_message, right_diff = rhs

    left_diff_lines = left_diff.lines
    right_diff_lines = right_diff.lines

    diff_lines_ratio = min(left_diff_lines, right_diff_lines) / max(left_diff_lines, right_diff_lines)

    # get rating of message
    msg_rating = fuzz.token_sort_ratio(left_message, right_message) / 100

    # Skip on diff_lines_ratio less than 1%
    if diff_lines_ratio < 0.01:
        return SimRating(msg_rating, 0, diff_lines_ratio)

    # get rating of diff
    diff_rating = rate_diffs(thresholds, left_diff, right_diff)

    return SimRating(msg_rating, diff_rating, diff_lines_ratio)
项目:openrefine-wikidata    作者:wetneb    | 项目源码 | 文件源码
def fuzzy_match_strings(ref, val):
    """
    Returns the matching score of two values.
    """
    if not ref or not val:
        return 0
    ref_q = to_q(ref)
    val_q = to_q(val)
    if ref_q or val_q:
        return 100 if ref_q == val_q else 0
    simplified_val = unidecode(val).lower()
    simplified_ref = unidecode(ref).lower()

    # Return symmetric score
    r1 = fuzz.token_sort_ratio(simplified_val, simplified_ref)
    r2 = fuzz.token_sort_ratio(simplified_ref, simplified_val)
    r2 = r1
    return int(0.5*(r1+r2))
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def matchautorgroup(self,lists):
        all = self.members+self.groups
        res = []


        for element in lists:
            member = None
            max = 0
            for memb in all:
                ratio = fuzz.token_sort_ratio(element, memb['nombre'])
                if ratio > max:
                    member = memb
                    max = ratio
            res.append(member)
        return res
项目:the-magical-csv-merge-machine    作者:entrepreneur-interet-general    | 项目源码 | 文件源码
def score_tokens(src, ref, translate_tokens):
    if translate_tokens:
        return score_tokens(translate(src), translate(ref), False)
    # Returns a score in [0, 100]
    aTokens = validateTokens(src)
    bTokens = validateTokens(ref)
    a2 = ' '.join(aTokens)
    b2 = ' '.join(bTokens)
    tokenSortRatio = fuzz.token_sort_ratio(a2, b2)
    if tokenSortRatio < 40: 
        logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref))
        return 0
    tokenSetRatio = fuzz.token_set_ratio(a2, b2)
    if tokenSetRatio < 50:
        logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref))
        return 0
    if REQUIRES_SHARED_PROPER_NOUN:
        aProper = ' '.join(filterProperNouns(aTokens))
        bProper = ' '.join(filterProperNouns(bTokens))
        # if(len(aProper) > 3 and len(bProper) > 3):
        if len(aProper) > 0 or len(bProper) > 0:
            properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper)
            if properNounSortRatio < 80: 
                logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref))
                return 0
            properNounSetRatio = fuzz.token_set_ratio(aProper, bProper)
            if properNounSetRatio < 60:
                logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref))
                return 0
    return tokenSortRatio * tokenSetRatio / 100
项目:PaStA    作者:lfd    | 项目源码 | 文件源码
def best_string_mapping(threshold, left_list, right_list):
    """
    This function tries to find the closest mapping with the best weight of two lists of strings.
    Example:

      List A        List B

    0:  'abc'         'abc'
    1:  'cde'         'cde'
    2:  'fgh'         'fgh
    3:                'fgj

    map_lists will try to map each element of List A to an element of List B, in respect to the given threshold.

    As a[{0,1,2}] == b[{0,1,2}], those values will automatically be mapped. Additionally, a[2] will also be mapped to
    b[3], if the threshold is low enough (cf. 0.5).
    """
    def injective_map(ll, rl, inverse_result=False):
        ret = dict()
        for l_entry in ll:
            for r_entry in rl:
                if l_entry == r_entry:
                    sim = 1
                else:
                    sim = fuzz.token_sort_ratio(l_entry, r_entry) / 100

                if sim < threshold:
                    continue

                if l_entry in ret:
                    _, old_sim = ret[l_entry]
                    if sim < old_sim:
                        continue

                ret[l_entry] = r_entry, sim
        return {(r, l) if inverse_result else (l, r) for l, (r, _) in ret.items()}

    return injective_map(left_list, right_list) | injective_map(right_list, left_list, True)
项目:PaStA    作者:lfd    | 项目源码 | 文件源码
def rate_diffs(thresholds, l_diff, r_diff):
    filename_compare = best_string_mapping(thresholds.filename, l_diff.patches.keys(), r_diff.patches.keys())
    levenshteins = []

    def compare_hunks(left, right):
        # This case happens for example, if both hunks remove empty newlines
        if left == right:
            return 100
        return fuzz.token_sort_ratio(left, right)

    for l_filename, r_filename in filename_compare:
        l_hunks = l_diff.patches[l_filename]
        r_hunks = r_diff.patches[r_filename]

        levenshtein = []
        hunk_compare = best_string_mapping(thresholds.heading,
                                           l_hunks.keys(), r_hunks.keys())

        for l_hunk_heading, r_hunk_heading in hunk_compare:
            lhunk = l_hunks[l_hunk_heading]
            rhunk = r_hunks[r_hunk_heading]

            if lhunk.deletions and rhunk.deletions:
                levenshtein.append(compare_hunks(lhunk.deletions,
                                                 rhunk.deletions))
            if lhunk.insertions and rhunk.insertions:
                levenshtein.append(compare_hunks(lhunk.insertions,
                                                 rhunk.insertions))

        if levenshtein:
            levenshteins.append(mean(levenshtein))

    if not levenshteins:
        levenshteins = [0]

    diff_rating = mean(levenshteins) / 100

    return diff_rating
项目:PaStA    作者:lfd    | 项目源码 | 文件源码
def preevaluate_filenames(thresholds, right_files, left_file):
    candidates = []
    for right_file in right_files:
        if thresholds.filename >= 1.0:
            if left_file != right_file:
                continue
        else:
            sim = fuzz.token_sort_ratio(left_file, right_file) / 100
            if sim < thresholds.filename:
                continue
        candidates.append(right_file)
    return left_file, candidates
项目:ModTools    作者:MattBSG    | 项目源码 | 文件源码
def strict_compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.partial_ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
项目:ModTools    作者:MattBSG    | 项目源码 | 文件源码
def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
项目:GetUrRecon    作者:cmh2166    | 项目源码 | 文件源码
def get_CUL_score(record_elems, resp_elems):
    if record_elems is None or resp_elems is None:
        return None
    elif isinstance(record_elems, str) and isinstance(resp_elems, str):
        score = str(fuzz.token_sort_ratio(record_elems, resp_elems))
        return score
    elif isinstance(record_elems, str) and not isinstance(resp_elems, str):
        scores = []
        for n in range(len(resp_elems)):
            score = str(fuzz.token_sort_ratio(record_elems, resp_elems[n]))
            scores.append(score)
        return max(scores)
    elif not isinstance(record_elems, str) and isinstance(resp_elems, str):
        scores = []
        for n in range(len(record_elems)):
            score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems))
            scores.append(score)
        return max(scores)
    elif not isinstance(record_elems, str) and not isinstance(resp_elems, str):
        scores = []
        for n in range(len(record_elems)):
            for m in range(len(resp_elems)):
                score = str(fuzz.token_sort_ratio(record_elems[n],
                            resp_elems[m]))
                scores.append(score)
        if scores != []:
            return max(scores)
    else:
        return None
项目:Snakepit    作者:K4lium    | 项目源码 | 文件源码
def computeSimilarity(s1, s2):
    return 1.0 - (0.01 * max(
        fuzz.ratio(s1, s2),
        fuzz.token_sort_ratio(s1, s2),
        fuzz.token_set_ratio(s1, s2)))
项目:NSIT-Bot    作者:gabru-md    | 项目源码 | 文件源码
def sim(str1,str2):
    return fuzz.token_sort_ratio(str1,str2) * 0.01
项目:python-zentropi    作者:zentropi    | 项目源码 | 文件源码
def match_fuzzy(self, frame):
        pattern = process.extractOne(
            frame.name, self._index_fuzzy,
            scorer=fuzz.token_sort_ratio)
        if not pattern or pattern[1] < MATCH_FUZZY_THRESHOLD:
            return frame, set()
        return frame, self._handlers[pattern[0]]
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def main(conf):
    dump_dir = conf['fuzzy.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['fuzzy.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['fuzzy.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
项目:stocks-list    作者:akashgiri    | 项目源码 | 文件源码
def is_fuzzy_matching_valid(self, stock_name, current_stock):
        ## Get the token sort ratio from fuzzywuzzy
        ratio = fuzz.token_sort_ratio(stock_name, current_stock)
        return ratio > 95
项目:matchtools    作者:matchtools    | 项目源码 | 文件源码
def compare_strings(cls, string1, string2, *, tolerance=None,
                        method='uwratio'):
        """
        Check if the strings provided have a similarity ratio within the
        specified tolerance.

        Return True if yes, otherwise return False.

        Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).

        :param string1: str
        :param string2: str
        :param tolerance: number
        :param method: str, one of: 'uwratio', 'partial_ratio',
                                    'token_sort_ratio', 'token_set_ratio',
                                    'ratio'
        :rtype: bool

        :Example:

        >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
        True

        >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
        False
        """

        str_number = any(
            char.isdigit() for string in (string1, string2) for char in string)

        if tolerance is None:
            if str_number:
                tolerance = cls.str_number_tolerance
            else:
                tolerance = cls.string_tolerance

        if not str_number:
            if cls.is_abbreviation(string1, string2):
                return True

        methods = {'uwratio': fuzz.UWRatio,
                   'partial_ratio': fuzz.partial_ratio,
                   'token_sort_ratio': fuzz.token_sort_ratio,
                   'token_set_ratio': fuzz.token_set_ratio,
                   'ratio': fuzz.ratio}

        if method not in methods:
            msg = 'wrong method, use available: {}'
            raise ValueError(msg.format(', '.join(sorted(methods))))

        return methods[method](string1, string2) >= 100 - tolerance
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def compute_features(train_df, test_df):

    train_df[Fields.qratio] = train_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.qratio] = test_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.wratio] = test_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)

    quality = dict(
        quality_qratio=quality_qratio,
        quality_wratio=quality_wratio,
        quality_partial_ratio=quality_partial_ratio,
        quality_partial_token_set_ratio=quality_partial_token_set_ratio,
        quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
        quality_token_set_ratio=quality_token_set_ratio,
        quality_token_sort_ratio=quality_token_sort_ratio
    )

    return quality