Python fuzzywuzzy.fuzz 模块,token_set_ratio() 实例源码

我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用fuzzywuzzy.fuzz.token_set_ratio()

项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def _compute_author_similarity(self, paired_authors):
        def row_similarity(row):
            same_email = row.author_email == row.author_email_other
            name_similarity = fuzz.token_set_ratio(row.author_name,
                                                   row.author_name_other)
            email_name_similarity = fuzz.ratio(row.email_name,
                                               row.email_name_other)
            name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
                                                            row.name_from_email_other)
            return pd.Series(
                [same_email, name_similarity, email_name_similarity,
                 name_to_email_similarity])

        newcols = paired_authors.apply(row_similarity, axis=1)
        newcols.columns = ['same_email', 'name_similarity',
                           'email_name_similarity', 'name_to_email_similarity']
        newdf = paired_authors.join(newcols)
        return newdf
项目:kaggle-quora-dup    作者:aerdem4    | 项目源码 | 文件源码
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df
项目:vityBot    作者:GDGVIT    | 项目源码 | 文件源码
def find_match(query_string):
    """
    find the matching faculty object from the query string
    :param query_string:
    :return: faculty dictionary object
    """
    global data

    max_ratio = 0
    max_faculty = None

    for faculty in data:
        ratio = fuzz.token_set_ratio(faculty['name'], query_string)

        if ratio > max_ratio:
            max_ratio = ratio
            max_faculty = faculty

    return max_faculty if max_ratio > 40 else None
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def compare_names(name1: ParsedName, name2: ParsedName):
        if proper(name1) and proper(name2):
            compare = fuzz.token_set_ratio
        else:
            compare = fuzz.ratio
        return compare(name1.name, name2.name)
项目:the-magical-csv-merge-machine    作者:entrepreneur-interet-general    | 项目源码 | 文件源码
def score_tokens(src, ref, translate_tokens):
    if translate_tokens:
        return score_tokens(translate(src), translate(ref), False)
    # Returns a score in [0, 100]
    aTokens = validateTokens(src)
    bTokens = validateTokens(ref)
    a2 = ' '.join(aTokens)
    b2 = ' '.join(bTokens)
    tokenSortRatio = fuzz.token_sort_ratio(a2, b2)
    if tokenSortRatio < 40: 
        logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref))
        return 0
    tokenSetRatio = fuzz.token_set_ratio(a2, b2)
    if tokenSetRatio < 50:
        logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref))
        return 0
    if REQUIRES_SHARED_PROPER_NOUN:
        aProper = ' '.join(filterProperNouns(aTokens))
        bProper = ' '.join(filterProperNouns(bTokens))
        # if(len(aProper) > 3 and len(bProper) > 3):
        if len(aProper) > 0 or len(bProper) > 0:
            properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper)
            if properNounSortRatio < 80: 
                logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref))
                return 0
            properNounSetRatio = fuzz.token_set_ratio(aProper, bProper)
            if properNounSetRatio < 60:
                logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref))
                return 0
    return tokenSortRatio * tokenSetRatio / 100
项目:vityBot    作者:GDGVIT    | 项目源码 | 文件源码
def find_match(query, intent):
    global col
    doc = None
    max_ratio = 0

    for d in col.find({"intent": intent}):
        ratio = fuzz.token_set_ratio(d['text'], query)

        if ratio > max_ratio:
            max_ratio = ratio
            doc = d

    del doc['_id']
    return doc['answer']
项目:vityBot    作者:GDGVIT    | 项目源码 | 文件源码
def find_match(course_list, query_string):
    """
    find the most matching course for a given name and return the course
    :param course_list: list of courses
    :param query_string: query of the user
    :return: course object
    """

    max_out = 0  # the max ratio among the courses
    max_course = None

    for course in course_list:
        if 'lab' not in query_string.lower():
            if course.subject_type == 'Embedded Lab':
                continue
        else:
            if course.subject_type == 'Embedded Theory':
                continue

        max_in = 0  # the max ratio among different names of the course

        for name in course.names:
            ratio = fuzz.token_set_ratio(name, query_string)

            if ratio > max_in:
                max_in = ratio

        if max_out < max_in:
            max_out = max_in
            max_course = course

    return max_course if max_out > 50 else None
项目:ModTools    作者:MattBSG    | 项目源码 | 文件源码
def strict_compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.partial_ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
项目:ModTools    作者:MattBSG    | 项目源码 | 文件源码
def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
项目:Question-Answering-System    作者:AdityaAS    | 项目源码 | 文件源码
def enter(MSG):
    """
    This function takes a string (MSG) and tries to answer the query by looking through the dictionaries in the program (after some preprocessing).
    It tries to mine out the correct response by performing pattern matching through the structured data
    """
    msg=MSG.lower()
    if msg[-1]=='?':
        msg=msg[:-1]
    tokens=nltk.word_tokenize(msg)
    for i in words:
        while (i in tokens):
            tokens.remove(i)
    lst=[]
    flag=0
    if tokens[0]=="who":
        lst=data_who
    elif tokens[0]=="what":
        lst=data_what
    elif tokens[0]=='how':
        lst=data_how
    #msg=str(tokens)
    msg=' '.join(tokens[1:])    
    for i in lst:
        if fuzz.token_set_ratio(i[0],msg)>=60:
            print i[1]
            flag=1
            break
    if flag==0:
        print "Question Not found"
项目:Snakepit    作者:K4lium    | 项目源码 | 文件源码
def computeSimilarity(s1, s2):
    return 1.0 - (0.01 * max(
        fuzz.ratio(s1, s2),
        fuzz.token_sort_ratio(s1, s2),
        fuzz.token_set_ratio(s1, s2)))
项目:watcher    作者:nosmokingbandit    | 项目源码 | 文件源码
def _match_torrent_name(self, movie_title, movie_year, torrent_title):
        ''' Checks if movie_title and torrent_title are a good match
        movie_title: str title of movie
        movie_year: str year of movie release
        torrent_title: str title of torrent

        Helper function for rss_sync.

        Since torrent indexers don't supply imdbid like NewzNab does we have to compare
            the titles to find a match. This should be fairly accurate since a backlog
            search uses name and year to find releases.

        Checks if the year is in the title, promptly ignores it if the year is not found.
        Then does a fuzzy title match looking for 80+ token set ratio.

        Returns bool on match success
        '''

        if movie_year not in torrent_title:
            return False
        else:
            title = movie_title.replace(':', '.').replace(' ', '.').lower()
            torrent = torrent_title.replace(' ', '.').replace(':', '.').lower()
            match = fuzz.token_set_ratio(title, torrent)
            if match > 80:
                return True
            else:
                return False
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def main(conf):
    dump_dir = conf['fuzzy.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['fuzzy.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['fuzzy.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
项目:matchtools    作者:matchtools    | 项目源码 | 文件源码
def compare_strings(cls, string1, string2, *, tolerance=None,
                        method='uwratio'):
        """
        Check if the strings provided have a similarity ratio within the
        specified tolerance.

        Return True if yes, otherwise return False.

        Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy).

        :param string1: str
        :param string2: str
        :param tolerance: number
        :param method: str, one of: 'uwratio', 'partial_ratio',
                                    'token_sort_ratio', 'token_set_ratio',
                                    'ratio'
        :rtype: bool

        :Example:

        >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)
        True

        >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')
        False
        """

        str_number = any(
            char.isdigit() for string in (string1, string2) for char in string)

        if tolerance is None:
            if str_number:
                tolerance = cls.str_number_tolerance
            else:
                tolerance = cls.string_tolerance

        if not str_number:
            if cls.is_abbreviation(string1, string2):
                return True

        methods = {'uwratio': fuzz.UWRatio,
                   'partial_ratio': fuzz.partial_ratio,
                   'token_sort_ratio': fuzz.token_sort_ratio,
                   'token_set_ratio': fuzz.token_set_ratio,
                   'ratio': fuzz.ratio}

        if method not in methods:
            msg = 'wrong method, use available: {}'
            raise ValueError(msg.format(', '.join(sorted(methods))))

        return methods[method](string1, string2) >= 100 - tolerance
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def compute_features(train_df, test_df):

    train_df[Fields.qratio] = train_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.qratio] = test_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.wratio] = test_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)

    quality = dict(
        quality_qratio=quality_qratio,
        quality_wratio=quality_wratio,
        quality_partial_ratio=quality_partial_ratio,
        quality_partial_token_set_ratio=quality_partial_token_set_ratio,
        quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
        quality_token_set_ratio=quality_token_set_ratio,
        quality_token_sort_ratio=quality_token_sort_ratio
    )

    return quality