我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用fuzzywuzzy.fuzz.token_set_ratio()。
def _compute_author_similarity(self, paired_authors): def row_similarity(row): same_email = row.author_email == row.author_email_other name_similarity = fuzz.token_set_ratio(row.author_name, row.author_name_other) email_name_similarity = fuzz.ratio(row.email_name, row.email_name_other) name_to_email_similarity = fuzz.token_set_ratio(row.author_name, row.name_from_email_other) return pd.Series( [same_email, name_similarity, email_name_similarity, name_to_email_similarity]) newcols = paired_authors.apply(row_similarity, axis=1) newcols.columns = ['same_email', 'name_similarity', 'email_name_similarity', 'name_to_email_similarity'] newdf = paired_authors.join(newcols) return newdf
def extract_features(df): df["question1"] = df["question1"].fillna("").apply(preprocess) df["question2"] = df["question2"].fillna("").apply(preprocess) print("token features...") token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return df
def find_match(query_string): """ find the matching faculty object from the query string :param query_string: :return: faculty dictionary object """ global data max_ratio = 0 max_faculty = None for faculty in data: ratio = fuzz.token_set_ratio(faculty['name'], query_string) if ratio > max_ratio: max_ratio = ratio max_faculty = faculty return max_faculty if max_ratio > 40 else None
def compare_names(name1: ParsedName, name2: ParsedName): if proper(name1) and proper(name2): compare = fuzz.token_set_ratio else: compare = fuzz.ratio return compare(name1.name, name2.name)
def score_tokens(src, ref, translate_tokens): if translate_tokens: return score_tokens(translate(src), translate(ref), False) # Returns a score in [0, 100] aTokens = validateTokens(src) bTokens = validateTokens(ref) a2 = ' '.join(aTokens) b2 = ' '.join(bTokens) tokenSortRatio = fuzz.token_sort_ratio(a2, b2) if tokenSortRatio < 40: logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref)) return 0 tokenSetRatio = fuzz.token_set_ratio(a2, b2) if tokenSetRatio < 50: logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref)) return 0 if REQUIRES_SHARED_PROPER_NOUN: aProper = ' '.join(filterProperNouns(aTokens)) bProper = ' '.join(filterProperNouns(bTokens)) # if(len(aProper) > 3 and len(bProper) > 3): if len(aProper) > 0 or len(bProper) > 0: properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper) if properNounSortRatio < 80: logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref)) return 0 properNounSetRatio = fuzz.token_set_ratio(aProper, bProper) if properNounSetRatio < 60: logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref)) return 0 return tokenSortRatio * tokenSetRatio / 100
def find_match(query, intent): global col doc = None max_ratio = 0 for d in col.find({"intent": intent}): ratio = fuzz.token_set_ratio(d['text'], query) if ratio > max_ratio: max_ratio = ratio doc = d del doc['_id'] return doc['answer']
def find_match(course_list, query_string): """ find the most matching course for a given name and return the course :param course_list: list of courses :param query_string: query of the user :return: course object """ max_out = 0 # the max ratio among the courses max_course = None for course in course_list: if 'lab' not in query_string.lower(): if course.subject_type == 'Embedded Lab': continue else: if course.subject_type == 'Embedded Theory': continue max_in = 0 # the max ratio among different names of the course for name in course.names: ratio = fuzz.token_set_ratio(name, query_string) if ratio > max_in: max_in = ratio if max_out < max_in: max_out = max_in max_course = course return max_course if max_out > 50 else None
def strict_compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.partial_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.partial_ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def enter(MSG): """ This function takes a string (MSG) and tries to answer the query by looking through the dictionaries in the program (after some preprocessing). It tries to mine out the correct response by performing pattern matching through the structured data """ msg=MSG.lower() if msg[-1]=='?': msg=msg[:-1] tokens=nltk.word_tokenize(msg) for i in words: while (i in tokens): tokens.remove(i) lst=[] flag=0 if tokens[0]=="who": lst=data_who elif tokens[0]=="what": lst=data_what elif tokens[0]=='how': lst=data_how #msg=str(tokens) msg=' '.join(tokens[1:]) for i in lst: if fuzz.token_set_ratio(i[0],msg)>=60: print i[1] flag=1 break if flag==0: print "Question Not found"
def computeSimilarity(s1, s2): return 1.0 - (0.01 * max( fuzz.ratio(s1, s2), fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2)))
def _match_torrent_name(self, movie_title, movie_year, torrent_title): ''' Checks if movie_title and torrent_title are a good match movie_title: str title of movie movie_year: str year of movie release torrent_title: str title of torrent Helper function for rss_sync. Since torrent indexers don't supply imdbid like NewzNab does we have to compare the titles to find a match. This should be fairly accurate since a backlog search uses name and year to find releases. Checks if the year is in the title, promptly ignores it if the year is not found. Then does a fuzzy title match looking for 80+ token set ratio. Returns bool on match success ''' if movie_year not in torrent_title: return False else: title = movie_title.replace(':', '.').replace(' ', '.').lower() torrent = torrent_title.replace(' ', '.').replace(':', '.').lower() match = fuzz.token_set_ratio(title, torrent) if match > 80: return True else: return False
def main(conf): dump_dir = conf['fuzzy.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['fuzzy.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['fuzzy.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def compare_strings(cls, string1, string2, *, tolerance=None, method='uwratio'): """ Check if the strings provided have a similarity ratio within the specified tolerance. Return True if yes, otherwise return False. Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy). :param string1: str :param string2: str :param tolerance: number :param method: str, one of: 'uwratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'ratio' :rtype: bool :Example: >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10) True >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio') False """ str_number = any( char.isdigit() for string in (string1, string2) for char in string) if tolerance is None: if str_number: tolerance = cls.str_number_tolerance else: tolerance = cls.string_tolerance if not str_number: if cls.is_abbreviation(string1, string2): return True methods = {'uwratio': fuzz.UWRatio, 'partial_ratio': fuzz.partial_ratio, 'token_sort_ratio': fuzz.token_sort_ratio, 'token_set_ratio': fuzz.token_set_ratio, 'ratio': fuzz.ratio} if method not in methods: msg = 'wrong method, use available: {}' raise ValueError(msg.format(', '.join(sorted(methods)))) return methods[method](string1, string2) >= 100 - tolerance
def compute_features(train_df, test_df): train_df[Fields.qratio] = train_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.qratio] = test_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_qratio = compute_quality(train_df, Fields.qratio) train_df[Fields.wratio] = train_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.wratio] = test_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_wratio = compute_quality(train_df, Fields.wratio) train_df[Fields.partial_ratio] = train_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_ratio] = test_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio) train_df[Fields.partial_token_set_ratio] = train_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_set_ratio] = test_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio) train_df[Fields.partial_token_sort_ratio] = train_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_sort_ratio] = test_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio) train_df[Fields.token_set_ratio] = train_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_set_ratio] = test_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio) train_df[Fields.token_sort_ratio] = train_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_sort_ratio] = test_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio) quality = dict( quality_qratio=quality_qratio, quality_wratio=quality_wratio, quality_partial_ratio=quality_partial_ratio, quality_partial_token_set_ratio=quality_partial_token_set_ratio, quality_partial_token_sort_ratio=quality_partial_token_sort_ratio, quality_token_set_ratio=quality_token_set_ratio, quality_token_sort_ratio=quality_token_sort_ratio ) return quality