我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用fuzzywuzzy.fuzz.token_sort_ratio()。
def autorsAmendment(self,autors): #formatea si es un diputado de forma que se pueda buscar en la bd strip=autors[0].strip() typeaut = self.typeAutor(name=strip) if typeaut is not 'grupo': max = 0 member = None for memb in self.members: ratio = fuzz.token_sort_ratio(strip, memb['nombre']) if ratio > max: member = memb max = ratio return member['nombre'] else: return strip
def extract_features(df): df["question1"] = df["question1"].fillna("").apply(preprocess) df["question2"] = df["question2"].fillna("").apply(preprocess) print("token features...") token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return df
def evaluate_patch_pair(thresholds, lhs, rhs): left_message, left_diff = lhs right_message, right_diff = rhs left_diff_lines = left_diff.lines right_diff_lines = right_diff.lines diff_lines_ratio = min(left_diff_lines, right_diff_lines) / max(left_diff_lines, right_diff_lines) # get rating of message msg_rating = fuzz.token_sort_ratio(left_message, right_message) / 100 # Skip on diff_lines_ratio less than 1% if diff_lines_ratio < 0.01: return SimRating(msg_rating, 0, diff_lines_ratio) # get rating of diff diff_rating = rate_diffs(thresholds, left_diff, right_diff) return SimRating(msg_rating, diff_rating, diff_lines_ratio)
def fuzzy_match_strings(ref, val): """ Returns the matching score of two values. """ if not ref or not val: return 0 ref_q = to_q(ref) val_q = to_q(val) if ref_q or val_q: return 100 if ref_q == val_q else 0 simplified_val = unidecode(val).lower() simplified_ref = unidecode(ref).lower() # Return symmetric score r1 = fuzz.token_sort_ratio(simplified_val, simplified_ref) r2 = fuzz.token_sort_ratio(simplified_ref, simplified_val) r2 = r1 return int(0.5*(r1+r2))
def matchautorgroup(self,lists): all = self.members+self.groups res = [] for element in lists: member = None max = 0 for memb in all: ratio = fuzz.token_sort_ratio(element, memb['nombre']) if ratio > max: member = memb max = ratio res.append(member) return res
def score_tokens(src, ref, translate_tokens): if translate_tokens: return score_tokens(translate(src), translate(ref), False) # Returns a score in [0, 100] aTokens = validateTokens(src) bTokens = validateTokens(ref) a2 = ' '.join(aTokens) b2 = ' '.join(bTokens) tokenSortRatio = fuzz.token_sort_ratio(a2, b2) if tokenSortRatio < 40: logging.debug('Rejected for TOKEN_SORT : {} / {}'.format(src, ref)) return 0 tokenSetRatio = fuzz.token_set_ratio(a2, b2) if tokenSetRatio < 50: logging.debug('Rejected for TOKEN_SET : {} / {}'.format(src, ref)) return 0 if REQUIRES_SHARED_PROPER_NOUN: aProper = ' '.join(filterProperNouns(aTokens)) bProper = ' '.join(filterProperNouns(bTokens)) # if(len(aProper) > 3 and len(bProper) > 3): if len(aProper) > 0 or len(bProper) > 0: properNounSortRatio = fuzz.token_sort_ratio(aProper, bProper) if properNounSortRatio < 80: logging.debug('Rejected for PROPER_NOUN_SORT : {} / {}'.format(src, ref)) return 0 properNounSetRatio = fuzz.token_set_ratio(aProper, bProper) if properNounSetRatio < 60: logging.debug('Rejected for PROPER_NOUN_SET : {} / {}'.format(src, ref)) return 0 return tokenSortRatio * tokenSetRatio / 100
def best_string_mapping(threshold, left_list, right_list): """ This function tries to find the closest mapping with the best weight of two lists of strings. Example: List A List B 0: 'abc' 'abc' 1: 'cde' 'cde' 2: 'fgh' 'fgh 3: 'fgj map_lists will try to map each element of List A to an element of List B, in respect to the given threshold. As a[{0,1,2}] == b[{0,1,2}], those values will automatically be mapped. Additionally, a[2] will also be mapped to b[3], if the threshold is low enough (cf. 0.5). """ def injective_map(ll, rl, inverse_result=False): ret = dict() for l_entry in ll: for r_entry in rl: if l_entry == r_entry: sim = 1 else: sim = fuzz.token_sort_ratio(l_entry, r_entry) / 100 if sim < threshold: continue if l_entry in ret: _, old_sim = ret[l_entry] if sim < old_sim: continue ret[l_entry] = r_entry, sim return {(r, l) if inverse_result else (l, r) for l, (r, _) in ret.items()} return injective_map(left_list, right_list) | injective_map(right_list, left_list, True)
def rate_diffs(thresholds, l_diff, r_diff): filename_compare = best_string_mapping(thresholds.filename, l_diff.patches.keys(), r_diff.patches.keys()) levenshteins = [] def compare_hunks(left, right): # This case happens for example, if both hunks remove empty newlines if left == right: return 100 return fuzz.token_sort_ratio(left, right) for l_filename, r_filename in filename_compare: l_hunks = l_diff.patches[l_filename] r_hunks = r_diff.patches[r_filename] levenshtein = [] hunk_compare = best_string_mapping(thresholds.heading, l_hunks.keys(), r_hunks.keys()) for l_hunk_heading, r_hunk_heading in hunk_compare: lhunk = l_hunks[l_hunk_heading] rhunk = r_hunks[r_hunk_heading] if lhunk.deletions and rhunk.deletions: levenshtein.append(compare_hunks(lhunk.deletions, rhunk.deletions)) if lhunk.insertions and rhunk.insertions: levenshtein.append(compare_hunks(lhunk.insertions, rhunk.insertions)) if levenshtein: levenshteins.append(mean(levenshtein)) if not levenshteins: levenshteins = [0] diff_rating = mean(levenshteins) / 100 return diff_rating
def preevaluate_filenames(thresholds, right_files, left_file): candidates = [] for right_file in right_files: if thresholds.filename >= 1.0: if left_file != right_file: continue else: sim = fuzz.token_sort_ratio(left_file, right_file) / 100 if sim < thresholds.filename: continue candidates.append(right_file) return left_file, candidates
def strict_compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.partial_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.partial_ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def get_CUL_score(record_elems, resp_elems): if record_elems is None or resp_elems is None: return None elif isinstance(record_elems, str) and isinstance(resp_elems, str): score = str(fuzz.token_sort_ratio(record_elems, resp_elems)) return score elif isinstance(record_elems, str) and not isinstance(resp_elems, str): scores = [] for n in range(len(resp_elems)): score = str(fuzz.token_sort_ratio(record_elems, resp_elems[n])) scores.append(score) return max(scores) elif not isinstance(record_elems, str) and isinstance(resp_elems, str): scores = [] for n in range(len(record_elems)): score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems)) scores.append(score) return max(scores) elif not isinstance(record_elems, str) and not isinstance(resp_elems, str): scores = [] for n in range(len(record_elems)): for m in range(len(resp_elems)): score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems[m])) scores.append(score) if scores != []: return max(scores) else: return None
def computeSimilarity(s1, s2): return 1.0 - (0.01 * max( fuzz.ratio(s1, s2), fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2)))
def sim(str1,str2): return fuzz.token_sort_ratio(str1,str2) * 0.01
def match_fuzzy(self, frame): pattern = process.extractOne( frame.name, self._index_fuzzy, scorer=fuzz.token_sort_ratio) if not pattern or pattern[1] < MATCH_FUZZY_THRESHOLD: return frame, set() return frame, self._handlers[pattern[0]]
def main(conf): dump_dir = conf['fuzzy.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['fuzzy.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['fuzzy.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def is_fuzzy_matching_valid(self, stock_name, current_stock): ## Get the token sort ratio from fuzzywuzzy ratio = fuzz.token_sort_ratio(stock_name, current_stock) return ratio > 95
def compare_strings(cls, string1, string2, *, tolerance=None, method='uwratio'): """ Check if the strings provided have a similarity ratio within the specified tolerance. Return True if yes, otherwise return False. Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy). :param string1: str :param string2: str :param tolerance: number :param method: str, one of: 'uwratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'ratio' :rtype: bool :Example: >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10) True >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio') False """ str_number = any( char.isdigit() for string in (string1, string2) for char in string) if tolerance is None: if str_number: tolerance = cls.str_number_tolerance else: tolerance = cls.string_tolerance if not str_number: if cls.is_abbreviation(string1, string2): return True methods = {'uwratio': fuzz.UWRatio, 'partial_ratio': fuzz.partial_ratio, 'token_sort_ratio': fuzz.token_sort_ratio, 'token_set_ratio': fuzz.token_set_ratio, 'ratio': fuzz.ratio} if method not in methods: msg = 'wrong method, use available: {}' raise ValueError(msg.format(', '.join(sorted(methods)))) return methods[method](string1, string2) >= 100 - tolerance
def compute_features(train_df, test_df): train_df[Fields.qratio] = train_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.qratio] = test_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_qratio = compute_quality(train_df, Fields.qratio) train_df[Fields.wratio] = train_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.wratio] = test_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_wratio = compute_quality(train_df, Fields.wratio) train_df[Fields.partial_ratio] = train_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_ratio] = test_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio) train_df[Fields.partial_token_set_ratio] = train_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_set_ratio] = test_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio) train_df[Fields.partial_token_sort_ratio] = train_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_sort_ratio] = test_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio) train_df[Fields.token_set_ratio] = train_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_set_ratio] = test_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio) train_df[Fields.token_sort_ratio] = train_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_sort_ratio] = test_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio) quality = dict( quality_qratio=quality_qratio, quality_wratio=quality_wratio, quality_partial_ratio=quality_partial_ratio, quality_partial_token_set_ratio=quality_partial_token_set_ratio, quality_partial_token_sort_ratio=quality_partial_token_sort_ratio, quality_token_set_ratio=quality_token_set_ratio, quality_token_sort_ratio=quality_token_sort_ratio ) return quality