我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用fuzzywuzzy.fuzz.partial_ratio()。
def extract_features(df): df["question1"] = df["question1"].fillna("").apply(preprocess) df["question2"] = df["question2"].fillna("").apply(preprocess) print("token features...") token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return df
def get_combined_fuzz_score(a, b, **kwargs): a = clean_name(a) b = clean_name(b) if 'simple' in kwargs: w_simple = float(kwargs['simple']) else: w_simple = float(1) if 'partial' in kwargs: w_partial = float(kwargs['partial']) else: w_partial = float(1) simple = fuzz.ratio(a, b) * w_simple partial = fuzz.partial_ratio(a, b) * w_partial combined = float(simple) * float(partial) / float(10000) return combined
def search(self, targets, partial=True, fuzzy=False): allInstances = self.instances() matchedInstances = set() for host in targets: for instance in allInstances: names = [instance.name] if instance.aliases != None: names += list(instance.aliases) for name in names: if host.lower() == name.lower(): matchedInstances.add((100, instance)) elif partial and host.lower() in name.lower(): matchedInstances.add((99, instance)) if fuzzy: score = fuzz.partial_ratio(host.lower(), name.lower()) if score > 85 or host.lower() in name.lower(): matchedInstances.add((score, instance)) # it is possible for the same instance to be matched, if so, it should only # appear on the return list once (still ordered by the most probable match) return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys())
def score_chars(src, ref): # Returns a score in [0, 100] a0 = toASCII(src) b0 = toASCII(ref) a1 = acronymizePhrase(a0) b1 = acronymizePhrase(b0) if len(a1) > 0 and len(b1) > 0 and (a1 == b0.upper() or a0.upper() == b1): logging.debug('Accepted for ACRO : {} / {}'.format(a, b)) return 100 a = justCase(src) b = justCase(ref) absCharRatio = fuzz.ratio(a, b) if absCharRatio < 20: logging.debug('Rejected for ABS : {} / {}'.format(a, b)) return 0 partialCharRatio = fuzz.partial_ratio(a, b) if partialCharRatio < 30: logging.debug('Rejected for PARTIAL : {} / {}'.format(a, b)) return 0 return absCharRatio * partialCharRatio / 100
def fuzzy_feats(train_in, test_in, qcolumns = ['question1', 'question2'], append=''): from fuzzywuzzy import fuzz import pandas as pd train = train_in.copy().loc[:,qcolumns] test = test_in.copy().loc[:,qcolumns] train['fuzz_r'+append] = train.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_pr'+append] = train.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_tsr'+append] = train.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_tsor'+append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_r'+append] = test.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_pr'+append] = test.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_tsr'+append] = test.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_tsor'+append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) return (train, test)
def filterModule(self, module): ratio = 0 compatibleType = False if "type" in self.modfilter: if self.modfilter["type"]["dir"] == "input": for input in module.inputDefs: if input.pintype == self.modfilter["type"]["type"]: compatibleType = True break elif self.modfilter["type"]["dir"] == "output": for output in module.outputDefs: if output.pintype == self.modfilter["type"]["type"]: compatibleType = True break if not compatibleType: return False if "text" in self.modfilter: # Filter by text input if self.modfilter["text"] in module.name: return True if not self.modfilter["text"]: # Text entry is empty return True ratio = fuzz.ratio(self.modfilter["text"], module.name) ratio = max(ratio, fuzz.partial_ratio(self.modfilter["text"], module.desc)) else: return True # Don't filter by text? Return all remaining if ratio > 40: return True else: return False
def similarity(n1, n2): """ Returns the mean of the partial_ratio score for each field in the two entities. Note that if they don't have fields that match, the score will be zero. """ scores = [ fuzz.partial_ratio(n1, n2) ] return float(sum(s for s in scores)) / float(len(scores))
def address_filter_score(src, ref): a1, a2 = case_phrase(src), case_phrase(ref) return fuzz.partial_ratio(a1, a2) + fuzz.ratio(a1, a2) # Acronym handling
def fuzzy_matches_in_sentence(self, skill, sentence): N = len(skill.split()) doc = self.ngrams(sentence, N) doc_join = [b" ".join(d) for d in doc] for dj in doc_join: ratio = fuzz.partial_ratio(skill, dj) if ratio > 88: yield CandidateSkill( skill_name=skill, matched_skill=dj, confidence=ratio, context=sentence.decode('utf-8') )
def candidate_skills(self, job_posting): document = job_posting.text sentences = self.ie_preprocess(document) for skill in self.lookup: len_skill = len(skill.split()) for sent in sentences: sent = sent.encode('utf-8') # Exact matching if len_skill == 1: sent = sent.decode('utf-8') if re.search(r'\b' + skill + r'\b', sent, re.IGNORECASE): yield CandidateSkill( skill_name=skill, matched_skill=skill, confidence=100, context=sent ) # Fuzzy matching else: ratio = fuzz.partial_ratio(skill, sent) # You can adjust the partial of matching here: # 100 => exact matching 0 => no matching if ratio > 88: for match in self.fuzzy_matches_in_sentence(skill, sent): yield match
def is_aligned_arg(x, y): """ Return whether these two arguments are aligned: they occur in the same WordNet synset. :param x: the first argument :param y: the second argument :return: Whether they are aligned """ global nlp # Allow partial matching if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100: return True x_words = [w for w in x.split() if not nlp.is_stop(w)] y_words = [w for w in y.split() if not nlp.is_stop(w)] if len(x_words) == 0 or len(y_words) == 0: return False x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()]) for w in x_words] y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()]) for w in y_words] # One word - check whether there is intersection between synsets if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \ len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not nlp.is_stop(w)]) > 0: return True # More than one word - align words from x with words from y intersections = [len([w for w in s1.intersection(s2) if not nlp.is_stop(w)]) for s1 in x_synonyms for s2 in y_synonyms] if len([intersection_len for intersection_len in intersections if intersection_len > 0]) >= \ 0.75 * max(len(x_synonyms), len(y_synonyms)): return True return False
def strict_compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.partial_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.partial_ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def fuzzy_title(self, titles): ''' Score and remove results based on title match titles: list of titles to match against If titles is an empty list every result is treated as a perfect match Iterates through self.results and removes any entry that does not fuzzy match 'title' > 60. Adds fuzzy_score / 20 points to ['score'] *If title is passed as None, assumes perfect match and scores +20 Does not return ''' logging.info(u'Checking title match.') lst = [] if titles == []: for result in self.results: result['score'] += 20 lst.append(result) else: for result in self.results: if result['type'] == 'import' and result not in lst: result['score'] += 20 lst.append(result) continue test = Url.encode(result['title']) matches = [fuzz.partial_ratio(Url.encode(title), test) for title in titles] if any([match > 70 for match in matches]): result['score'] += (max(matches) / 5) lst.append(result) else: logging.debug(u'{} best title match was {}%, removing search result.'.format(test, max(matches))) self.results = lst logging.info(u'Keeping {} results.'.format(len(self.results)))
def fuzzy_match(self, items, test): ''' Fuzzy matches title with predb rss titles :param items: list of titles in predb rss :param test: str to match to rss titles Returns bool if any one 'items' fuzzy matches above 50% ''' for item in items: match = fuzz.partial_ratio(item, test) if match > 50: return True return False
def reverseLinking(sent, text_candidate): tokens = sent.split() label = ["O"] * len(tokens) text_attention_indices = None exact_match = False if text_candidate is None or len(text_candidate) == 0: return '<UNK>', label, exact_match # sorted by length for text in sorted(text_candidate, key=lambda x:len(x), reverse=True): pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text)) if re.search(pattern, sent): text_attention_indices = get_indices(tokens, text.split()) break if text_attention_indices != None: exact_match = True for i in text_attention_indices: label[i] = 'I' else: try: v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio) except: print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate)) return '<UNK>', label, exact_match v = v.split() n_gram_candidate = get_ngram(tokens) n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True) top = n_gram_candidate[0] for i in range(top[1], top[2]): label[i] = 'I' entity_text = [] for l, t in zip(label, tokens): if l == 'I': entity_text.append(t) entity_text = " ".join(entity_text) label = " ".join(label) return entity_text, label, exact_match
def main(conf): dump_dir = conf['fuzzy.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['fuzzy.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['fuzzy.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def compare_strings(cls, string1, string2, *, tolerance=None, method='uwratio'): """ Check if the strings provided have a similarity ratio within the specified tolerance. Return True if yes, otherwise return False. Use fuzzywuzzy (https://pypi.python.org/pypi/fuzzywuzzy). :param string1: str :param string2: str :param tolerance: number :param method: str, one of: 'uwratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'ratio' :rtype: bool :Example: >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10) True >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio') False """ str_number = any( char.isdigit() for string in (string1, string2) for char in string) if tolerance is None: if str_number: tolerance = cls.str_number_tolerance else: tolerance = cls.string_tolerance if not str_number: if cls.is_abbreviation(string1, string2): return True methods = {'uwratio': fuzz.UWRatio, 'partial_ratio': fuzz.partial_ratio, 'token_sort_ratio': fuzz.token_sort_ratio, 'token_set_ratio': fuzz.token_set_ratio, 'ratio': fuzz.ratio} if method not in methods: msg = 'wrong method, use available: {}' raise ValueError(msg.format(', '.join(sorted(methods)))) return methods[method](string1, string2) >= 100 - tolerance
def partial_match(x, y): """ Return whether these two mentions have a partial match in WordNet synset. :param x: the first mention :param y: the second mention :return: Whether they are aligned """ # Allow partial matching if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100: return True x_words = [w for w in x.split() if not is_stop(w)] y_words = [w for w in y.split() if not is_stop(w)] if len(x_words) == 0 or len(y_words) == 0: return False x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()]) for w in x_words] y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()]) for w in y_words] # One word - check whether there is intersection between synsets if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \ len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not is_stop(w)]) > 0: return True # More than one word - align words from x with words from y cost = -np.vstack([np.array([len([w for w in s1.intersection(s2) if not is_stop(w)]) for s1 in x_synonyms]) for s2 in y_synonyms]) m = Munkres() cost = pad_to_square(cost) indices = m.compute(cost) # Compute the average score of the alignment average_score = np.mean([-cost[row, col] for row, col in indices]) if average_score >= 0.75: return True return False
def compute_features(train_df, test_df): train_df[Fields.qratio] = train_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.qratio] = test_df.apply( lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_qratio = compute_quality(train_df, Fields.qratio) train_df[Fields.wratio] = train_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.wratio] = test_df.apply( lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_wratio = compute_quality(train_df, Fields.wratio) train_df[Fields.partial_ratio] = train_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_ratio] = test_df.apply( lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio) train_df[Fields.partial_token_set_ratio] = train_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_set_ratio] = test_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio) train_df[Fields.partial_token_sort_ratio] = train_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.partial_token_sort_ratio] = test_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio) train_df[Fields.token_set_ratio] = train_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_set_ratio] = test_df.apply( lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio) train_df[Fields.token_sort_ratio] = train_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1) test_df[Fields.token_sort_ratio] = test_df.apply( lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1) quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio) quality = dict( quality_qratio=quality_qratio, quality_wratio=quality_wratio, quality_partial_ratio=quality_partial_ratio, quality_partial_token_set_ratio=quality_partial_token_set_ratio, quality_partial_token_sort_ratio=quality_partial_token_sort_ratio, quality_token_set_ratio=quality_token_set_ratio, quality_token_sort_ratio=quality_token_sort_ratio ) return quality