我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用fuzzywuzzy.fuzz.ratio()。
def _compute_author_similarity(self, paired_authors): def row_similarity(row): same_email = row.author_email == row.author_email_other name_similarity = fuzz.token_set_ratio(row.author_name, row.author_name_other) email_name_similarity = fuzz.ratio(row.email_name, row.email_name_other) name_to_email_similarity = fuzz.token_set_ratio(row.author_name, row.name_from_email_other) return pd.Series( [same_email, name_similarity, email_name_similarity, name_to_email_similarity]) newcols = paired_authors.apply(row_similarity, axis=1) newcols.columns = ['same_email', 'name_similarity', 'email_name_similarity', 'name_to_email_similarity'] newdf = paired_authors.join(newcols) return newdf
def get_combined_fuzz_score(a, b, **kwargs): a = clean_name(a) b = clean_name(b) if 'simple' in kwargs: w_simple = float(kwargs['simple']) else: w_simple = float(1) if 'partial' in kwargs: w_partial = float(kwargs['partial']) else: w_partial = float(1) simple = fuzz.ratio(a, b) * w_simple partial = fuzz.partial_ratio(a, b) * w_partial combined = float(simple) * float(partial) / float(10000) return combined
def findItemName(self, itemDictionary, messageItem): bestScore = 0 score = 0 bestItem = None try: for itemName, itemLabel in itemDictionary.items(): score = fuzz.ratio(messageItem, itemLabel) if score > bestScore: bestScore = score bestItem = itemName except KeyError: pass return bestItem
def tieBreak(self, query, i, j): """ ???????????????????????????????? Args: - query: ?????? - i: index ? i ? title - j: index ? j ? title Return: (target, index) - target: ?????? - index : ???? id """ raw1 = self.titles[i] raw2 = self.titles[j] r1 = fuzz.ratio(query, raw1) r2 = fuzz.ratio(query, raw2) if r1 > r2: return (raw1,i) else: return (raw2,j)
def score_chars(src, ref): # Returns a score in [0, 100] a0 = toASCII(src) b0 = toASCII(ref) a1 = acronymizePhrase(a0) b1 = acronymizePhrase(b0) if len(a1) > 0 and len(b1) > 0 and (a1 == b0.upper() or a0.upper() == b1): logging.debug('Accepted for ACRO : {} / {}'.format(a, b)) return 100 a = justCase(src) b = justCase(ref) absCharRatio = fuzz.ratio(a, b) if absCharRatio < 20: logging.debug('Rejected for ABS : {} / {}'.format(a, b)) return 0 partialCharRatio = fuzz.partial_ratio(a, b) if partialCharRatio < 30: logging.debug('Rejected for PARTIAL : {} / {}'.format(a, b)) return 0 return absCharRatio * partialCharRatio / 100
def fuzzy_feats(train_in, test_in, qcolumns = ['question1', 'question2'], append=''): from fuzzywuzzy import fuzz import pandas as pd train = train_in.copy().loc[:,qcolumns] test = test_in.copy().loc[:,qcolumns] train['fuzz_r'+append] = train.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_pr'+append] = train.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_tsr'+append] = train.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) train['fuzz_tsor'+append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_r'+append] = test.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_pr'+append] = test.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_tsr'+append] = test.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) test['fuzz_tsor'+append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1) return (train, test)
def best_scoring_value(self, groups): ''' Finds best fuzzy match Compares each elem of the group with each keyphrase/word in loc_map Returns the location with best matching ''' best_match = '' best_score = 0 groups = list(groups) # Append the whole of the group to the things to be checked # For instance, for the group ('a', 'b'), 'a b' will also be matched groups.append(' '.join(groups)) for g in groups: for key in self.loc_map: if fuzz.ratio(key, g) > best_score: best_score = fuzz.ratio(key, g) best_match = self.loc_map[key] return best_match
def is_eq_arg(x, y): """ Return whether these two words are equal, with fuzzy string matching. :param x: the first argument :param y: the second argument :return: Whether they are equal """ if fuzz.ratio(x, y) >= 90: return True # Convert numbers to words x_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in x.split()] y_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in y.split()] # Partial entailment with equivalence, e.g. 'two girls' -> 'two kids': return fuzz.ratio(' '.join(x_words), ' '.join(y_words)) >= 85
def is_eq_preds(p1, p2): """ Return whether these two predicates are equal, with fuzzy string matching. :param x: the first predicate :param y: the second predicate :return: Whether they are equal """ global nlp # Levenshtein distance mostly if fuzz.ratio(p1, p2) >= 90: return True # Same verb if p1.replace('{a0} ', '{a0} be ') == p2 or p1.replace('{a0} ', '{a0} have ') == p2 or \ p2.replace('{a0} ', '{a0} be ') == p1 or p2.replace('{a0} ', '{a0} have ') == p1: return True return False
def test_fuzzy_korean_ratio(): """Test Korean-specific fuzzy search.""" assert fuzz.ratio('?', '?') == 0 assert fuzzy_korean_ratio('?', '?') == 67 assert fuzz.ratio('??', '??') == 0 assert fuzzy_korean_ratio('??', '??') == 67 assert fuzz.ratio('??', '??') == 0 assert fuzzy_korean_ratio('??', '??') == 57 assert fuzz.ratio('??', '??') == 0 assert fuzzy_korean_ratio('??', '??') == 57 assert fuzz.ratio('??', '?????') == 0 assert fuzzy_korean_ratio('??', '?????') == 80
def parseArgs(): argparser = argparse.ArgumentParser(description='This is uploafer. Obviously. If you don\'t know what WM2 is, better not to know what uploafer is.') #argparser.add_argument('-u', '--username', help='Your PTH username', required=True) #argparser.add_argument('-p', '--password', help='Your PTH password', required=True) #argparser.add_argument('-i', '--wm2media', help='The directory containing your WM2 downloads. Each subdirectory should contain a "ReleaseInfo2.txt" file.', default='.', required=True) #argparser.add_argument('-w', '--wm2root', help='This directory should contain "manage.py". Leave this blank to disable auto-import. Warning: auto-import will MOVE your torrent data!') #argparser.add_argument('-o', '--output', help='This is the output directory for torrents and media you wish to upload. This option is overridden if wm2root is specified.') #argparser.add_argument('-z', '--fuzzratio', help='Minimum likeness ratio required to consider a match. Anything which scores higher than this will not be eligible for uploading. Default is 90', type=int, default=90) argparser.add_argument('-vv', '--debug', help='Highest level of verbosity for debugging', action="store_true") argparser.add_argument('-v', '--verbose', help='High level of verbosity for detailed info', action="store_true") argparser.add_argument('-r', '--resume', help="Resume where uploafer left off within the WM2 media directory.", action="store_true") argparser.add_argument('-a', '--auto', help='Don\'t use this.', action="store_true") args = argparser.parse_args() if args.debug: log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG) log.info("Debug output.") elif args.verbose: log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO) log.info("Verbose output.") else: log.basicConfig(format="%(levelname)s: %(message)s") return args
def findBestGroup(ri, artist): #TODO: Check catalogue numbers! bestGrp = ri.group #placeholder bestGrp.match = -1 for group in artist.torrentgroup: if (ri.group.catalogueNumber != '') and (ri.group.catalogueNumber == group.groupCatalogueNumber): bestGrp = group bestGrp.match = 101 break else: group.match = fuzz.ratio(ri.group.name, group.groupName) if group.match > bestGrp.match: bestGrp = group if bestGrp.match == 100: break return bestGrp
def match_phrase(self, lineinput, phrases): scores = [] phrasemap = {} for phrase in phrases: phrasemap[phrase['id']] = phrase for part in phrase['parts']: pscore={} pscore['part']=part pscore['id']=phrase['id'] pscore['score'] = fuzz.ratio(part, lineinput) scores.append(pscore) maxscore = max(scores, key=lambda x: x['score']) # print scores # print maxscore return phrasemap[maxscore['id']]
def get_fixture_channels(self, events, fixture): chann = [] items = [] for item in events: evnt = item['event'] comp = fuzz.ratio(fixture.competition.name, evnt['competition']) home = fuzz.ratio(fixture.home_team.name, evnt['home']) away = fuzz.ratio(fixture.away_team.name, evnt['away']) comb = (comp + home + away) / 3 items.append({ 'ratio': comb, 'channels': item['channels'] }) if items: sort = sorted(items, key=itemgetter('ratio'), reverse=True)[0] if sort['ratio'] > 70: chann = self.data.get_multiple('channel', 'name', sort['channels']) chann = [c.id for c in chann] return chann
def build_similarity(self, actor, other_actor): similarity = ActorSimilarity(**su.empty_dict(ACTOR_SIMILARITY_FIELDS)) # run comparisons for similarity similarity.identical = (actor.actor_id == other_actor.actor_id) similarity.proper_name1 = proper(actor.parsed_name) similarity.proper_name2 = proper(other_actor.parsed_name) similarity.proper_email_name1 = proper(actor.parsed_email.parsed_name) similarity.proper_email_name2 = proper( other_actor.parsed_email.parsed_name) similarity.same_name = (actor.parsed_name.name == other_actor.parsed_name.name) similarity.name_ratio = self.compare_names(actor.parsed_name, other_actor.parsed_name) similarity.same_email = (actor.parsed_email.email == other_actor.parsed_email.email) similarity.email_domain_ratio = fuzz.ratio( actor.parsed_email.domain, other_actor.parsed_email.domain) similarity.same_email_name = (actor.parsed_email.parsed_name.name == other_actor.parsed_email.parsed_name.name) similarity.email_name_ratio = self.compare_names( actor.parsed_email.parsed_name, other_actor.parsed_email.parsed_name) similarity.name1_email_ratio = self.compare_names( actor.parsed_name, other_actor.parsed_email.parsed_name) similarity.name2_email_ratio = self.compare_names( actor.parsed_email.parsed_name, other_actor.parsed_name) return similarity
def compare_names(name1: ParsedName, name2: ParsedName): if proper(name1) and proper(name2): compare = fuzz.token_set_ratio else: compare = fuzz.ratio return compare(name1.name, name2.name)
def fuzzy_distance(word, words): return sorted(((w, fuzz.ratio(word, w)) for w in words), key=lambda e: -e[1])
def parse_line(frequency_dict, word_index_dict, nynorsk_line, bokmaal_line): nn_tokenized = re.findall(r'\w+', nynorsk_line, re.MULTILINE | re.UNICODE) nb_tokenized = re.findall(r'\w+', bokmaal_line, re.MULTILINE | re.UNICODE) if (len(nn_tokenized) != len(nb_tokenized)): # Drop the whole sentence if it doesn't have the same number of tokens. return consecutive_skips = 0 for i in range(len(nb_tokenized)): # If translation fails, the word is prefixed with '*' if '*' in nb_tokenized[i] or '*' in nn_tokenized[i]: continue # If the edit distance ratio is lower than 40 % for three consecutive words, # we conclude that we have gone astray, and drop the rest of the sentence. if (fuzz.ratio(nn_tokenized[i], nb_tokenized[i]) < 40): consecutive_skips += 1 if (consecutive_skips == 3): break else: consecutive_skips = 0 nn_token_idx = get_index_key(word_index_dict, nn_tokenized[i]) nb_token_idx = get_index_key(word_index_dict, nb_tokenized[i]) if (nn_token_idx, nb_token_idx) in frequency_dict: frequency_dict[(nn_token_idx, nb_token_idx)] += 1 else: frequency_dict[(nn_token_idx, nb_token_idx)] = 1
def filterModule(self, module): ratio = 0 compatibleType = False if "type" in self.modfilter: if self.modfilter["type"]["dir"] == "input": for input in module.inputDefs: if input.pintype == self.modfilter["type"]["type"]: compatibleType = True break elif self.modfilter["type"]["dir"] == "output": for output in module.outputDefs: if output.pintype == self.modfilter["type"]["type"]: compatibleType = True break if not compatibleType: return False if "text" in self.modfilter: # Filter by text input if self.modfilter["text"] in module.name: return True if not self.modfilter["text"]: # Text entry is empty return True ratio = fuzz.ratio(self.modfilter["text"], module.name) ratio = max(ratio, fuzz.partial_ratio(self.modfilter["text"], module.desc)) else: return True # Don't filter by text? Return all remaining if ratio > 40: return True else: return False
def get_unknown(topic): topics_list = get_topics_list() if topic.startswith(':'): topics_list = [x for x in topics_list if x.startswith(':')] else: topics_list = [x for x in topics_list if not x.startswith(':')] possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3] possible_topics_text = "\n".join([(" * %s %s" % x) for x in possible_topics]) return """ Unknown topic. Do you mean one of these topics may be? %s """ % possible_topics_text
def find_entity(self, entity, types): if self.ssl: req = get("%s/api/states" % self.url, headers=self.headers, verify=self.verify) else: req = get("%s/api/states" % self.url, headers=self.headers) if req.status_code == 200: best_score = 0 best_entity = None for state in req.json(): try: if state['entity_id'].split(".")[0] in types: score = fuzz.ratio( entity, state['attributes']['friendly_name'].lower()) if score > best_score: best_score = score best_entity = { "id": state['entity_id'], "dev_name": state['attributes'] ['friendly_name'], "state": state['state']} except KeyError: pass return best_entity # # checking the entity attributes to be used in the response dialog. #
def match(self, query): """ ????? query??????????????????????? Args: - query: ????????? - removeStopWords: ?? stopwords """ ratio = -1 target = "" target_idx = -1 if self.cleanStopWords: mQuery = [word for word in self.wordSegmentation(query) if word not in self.stopwords] mQuery = "".join(mQuery) title_list = self.segTitles else: title_list = self.titles mQuery = query for index,title in enumerate(title_list): newRatio = fuzz.ratio(mQuery, title) if newRatio > ratio: ratio = newRatio target = title target_idx = index elif self.cleanStopWords and newRatio == ratio: target, target_idx = self.tieBreak(query,target_idx,index) self.similarity = ratio return target,target_idx
def get(cls, name): mon = cls.MONSTER_D.get(name.strip().lower()) if mon: return mon mons = [] for mon in cls.MONSTERS: ratio = fuzz.ratio(mon.name.lower().strip(), name) mons.append((ratio, mon)) mons = [b for a, b in sorted(mons, key=lambda x: x[0], reverse=True)] return mons[0]
def match_contractors(contractors_file, match_file, match_col, match_threshold): results = [] with open(match_file, 'r') as f: with open(contractors_file, 'r') as g: contracts = [] contribs_reader = csv.reader(f) contracts_reader = csv.reader(g) next(contracts_reader) for row in contracts_reader: contracts.append(row) header = next(contribs_reader) for row in contribs_reader: best_match = '' best_match_amount = -1 best_score = 0 for contract in contracts: translator = str.maketrans('', '', string.punctuation) contractor_name = contract[0].translate(translator).lower() match_name = row[match_col].translate(translator).lower() score = fuzz.ratio(match_name, contractor_name) if score > best_score and score > match_threshold: best_match = contract[0] best_score = score best_match_amount = contract[4] new_row = row + [best_match, best_match_amount] results.append(new_row) return results
def __process_loc_results__(self, results, label): """Method takes the json results from running the Args: results(list): List of JSON rows from LOC ID call label(str): Original Label """ title, loc_uri, term_weights = None, None, dict() for row in results: if isinstance(row, dict) or not row[0].startswith('atom:entry'): continue if row[2][0].startswith("atom:title"): title = row[2][-1] if row[3][0].startswith("atom:link"): loc_url = row[3][-1].get('href') if "subjects/" in loc_url: bf_class = BF.Topic elif "organizations/" in loc_url: bf_class = BF.Organization else: bf_class = BF.Agent loc_uri = rdflib.URIRef(loc_url) term_weights[str(loc_uri)] = { "weight": fuzz.ratio(label, title), "class": bf_class, "title": title} results = sorted(term_weights.items(), key=lambda x: x[1]['weight']) results.reverse() for row in results: loc_url = row[0] weight = row[1].get('weight') title = row[1].get('title') if weight >= self.cutoff: return rdflib.URIRef(loc_url), rdflib.Literal(title) return None, None
def address_filter_score(src, ref): a1, a2 = case_phrase(src), case_phrase(ref) return fuzz.partial_ratio(a1, a2) + fuzz.ratio(a1, a2) # Acronym handling
def check_answer(self, message, match): answer = match.group("answer") print answer if self.active_question_bool: if fuzz.ratio((answer.lower()), (self.data[self.active_index]['answer'].lower())) >= self.fuzziness_ratio: self.active_question_bool = False self.active_index = 0 self.active_question = "" name = self.nombre(message.getParticipant()) return TextMessageProtocolEntity("Correct " + name + "!", to=message.getFrom()) else: name = self.nombre(message.getParticipant()) return TextMessageProtocolEntity("Incorrect " + name + "!", to=message.getFrom())
def percentDiff(old, new): x = fuzz.ratio(old, new) return x
def get_type_from_title(title): engrol = RomanianHelper.englishize_romanian(title).lower() stop_pos = len(title) magic_keyword_search_result = re.search(r'(pentru|privind)', engrol) if magic_keyword_search_result != None: stop_pos = magic_keyword_search_result.start() search_space = engrol[:stop_pos] type_to_keywords = { 'HG': 'hotarare', 'OM': 'ordin', 'LEGE': 'lege', 'OG': 'ordonanta', 'OUG': 'ordonanta de urgenta' } final_type = None max_ratio = 0 for key in type_to_keywords: ratio = fuzz.ratio(type_to_keywords[key], search_space) if ratio > max_ratio: max_ratio = ratio final_type = key return final_type
def extractMentorsMentees(data): # mentors = pd.DataFrame([row for row in data.iterrows() if (fuzz.ratio(row[1][cmap[4]], "Mentor")>90)]) # mentees = pd.DataFrame([row for row in data.iterrows() if (fuzz.ratio(row[1][cmap[4]], "Mentee")>90)]) mentors = data[data[cmap[4]] == "Mentor"] mentees = data[data[cmap[4]] == "Mentee"] mentors['xx'] = list(range(len(mentors))) mentees['xx'] = list(range(len(mentees))) return mentors, mentees
def scoreTheMatch(peer1,peer2,field_name): return fuzz.ratio(peer1[field_name], peer2[field_name])
def asking_team(self, msg): equipes = utils.get_list_of_equipes_popular_names() # String: 'Flamengo' for equipe in equipes: if fuzz.ratio(equipe, msg) > 49: self.user.team_slug = msg.lower().replace(" ", "-") self.user.team_popular_name = utils.get_popular_name_by_slug(self.user.team_slug) self.user.team_id = utils.get_equipe_id_by_slug(self.user.team_slug) if self.user.team_id is None: break self.state = State.CONFIRMING_TEAM return TextResponse("Irado! ?? Seu time é o {}, né?".format(self.user.team_popular_name)) return TextResponse('Você entrou com um time inválido! Por favor, tente novamente.')
def lookup(self, query): matches = process.extract(query, self.index.keys(), scorer=fuzz.ratio) result = None if query[-1] == '+': for match in matches: if match[0].find('+') != -1: result = match[0] break else: result = matches[0][0] if result: result = self.db[self.db.db_cfg.database].FEHData.find_one({'id': self.index[result]}) return result
def wiki(self, query, amount=5, threshold=50): best = BestHandler() best.add(0, ('HOME', WIKI_URL)) if query != '': for name, link in self._wiki.items(): score = fuzz.ratio(query.lower(), name.split(ARROW_CHARACTER)[-1].strip().lower()) best.add(score, (name, link)) return best.to_list(amount, threshold)
def fuzzy_korean_ratio(str1: str, str2: str) -> int: """Fuzzy Search with Korean.""" return fuzz.ratio( normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2), )
def html(bot, event: Message, sess, keyword: str): """ HTML ???? ?? `{PREFIX}html tbody` (`tbody` TAG? ?? ???? ??) """ try: ref = sess.query(JSONCache).filter_by(name='html').one() except NoResultFound: await bot.say( event.channel, '?? ???? ?? ???? ????? ? ????. ??? ??????!' ) return name = None link = None ratio = -1 for _name, _link in ref.body: _ratio = fuzz.ratio(keyword, _name) if _ratio > ratio: name = _name link = _link ratio = _ratio if ratio > 40: await bot.say( event.channel, f':html: `{name}` - {link}' ) else: await bot.say( event.channel, '??? HTML Element? ?? ?????!' )
def css(bot, event: Message, sess, keyword: str): """ CSS ???? ?? `{PREFIX}css color` (`color` ? ?? ???? ??) """ try: ref = sess.query(JSONCache).filter_by(name='css').one() except NoResultFound: await bot.say( event.channel, '?? ???? ?? ???? ????? ? ????. ??? ??????!' ) return name = None link = None ratio = -1 for _name, _link in ref.body: _ratio = fuzz.ratio(keyword, _name) if _ratio > ratio: name = _name link = _link ratio = _ratio if ratio > 40: await bot.say( event.channel, f':css: `{name}` - {link}' ) else: await bot.say( event.channel, '??? CSS ?? ??? ?? ?????!' )
def python(bot, event: Message, sess, keyword: str): """ Python library ???? ?? `{PREFIX}py re` (`re` ?? ??? ?? ???? ??) """ try: ref = sess.query(JSONCache).filter_by(name='python').one() except NoResultFound: await bot.say( event.channel, '?? ???? ?? ???? ????? ? ????. ??? ??????!' ) return name = None link = None ratio = -1 for code, _name, _link in ref.body: if code: _ratio = fuzz.ratio(keyword, code) else: _ratio = fuzz.ratio(keyword, _name) if _ratio > ratio: name = _name link = _link ratio = _ratio if ratio > 40: await bot.say( event.channel, f':python: {name} - {link}' ) else: await bot.say( event.channel, '??? Python library? ?? ?????!' )
def strict_compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.partial_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.partial_ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two) > highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def _match_place_name_to_wiki_page(place_name, wiki_page_titles): """Work horse of `geosearch`: separated for easier testing & debugging. For example places we can't yet match, see `test_wp._CHALLENGE_PLACE_NAME_TO_WIKI`. Potential improvements: - Change existing dials (for each pass?): local vars (e.g. _THRESHOLD), radius/limit kwarg to Wikipedia API - Changes scorers on different passes, e.g. partial_ratio is more lenient than ratio. - Modify full_process processor: it removes non-letter-number characters so wiki disambiguation markup can cause undesired matching. For example, "Boulevard (restaurant)" becomes "boulevard restaurant", which matches "mourad restaurant" at 79. - Add additional processors: - Modify plurals, articles, accents (full_process will just remove accented characters :( ). - Remove city/state name occurences in wiki pages, e.g. "San Francisco Ferry Building" -> "Ferry Building" could better match the Yelp "Ferry Building Marketplace" (disclaimer: US-centric) - Modify place_name query string. These may be better than their "remove" counterparts because adding more characters gives more information to try to match against and may produce more accurate results than removing characters. - (reverse ^) add city/state to place names: "Ferry Building Marketplace" -> "San Francisco Ferry Building Marketplace" - Reverse wiki_disambiguation_processor: add common wikipedia endings: (restaurant), (California), etc. - Consider running most lenient processors first, moving towards more strict, like a filter. Right now we run the strictest first. """ # We run multiple processor passes: if there is no match, the next processor may be more lenient. for processor in _PLACE_NAME_TO_WIKI_PAGE_PROCESSORS: matches = process.extractBests(place_name, wiki_page_titles, scorer=_SCORER, processor=processor, score_cutoff=_THRESHOLD) if len(matches) >= 1: if len(matches) > 1: print('More than one match above threshold', matches, file=sys.stderr) return matches[0][0] return None
def is_almost_equal(self, other): name1 = self.name.lower() name2 = other.name.lower() return fuzz.ratio(name1, name2) >= MIN_FUZZY_RATIO
def fuzzy_fit(x, y): """ Returns whether x and y are similar in fuzzy string matching :param x: the first mention :param y: the second mention :return: whether x and y are similar in fuzzy string matching """ if fuzz.ratio(x, y) >= 90: return True # Convert numbers to words x_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in x.split()] y_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in y.split()] return fuzz.ratio(' '.join(x_words), ' '.join(y_words)) >= 85
def update_ner_pubtator(self): ''' Process sentence tokens and see if any match to PubTator entity mentions. If so, replace their token['ner'] with the PubTator NER class (CHEMICAL, DISEASE, etc.) ''' if self.pubtator: for sent in self.sentences: sentence_index = sent['index'] # are there any PubTator NER tags for this sentence? if not self.pubtator.sentence_ner[sentence_index]: continue # process pubtator NER! (read CoreNLP tokens, see any of them match exactly...) for t in sent['tokens']: for biothing in self.pubtator.sentence_ner[sentence_index]: start, end = biothing.corenlp_offsets if t['characterOffsetBegin'] == start and t['characterOffsetEnd'] == end: # exact match! update CoreNLP NER with PubTator NER biothing.matched_corenlp_token = t['index'] t['ner'] = biothing.ner_type break elif fuzz and self.fuzzy_ner_match: if fuzz.ratio(t['originalText'].lower(), biothing.token.lower()) > self.fuzzy_ner_match: biothing.matched_corenlp_token = t['index'] t['ner'] = biothing.ner_type break self.pubtator_ner_updated = True return self.pubtator_ner_updated
def get_best_match(self, input, corpus, tolerance): cartesian = itr.product(input, corpus) max_match = 0 max_p = "" max_q = "" for p, q in cartesian: match_percentage = fuzz.ratio(p, q) if(match_percentage > max_match): max_match = match_percentage max_p = p max_q = q return max_p, max_q
def normalizeMalwareNamesStep1(malwarenames): # malwarenames-list to string names = " ".join(malwarenames) for trn in TRENNER: names = names.replace(trn, " ").lower() for key in sorted(MAPPING, key=len, reverse=True): names = names.replace(key, MAPPING[key]) return names # similarity from the ratio, token_sort and token_set ratio methods in FuzzyWuzzy
def computeSimilarity(s1, s2): return 1.0 - (0.01 * max( fuzz.ratio(s1, s2), fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2)))