Python fuzzywuzzy.fuzz 模块,partial_ratio() 实例源码


项目:kaggle-quora-dup    作者:aerdem4    | 项目源码 | 文件源码
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df
项目:DVH-Analytics    作者:cutright    | 项目源码 | 文件源码
def get_combined_fuzz_score(a, b, **kwargs):
    a = clean_name(a)
    b = clean_name(b)

    if 'simple' in kwargs:
        w_simple = float(kwargs['simple'])
        w_simple = float(1)

    if 'partial' in kwargs:
        w_partial = float(kwargs['partial'])
        w_partial = float(1)

    simple = fuzz.ratio(a, b) * w_simple
    partial = fuzz.partial_ratio(a, b) * w_partial
    combined = float(simple) * float(partial) / float(10000)
    return combined
项目:bridgy    作者:wagoodman    | 项目源码 | 文件源码
def search(self, targets, partial=True, fuzzy=False):
        allInstances = self.instances()
        matchedInstances = set()

        for host in targets:
            for instance in allInstances:
                names = []
                if instance.aliases != None:
                    names += list(instance.aliases)
                for name in names:
                    if host.lower() == name.lower():
                        matchedInstances.add((100, instance))
                    elif partial and host.lower() in name.lower():
                        matchedInstances.add((99, instance))

                    if fuzzy:
                        score = fuzz.partial_ratio(host.lower(), name.lower())
                        if score > 85 or host.lower() in name.lower():
                            matchedInstances.add((score, instance))

        # it is possible for the same instance to be matched, if so, it should only
        # appear on the return list once (still ordered by the most probable match)
        return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys())
项目:the-magical-csv-merge-machine    作者:entrepreneur-interet-general    | 项目源码 | 文件源码
def score_chars(src, ref):
    # Returns a score in [0, 100]
    a0 = toASCII(src)
    b0 = toASCII(ref)
    a1 = acronymizePhrase(a0)
    b1 = acronymizePhrase(b0)
    if len(a1) > 0 and len(b1) > 0 and (a1 == b0.upper() or a0.upper() == b1):
        logging.debug('Accepted for ACRO : {} / {}'.format(a, b))
        return 100
    a = justCase(src)
    b = justCase(ref)
    absCharRatio = fuzz.ratio(a, b)
    if absCharRatio < 20: 
        logging.debug('Rejected for ABS : {} / {}'.format(a, b))
        return 0
    partialCharRatio = fuzz.partial_ratio(a, b)
    if partialCharRatio < 30: 
        logging.debug('Rejected for PARTIAL : {} / {}'.format(a, b))
        return 0
    return absCharRatio * partialCharRatio / 100
项目:kaggle    作者:rbauld    | 项目源码 | 文件源码
def fuzzy_feats(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):
    from fuzzywuzzy import fuzz
    import pandas as pd

    train = train_in.copy().loc[:,qcolumns]
    test = test_in.copy().loc[:,qcolumns]

    train['fuzz_r'+append] = train.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    train['fuzz_pr'+append] = train.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    train['fuzz_tsr'+append] = train.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    train['fuzz_tsor'+append] = train.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)    

    test['fuzz_r'+append] = test.apply(lambda x: fuzz.ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    test['fuzz_pr'+append] = test.apply(lambda x: fuzz.partial_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    test['fuzz_tsr'+append] = test.apply(lambda x: fuzz.partial_token_set_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)
    test['fuzz_tsor'+append] = test.apply(lambda x: fuzz.partial_token_sort_ratio(x[qcolumns[0]],x[qcolumns[1]]), axis = 1)     

    return (train, test)
项目:pyree-old    作者:DrLuke    | 项目源码 | 文件源码
def filterModule(self, module):
        ratio = 0
        compatibleType = False
        if "type" in self.modfilter:
            if self.modfilter["type"]["dir"] == "input":
                for input in module.inputDefs:
                    if input.pintype == self.modfilter["type"]["type"]:
                        compatibleType = True
            elif self.modfilter["type"]["dir"] == "output":
                for output in module.outputDefs:
                    if output.pintype == self.modfilter["type"]["type"]:
                        compatibleType = True

            if not compatibleType:
                return False

        if "text" in self.modfilter:    # Filter by text input
            if self.modfilter["text"] in
                return True
            if not self.modfilter["text"]:  # Text entry is empty
                return True
            ratio = fuzz.ratio(self.modfilter["text"],
            ratio = max(ratio, fuzz.partial_ratio(self.modfilter["text"], module.desc))
            return True     # Don't filter by text? Return all remaining

        if ratio > 40:
            return True
            return False
项目:newsname-match    作者:bahadasx    | 项目源码 | 文件源码
def similarity(n1, n2):
    Returns the mean of the partial_ratio score for each field in the two
    entities. Note that if they don't have fields that match, the score will
    be zero.

    scores = [
        fuzz.partial_ratio(n1, n2)

    return float(sum(s for s in scores)) / float(len(scores))
项目:the-magical-csv-merge-machine    作者:entrepreneur-interet-general    | 项目源码 | 文件源码
def address_filter_score(src, ref):
    a1, a2 = case_phrase(src), case_phrase(ref)
    return fuzz.partial_ratio(a1, a2) + fuzz.ratio(a1, a2)

# Acronym handling
项目:skills-ml    作者:workforce-data-initiative    | 项目源码 | 文件源码
def fuzzy_matches_in_sentence(self, skill, sentence):
        N = len(skill.split())
        doc = self.ngrams(sentence, N)
        doc_join = [b" ".join(d) for d in doc]

        for dj in doc_join:
            ratio = fuzz.partial_ratio(skill, dj)
            if ratio > 88:
                yield CandidateSkill(
项目:skills-ml    作者:workforce-data-initiative    | 项目源码 | 文件源码
def candidate_skills(self, job_posting):
        document = job_posting.text
        sentences = self.ie_preprocess(document)

        for skill in self.lookup:
            len_skill = len(skill.split())
            for sent in sentences:
                sent = sent.encode('utf-8')

                # Exact matching
                if len_skill == 1:
                    sent = sent.decode('utf-8')
                    if'\b' + skill + r'\b', sent, re.IGNORECASE):
                        yield CandidateSkill(
                # Fuzzy matching
                    ratio = fuzz.partial_ratio(skill, sent)
                    # You can adjust the partial of matching here:
                    # 100 => exact matching 0 => no matching
                    if ratio > 88:
                        for match in self.fuzzy_matches_in_sentence(skill, sent):
                            yield match
项目:Chirps    作者:vered1986    | 项目源码 | 文件源码
def is_aligned_arg(x, y):
    Return whether these two arguments are aligned: they occur in the same WordNet synset.
    :param x: the first argument
    :param y: the second argument
    :return: Whether they are aligned
    global nlp

    # Allow partial matching
    if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100:
        return True

    x_words = [w for w in x.split() if not nlp.is_stop(w)]
    y_words = [w for w in y.split() if not nlp.is_stop(w)]

    if len(x_words) == 0 or len(y_words) == 0:
        return False

    x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
                  for w in x_words]
    y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
                  for w in y_words]

    # One word - check whether there is intersection between synsets
    if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \
                    len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not nlp.is_stop(w)]) > 0:
        return True

    # More than one word - align words from x with words from y
    intersections = [len([w for w in s1.intersection(s2) if not nlp.is_stop(w)])
                     for s1 in x_synonyms for s2 in y_synonyms]

    if len([intersection_len for intersection_len in intersections if intersection_len > 0]) >= \
                    0.75 * max(len(x_synonyms), len(y_synonyms)):
        return True

    return False
项目:ModTools    作者:MattBSG    | 项目源码 | 文件源码
def strict_compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.partial_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.partial_ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two) > highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
项目:watcher    作者:nosmokingbandit    | 项目源码 | 文件源码
def fuzzy_title(self, titles):
        ''' Score and remove results based on title match
        titles: list of titles to match against

        If titles is an empty list every result is treated as a perfect match

        Iterates through self.results and removes any entry that does not
            fuzzy match 'title' > 60.
        Adds fuzzy_score / 20 points to ['score']

        *If title is passed as None, assumes perfect match and scores +20

        Does not return
        ''''Checking title match.')

        lst = []
        if titles == []:
            for result in self.results:
                result['score'] += 20
            for result in self.results:
                if result['type'] == 'import' and result not in lst:
                    result['score'] += 20
                test = Url.encode(result['title'])
                matches = [fuzz.partial_ratio(Url.encode(title), test) for title in titles]
                if any([match > 70 for match in matches]):
                    result['score'] += (max(matches) / 5)
                    logging.debug(u'{} best title match was {}%, removing search result.'.format(test, max(matches)))
        self.results = lst'Keeping {} results.'.format(len(self.results)))
项目:watcher    作者:nosmokingbandit    | 项目源码 | 文件源码
def fuzzy_match(self, items, test):
        ''' Fuzzy matches title with predb rss titles
        :param items: list of titles in predb rss
        :param test: str to match to rss titles

        Returns bool if any one 'items' fuzzy matches above 50%

        for item in items:
            match = fuzz.partial_ratio(item, test)
            if match > 50:
                return True
        return False
项目:BuboQA    作者:castorini    | 项目源码 | 文件源码
def reverseLinking(sent, text_candidate):
    tokens = sent.split()
    label = ["O"] * len(tokens)
    text_attention_indices = None
    exact_match = False

    if text_candidate is None or len(text_candidate) == 0:
        return '<UNK>', label, exact_match

    # sorted by length
    for text in sorted(text_candidate, key=lambda x:len(x), reverse=True):
        pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text))
        if, sent):
            text_attention_indices = get_indices(tokens, text.split())
    if text_attention_indices != None:
        exact_match = True
        for i in text_attention_indices:
            label[i] = 'I'
            v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio)
            print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate))
            return '<UNK>', label, exact_match
        v = v.split()
        n_gram_candidate = get_ngram(tokens)
        n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True)
        top = n_gram_candidate[0]
        for i in range(top[1], top[2]):
            label[i] = 'I'
    entity_text = []
    for l, t in zip(label, tokens):
        if l == 'I':
    entity_text = " ".join(entity_text)
    label = " ".join(label)
    return entity_text, label, exact_match
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def main(conf):
    dump_dir = conf['fuzzy.dump.dir']
    makedirs(dump_dir)'Loading train dataset')
    train_df = load_train_df(conf['fuzzy.dataset'])'Loading test dataset')
    test_df = load_test_df(conf['fuzzy.dataset'])

    compute_features(train_df, test_df)'Writing train dataset to disk')
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)'Writing test dataset to disk')
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
项目:matchtools    作者:matchtools    | 项目源码 | 文件源码
def compare_strings(cls, string1, string2, *, tolerance=None,
        Check if the strings provided have a similarity ratio within the
        specified tolerance.

        Return True if yes, otherwise return False.

        Use fuzzywuzzy (

        :param string1: str
        :param string2: str
        :param tolerance: number
        :param method: str, one of: 'uwratio', 'partial_ratio',
                                    'token_sort_ratio', 'token_set_ratio',
        :rtype: bool


        >>> MatchBlock.compare_strings('Beatles', 'The Beatles', tolerance=10)

        >>> MatchBlock.compare_strings('AB', 'AC', tolerance=0, method='ratio')

        str_number = any(
            char.isdigit() for string in (string1, string2) for char in string)

        if tolerance is None:
            if str_number:
                tolerance = cls.str_number_tolerance
                tolerance = cls.string_tolerance

        if not str_number:
            if cls.is_abbreviation(string1, string2):
                return True

        methods = {'uwratio': fuzz.UWRatio,
                   'partial_ratio': fuzz.partial_ratio,
                   'token_sort_ratio': fuzz.token_sort_ratio,
                   'token_set_ratio': fuzz.token_set_ratio,
                   'ratio': fuzz.ratio}

        if method not in methods:
            msg = 'wrong method, use available: {}'
            raise ValueError(msg.format(', '.join(sorted(methods))))

        return methods[method](string1, string2) >= 100 - tolerance
项目:OKR    作者:vered1986    | 项目源码 | 文件源码
def partial_match(x, y):
    Return whether these two mentions have a partial match in WordNet synset.
    :param x: the first mention
    :param y: the second mention
    :return: Whether they are aligned

    # Allow partial matching
    if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100:
        return True

    x_words = [w for w in x.split() if not is_stop(w)]
    y_words = [w for w in y.split() if not is_stop(w)]

    if len(x_words) == 0 or len(y_words) == 0:
        return False

    x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
                  for w in x_words]
    y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
                  for w in y_words]

    # One word - check whether there is intersection between synsets
    if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \
                    len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not is_stop(w)]) > 0:
        return True

    # More than one word - align words from x with words from y
    cost = -np.vstack([np.array([len([w for w in s1.intersection(s2) if not is_stop(w)]) for s1 in x_synonyms])
                       for s2 in y_synonyms])
    m = Munkres()
    cost = pad_to_square(cost)
    indices = m.compute(cost)

    # Compute the average score of the alignment
    average_score = np.mean([-cost[row, col] for row, col in indices])

    if average_score >= 0.75:
        return True

    return False
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def compute_features(train_df, test_df):

    train_df[Fields.qratio] = train_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.qratio] = test_df.apply(
        lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.wratio] = test_df.apply(
        lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
    quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)

    quality = dict(

    return quality