Python Levenshtein 模块，distance() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用Levenshtein.distance()。

项目：ngraph 作者：NervanaSystems | 项目源码 | 文件源码

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = {ss: ii for ii, ss in enumerate(b)}

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

项目：chalktalk_docs 作者：loremIpsum1771 | 项目源码 | 文件源码

def levenshtein_distance(a, b):
    """Return the Levenshtein edit distance between two strings *a* and *b*."""
    if a == b:
        return 0
    if len(a) < len(b):
        a, b = b, a
    if not a:
        return len(b)
    previous_row = range(len(b) + 1)
    for i, column1 in enumerate(a):
        current_row = [i + 1]
        for j, column2 in enumerate(b):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (column1 != column2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

项目：tpg.now 作者：stklik | 项目源码 | 文件源码

def getStopFromString(self, candidate):
        normalizedCandidate = Stop.normalizeStopName(candidate)
        if not Tpg.getTodaysStops():
            return None

        for stop in Tpg.getTodaysStops():
            if candidate.upper() == stop.code:
                return stop

            if normalizedCandidate == stop.normalizedName:
                return stop

        for stop in Tpg.getTodaysStops():
            if normalizedCandidate in stop.normalizedName:
                return stop

        # calculate the Levenshtein distance to all stop names
        codeToLevenshtein = {stop: Levenshtein.distance(
            normalizedCandidate, stop.normalizedName) for stop in Tpg.getTodaysStops()}
        # smallest Levenshtein distance
        minimum = min(codeToLevenshtein, key=codeToLevenshtein.get)
        return minimum

项目：henet 作者：AcrDijon | 项目源码 | 文件源码

def test_parse(self):
        for file in os.listdir(SAMPLE_DIR):
            if not file.endswith('.rst'):
                continue
            filename = os.path.join(SAMPLE_DIR, file)
            article = parse_article(filename)
            rendered = article.render().strip()

            with open(filename) as f:
                source = f.read().strip()
                source = source.expandtabs(4).decode('utf8')

            if source != rendered:
                lev_ = distance(source, rendered)
                jaro_ = jaro(source, rendered)

                if lev_ > 10 and jaro_ < 0.8 and file not in MUTATED_FILES:
                    print('%d %f %s' % (lev_, jaro_, filename))
                    raise AssertionError(filename)

项目：tensorflow_end2end_speech_recognition 作者：hirofumi0810 | 项目源码 | 文件源码

def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
    Args:
        session:
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
    Returns:
        edit_distances: list of edit distance of each uttearance
    """
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances

项目：tensorflow_end2end_speech_recognition 作者：hirofumi0810 | 项目源码 | 文件源码

def compute_per(ref, hyp, normalize=True):
    """Compute Phone Error Rate.
    Args:
        ref (list): phones in the reference transcript
        hyp (list): phones in the predicted transcript
        normalize (bool, optional): if True, divide by the length of str_true
    Returns:
        per (float): Phone Error Rate between str_true and str_pred
    """
    # Build mapping of phone to index
    phone_set = set(ref + hyp)
    phone2char = dict(zip(phone_set, range(len(phone_set))))

    # Map phones to a single char array
    # NOTE: Levenshtein packages only accepts strings
    phones_ref = [chr(phone2char[p]) for p in ref]
    phones_hyp = [chr(phone2char[p]) for p in hyp]

    per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
    if normalize:
        per /= len(ref)
    return per

项目：deepSpeech 作者：fordDeepDSP | 项目源码 | 文件源码

def inference(predictions_op, true_labels_op, display, sess):
    """ Perform inference per batch on pre-trained model.
    This function performs inference and computes the CER per utterance.
    Args:
        predictions_op: Prediction op
        true_labels_op: True Labels op
        display: print sample predictions if True
        sess: default session to evaluate the ops.
    Returns:
        char_err_rate: list of CER per utterance.
    """
    char_err_rate = []
    # Perform inference of batch worth of data at a time.
    [predictions, true_labels] = sess.run([predictions_op,
                                           true_labels_op])
    pred_label = sparse_to_labels(predictions[0][0])
    actual_label = sparse_to_labels(true_labels)
    for (label, pred) in zip(actual_label, pred_label):
        char_err_rate.append(distance(label, pred)/len(label))

    if display:
        # Print sample responses
        for i in range(ARGS.batch_size):
            print(actual_label[i] + ' vs ' + pred_label[i])
    return char_err_rate

项目：dbas 作者：hhucn | 项目源码 | 文件源码

def get_strings_for_search(value):
    """
    Returns all statements which have a substring of the given value

    :param value: String
    :return: dict() with Statements.uid as key and 'text', 'distance' as well as 'arguments' as values
    """
    tmp_dict = OrderedDict()
    db_statements = get_not_disabled_statement_as_query().join(TextVersion, Statement.textversion_uid == TextVersion.uid).all()
    for stat in db_statements:
        if value.lower() in stat.textversions.content.lower():
            # get distance between input value and saved value
            rd = __get_fuzzy_string_dict(current_text=value, return_text=stat.textversions.content, uid=stat.uid)
            tmp_dict[str(stat.uid)] = rd

    tmp_dict = __sort_dict(tmp_dict)
    return_index = list(islice(tmp_dict, list_length))
    return_dict = OrderedDict()
    for index in return_index:
        return_dict[index] = tmp_dict[index]
    return return_dict

项目：dbas 作者：hhucn | 项目源码 | 文件源码

def get_strings_for_public_nickname(value, nickname):
    """
    Returns dictionaries with public nicknames of users, where the nickname containts the value

    :param value: String
    :param nickname: current users nickname
    :return: dict()
    """
    db_user = DBDiscussionSession.query(User).filter(func.lower(User.public_nickname).contains(func.lower(value)),
                                                     ~User.public_nickname.in_([nickname, 'admin', nick_of_anonymous_user])).all()
    return_array = []

    for index, user in enumerate(db_user):
        dist = get_distance(value, user.public_nickname)
        return_array.append({'index': index,
                             'distance': dist,
                             'text': user.public_nickname,
                             'avatar': get_public_profile_picture(user)})

    return_array = __sort_array(return_array)
    return return_array[:list_length]

项目：dbas 作者：hhucn | 项目源码 | 文件源码

def __sort_array(list):
    """
    Returns sorted array, based on the distance

    :param list: Array
    :return: Array
    """
    return_list = []
    newlist = sorted(list, key=lambda k: k['distance'])

    if mechanism == 'SequenceMatcher':  # sort descending
        newlist = reversed(newlist)

    # add index
    for index, dict in enumerate(newlist):
        dict['index'] = index
        return_list.append(dict)

    return return_list

项目：dbas 作者：hhucn | 项目源码 | 文件源码

def __sort_dict(dictionary):
    """
    Returns sorted dictionary, based on the distance

    :param dictionary: dict()
    :return: dict()
    """
    dictionary = OrderedDict(sorted(dictionary.items()))
    return_dict = OrderedDict()
    for i in list(dictionary.keys())[0:return_count]:
        return_dict[i] = dictionary[i]
    if mechanism == 'SequenceMatcher':  # sort descending
        return_dict = OrderedDict(sorted(dictionary.items(), key=lambda kv: kv[0], reverse=True))
    else:  # sort ascending
        return_dict = OrderedDict()
        for i in list(dictionary.keys())[0:return_count]:
            return_dict[i] = dictionary[i]
    return return_dict

项目：ph0neutria 作者：phage-nz | 项目源码 | 文件源码

def getSignificantItems(item_list):
    tokenised_list = []

    logging.info('Tokenising input data.')
    for item in item_list:
        tokenised_list.append(tokeniseUrl(item))

    items = np.asarray(item_list)
    tokenised_items = np.asarray(tokenised_list)
    logging.info('Calculating Levenshtein distances between items.')
    lev_similarity = -1*np.array([[Levenshtein.distance(i1,i2) for i1 in tokenised_items] for i2 in tokenised_items])

    logging.info('Applying affinity propagation to data.')
    aff_prop = sklearn.cluster.AffinityPropagation(affinity='precomputed', damping=0.7)
    aff_prop.fit(lev_similarity)

    logging.info('Completed! Assembling list.')
    output_list = []

    for cluster_id in np.unique(aff_prop.labels_):
        exemplar = items[aff_prop.cluster_centers_indices_[cluster_id]]
        output_list.append(exemplar)

    return output_list

项目：deepspeech.pytorch 作者：SeanNaren | 项目源码 | 文件源码

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

项目：video_subtitle_extract 作者：thewintersun | 项目源码 | 文件源码

def maybe_same(str1,str2):
  '''??2????????????'''
  if len(str1) > len(str2):
    temp = str1
    str1 = str2
    str2 = temp

  #??????????????
  if float(len(str2))/ len(str1) > 2 and len(str1)>=4:
    return False

  #????????2, ?????
  distance = Levenshtein.distance(str1,str2)
  if distance <= 3 and len(str1)>=10:
    return True
  if distance <= 4 and len(str1)>=13:
    return True
  if distance <= 1 and len(str1)>=5:
    return True
  if distance > 2 and len(str1)<=6:
    return False
  if distance > 3:
    return False

  return True

项目：flexmatcher 作者：biggorilla-gh | 项目源码 | 文件源码

def find_knn(self, train_strings, train_labels, test_strings):
        """Find 3 nearest neighbors of each item in test_strings in
        train_strings and report their labels as the prediction.

        Args:
            train_strings (ndarray): Numpy array with strings in training set
            train_labels (ndarray): Numpy array with labels of train_strings
            test_strings (ndarray): Numpy array with string to be predict for
        """
        prediction = np.zeros((len(test_strings), self.num_classes))
        for i in range(len(test_strings)):
            a_str = test_strings[i]
            dists = np.array([0] * len(train_strings))
            for j in range(len(train_strings)):
                b_str = train_strings[j]
                dists[j] = lev.distance(a_str, b_str)
            # finding the top 3
            top3 = dists.argsort()[:3]
            for ind in top3:
                prediction[i][self.column_index[train_labels[ind]]] += 1.0 / 3
        return prediction

项目：pygi 作者：onlined | 项目源码 | 文件源码

def gitignores(*args):
    to_send = []
    gitignore_list = list()
    for arg in set(args):
        if arg in gitignore_list:
            to_send.append(arg)
        elif __name__ == '__main__':
            possibles = []
            for gitignore in gitignore_list:
                if Levenshtein.distance(gitignore, arg) == 1:
                    possibles.append(gitignore)
            print('WARNING: {} is not in gitignore list.'.format(arg), file=sys.stderr, end='')
            if possibles:
                if len(possibles) == 1:
                    possible_string = possibles[0]
                else:
                    possible_string = ', '.join(possibles[:-1]) + ' or ' + possibles[-1]
                print(' Did you mean {}?'.format(possible_string), file=sys.stderr)
            else:
                print('', file=sys.stderr)
    if not to_send:
        return '\n'
    text = _get_text_from_url('{}/{}'.format(API_URL, ','.join(to_send)))
    return '\n'.join(text.split('\n')[2:])

项目：deep-web-hackathon 作者：ogigoc | 项目源码 | 文件源码

def prune_useless_elements(path_root):
    to_remove = []
    for c in path_root.children:
        for useless in USELESS_KEYWORDS:
            if c.id and (distance(c.id, useless) <= MAX_DISTANCE or useless in c.id):
                #print('Removing {0} because of id {1}'.format(c, useless))
                to_remove.append(c)

            if c.cls:
                for cl in c.cls:
                    if distance(cl, useless) <= MAX_DISTANCE or useless in cl:
                        #print('Removing {0} because of class name {1}'.format(c, cl))
                        to_remove.append(c)

    path_root.children = [c for c in path_root.children if c not in to_remove]
    for c in path_root.children:
        prune_useless_elements(c)

项目：BotDigger 作者：hanzhang0116 | 项目源码 | 文件源码

def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
    similarDomain = ""
    minDistance = sys.maxint
    level = domain.split(".")
    if len(level) <=1:
        return ("not a domain", sys.maxint)
    (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
    for popularDomain in DomainDict:
        distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
        if distance < minDistance:
            minDistance = distance
            similarDomain = popularDomain
    #debug
    #sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
    if len(similarDomain) > 0:
        return (similarDomain, minDistance/float(len(similarDomain)))
    else:
        return (domain2LD, 0)

# check whether a domain contains invalid TLD