我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用Levenshtein.ratio()。
def match_elements(self, text1, text2): """ utility function to match two strings, makes use of match config initiated in __init__ returns the output as confidence score of flexible match """ conf = 0 if self.m_config['exact']: if text1 == text2: conf += 1 if self.m_config['levenshtein']: conf += ratio(text1, text2) if self.m_config['soundex']: if soundex(text1) == soundex(text2): conf += 1 if self.m_config['nysiis']: if fuzzy.nysiis(text1) == fuzzy.nysiis(text2): conf += 1 return conf
def getVec(kb, id1, id2): if kb == 'bh': title1, context1, category1 = getMsgbyId('baidu', id1) title2, context2, category2 = getMsgbyId('hudong', id2) if kb == 'bw': title1, context1, category1 = getMsgbyId('baidu', id1) title2, context2, category2 = getMsgbyId('wiki', id2) if kb == 'hw': title1, context1, category1 = getMsgbyId('hudong', id1) title2, context2, category2 = getMsgbyId('wiki', id2) title_r = Levenshtein.ratio(title1, title2) context_r = cosine(context1, context2) category_r = sameCategory(category1, category2) return (title_r, context_r, category_r, 0.0)
def frame_similarity(frame1,frame2): similarity = 1 if 'Type' in frame1: if frame1['Type'] != frame2['Type']: similarity = 0.0 if similarity == 1: if 'PlaceMention' in frame1: # if PlaceMention is normalized use simple string comparison if not Levenshtein_arg: if frame1['PlaceMention'] != frame2['PlaceMention']: similarity = 0.0 else: # PlaceMention is not normalized so use Levinshtein distance similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention']) #print("similarity: ", similarity) return similarity # evaluate at the document level -----------------------------------------------
def get_message_change_ratio(status_update): """Expects a status update instance, returns a number representing how much a message has been edited (1.0 completely changed, 0.0 unchanged) based on Levenshtein ratio. If a status update has no associated notification, returns None https://github.com/ztane/python-Levenshtein """ if hasattr(status_update, 'notification'): author_profile = status_update.author.profile intro_text = get_notification_intro(author_profile) + '\n\n' return 1.0 - Levenshtein.ratio( *[message.replace(intro_text, '') for message in ( status_update.notification.base_message, status_update.notification.sent_message)]) else: return None
def getCandidatesForLemma(lemma, min_size, max_size): hits=[] for match in ["phrase", "conjunct"]: url="http://lotus.lodlaundromat.org/retrieve?size=" + str(max_size) + "&match=" + match + "&rank=psf&noblank=true&" + urllib.parse.urlencode({"string": lemma, "predicate": "label", "subject": "\"http://dbpedia.org/resource\""}) r = requests.get(url=url) content = r.json() these_hits=content["hits"] hits=hits + these_hits if content["numhits"]>=min_size or len(lemma.split(' '))==1: break subjects={} for hit in hits: lev_sim=Levenshtein.ratio(hit["string"].lower(), lemma.lower()) if "Disambiguation" not in hit["subject"].lower() and "Category" not in hit["subject"]: if hit["subject"] not in subjects: #subjects[hit["subject"]]=hit["length"]*len(lemma.split()) subjects[hit["subject"]]={"ss": lev_sim, "count": 1} else: subjects[hit["subject"]]["ss"]=max(subjects[hit["subject"]]["ss"], lev_sim) subjects[hit["subject"]]["count"]+=1 return subjects
def mostCommon(spoken, lst, threshold): highestCountItem = max(lst, key=lst.count) highestCount = lst.count(highestCountItem) contenders = [] for item in lst: if (lst.count(item) == highestCount) and (item not in contenders): contenders.append(item) if len(contenders) > 1: print "\nContending" bestMatch = [None, 0] for ayah in contenders: score = ratio(spoken, ayah) print ayah print score if score > threshold and score > bestMatch[1]: bestMatch = [ayah, score] return bestMatch[0] elif ratio(spoken, highestCountItem) > threshold: return highestCountItem else: return None # Takes in a query and list of matches # Returns the match with the highest similarity to the query
def print_matched_groups(extracted_combo_lst): dst_dct = {} for itm in extracted_combo_lst: dst_dct.setdefault(itm, []) if len(extracted_combo_lst) == 1: break match_dct = {} for i in range(len(extracted_combo_lst)): if extracted_combo_lst[i] == itm: continue dst = Levenshtein.ratio(itm, extracted_combo_lst[i]) match_dct[extracted_combo_lst[i]] = dst sorted_match_lst = sorted(match_dct.items(), key = operator.itemgetter(1), reverse = True) top_n = 2 dst_dct[itm] = [e[0] for e in sorted_match_lst[0:top_n]] extracted_combo_lst.remove(itm) for e in dst_dct[itm]: extracted_combo_lst.remove(e) for k, v in dst_dct.items(): print k, v print
def should_run(self): data = self.item_options.get('compare_url') if data: if isinstance(data, Dict): self.fuzzy = data.get('fuzzy', 1.0) self.url2 = data.get('url') if not self.url2: logger.debug('compare_url must contain a url') return False else: logger.debug('compare_url must be a nested dictionary containing url and ratio properties') return False return True return False
def ratio(self): if not self._ratio: self._ratio = ratio(self._str1, self._str2) return self._ratio
def quick_ratio(self): # This is usually quick enough :o) if not self._ratio: self._ratio = ratio(self._str1, self._str2) return self._ratio
def getCandidates(self, mention, threshold=0.7): res = [] # ??title? for id, title, link_count in self.db_titles: m_score = Levenshtein.ratio(title, mention) if m_score > threshold: self.cur.execute("select abstract from abstract where id = %s"%id) context = self.cur.fetchall() if context != (): context = json.loads(context[0][0]) RE = [] self.cur.execute("select to_id from link where from_id = %s"%id) linkto_ids = self.cur.fetchall() if linkto_ids != (): for to_id in linkto_ids: RE.append(to_id[0]) res.append(Entity(title, id, m_score, context, link_count, RE)) # ??disambiguation? for id, title, dis_context, link_count in self.db_disambiguations: m_score = Levenshtein.ratio(title, mention) if m_score > threshold: title += '[%s]'%dis_context self.cur.execute("select abstract from abstract where id = %s"%id) context = self.cur.fetchall() if context != (): context = json.loads(context[0][0]) context.append(dis_context) RE = [] self.cur.execute("select to_id from link where from_id = %s"%id) linkto_ids = self.cur.fetchall() if linkto_ids != (): for to_id in linkto_ids: RE.append(to_id[0]) res.append(Entity(title, id, m_score, context, link_count, RE)) return res
def frame_similarity(frame1,frame2): similarity = 1 if 'Type' in frame1: if frame1['Type'] != frame2['Type']: similarity = 0 if similarity == 1: if 'PlaceMention' in frame1: similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention']) return similarity # evaluate at the document level -----------------------------------------------
def get_name_similarity_ratio(a, b): names = (get_full_lowercase_name(sub) for sub in (a, b)) return Levenshtein.ratio(*names)
def interesting_party(*a, **k): while True: while True: phrase = get_name() if len(phrase) < 100: break steps = party(phrase, *a, **k) result = steps[-1][-1] if ratio(phrase.lower(), result.lower()) < 0.7: return steps
def play(self, guess): return ratio(normalise(guess), normalise(self.original))
def moreLocalCandidates(m, previous, candidates): for pm, pl in previous.items(): if is_abbrev(m, pm): for prevLink in previous[pm]: prevLinkDB=utils.makeDbpedia(prevLink) candidates.append(tuple([prevLinkDB, {"ss": 1.0, "count": 0.0}])) elif isEnoughSubset(m, pm): for prevLink in previous[pm]: prevLinkDB=utils.makeDbpedia(prevLink) candidates.append(tuple([prevLinkDB, {"ss": Levenshtein.ratio(m.lower(), pm.lower()), "count": 0.0}])) return candidates
def bestLevMatch(spoken, lst): print " " bestMatch = [None, 0.65] for ayah in lst: score = ratio(spoken, ayah) print ayah print score if score > bestMatch[1]: bestMatch = [ayah, score] return bestMatch[0] # Takes in an ayah object from alfanous # Returns a cleaned-up ayah object
def checkForWordInQuran(value): wordMatch = dbGet(models.QuranWord, value) if wordMatch: return wordMatch.text else: # The original word is not in the Quran so we try alfanous' suggestions wordSuggestionList = [] wordSuggestions = alfanous.do({ "action": "suggest", "query": value })["suggest"] for word in wordSuggestions: for suggestion in wordSuggestions[word]: wordMatch = dbGet(models.QuranWord, value) if wordMatch: wordSuggestionList.append(wordMatch.text) if len(wordSuggestionList) > 1: topRatioValue = 0 topSuggestion = "" while len(wordSuggestionList) > 0: suggestion = wordSuggestionList.pop(0) suggestionRatio = ratio(value, suggestion) if suggestionRatio > topRatioValue: topRatioValue = suggestionRatio topSuggestion = suggestion return topSuggestion elif len(wordSuggestionList) == 1: return wordSuggestionList[0] else: return None # Takes in a query and checks if any part of it is in the Quran # Return the part in the Quran if one is found, otherwise it returns None
def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD): """Compute the similarity ratio between two strings. If the ratio exceeds the threshold, return it; otherwise, return 0. The similarity ratio is given by 1 - (levenshtein distance with substitution cost = 2) / (total length) """ ratio = Levenshtein.ratio(x, y) return ratio if ratio > threshold else 0. ################################ # NERValueGenerator
def getLevenshteinDistance(item, keyword): item = item.lower().replace(' ', '').replace('-', '').replace('_', '') keyword = keyword.lower().replace(' ', '').replace('-', '').replace('_', '') return Levenshtein.ratio(item, keyword)
def generateStemmingDict(inputPath = 'stemmer.txt', outputPath = 'stemmingDict'): inputEncoding = 'utf8' outputEncoding = 'utf8' distance = Levenshtein.ratio fi = open(inputPath, 'r', encoding=inputEncoding) fo = open(outputPath, 'w', encoding=outputEncoding) stemmingDict = {} for line in fi: if line.strip() == '': continue tmpList = line.strip().split(' => ') for word in tmpList[0].split(', '): if word not in stemmingDict: stemmingDict[word] = set() stemmingDict[word].add(tmpList[1]) for key in stemmingDict: stemmingDict[key] = list(stemmingDict[key]) for i in range(len(stemmingDict[key])): stemmingDict[key][i] = [stemmingDict[key][i],distance(stemmingDict[key][i],key)] json.dump(stemmingDict,fo) fi.close() fo.close() fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding) for key in stemmingDict: fotxt.write(key + ' ' + str(stemmingDict[key]) + '\n') fotxt.close()
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1) print('nones') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x))) df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_nones') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1) df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x)) df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x)) df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x)) del df_features['question1_w2v'] del df_features['question2_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def get_features(df_features): print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_w2v'] = df_features.apply(lambda x: w2v_cos_sim(x['question1'], x['question2']), axis=1) return df_features
def get_registrar( cls, subject, ): cls.check_and_update_registrars() edited_subject = re.sub( pattern='[^\d\w]', repl='', string=subject, ) edited_subject = edited_subject.lower() for registrar in cls.registrars: if edited_subject in registrar['edited'].lower(): return registrar['original'] most_close_registrar = '' most_close_registrar_distance_ratio = 0 for registrar in cls.registrars: registrar_distance_ratio = Levenshtein.ratio( edited_subject, registrar['edited'], ) if registrar_distance_ratio > most_close_registrar_distance_ratio: most_close_registrar = registrar['original'] most_close_registrar_distance_ratio = registrar_distance_ratio return most_close_registrar
def adjective_fuzzy_matching(token, adjectives, match): """ Given a token and a list of terms to match, returns True if the stem of the token matches any of the items in the list. Input: token: Token object to match adjectives: list of items to match the Token match: minimum ratio (0-100) for matching """ for adjective in adjectives: if Levenshtein.ratio(str(token.stem), str(adjective)) >= match: return True return False
def _transactions_fuzzy_matching(transactions, match): """ Runs fuzzy matching on the transactions, by applying a complete linkage hierarchical clustering algorithm to the set of different itemsets in the transactions. For clustering, the similarity ratio as given by fuzzywuzzy.ratio is used as the distance measure Input: transactions: list of tuples representing items on each transaction match: minimum similarity ratio (0 to 100) for clustering Output: transactions: new version of the transactions, where each item has been replaced by the first item on its corresponding cluster word_clusters: dictionary that maps the cluster for each item in the transactions """ words = set([]) for transaction in transactions: words |= set(transaction) words = sorted(words) l = [((a, b), 100-Levenshtein.ratio(str(a), str(b))) for a, b in combinations(words, 2)] d = [value for pair, value in l] r = linkage(d, 'complete') clusters_index = fcluster(r, 100-match, "distance") clusters = {} for obs_i, cluster_i in enumerate(clusters_index): if cluster_i in clusters: clusters[cluster_i].append(words[obs_i]) else: clusters[cluster_i] = [words[obs_i]] word_clusters = {word: clusters[clusters_index[i]] for i, word in enumerate(words)} new_transactions = [] for transaction in transactions: new_transaction = tuple(set(([word_clusters[word][0] for word in transaction]))) new_transactions.append(new_transaction) return new_transactions, word_clusters
def __init__(self, match=90, key=lambda x: x.string.lower()): """ Fuzzy matching between the given token and term objects. For comparison applies the function given in the "key" parameter to the Token/tuple of Tokens. Parameter match defines the minimum similarity ratio for a match when comparing. Input: match : minimum similarity for fuzzy matching (%) key : function to apply to the token, default=lambda x: x.string.lower() """ self.match = match self.key = key
def __call__(self, token_tuple, terms): """ Input: token_tuple : Token or tuple of Token objects terms : term or iterable of terms to match Output: Returns None if no match is found. Returns the first matched in case many of them show the same similarity ratio. """ if not hasattr(terms, '__iter__'): terms = [terms] if not isinstance(token_tuple, tuple): token_tuple = (token_tuple,) try: token_tuple = tuple(self.key(token) for token in token_tuple) except Exception: # as e token_tuple = tuple(str(token) for token in token_tuple) best_term = None best_ratio = 0 for term in terms: ratio = max([Levenshtein.ratio(unicode(" ".join(token_tuple)), unicode(" ".join(term_i)))*100 for term_i in term]) if ratio >= self.match and ratio > best_ratio: best_term = term best_ratio = ratio return best_term # ------- UTIL FUNCTIONS ------------------------------------------------------
def _edit_dist(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio(str1, str2) d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() return d
def similar_link_visited(link_url, links, fuzzy): for link in links: if ratio(link_url, link) >= fuzzy: # Link already accessed, return return True return False
def check(self): headers = self.item_options.get('headers', {}) cookies = self.item_options.get('cookies', {}) username = self.global_options.get('username') password = self.global_options.get('password') r2 = requests.get(self.url2, headers=headers, auth=HTTPBasicAuth(username, password), allow_redirects=True, cookies=cookies) logger.info("Comparing urls...") if self.fuzzy == 1.0: self.ok(self.response.text == r2.text, 'Urls don\'t have equal content: {tested} and {reference}'.format(tested=self.url, reference=self.url2)) else: actual_ratio = ratio(self.response.text, r2.text) self.ok(actual_ratio > self.fuzzy, """ Urls don\'t have sufficiently similar content: {tested} and {reference} (expected {expected}, got {actual}) """ .format( tested=self.url, reference=self.url2, expected=self.fuzzy, actual=actual_ratio)) return self.is_ok()
def set_levenshtein(self): ''' Mean and max Levenshtein ratio for all labels. ''' if not [f for f in self.features if f.startswith('match_str_lsr')]: return ne = self.cluster.entities[0].norm # Pref label l = self.document.get('pref_label') self.match_str_lsr_pref = Levenshtein.ratio(ne, l) # Wikidata alt labels if self.document.get('wd_alt_label'): wd_labels = self.document.get('wd_alt_label') ratios = [Levenshtein.ratio(ne, l) for l in wd_labels] self.match_str_lsr_wd_max = max(ratios) - 0.5 self.match_str_lsr_wd_mean = (sum(ratios) / float(len(wd_labels))) - 0.375 else: wd_labels = [] # Any other alt labels if self.document.get('alt_label'): labels = self.document.get('alt_label') labels = [l for l in labels if l not in wd_labels] if labels: ratios = [Levenshtein.ratio(ne, l) for l in labels] self.match_str_lsr_alt_max = max(ratios) - 0.5 self.match_str_lsr_alt_mean = (sum(ratios) / float(len(labels))) - 0.375
def appendWordNetStemmingDict(inputPath='stemmingDict.old', outputPath='stemmingDict',outputEncoding='utf8'): oldDict = json.load(open(inputPath,'r',encoding='utf8')) distance = Levenshtein.ratio fi = open('wordnet.map','r',encoding='utf8') fo = open(outputPath,'w',encoding='utf8') for m in list(oldDict): tmp = set() for l in list(oldDict[m]): tmp.add(l[0]) oldDict[m] = set(tmp) for line in fi: m = line.strip().split(' ') if len(m) == 0: continue if m[0] not in oldDict: oldDict[m[0]]=set() oldDict[m[0]].add(m[1]) for m in list(oldDict): oldDict[m] = list(oldDict[m]) for i in range(len(oldDict[m])): if type(oldDict[m][i]) != str or type(m) != str: print(oldDict[m]) input() continue oldDict[m][i] = [oldDict[m][i],distance(oldDict[m][i],m)] json.dump(oldDict,fo) fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding) for key in oldDict: fotxt.write(key + ' ' + str(oldDict[key]) + '\n') fotxt.close() ## ##print('Dumping stemming mpping to json format......') ##generateStemmingDict() ##appendWordNetStemmingDict() ##print('Done!')
def calScoreSub(self, countCharDict): distance = Levenshtein.ratio q = self.qRaw scoreSub = 0 sub = '' if type(self.sub) == str: sub = self.sub subSplit = sub.split(' ') if sub in q: for w in subSplit: if w in countCharDict: scoreSub += 1/(countCharDict[w] + 1) else: scoreSub += 1 else: subSet = set(subSplit) qSet = set(q.split(' ')) for w in (subSet & qSet): if w in countCharDict: scoreSub += 1/(countCharDict[w] + 1) else: scoreSub += 1 if len(subSet) != 0: scoreSub = scoreSub/len(subSet) if type(self.sub) == list: for s in self.sub[0]: sub += s + ' ' sub = sub.strip() if type(self.sub) == list: if len(self.sub[0]) == len(self.sub[1]): lenSub = len(self.sub[0]) for i in range(lenSub): w = self.sub[0][i] wC = self.sub[1][i] if w in countCharDict: scoreSub += 1/(countCharDict[w] + 1)*distance(w,wC) else: scoreSub += 1*distance(w,wC) scoreSub = scoreSub / lenSub else: subIntersaction = set(self.sub[0]) & set(self.sub[1]) scoreSub = len(subIntersaction) / len(set(self.sub[0]) | set(self.sub[1])) self.scoreSub = scoreSub return scoreSub
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict): distance = Levenshtein.ratio pre = self.pre scorePre = 0 lastPreIndex = pre.rfind('.') if lastPreIndex != -1: preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:])) else: preLowerSet = set(re.split(r' ',pre)) preLower = list(preLowerSet) preLowerSet = set() for i in range(len(preLower)): if preLower[i] in stemmingDict: preLower[i] = stemmingDict[preLower[i]][0][0] preLowerSet.add(preLower[i]) maxIntersection = qWithoutSubSet & preLowerSet preFactor = 0 for char in maxIntersection: if char in countCharDict: preFactor += 1/(countCharDict[char] + 1) else: preFactor += 1 if len(maxIntersection) == 0: for w1 in qWithoutSubSet: for w2 in preLowerSet: if w1 == '' or w2 == '' or w1[0] != w2[0]: continue div = 1 if w1 in countCharDict: div = countCharDict[w1] + 1 dWord = distance(w1,w2) / div if preFactor < dWord: preFactor = dWord if len(pre) != 0: scorePre = preFactor / len(qWithoutSubSet | preLowerSet) else: scorePre = 0 self.scorePreLast = scorePre return scorePre
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict): distance = Levenshtein.ratio pre = self.pre scorePre = 0 preLowerSet = set(re.split(r' |\.',pre)) preLower = list(preLowerSet) preLowerSet = set() for i in range(len(preLower)): if preLower[i] in stemmingDict: preLower[i] = stemmingDict[preLower[i]][0][0] preLowerSet.add(preLower[i]) maxIntersection = qWithoutSubSet & preLowerSet preFactor = 0 for char in maxIntersection: if char in countCharDict: preFactor += 1/(countCharDict[char] + 1) else: preFactor += 1 if len(maxIntersection) == 0: for w1 in qWithoutSubSet: for w2 in preLowerSet: if w1 == '' or w2 == '' or w1[0] != w2[0]: continue div = 1 if w1 in countCharDict: div = countCharDict[w1] + 1 dWord = distance(w1,w2) / div if preFactor < dWord: preFactor = dWord if len(pre) != 0: scorePre = preFactor / len(qWithoutSubSet | preLowerSet) else: scorePre = 0 self.scorePreAll = scorePre return scorePre
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1) print('get_w2v') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x))) df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x))) df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x))) df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_calc') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1) df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1) df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x)) df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x)) df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x)) del df_features['q1_unique_w2v_weight'] del df_features['q2_unique_w2v_weight'] del df_features['q1_unique_w2v'] del df_features['q2_unique_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def __init__(self, stopwords=NLTKStopwords(), min_support=MIN_SUPPORT, max_words=MAX_WORDS, min_psupport=MIN_PSUPPORT, min_compact_support=MIN_COMPACT_SUPPORT, max_compact_distance=MAX_COMPACT_DISTANCE, adj_key=StemKey(), adj_win_size=ADJ_NEARBY_DISTANCE , match=85, compactness=True, redundancy=True, infrequent=True): """ Model to extract aspects using the algorithm by Hu et al. (2004) stopwords : iterable of strings to use as stopwords min_support : int, minimum support of an item set (positive: percentage, negative: absolute number of transactions) min_compact_support : int minimum number of compact sentences of an aspect max_words : int, maximum number of word on each aspect, max_compact_distance : int, maximum distance between consecutive words in an aspect adj_win_size : int, maximum distance to look for adjectives near an aspect on a sentence min_psupport : int, minimum pure support of an aspect adj_key : lambda function to extract adjectives match : int, minimum similarity ratio (0-100] for matching (use <100 for fuzzy) default= compactness : boolean, True to run "compactness pruning" redundancy : boolean, True to run "redundancy pruning" infrequent : boolean, True to also extract infrequent aspects """ self.params = {"stopwords": stopwords, "min_support": min_support, "max_words": max_words, "min_psupport": min_psupport, "min_compact_support": min_compact_support, "max_compact_distance": max_compact_distance, "adj_key": adj_key, "adj_win_size": adj_win_size, "match": match, "compactness": compactness, "redundancy": redundancy, "infrequent": infrequent}
def map_discipl(self,invalue,disctab): """ Convert disciplines along B2FIND disciplinary list Copyright (C) 2014 Heinrich Widmann Licensed under AGPLv3. """ retval=list() if type(invalue) is not list : inlist=re.split(r'[;&\s]\s*',invalue) inlist.append(invalue) else: seplist=[re.split(r"[;&]",i) for i in invalue] swlist=[re.findall(r"[\w']+",i) for i in invalue] inlist=swlist+seplist inlist=[item for sublist in inlist for item in sublist] for indisc in inlist : ##indisc=indisc.encode('ascii','ignore').capitalize() indisc=indisc.encode('utf8').replace('\n',' ').replace('\r',' ').strip().title() maxr=0.0 maxdisc='' for line in disctab : try: disc=line[2].strip() r=lvs.ratio(indisc,disc) except Exception as e: logging.error('[ERROR] %s in map_discipl : %s can not compared to %s !' % (e,indisc,disc)) continue if r > maxr : maxdisc=disc maxr=r ##HEW-T print('--- %s \n|%s|%s| %f | %f' % (line,indisc,disc,r,maxr) if maxr == 1 and indisc == maxdisc : logging.debug(' | Perfect match of %s : nothing to do' % indisc) retval.append(indisc.strip()) elif maxr > 0.90 : logging.debug(' | Similarity ratio %f is > 0.90 : replace value >>%s<< with best match --> %s' % (maxr,indisc,maxdisc)) ##return maxdisc retval.append(indisc.strip()) else: logging.debug(' | Similarity ratio %f is < 0.90 compare value >>%s<< and discipline >>%s<<' % (maxr,indisc,maxdisc)) continue if len(retval) > 0: retval=list(OrderedDict.fromkeys(retval)) ## this elemenates real duplicates return ';'.join(retval) else: return 'Not stated'