我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用nltk.WordNetLemmatizer()。
def LemTokens(tokens): lemmer = WordNetLemmatizer() return [lemmer.lemmatize(token) for token in tokens]
def add_from_lemma_definitions(self, vocab, try_lower=False): """Add lemma definitions for non-lemmas. This code covers the following scenario: supposed a dictionary is crawled, but only for word lemmas. """ lemmatizer = nltk.WordNetLemmatizer() added = 0 for word in vocab.words: word_list = [word, word.lower()] if try_lower else [word] for word_to_lemma in word_list: try: for part_of_speech in ['a', 's', 'r', 'n', 'v']: lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech) lemma_defs = self._data.get(lemma) if lemma != word and lemma_defs: # This can be quite slow. But this code will not be used # very often. for def_ in lemma_defs: if not def_ in self._data[word]: added += 1 self._data[word].append(def_) except: logger.error("lemmatizer crashed on {}".format(word)) logger.error(traceback.format_exc()) logger.info("Added {} new defs in add_from_lemma_definitions".format(added)) self.save()
def crawl_lemmas(self, vocab): """Add Wordnet lemmas as definitions.""" lemmatizer = nltk.WordNetLemmatizer() for word in vocab.words: definitions = [] try: for part_of_speech in ['a', 's', 'r', 'n', 'v']: lemma = lemmatizer.lemmatize(word, part_of_speech) if lemma != word and not [lemma] in definitions: definitions.append([lemma]) except: logger.error("lemmatizer crashed on {}".format(word)) if definitions: self._data[word] = definitions self.save()
def __init__(self): self._wordnet = nltk.WordNetLemmatizer() self._cache = {}
def __init__(self): self.install_nltk_corpora('stopwords', 'wordnet', 'punkt') self.lemmatizer = nltk.WordNetLemmatizer() self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something. self.tokenizer = self.make_tokenizer() self.stopwords = nltk.corpus.stopwords.words('english') self.sent_tokenizer = None
def lemmatize_individual_text(tokens): ''' Given a list of tokens, return a list of lemmatized strings. ''' lemmatizer = nltk.WordNetLemmatizer() return map(lemmatizer.lemmatize, tokens)
def lemmatize_term(term, pos=None): if pos is None: synsets = wordnet.synsets(term) if not synsets: return term pos = synsets[0].pos() if pos == wordnet.ADJ_SAT: pos = wordnet.ADJ return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
def __init__(self, lower: bool = True, stemmer="port"): self.lower = lower self.stemmer = stemmer if stemmer == "port": self._stemmer = PorterStemmer() self._stem = self._stemmer.stem elif stemmer == "wordnet": self._stemmer = WordNetLemmatizer() self._stem = self._stemmer.lemmatize else: raise ValueError(stemmer) # stemming is slow, so we cache words as we go self.normalize_cache = {}
def __init__(self, require_unique_match, lemmatizer="word_net", empty_question_features=False, stop_words=None): self.lemmatizer = lemmatizer self.stop_words = stop_words self.empty_question_features = empty_question_features if lemmatizer == "word_net": self._lemmatizer = WordNetLemmatizer() else: raise ValueError() self._cache = {} self.require_unique_match = require_unique_match
def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip #self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer()
def __init__(self, exclude_stopwords=False, lemmatize=True): try: import nltk _NLTK_DISABLED = False except: _NLTK_DISABLED = True self.vocas = [] # id to word self.token2id = dict() # word to id self.docfreq = [] # id to document frequency self.exclude_stopwords = exclude_stopwords self.stopwords_list = [] if exclude_stopwords: # Too much strict #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f: # stopwords_list = _f.read().replace('\n', '').split() if not _NLTK_DISABLED: stopwords_list += nltk.corpus.stopwords.words('english') self.stopwords_list = set(stopwords_list) if lemmatize: if not _NLTK_DISABLED: self.wlemm = nltk.WordNetLemmatizer() else: print ('Warning: no lemmatizer !')
def __init__(self, stopwords=None): self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english')) self.lemmatizer = nltk.WordNetLemmatizer()
def test_lemmatize_with_pos(): text = "The restaurants nearby are better than the shops further away" words = nltk.word_tokenize(text) lemmatizer = nltk.WordNetLemmatizer() print utility.lemmatize_with_pos(lemmatizer,words)
def noun_phrases(text, included_unnormalized=False): '''applies normalization to the terms found by noun_phrases_as_tokens and joins on '_'. :rtype: list of phrase strings with spaces replaced by ``_``. ''' lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() def normalize(word): '''Normalises words to lowercase and stems and lemmatizes it.''' word = word.lower() try: word = stemmer.stem_word(word) word = lemmatizer.lemmatize(word) except: pass return word normalizations = defaultdict(list) for terms in noun_phrases_as_tokens(text): key = u'_'.join(map(normalize, terms)) normalizations[key].append(u' '.join(terms)) if included_unnormalized: return normalizations.keys(), normalizations else: return normalizations.keys()
def __init__(self): NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger') self.normalizer = NltkNormalizer() self.lem = nltk.WordNetLemmatizer() self.tagger = nltk.PerceptronTagger() self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
def __init__(self): self.lemmatizer = WordNetLemmatizer()