Python nltk 模块,WordNetLemmatizer() 实例源码

我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用nltk.WordNetLemmatizer()

项目:LDA-REST    作者:valentinarho    | 项目源码 | 文件源码
def LemTokens(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(token) for token in tokens]
项目:dict_based_learning    作者:tombosc    | 项目源码 | 文件源码
def add_from_lemma_definitions(self, vocab, try_lower=False):
        """Add lemma definitions for non-lemmas.

        This code covers the following scenario: supposed a dictionary is crawled,
        but only for word lemmas.

        """
        lemmatizer = nltk.WordNetLemmatizer()
        added = 0
        for word in vocab.words:
            word_list = [word, word.lower()] if try_lower else [word]

            for word_to_lemma in word_list:
                try:
                    for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                        lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
                        lemma_defs = self._data.get(lemma)
                        if lemma != word and lemma_defs:
                            # This can be quite slow. But this code will not be used
                            # very often.
                            for def_ in lemma_defs:
                                if not def_ in self._data[word]:
                                    added += 1
                                    self._data[word].append(def_)
                except:
                    logger.error("lemmatizer crashed on {}".format(word))
                    logger.error(traceback.format_exc())
        logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
        self.save()
项目:dict_based_learning    作者:tombosc    | 项目源码 | 文件源码
def crawl_lemmas(self, vocab):
        """Add Wordnet lemmas as definitions."""
        lemmatizer = nltk.WordNetLemmatizer()
        for word in vocab.words:
            definitions = []
            try:
                for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                    lemma = lemmatizer.lemmatize(word, part_of_speech)
                    if lemma != word and not [lemma] in definitions:
                        definitions.append([lemma])
            except:
                logger.error("lemmatizer crashed on {}".format(word))
            if definitions:
                self._data[word] = definitions
        self.save()
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self):
        self._wordnet = nltk.WordNetLemmatizer()
        self._cache   = {}
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None
项目:Hacker_News_Article_Topics    作者:reeddunkle    | 项目源码 | 文件源码
def lemmatize_individual_text(tokens):
    '''
    Given a list of tokens, return a list of lemmatized strings.
    '''

    lemmatizer = nltk.WordNetLemmatizer()

    return map(lemmatizer.lemmatize, tokens)
项目:natural-language-preprocessings    作者:Hironsan    | 项目源码 | 文件源码
def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
项目:document-qa    作者:allenai    | 项目源码 | 文件源码
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}
项目:document-qa    作者:allenai    | 项目源码 | 文件源码
def __init__(self, require_unique_match, lemmatizer="word_net",
                 empty_question_features=False, stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match
项目:ai-chatbot-framework    作者:alfredfrancis    | 项目源码 | 文件源码
def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        #self.stopwords  = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
项目:pymake    作者:dtrckd    | 项目源码 | 文件源码
def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        self.stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            self.stopwords_list = set(stopwords_list)

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !')
项目:partisan-discourse    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self, stopwords=None):
        self.stopwords  = set(stopwords or nltk.corpus.stopwords.words('english'))
        self.lemmatizer = nltk.WordNetLemmatizer()
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def test_lemmatize_with_pos():
    text = "The restaurants nearby are better than the shops further away"
    words = nltk.word_tokenize(text)
    lemmatizer = nltk.WordNetLemmatizer()
    print utility.lemmatize_with_pos(lemmatizer,words)
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def noun_phrases(text, included_unnormalized=False):
    '''applies normalization to the terms found by noun_phrases_as_tokens
    and joins on '_'.

    :rtype: list of phrase strings with spaces replaced by ``_``.

    '''
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    def normalize(word):
        '''Normalises words to lowercase and stems and lemmatizes it.'''
        word = word.lower()
        try:
            word = stemmer.stem_word(word)
            word = lemmatizer.lemmatize(word)
        except:
            pass
        return word

    normalizations = defaultdict(list)
    for terms in noun_phrases_as_tokens(text):
        key = u'_'.join(map(normalize, terms))
        normalizations[key].append(u' '.join(terms))

    if included_unnormalized:
        return normalizations.keys(), normalizations
    else:
        return normalizations.keys()
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def __init__(self):
        NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
        self.normalizer = NltkNormalizer()
        self.lem = nltk.WordNetLemmatizer()
        self.tagger = nltk.PerceptronTagger()
        self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None
项目:nltk-api    作者:szyku    | 项目源码 | 文件源码
def __init__(self):
        self.lemmatizer = WordNetLemmatizer()