我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用nltk.BigramTagger()。
def train(self): train_data = nltk.corpus.brown.tagged_sents(categories='news') regexp_tagger = nltk.RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN'), ]) unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True return None
def __init__(self, train_sents): # [_code-unigram-chunker-constructor] train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.BigramTagger(train_data) # [_code-unigram-chunker-buildit]
def train(self): '''Train the Chunker on the ConLL-2000 corpus.''' train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)] for sent in nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=['NP'])] unigram_tagger = nltk.UnigramTagger(train_data) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True