我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用nltk.chunk()。
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False): # Create the chunker that uses our grammar chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.word_tokenize(sent)) # Parse the sentence, converting the parse tree into a tagged sequence sent = normalize(sent) if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract phrases and rejoin them with space phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
def measure_pattern_time_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown words = brown.words()[:%s] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. from pattern.search import search def measure_pattern_search(): global pattern_search_result #Make measure_me able to modify the value pattern_search_result = search("%s", text_tree) from timeit import Timer pattern_search_time = Timer(measure_pattern_search) def pattern_search_timeit(): runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)] average = sum(runtimes)/len(runtimes) # return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))]) return [runtimes, average, min(runtimes)] channel.send(pattern_search_timeit()) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
def pyrata2conll (dictList, **kwargs): """ See 3.1 Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html can be used wi nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw() """ if 'raw' in kwargs.keys(): rawFeatureName = kwargs['raw'] if 'pos' in kwargs.keys(): posFeatureName = kwargs['pos'] if 'chunk' in kwargs.keys(): chunkFeatureName = kwargs['chunk'] text = '' for e in dictList: text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n'])) return text # extend a given dictList # merge dictList # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Run all the tests # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ] # Yield candidates that are not filtered by stopwords and punctuation. for chunk in normalizer.normalize(chunks): yield chunk