我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用nltk.RegexpParser()。
def nltk_parse_clause(sentence): """ Natural Language Toolkit: code_cascaded_chunker http://www.nltk.org/book/ch07.html#code-cascaded-chunker """ grammar = r""" NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments CLAUSE: {<NP><VP>} # Chunk NP, VP """ cp = nltk.RegexpParser(grammar) #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] parsed_sentence = cp.parse(sentence) #print('parsed_sentence=', parsed_sentence)
def extract_chunks(text_string,max_words=3,lemmatize=False): # Any number of adjectives followed by any number of nouns and (optionally) again # any number of adjectives folowerd by any number of nouns grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' # Makes chunks using grammar regex chunker = nltk.RegexpParser(grammar) # Get grammatical functions of words # What this is doing: tag(sentence -> words) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string)) # Make chunks from the sentences, using grammar. Output in IOB. all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # Join phrases based on IOB syntax. candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key] # Filter by maximum keyphrase length candidates = list(filter(lambda l: len(l.split()) <= 3, candidates)) # Filter phrases consisting of punctuation or stopwords punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates)) # lemmatize if lemmatize: lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize candidates = [lemmatizer(x) for x in candidates] return candidates
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""): try: tagged = pos_tag(word_tokenize(sent)) #Maybe actually better if possessives aren't included. #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is #probably not enough information to identify a data source chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) chunks = [] for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'): chunk = "" for leave in subtree.leaves(): chunk += leave[0] + ' ' chunks.append(chunk.strip()) return chunked, chunks except Exception as e: print(str(e))
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
def main(): verb = "<ADV>*<AUX>*<VERB><PART>*<ADV>*" word = "<NOUN|ADJ|ADV|DET|ADP>" preposition = "<ADP|ADJ>" rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition) grammar_long = '''REL_PHRASE: {%s}''' % rel_pattern print grammar_long reverb_pattern = nltk.RegexpParser(grammar_long) # test_patterns(reverb_pattern) process_chave(reverb_pattern)
def determine_entities(self): """ Determines noun entities within a patent claim. param: pos - list of tuples from nltk pos tagger""" # Define grammar for chunking grammar = ''' NP: {<DT|PRP\$> <VBG> <NN.*>+} {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+} {<DT|PRP\$>? <JJ>* <NN.*>+ } ''' cp = nltk.RegexpParser(grammar) # Or store as part of claim object property? # Option: split into features / clauses, run over clauses and # then re-correlate return cp.parse(self.pos)
def find_chunk(sent, chunk_rule=None): if not chunk_rule: chunk_rule = 'QWORD: <W.*><V.*><DT>*{<.*>*?<N.*>+}' logger.debug(chunk_rule) label=chunk_rule.split(':')[0].strip() cp = nltk.RegexpParser(chunk_rule) tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == label: subtree = ' '.join([a[0] for a in subtree ]) return subtree
def find_chunk(sent,chunk_rule=None): if not chunk_rule: chunk_rule = 'HCHUNK: <W.*><.*>*?{<N.*>+}' label=chunk_rule.split(':')[0].strip() cp = nltk.RegexpParser(chunk_rule) tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == label: subtree = ' '.join([a[0] for a in subtree ]) print (subtree) return subtree ##this is required only once
def get_parse_info(parsestr, stemmer, language, stoplist): hash_token_pos = OrderedDict() if language=='german': grammar = r""" NBAR: {<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><APPR><NBAR>} # Above, connected with in/of/etc... """ if language=='english': #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) postoks = [] for i in Tree.fromstring(parsestr).subtrees(): if i.height() == 2: word, pos = i[0], i.label() hash_token_pos[stemmer.stem(word)] = word + u"::" + pos postoks.append((word, pos)) chunk_tree = chunker.parse(postoks) phrases = get_terms(chunk_tree, stemmer, stoplist) phrase_list = [ ' '.join(term) for term in phrases if term] return hash_token_pos, phrase_list
def drawNamedEntityTree(self, text): tokenized_text = self.tokenizer.tokenize(text) tagged_text = self.tagWords(tokenized_text) grammar = "ENT: {<PESSOA>*}" cp = RegexpParser(grammar) res = cp.parse(tagged_text) res.draw() # Tokenizar sentenas em palavras. Retorna uma lista com as palavras que formam o texto.
def fetch_all_organizations(resume_text): organizations = set() tokenized_sentences = nltk.sent_tokenize(resume_text) # Custom grammar with NLTK # NP - Noun Phrase # NN - Noun # NNP - Proper Noun # V - Verb # JJ - Adjective # In a sentence that contains NN NNNP V NN NN JJ NN. # The noun-phrases fetched are: # NP: NN NNP # NP: NN NN # NP: NN # Ex, "Application Developer at Delta Force" # => ["Application Developer", "Delta Force"] grammar = r"""NP: {<NN|NNP>+}""" parser = nltk.RegexpParser(grammar) avoid_organizations = utilities.get_avoid_organizations() for sentence in tokenized_sentences: # tags all parts of speech in the tokenized sentences tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence)) # then chunks with customize grammar # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) noun_phrases = [] for np_chunk in np_chunks: if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree noun_phrase = "" for (org, tag) in np_chunk.leaves(): noun_phrase += org + ' ' noun_phrases.append(noun_phrase.rstrip()) # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) for chunk in chunks: if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION': (organization, tag) = chunk[0] # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase for noun_phrase in noun_phrases: if organization in noun_phrase and organization not in avoid_organizations: organizations.add(noun_phrase.capitalize()) return organizations
def label_nounphrases(self): """ Label noun phrases in the output from pos chunking. """ grammar = ''' NP: {<DT|PRP\$> <VBG> <NN.*>+} {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+} {<DT|PRP\$>? <JJ>* <NN.*>+ } ''' cp = nltk.RegexpParser(grammar) result = cp.parse(self.pos) ptree = nltk.tree.ParentedTree.convert(result) subtrees = ptree.subtrees(filter=lambda x: x.label() == 'NP') # build up mapping dict - if not in dict add new entry id+1; # if in dict label using key mapping_dict = {} pos_to_np = {} for st in subtrees: np_string = " ".join( [ leaf[0] for leaf in st.leaves() if leaf[1] != ("DT" or "PRP$") ] ) np_id = mapping_dict.get(np_string, None) if not np_id: # put ends_with here nps = [i[0] for i in mapping_dict.items()] ends_with_list = [ np for np in nps if ends_with(np_string, np) ] if ends_with_list: np_id = mapping_dict[ends_with_list[0]] else: np_id = len(mapping_dict)+1 mapping_dict[np_string] = np_id pos_to_np[st.parent_index()] = np_id # Label Tree with entities flat_list = [] for i in range(0, len(ptree)): # print(i) # Label if isinstance(ptree[i], nltk.tree.Tree): for leaf in ptree[i].leaves(): # Unpack leaf and add label as triple flat_list.append((leaf[0], leaf[1], pos_to_np.get(i, ""))) else: flat_list.append( (ptree[i][0], ptree[i][1], pos_to_np.get(i, "")) ) return (flat_list, mapping_dict)
def noun_phrases_as_tokens(text): '''Generate a bag of lists of unnormalized tokens representing noun phrases from ``text``. This is built around python's nltk library for getting Noun Phrases (NPs). This is all documented in the NLTK Book http://www.nltk.org/book/ch03.html and blog posts that cite the book. :rtype: list of lists of strings ''' ## from NLTK Book: sentence_re = r'''(?x) # set flag to allow verbose regexps ([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' ## From Su Nam Kim paper: ## http://www.comp.nus.edu.sg/~kanmy/papers/10.1007_s10579-012-9210-3.pdf grammar = r''' NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... ''' if len(text.strip()) == 0: return [] chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) #print postoks tree = chunker.parse(postoks) stops = stopwords.words('english') stops += dossier_stopwords() ## These next four functions are standard uses of NLTK illustrated by ## http://alexbowe.com/au-naturale/ ## https://gist.github.com/alexbowe/879414 def leaves(tree): '''Finds NP (nounphrase) leaf nodes of a chunk tree.''' for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'): yield subtree.leaves() def acceptable_word(word): '''Checks conditions for acceptable word: length, stopword.''' return 2 <= len(word) <= 40 and word.lower() not in stops def get_terms(tree): for leaf in leaves(tree): yield [w for w,t in leaf if acceptable_word(w)] return list(get_terms(tree))