我们从Python开源项目中,提取了以下26个代码示例,用于说明如何使用nltk.ne_chunk()。
def ne_tagging(text): chunked = ne_chunk(pos_tag(word_tokenize(text))) prev = None continuous_chunk = [] current_chunk = [] for i in chunked: if type(i) == Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk
def process(self, fc, context=None): text_source = self.config.get('text_source') if text_source and text_source in fc: text = fc[text_source] else: return fc names = defaultdict(StringCounter) for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): label = chunk.label() name = ' '.join(c[0] for c in chunk.leaves()) if not isinstance(name, unicode): name = unicode(name, 'utf-8') name = cleanse(name) #print chunk.node, name names[label][name] += 1 for entity_type, name_counts in names.items(): fc[entity_type] = name_counts return fc
def get_continuous_chunks(self, text): chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))) prev = None continuous_chunk = [] current_chunk = [] for i in chunked: if type(i) == nltk.Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk
def get_pos_tag(qind): q = index_q[qind] wl = str(q).lower().split() pos_l = nltk.pos_tag(wl) q1_pos = [] for pos in pos_l: q1_pos.append(pos[1]) return q1_pos # def get_ner_tag(qind): # q = index_q[qind] # wl = str(q).lower().split() # ner_l = nltk.ne_chunk(wl) # q1_ner = [] # for pos in ner_l: # q1_ner.append(pos[1]) # return q1_ner
def whereRules(sentenceOriginal): score = 0 sentence = sentenceOriginal.lower() # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))): # if type(chunk) is nltk.tree.Tree: # if 'LOCATION' in chunk.label() or 'GPE' in chunk.label(): # score += 10 # RULE 2 for word in LOCPREP: if word in sentence: score += 4 # RULE 3 for word in LOCATION: if word in sentence: score += 6 return score # WHEN RULES
def performNameExtraction(text): #Returns a list of what NLTK defines as persons after processing the text passed into it. try: entity_names = [] for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label') and chunk.label: if chunk.label() == 'PERSON': name_value = ' '.join(child[0] for child in chunk.leaves()) if name_value not in entity_names: entity_names.append(name_value) except: print "Unexpected error:", sys.exc_info()[0] return entity_names
def ne_chunked(): print() print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") print("=" * 45) ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*') rels = [] for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): sent = nltk.ne_chunk(sent) rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7) for rel in rels: print('{0:<5}{1}'.format(i, rtuple(rel)))
def fetch_name(resume_text): tokenized_sentences = nltk.sent_tokenize(resume_text) for sentence in tokenized_sentences: for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')): if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON': chunk = chunk[0] (name, tag) = chunk if tag == 'NOUN': return name return "Applicant name couldn't be processed"
def extract_entities(text): result=dict() for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))): # chunk.draw() if(isinstance(chunk, nltk.tree.Tree)): for subtree in chunk.subtrees(filter=lambda t: (t.label() == 'PERSON' or t.label() == 'GPE' or t.label() == 'LOCATION')): for leave in subtree.leaves(): if leave[0].lower() not in irrelevant_loc_words: result[leave[0].lower()]=subtree.label() # print result return result
def find_named_entities(sent): tree = nltk.ne_chunk(sent) for st in tree.subtrees(): if st.label() != 'S': logger.debug(st)
def extract(self, text, entity_description=False): # We need to clean the text in each method otherwise when we present it # to the user, it will have a different format text = self.remove_return_lines_and_quotes(text) sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)] # This function is quite expensive sentences = [nltk.pos_tag(sent) for sent in sentences] entities_all = {} if entity_description else [] #stop = stopwords.words('english') # more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence'] # stop = stopwords.words('english') # stop = stop + more_stop_words stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"] for s in sentences: chunked = nltk.ne_chunk(s, binary=True) for n in chunked: if isinstance(n, nltk.tree.Tree): if n.label() == 'NE': entities_all = self.getEntity(n, stop, entities_all, entity_description) if entity_description: return entities_all else: return list(set(entities_all))
def extract_org(sent): pos = pos_tag(nltk.tokenize.word_tokenize(sent)) sentt = nltk.ne_chunk(pos, binary=False) org = [] for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'): for leave in subtree.leaves(): org.append(leave) return org
def create_phrase(self, phrase_str): tokenized_phrase = nltk.word_tokenize(phrase_str) tagged_phrase = nltk.pos_tag(tokenized_phrase) ne_chunk_tree = nltk.ne_chunk(tagged_phrase) #if (line_num in bluh): #print(str(line_num)+". "+str(ne_chunk_tree)) merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens) #if (line_num in bluh): #print(str(line_num)+". "+str(ne_chunk_list)) tokens = [] #list of tagged tuples for token in ne_chunk_list: if type(token) is nltk.tree.Tree: tokens.append(self._tree_to_tuple(token)) else: if (token[0] in self._keywords): token = (token[0], self._keywords[token[0]]) tokens.append(token) #if (line_num in bluh): #print(str(line_num)+". "+str(tokens)) phrase = Phrase(tokens) return phrase #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples #output: list of tuples/trees containing nltk tokens #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
def fetch_all_organizations(resume_text): organizations = set() tokenized_sentences = nltk.sent_tokenize(resume_text) # Custom grammar with NLTK # NP - Noun Phrase # NN - Noun # NNP - Proper Noun # V - Verb # JJ - Adjective # In a sentence that contains NN NNNP V NN NN JJ NN. # The noun-phrases fetched are: # NP: NN NNP # NP: NN NN # NP: NN # Ex, "Application Developer at Delta Force" # => ["Application Developer", "Delta Force"] grammar = r"""NP: {<NN|NNP>+}""" parser = nltk.RegexpParser(grammar) avoid_organizations = utilities.get_avoid_organizations() for sentence in tokenized_sentences: # tags all parts of speech in the tokenized sentences tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence)) # then chunks with customize grammar # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) noun_phrases = [] for np_chunk in np_chunks: if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree noun_phrase = "" for (org, tag) in np_chunk.leaves(): noun_phrase += org + ' ' noun_phrases.append(noun_phrase.rstrip()) # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) for chunk in chunks: if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION': (organization, tag) = chunk[0] # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase for noun_phrase in noun_phrases: if organization in noun_phrase and organization not in avoid_organizations: organizations.add(noun_phrase.capitalize()) return organizations
def extract_all(use_random_forest): if use_random_forest: emails = rf_model() emails = [email for email in emails if email[0] != 'negatives_clean'] else: db = utils.get_local_db() for collection in db.collection_names(): if collection != 'negatives_clean': for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # find features for each email email_data = [] for email_set in emails: email = email_set[1] fields = features[email_set[0]] # extract named entities tokenized_email = nltk.word_tokenize(email) tagged_email = nltk.pos_tag(tokenized_email) named_entity_email = nltk.ne_chunk(tagged_email) entities = [] # concatenate multi-word entities for branch in named_entity_email: if isinstance(branch, nltk.tree.Tree): entity = '' for sub_entity in branch: entity += (sub_entity[0] + ' ') if [branch.label(), entity.strip()] not in entities: entities.append([branch.label(), entity.strip()]) # use entities to fill in fields matches = [] for field in fields: field_matches = [] for entity in entities: # compute semantic distance and threshold dist = 0 description = describe(entity[1]) if description: for word in description.split(): a = wn.synsets(field[1]) b = wn.synsets(word) if a and b: a = a[0] b = b[0] segment = a.path_similarity(b) if segment: dist += segment if dist > 0.1: field_matches.append([dist, entity[1]]) field_matches.sort(key=lambda x: x[0], reverse=True) matches.append({field[1]: field_matches}) email_data.append([email_set[0], email, matches]) return email_data
def extract_one(email): # use random-forest to find email category category = rf_categorize(email) if category != 'negatives_clean': fields = features[category] # extract named entities tokenized_email = nltk.word_tokenize(email) tagged_email = nltk.pos_tag(tokenized_email) named_entity_email = nltk.ne_chunk(tagged_email) entities = [] # concatenate multi-word entities for branch in named_entity_email: if isinstance(branch, nltk.tree.Tree): entity = '' for sub_entity in branch: entity += (sub_entity[0] + ' ') if [branch.label(), entity.strip()] not in entities: entities.append([branch.label(), entity.strip()]) # use entities to fill in fields matches = [] for field in fields: field_matches = [] for entity in entities: # compute semantic distance and threshold dist = 0 description = describe(entity[1]) if description: for word in description.split(): a = wn.synsets(field[1]) b = wn.synsets(word) if a and b: a = a[0] b = b[0] segment = a.path_similarity(b) if segment: dist += segment if dist > 0.1: field_matches.append([dist, entity[1]]) field_matches.sort(key=lambda x: x[0], reverse=True) matches.append({field[1]: field_matches}) # return categorized email with field guess probablities return [category, email, matches]
def whoRules(question, sentenceOriginal): score = 0 hasNameQuestion = False hasNameSentence = False hasnameSentence = False hasHumanSentence = False sentence = sentenceOriginal.lower() # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))): # if type(chunk) is nltk.tree.Tree: # if 'PERSON' in chunk.label() or 'ORGANIZATION' in chunk.label(): # score += 10 for item in question: if item in NAME: hasNameQuestion = True #break if item in HUMAN and item in sentence: score += 10 for item in sentence: if item in NAME: hasNameSentence = True if 'name' in item: hasnameSentence = True if item in HUMAN: hasHumanSentence = True # RULE 2 if not hasNameQuestion and hasNameSentence: score += 6 # RULE 3 if not hasNameQuestion and hasnameSentence: score += 4 # RULE 4 if hasNameSentence or hasHumanSentence: score += 4 return score # WHAT RULES