我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tag()。
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def _pos1_gram_after_answer(self, row, flag): """The first POS tag following the answer span is [FLAG] - Args: row(pandas.dataframe): input pandas dataframe flag(string): symbol to match first tagger - Returns: binary(int): 1 match, 0 not match """ question = row.Question if question: first_tagger = self._first_tagger_after_answer_span(question) if first_tagger == flag: return 1 else: return 0 else: return 0
def _pos1_gram_before_answer(self, row, flag): """The first POS tag before the answer span is [FLAG] - Args: row(pandas.dataframe): input pandas dataframe flag(string): symbol to match first tagger - Returns: binary(int): 1 match, 0 not match """ question = row.Question if question: first_tagger = self._first_tagger_before_answer_span(question) if first_tagger == flag: return 1 else: return 0 else: return 0
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
def extract_JK(pos_seq): """The 'JK' method in Handler et al. 2016. Returns token positions of valid ngrams.""" def find_ngrams(input_list, num_): '''get ngrams of len n from input list''' return zip(*[input_list[i:] for i in range(num_)]) # copied from M and S chp 5''' patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN']) pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq] pos_seq = [(i, p) for i, p in enumerate(pos_seq)] ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)] def stringify(s): return "".join(a[1] for a in s) def positionify(s): return tuple(a[0] for a in s) ngrams = filter(lambda x: stringify(x) in patterns, ngrams) return [set(positionify(n)) for n in ngrams] ########
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
def get_tweet_tags(tweet): """ Break up a tweet into individual word parts """ tknzr = TweetTokenizer() tokens = tknzr.tokenize(tweet) # replace handles with real names for n, tok in enumerate(tokens): if tok.startswith('@'): handle = tok.strip("@") if handle in user.students: # If we have a database entry for the mentioned user, we can # easily substitute a full name. usr = user.NPUser(handle) tokens[n] = usr.fullname else: # If there is no database entry, we use the user's alias. While # this is the full name in many cases, it is often not reliable usr = api.get_user(handle) tokens[n] = usr.name tagged = nltk.pos_tag(tokens) # In nltk, if a teacher's name is written with a period after an # abbreviated prefix, it is awkwardly broken up into 3 tags for n, tag in enumerate(tagged): # If there is the weird period after the prefix, if tag[1] == '.': # and it is in fact splitting up a person's name, if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP': if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']: # combine it into the actual name, tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0], tagged[n + 1][0]), 'NNP') # and then remove the extra tags. del tagged[n + 1] del tagged[n] return tagged
def load_xml(self, xmldir): ''' for KDD/WWW/UMD only :return: doclist ''' for filename in os.listdir(xmldir): with open(xmldir+filename) as textfile: doc = Document() doc.name = filename[:filename.find('.xml')] import string printable = set(string.printable) # print((filename)) try: lines = textfile.readlines() xml = ''.join([filter(lambda x: x in printable, l) for l in lines]) root = ET.fromstring(xml) doc.title = root.findall("title")[0].text doc.abstract = root.findall("abstract")[0].text doc.phrases = [n.text for n in root.findall("*/tag")] self.doclist.append(doc) except UnicodeDecodeError: print('UnicodeDecodeError detected! %s' % filename )
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
def conll_tag_chunks(chunk_sents): '''Convert each chunked sentence to list of (tag, chunk_tag) tuples, so the final result is a list of lists of (tag, chunk_tag) tuples. >>> from nltk.tree import Tree >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) >>> conll_tag_chunks([t]) [[('DT', 'B-NP'), ('NN', 'I-NP')]] ''' tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def parse(self, tagged_sent): '''Parsed tagged tokens into parse Tree of chunks''' if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) # create conll str for tree parsing wtc = zip(words, chunks) return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
def parse(self, tagged_sent): iobs = [] in_person = False for word, tag in tagged_sent: if word in self.name_set and in_person: iobs.append((word, tag, 'I-PERSON')) elif word in self.name_set: iobs.append((word, tag, 'B-PERSON')) in_person = True else: iobs.append((word, tag, 'O')) in_person = False return conlltags2tree(iobs)
def iob_locations(self, tagged_sent): i = 0 l = len(tagged_sent) inside = False while i < l: word, tag = tagged_sent[i] j = i + 1 k = j + self.lookahead nextwords, nexttags = [], [] loc = False # lookahead in the sentence to find multi-word locations while j < k: if ' '.join([word] + nextwords) in self.locations: # combine multiple separate locations into single location chunk if inside: yield word, tag, 'I-LOCATION' else: yield word, tag, 'B-LOCATION' # every next word is inside the location chunk for nword, ntag in zip(nextwords, nexttags): yield nword, ntag, 'I-LOCATION' # found a location, so we're inside a chunk loc, inside = True, True # move forward to the next word since the current words # are already chunked i = j break if j < l: nextword, nexttag = tagged_sent[j] nextwords.append(nextword) nexttags.append(nexttag) j += 1 else: break # if no location found, then we're outside the location chunk if not loc: inside = False i += 1 yield word, tag, 'O'
def ieer_chunked_sents(tag=nltk.tag.pos_tag): for doc in ieer.parsed_docs(): tagged = ieertree2conlltags(doc.text, tag) yield conlltags2tree(tagged)
def _get_wordnet_pos(spacy_token): '''Wordnet POS tag''' pos = spacy_token.tag_[0].lower() if pos in ['a', 'n', 'v']: return pos
def _synonym_prefilter_fn(token, synonym): ''' Similarity heuristics go here ''' if (len(synonym.text.split()) > 2) or \ (synonym.lemma == token.lemma) or \ (synonym.tag != token.tag) or \ (token.text.lower() == 'be'): return False else: return True
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst)
def map_words(self, _text): mapping = defaultdict(list) tagged_words = pos_tag(set(self.get_words(_text))) for word, tag in tagged_words: mapping[tag].append(word) return mapping
def postagger(sent): text = nltk.word_tokenize(sent) posTagged = pos_tag(text) #simplifiedTags = [map_tag('en-ptb', 'universal', tag) for word, tag in posTagged] return posTagged
def tag(text): """ ?????????? ''list'' ?????????? ''list'' ???? [('???????', '??????')]""" tagger = nltk.tag.UnigramTagger(model=data())# backoff=default_tagger) return tagger.tag(text)
def logmsg(s): # would be better to use python logger print>>sys.stderr, "[phrasemachine] %s" % s ############## SimpleNP ## Uses a five-tag coarse grammar. ## tagset: A D P N O # Requires conversion from PTB or Petrov/Gimpel tags to our system. # "Coarse*" indicates petrov/gimpel # Grammar change from the FST version: can't repeat NUM in both adj and noun.
def coarse_tag_str(pos_seq): """Convert POS sequence to our coarse system, formatted as a string.""" global tag2coarse tags = [tag2coarse.get(tag,'O') for tag in pos_seq] return ''.join(tags) # POS extraction assuming list of POS tags as input. # >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 4)] # >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
def _ner_features(self, row): """Name entity recognition features - Args: row(pandas.dataframe): dataframe of current row - Returns: row(pandas.dataframe): result a pandas dataframe with new feature """ answer = row.Answer question = row.Question if answer is not None and question is not None: sentence_len = len(row.Sentence.split()) ners_answer = self.st.tag(answer.split()) ners_question = self.st.tag(question.split()) ner_values_answer = [v for k, v in ners_answer if v in [ 'PERSON', 'ORGANIZATION', 'LOCATION']] ner_values_question = [v for k, v in ners_question if v in [ 'PERSON', 'ORGANIZATION', 'LOCATION']] else: return None # NER IN ANSWER if 'PERSON' in ner_values_answer: row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 1 else: row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 0 if 'ORGANIZATION' in ner_values_answer: row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 1 else: row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 0 if 'LOCATION' in ner_values_answer: row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 1 else: row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 0 # NER IN QUESTION if 'PERSON' in ner_values_question: row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 1 else: row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 0 if 'ORGANIZATION' in ner_values_question: row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 1 else: row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 0 if 'LOCATION' in ner_values_question: row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 1 else: row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 0 row['NUM_NAMED_ENTITIES_IN_ANSWER'] = len(ner_values_answer) row['NUM_NAMED_ENTITIES_OUT_ANSWER'] = len(ner_values_question) row['ANSWER_NAMED_ENTITY_DENSITY'] = float( len(ner_values_answer)) / sentence_len row['QUESTION_NAMED_ENTITY_DENSITY'] = float( len(ner_values_question)) / sentence_len return row
def check_postag(config): train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for dataset_name in config['testing_datasets']: # override the original test_set # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) test_set = test_sets[dataset_name] # print(dataset_name) # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']]))) test_data_plain = zip(*(test_set['source'], test_set['target'])) test_size = len(test_data_plain) # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for idx in xrange(len(test_data_plain)): # len(test_data_plain) test_s_o, test_t_o = test_data_plain[idx] source = keyphrase_utils.cut_zero(test_s_o, idx2word) print(source) # Add other jars from Stanford directory stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) text = pos_tagger.tag(source) print(text)
def build_history(data_list, supported_tags_phones,supported_tags): history_list = [] # list of all histories sents = [] count = 0 expected = [] for data in data_list: # data is the inputs entered by a given student data1 = data['data'] #data1 is for every sentence entered by user for rec in data1: updates = rec['updates'] sent = rec['sentence'] relatedTags=[] relations=[] if "rels" in rec.keys(): relatedEntities = rec['rels'] expected.append(relatedEntities) for i in relatedEntities: relations.append(i.keys()) for j in i[i.keys()[0]]: relatedTags.append(j) words = [] posTaggedSent = postagger(sent) #chunkPhrases = chunker(sent) if len(updates) == len(posTaggedSent): for i in range(len(updates)): words.append({"word":updates[i]['word'],"pos":posTaggedSent[i],"tag":updates[i]['tag']}) #------------------------------------------------------------------------------------------------ # NOTE: below code is a temporary hack to build the MAxEnt for just 2 tags - we will change this later if (updates[i]['tag'] not in supported_tags_phones): if updates[i]['tag'] == "Model": updates[i]['tag'] = "Version" else: updates[i]['tag'] = "Other" #------------------------------------------------------------------------------------------------ sents.append(words) history={} history['sentence'] = words history['i'] = count+1 #history['phrases'] = chunkPhrases history['relatedTags'] = relatedTags if len(relations) > 0: history_list.append((history,relations[0][0],)) else: history_list.append((history,"None",)) count += 1 return (history_list,sents,expected)
def chunker(sent): #a = [("I","PRP"),("hear","VBP"),("Jerusalem","NNP"),("bells","NNS"),("ringing","VBG")] #input_sent = " Rockwell said the agreement calls for it to supply 200 addititonal so-called shipsets for the planes." input_sent = sent text = nltk.word_tokenize(input_sent) a = nltk.pos_tag(text) phrases = [] tup = () '''test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])''' NP_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) VP_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP']) class ChunkParser(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.TrigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word,pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)] return nltk.chunk.util.conlltags2tree(conlltags) NPChunker = ChunkParser(NP_sents) VPChunker = ChunkParser(VP_sents) #print (NPChunker.parse("I hear Jerusalem bells ringing")) parsed_sent = NPChunker.parse(a) for i in parsed_sent: if (type(i)!=type(tup)): l=[] for t in tuple(i): l.append(t[0]) phrases.append({"NP":" ".join(l)}) parsed_sent = VPChunker.parse(a) for i in parsed_sent: if (type(i)!=type(tup)): l=[] for t in tuple(i): l.append(t[0]) phrases.append({"VP":" ".join(l)}) return phrases