我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.pos_tag()。
def ne_tagging(text): chunked = ne_chunk(pos_tag(word_tokenize(text))) prev = None continuous_chunk = [] current_chunk = [] for i in chunked: if type(i) == Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk
def keyword_extractor(data): try: #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data))) #result = np_extractor.extract() text = words_wo_stopwords(strip_tags(data)) #TODO this is duplicated job, should be improved words = word_tokenize(strip_tags(text)) taggged = pos_tag(words) cleaned = filter_insignificant(taggged) text = " ".join(cleaned) wc = WordCloudMod().generate(text) result = list(wc.keys())[:10] except Exception as err: print(colored.red("At keywords extraction {}".format(err))) result = [] return result # TODO definitely can be better if we knew where content is
def _identify_pronoun(self, answer): """Calculate percentage of pronouns within answer - Args: answer(str): answer text - Returns: percentage(float): ratio of pronouns in answer """ text = nltk.word_tokenize(answer) post = nltk.pos_tag(text) pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$'] # init variables num_pronouns = 0 num_terms = len(post) percentage = 0 for k, v in post: if v in pronoun_list: num_pronouns += 1 percentage = float(num_pronouns) / num_terms return percentage
def _identify_pronoun2(self, sentence): """Calculate percentage of pronouns in the sentence that are in the answer - Args: sentence(str): question sentence - Returns: pronoun_in_sentence(list): pronouns in sentence sentence_len(int): length of current sentence """ text = nltk.word_tokenize(sentence) post = nltk.pos_tag(text) pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$'] pronoun_in_sentence = [] sentence_len = len(post) for k, v in post: if v in pronoun_list: pronoun_in_sentence.append(k) return pronoun_in_sentence, sentence_len
def _first_tagger_after_answer_span(self, question): """Get the first tagger after answer span - Args: question(string): string of current question - Returns: tagger(string): tagger of first term after span """ index = 0 text = nltk.word_tokenize(question) post = nltk.pos_tag(text) for idx, t in enumerate(post): if t[0] == '_____': index = idx + 1 break try: return post[index][1] except IndexError: return 'dummy'
def _first_tagger_before_answer_span(self, question): """Get the first tagger before answer span - Args: question(string): string of current question - Returns: tagger(string): tagger of first term before span """ index = 0 text = nltk.word_tokenize(question) post = nltk.pos_tag(text) for idx, t in enumerate(post): if t[0] == "_____": index = idx - 1 break try: return post[index][1] except IndexError: return 'dummy'
def tag(self, lines): ''' Tokenize and categorise the words in the collection of text :param lines: The list of strings with the text to match :type lines: ``list`` of ``str`` :rtype: :class: :return: ''' try: tokenized_words = nltk.word_tokenize(lines) return nltk.pos_tag(tokenized_words) except LookupError as le: print("Run install_words.py first") raise le
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown)
def _get_base_doge_words(self, eng_text): """ Get all base words from text to make doge phrases from. eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy'] Args: eng_text (str): Text to get words from. Returns: list[str]: List of lower case words to use from text. """ phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation]) tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()]) chosen_words = [] for word, tag in tagged_words: if tag[0] in ['N', 'V', 'J']: # make noun singular if tag[0] == 'N': word = self._lemmatizer.lemmatize(word, pos='n') # make verb infinitive elif tag[0] == 'V': word = self._lemmatizer.lemmatize(word, pos='v') chosen_words.append(word.encode('ascii', 'ignore')) # lemmatize makes word unicode return list(set(chosen_words))
def _get_doge_descriptors(self, word_ls): """ Get descriptors for a set of doge words. eg. ['person', 'run'] -> ['much', 'very'] Args: word_ls (list[str]): List of words to use. Returns: list[str]: List of doge descriptors, eg. 'much', 'very', in order. """ tagged_words = nltk.pos_tag(word_ls) chosen_descriptors = [] for word, tag in tagged_words: possible_descs = [MUCH, MANY, SUCH, SO, VERY] if tag[0] == 'J': possible_descs.remove(VERY) possible_descs.remove(SO) if len(chosen_descriptors) >= 2: allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors[-2:]] else: allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors] chosen_descriptors.append(random.choice(allowed_descriptors)) return chosen_descriptors
def extract_candidate_words(sents, tags=GOODTAGS, tagged=False, **kwargs): """ Extracts key words based on a list of good part of speech tags. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Identify only good words by their tag for token, tag in sent: if tag in tags: for token in normalizer.normalize([token]): yield token ########################################################################## ## Key phrase by text scoring mechanisms ##########################################################################
def normalize(self, words): """ Normalizes a list of words. """ # Add part of speech tags to the words words = nltk.pos_tag(words) for word, tag in words: if self.lower: word = word.lower() if self.strip: word = word.strip() if word not in self.stopwords: if not all(c in self.punct for c in word): if self.lemmatize: word = self.lemmatizer.lemmatize(word, tag) yield word
def tagged_abstracts(RS_pmids_tokenizedabstracts_dict): """ Takes a dict of tokenized abstracts and tags them using the NLTK module for Natural Language Entities. Input dictionary: key is the RS ID, value is a dictionary where key is the pmid and value is a list of tokens""" RS_pmids_taggedabstracts_dict = {} for each_RS in RS_pmids_tokenizedabstracts_dict: pmids_taggedabstracts = {} pmids_tokenizedabstracts = RS_pmids_tokenizedabstracts_dict[each_RS] for pmid in pmids_tokenizedabstracts: taggedabstracts_list = [] for token in pmids_tokenizedabstracts[pmid]: tagged = nltk.pos_tag(token) taggedabstracts_list.append(tagged) pmids_taggedabstracts[pmid] = taggedabstracts_list RS_pmids_taggedabstracts_dict[each_RS] = pmids_taggedabstracts return RS_pmids_taggedabstracts_dict
def from_sentence(sent): tokens = nltk.word_tokenize(sent) tagged = nltk.pos_tag(tokens) dg = DependencyGraph() for (index, (word, tag)) in enumerate(tagged): dg.nodes[index + 1] = { 'word': word, 'lemma': '_', 'ctag': tag, 'tag': tag, 'feats': '_', 'rel': '_', 'deps': defaultdict(), 'head': '_', 'address': index + 1, } dg.connect_graph() return dg
def prepare_sentence(words, vectorizer=None, lemmatizer=None, max_words=78, return_output=True): X = np.ones((max_words, 300))*ZERO_EPSILON if return_output: y = np.ones((max_words, 300))*ZERO_EPSILON raw_pos = [p[1]for p in pos_tag(words)] pos = [str(treebank_to_simple(p, default=wordnet.NOUN)) for p in raw_pos] lemmas = [str(lemmatizer(w, pos=p)) for (w,p) in zip(words, pos)] num_words = len(words) if len(words) <= max_words else max_words for word_i in range(num_words): word_vector = vectorizer(words[word_i]) X[word_i, :] = word_vector if return_output: lemma_vector = lemmas[word_i] y[word_i, :] = vectorizer(lemma_vector) if return_output: return X, y return X
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False): # Create the chunker that uses our grammar chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.word_tokenize(sent)) # Parse the sentence, converting the parse tree into a tagged sequence sent = normalize(sent) if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract phrases and rejoin them with space phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
def get_counts(): global unigrams global bigrams global sentences for i in xrange(1, NUM_FILES+1): if i in SKIP: continue with open("Shakespeare_parsed/%03d" % i) as f: for line in f: tokens = get_tokens(line) tokens = [t.lower() for t in tokens] tags = nltk.pos_tag(tokens) if len(tokens) == 0: continue sentences.append(tokens) prev_word = "" for token in tokens: unigrams[token] += 1 if not prev_word == "": bigrams[(prev_word,token)] += 1 prev_word = token top10_uni = unigrams.most_common()[:10] top10_bi = bigrams.most_common()[:10]
def tag_contexts(doc_id): global tags if not tags : tags = nltk.data.load("help/tagsets/upenn_tagset.pickle") words = defaultdict(Counter) count = Counter() for context in get_contexts(doc_id) : for word, tag in nltk.pos_tag(tokenize(context)) : words[tag].update([word]) count.update([tag]) tag_common_words = {tag : ' '.join(zip(*tag_words.most_common(10))[0]) for tag, tag_words in words.items() } for tag, freq in count.most_common(15) : print "%4d\t%45s\t%s" % (freq, tags[tag][0], tag_common_words[tag])
def get_NN_entities(post): sentences = nltk.tokenize.sent_tokenize(post) token_sets = [nltk.tokenize.word_tokenize(s) for s in sentences] pos_tagged_token_sets = [nltk.pos_tag(t) for t in token_sets] pos_tagged_tokens = [t for v in pos_tagged_token_sets for t in v] all_entities = [] previous_pos = None current_entities = [] for (entity, pos) in pos_tagged_tokens: if previous_pos == pos and pos.startswith('NN'): current_entities.append(entity.lower()) elif pos.startswith('NN'): if current_entities != []: all_entities.append(' '.join(current_entities)) current_entities = [entity.lower()] previous_pos = pos return all_entities
def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma
def clean_text(raw_text, filtered_word_types): """Clean raw text for bag-of-words model""" # Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", raw_text) # Convert to lower case, split into individual words words = letters_only.lower().split() # stem words stemmer = PorterStemmer() stemmed_words = list(map(stemmer.stem, words)) # Remove stop words if requested if filtered_word_types is not None: tagged_text = nltk.pos_tag(stemmed_words) stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types] # join together return " ".join(stemmed_words)
def get_lemmas(sent, lemmatizer): stop_words = [] res = [] for word in sent: pos = get_wordnet_pos(nltk.pos_tag([word])[0][1]) if pos == '': lemma = lemmatizer.lemmatize(word) else: lemma = lemmatizer.lemmatize(word, pos) #if(type(lemma) == unicode): # lemma = lemma.encode('ascii', 'ignore') if lemma.isdigit(): res.append('number') else: res.append(lemma) return res
def pos_tag_questions(qstn_list): res = [] count = 0 for i in qstn_list: r = [] i = i.split(':') r.append(i[0]) r.append(i[1].split()[0]) i = i[1].split() del i[0] sent = nltk.word_tokenize(' '.join(i)) r.append(nltk.pos_tag(sent)) res.append(tuple(r)) count += 1 if (count % 100) == 0: print ("processed : " + str(count) ) return res #experiment with different features to get better accuracy #also dont forget to to include the same feature extractor in the process_grammar.py
def __init__(self): super(RssSkill, self).__init__('RssSkill') self._is_reading_headlines = False self.feeds = {} self.cached_items = {} self.cache_time = {} try: pos_tag('advance') except LookupError: logger.debug('Tagger not installed... Trying to download') dler = Downloader() if not dler.download('averaged_perceptron_tagger'): logger.debug('Trying alternative source...') dler = Downloader(ALT_NLTK_DATA) dler.download('averaged_perceptron_tagger', raise_on_error=True)
def pos_tag_text(line, token_pattern=token_pattern, exclude_stopword=stopwords, encode_digit=False): token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE) for name in ["question1", "question2"]: l = line[name] ## tokenize tokens = [x.lower() for x in token_pattern.findall(l)] ## stem #tokens=l.lower().split() #print tokens tokens = stem_tokens(tokens, english_stemmer) line[name+'_stem']=' '.join(tokens) #print tokens if exclude_stopword: tokens = [x for x in tokens if x not in stopwords] tags = pos_tag(tokens) tags_list = [t for w,t in tags] tags_str = " ".join(tags_list) #print tags_str line[name+'_pos_tag'] = tags_str return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem', u'question2_pos_tag']]
def get_pos_tag(qind): q = index_q[qind] wl = str(q).lower().split() pos_l = nltk.pos_tag(wl) q1_pos = [] for pos in pos_l: q1_pos.append(pos[1]) return q1_pos # def get_ner_tag(qind): # q = index_q[qind] # wl = str(q).lower().split() # ner_l = nltk.ne_chunk(wl) # q1_ner = [] # for pos in ner_l: # q1_ner.append(pos[1]) # return q1_ner
def getPOSLinks(text): wordnet_lemmatizer = WordNetLemmatizer() text = nltk.word_tokenize(text) pos = nltk.pos_tag(text) links = [] link = [] active = False for w in pos: part = w[1] word = w[0] if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")): active = True if(active): link.append(wordnet_lemmatizer.lemmatize(word)) #extract main body if(active and (part == "PRP" or part[:2] == "NN" or part == "." )): active = False links.append(" ".join(link)) link = [] return links
def tag(path, filename): print("Tagging "+path) WRITE_HANDLER = open(PREPROCESSED_DATA + filename.strip() + "_features", 'w') for line in open(path, 'r'): tokens = line.split() if(len(tokens) == 0): continue tags = pos_tag(tokens) # tag features = list() for token in tags: tok = token[0] tag = token[1] if tok.lower() not in stop_words: features.append(tok+":"+tag) if(len(features)>0): WRITE_HANDLER.write(str(features)+'\n\n') else: ## EMPTY lines WRITE_HANDLER.write('\n\n')
def _analyze_query(self): tagged = nltk.pos_tag(self.ir_query) ir_query_tagged = [] for word, pos in tagged: pos = { pos.startswith('N'): wordnet.NOUN, pos.startswith('V'): wordnet.VERB, pos.startswith('J'): wordnet.ADJ, pos.startswith('R'): wordnet.ADV, }.get(pos, None) if pos: synsets = wordnet.synsets(word, pos=pos) else: synsets = wordnet.synsets(word) ir_query_tagged.append((word, synsets)) # Add additional special hidden term ir_query_tagged.append(('cause', [wordnet.synset('cause.v.01')])) self.ir_query_tagged = ir_query_tagged
def combine_pos_tag(self, pos_tag): noun = ['NN', 'NNS', 'NNP', 'NNPS'] adjective = ['JJ', 'JJR', 'JJS'] adverb = ['RB', 'RBR', 'RBS'] verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] wh = ['WDT', 'WP', 'WRB'] if pos_tag in noun: return 'NN' elif pos_tag in adjective: return 'JJ' elif pos_tag in adverb: return 'RB' elif pos_tag in verb: return 'VB' elif pos_tag in wh: return 'WP' else: return pos_tag
def branch(words): """ This initial filter of our input sentence. It tokenizes the words and tags the words with parts of speech. It then passes the tokenized and tagged words to 1 of 3 functions. A sentence is either declarative() , interrogative() , or imperative() Args: words (String): The words inputted by the user Returns: String: response from one of the three functions that handle type of sentences. """ parts_of_speech = nltk.pos_tag(nltk.word_tokenize(words)) leading_word = parts_of_speech[0][1][0] if leading_word == 'W': return interrogative(parts_of_speech[1:]) elif leading_word == "V": return imperative(parts_of_speech) else: declarative(parts_of_speech)
def tokenize(data, language="english", filterStopWords = False, tagging = False): result = {} tags = [] filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"] sent_token = nltk.tokenize.sent_tokenize(data, language) word_token = nltk.tokenize.word_tokenize(data, language) word_token = [w for w in word_token if not w in filterChars] if filterStopWords is True: stop_words = set(stopwords.words(language)) word_token = [w for w in word_token if not w in stop_words] if tagging is True: tags = nltk.pos_tag(word_token) result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags} return json.loads(jsonpickle.encode(result, unpicklable=False))
def change_sentence(self): text = nltk.tokenize.word_tokenize(self._sentence) changed = False for cur in nltk.pos_tag(text): if (cur[1] == "NN" or cur[1] == "NNP" or cur[1] == "RPR"): foundedTmura = self.getFromDB(cur[0]) if foundedTmura == None: foundedTmura = getTmura(cur[0]) if foundedTmura != "not found": self.add2DB(cur[0], foundedTmura) if foundedTmura != "not found" and changed == False: if (foundedTmura.find("OR")): foundedTmura = foundedTmura.replace('OR', 'or') if randrange(2) == 0: rep = cur[0] + ", " + foundedTmura + ", " else: rep = cur[0] + "(" + foundedTmura + ") " self._sentence = self._sentence.replace(cur[0], rep) changed = True return self._sentence
def analysis(reviews_collection_text): with open('data/reviews_%s' % reviews_collection_text, 'r') as f: raw_data = f.read() with open('data/reviews_%s' % reviews_collection_text, 'r') as f: comments = f.readlines() data = raw_data.replace('\n', ' ') data_lower = data.lower() tokens_with_punc = word_tokenize(data_lower) tokens = RegexpTokenizer(r'\w+').tokenize(data_lower) print("--- Most frequent tokens ---\n", FreqDist(tokens_with_punc).most_common(15)) print("--- Tokens without punctuation ---\n", FreqDist(tokens).most_common(15)) stop = set(stopwords.words('english')) words = [word for word in tokens if word not in stop] print("--- Most frequent words ---\n", FreqDist(words).most_common(15)) tagged = pos_tag(words) nouns = [word for word, pos in tagged if (pos == 'NN')] print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15)) adjts = [word for word, pos in tagged if (pos == 'JJ')] print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15)) tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments] lxdst = [lexical_density(token) for token in tokns if len(token) > 0] avgld = sum(lxdst) / len(comments) print("--- Average lexical density ---\n", avgld)
def whereRules(sentenceOriginal): score = 0 sentence = sentenceOriginal.lower() # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))): # if type(chunk) is nltk.tree.Tree: # if 'LOCATION' in chunk.label() or 'GPE' in chunk.label(): # score += 10 # RULE 2 for word in LOCPREP: if word in sentence: score += 4 # RULE 3 for word in LOCATION: if word in sentence: score += 6 return score # WHEN RULES
def check_imperative(self, paragraph): """ Check the given sentence/s for Imperatives. :param paragraph: The input paragraph to be tested. :return: A list of tuples having 2 elements (invalid word, parts of speech) or an empty list if no invalid words are found. """ words = nltk.word_tokenize(nltk.sent_tokenize(paragraph)[0]) # VBZ : Verb, 3rd person singular present, like 'adds', 'writes' # etc # VBD : Verb, Past tense , like 'added', 'wrote' etc # VBG : Verb, Present participle, like 'adding', 'writing' word, tag = nltk.pos_tag(['I'] + words)[1:2][0] if(tag.startswith('VBZ') or tag.startswith('VBD') or tag.startswith('VBG') or word.endswith('ing')): # Handle special case for VBG return (word, tag) else: return None
def word_split(self, sentence): words = re.split(self.word_split_pattern, sentence) words = [w for w in words if len(w) > 0] words = ["::".join(tag) for tag in nltk.pos_tag(words)] return words
def _find_nouns(self, sentence): tokens = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(tokens) nouns = [word for word, pos in tagged \ if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] filter_keywords = ['chuck', 'norris', 'quot'] filtered = [i for i in nouns if not any(f in i.lower() for f in filter_keywords)] return filtered
def _count_token_with_match(self, answer, match): """Count answer match FLAG """ text = nltk.word_tokenize(answer) post = nltk.pos_tag(text) count = 0 for k, v in post: if v == match: count += 1 return count
def is_noun(word): POS = nltk.pos_tag([word])[0][1] return POS.startswith('NN')
def get_tweet_tags(tweet): """ Break up a tweet into individual word parts """ tknzr = TweetTokenizer() tokens = tknzr.tokenize(tweet) # replace handles with real names for n, tok in enumerate(tokens): if tok.startswith('@'): handle = tok.strip("@") if handle in user.students: # If we have a database entry for the mentioned user, we can # easily substitute a full name. usr = user.NPUser(handle) tokens[n] = usr.fullname else: # If there is no database entry, we use the user's alias. While # this is the full name in many cases, it is often not reliable usr = api.get_user(handle) tokens[n] = usr.name tagged = nltk.pos_tag(tokens) # In nltk, if a teacher's name is written with a period after an # abbreviated prefix, it is awkwardly broken up into 3 tags for n, tag in enumerate(tagged): # If there is the weird period after the prefix, if tag[1] == '.': # and it is in fact splitting up a person's name, if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP': if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']: # combine it into the actual name, tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0], tagged[n + 1][0]), 'NNP') # and then remove the extra tags. del tagged[n + 1] del tagged[n] return tagged
def normalize_tokens(self): if len(self.stindices) != len(self.enindices): sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n") return start = {} idx = 0 for s in sorted(self.stindices): self.stindices[s] = idx start[idx] = s idx += 1 end = {} idx = 0 for t in sorted(self.enindices): self.enindices[t] = idx end[idx] = t if idx > 0 and end[idx - 1] > start[idx]: sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n") return token = self.text[start[idx] : t + 1].strip() if " " in token: sys.stderr.write("\t\tIssue: incorrect tokenization " + token + "\n") return if token == "": continue self.tokens.append(token) idx += 1 try: self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)] for idx in xrange(len(self.tokens)): tok = self.tokens[idx] if self.nltkpostags[idx].startswith("V"): self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v')) else: self.nltklemmas.append(lemmatizer.lemmatize(tok)) except IndexError: print self.tokens print pos_tag(self.tokens) return True
def tag(self, tokens): """ add pos tags to token objects :param tokens: list of token objects :type tokens: list(Token) :return: label augmented list of Token objects :rtype: list(Token) """ tags = pos_tag([token.get_text() for token in tokens]) for token, tag in zip(tokens, tags): token.add_a_label('pos', tag[1]) return tokens
def pos(text): tokens = nltk.word_tokenize(text) wordpos = nltk.pos_tag(tokens) return wordpos
def __tagPartsOfSpeech(words): return [pair[1] for pair in nltk.pos_tag(words)]
def tag(text, tt_home): # Default NLTK's tokenizer # TreebankWordTokenizer + PunktSentenceTokenizer nltk_start = time() tokens = word_tokenize(text) # Default NLTK's POS tagger # ? # Use tagset='universal' for universal tagset nltk_tagged = pos_tag(tokens) nltk_end = time() nltk_execution = nltk_end - nltk_start logger.info("NLTK took %f seconds" % nltk_execution) # TreeTagger wrapper # Tokenization: ? # Default language: English # English: trained on Penn treebank # Default flags: -token -lemma -sgml -quiet -no-unknown tt_start = time() tt = TreeTagger(TAGDIR=tt_home) raw_tags = tt.tag_text(text) tt_end = time() tt_execution = tt_end - tt_start tt_tagged = make_tags(raw_tags) logger.info("TreeTagger took %f seconds" % tt_execution) return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def tag_one(self, text, tagset, **kwargs): """ POS-Tags the given text """ return pos_tag(word_tokenize(text, tagset))