我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.pos_tag_sents()。
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): ''' This function will extract text of a specific POS sequence rather than just Noun Phrase ''' import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group) for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_chunks(text_string,max_words=3,lemmatize=False): # Any number of adjectives followed by any number of nouns and (optionally) again # any number of adjectives folowerd by any number of nouns grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' # Makes chunks using grammar regex chunker = nltk.RegexpParser(grammar) # Get grammatical functions of words # What this is doing: tag(sentence -> words) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string)) # Make chunks from the sentences, using grammar. Output in IOB. all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # Join phrases based on IOB syntax. candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key] # Filter by maximum keyphrase length candidates = list(filter(lambda l: len(l.split()) <= 3, candidates)) # Filter phrases consisting of punctuation or stopwords punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates)) # lemmatize if lemmatize: lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize candidates = [lemmatizer(x) for x in candidates] return candidates
def tag_sentences(sentences, pos_symbol=False): tokenized = [] for sent in sentences: tokenized.append(tokenizer(sent)) processed_list = tagger(tokenized) if not pos_symbol: output_list = [] for sentence in processed_list: new_sentence = [] for word in sentence: new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]])) output_list.append(new_sentence) else: output_list = processed_list return output_list
def tag_many(self, documents, tagset=None, **kwargs): """ POS-Tag many documents. """ return pos_tag_sents((word_tokenize(d) for d in documents), tagset)
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)] return candidates
def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'): all_chunks = [] chunker = nltk.chunk.regexp.RegexpParser(grammar) for sentence in sentences: tagged_sents = nltk.pos_tag_sents( [nltk.word_tokenize(sentence)]) chunks = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents] wtc_sents = [nltk.chunk.tree2conlltags(chunk) for chunk in chunks] flattened_chunks = list( itertools.chain.from_iterable( wtc_sent for wtc_sent in wtc_sents) ) valid_chunks_tagged = [(status, [wtc for wtc in chunk]) for status, chunk in itertools.groupby(flattened_chunks, lambda (word,pos,chunk): chunk != 'O')] valid_chunks = [' '.join(word.lower() for word, tag, chunk in wtc_group if word.lower() not in stopword_list) for status, wtc_group in valid_chunks_tagged if status] all_chunks.append(valid_chunks) return all_chunks
def tokenize(str_stream, eos=True, remove_punct=False): """ Given a str or str_stream (f.read()) convert the str to a list of sentences, e.g.: [[word, word], [word, word, ...], ...] :param str_stream: a str or a str_stream :param eos: wether turns '.' into <eos> tag :param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'" :return: a list of sentences, each sentence is a list of words (str) """ # do lazy import coz import nltk is very slow import nltk try: nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: print('punct resource not found, using nltk.download("punkt") to download resource data...') nltk.download('punkt') tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())] # get POS Tags tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal') pos_tags = [] for token_tags in tokens_tags: _, tags = zip(*token_tags) pos_tags.append(tags) # tag number tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens] if eos: for token in tokens: token[-1] = '<eos>' if remove_punct: tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens] return tokens, pos_tags
def generate_candidate(texts, method='word', remove_punctuation=False): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates