我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用nltk.tokenize.WordPunctTokenizer()。
def getredundantComponents(sentences): window_size=4 introList=[] midlist=[] endlist=[] for sent in sentences: words = WordPunctTokenizer().tokenize(sent) length_sent=len(words) f_point = (length_sent)//3 m_point=(length_sent)//2 index_span=window_size//2 intro=' '.join(word for word in words[0:window_size]) mid=' '.join(word for word in words[m_point-index_span:m_point+index_span]) end=' '.join(word for word in words[-window_size:]) introList.append(intro) midlist.append(mid) endlist.append(end) return introList, midlist, endlist
def __init__(self, root, fileids=DOC_PATTERN, tags=None, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), encoding='utf8', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._good_tags = tags or self.TAGS
def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out
def __init__(self): self.model = WordPunctTokenizer()
def __init__(self): self.tokenizers = { 'en': TweetTokenizer(), 'de': WordPunctTokenizer(), 'it': WordPunctTokenizer(), 'fr': WordPunctTokenizer(), 'default': WordPunctTokenizer() } self.tokenizer = TweetTokenizer()