我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.tokenize.WhitespaceTokenizer()。
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1'): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def preprocess(s, max_tokens): #s = unicode(s, ignore="errors") s = s.lower() s = re.sub(r'[^\x00-\x7F]+',' ', s) s = re.sub("<s>", "", s) s = re.sub("<eos>", "", s) s = remove_punctuation(s) s = re.sub('\d','#',s) s = re.sub('\n',' ',s) s = re.sub(',',' ',s) tokens = WhitespaceTokenizer().tokenize(s) #s = replace_the_unfrequent(tokens) if (len(tokens) > max_tokens): tokens = tokens[:max_tokens] s = " ".join(tokens) return s, len(tokens)
def tokenize(self, text): """ tokenize text into a list of Token objects :param text: text to be tokenized (might contains several sentences) :type text: str :return: List of Token objects :rtype: list(Token) """ tokens = [] if self.tokenizer_type == "SpaceTokenizer": operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+') for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer": operator = WhitespaceTokenizer() for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "PTBTokenizer": ptb_tokens = word_tokenize(text) counter = 0 for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens): new_token = Token(counter, token, span[0], span[1]) counter += 1 tokens.append(new_token) return tokens