Python nltk.tokenize 模块，WordPunctTokenizer() 实例源码

我们从Python开源项目中，提取了以下12个代码示例，用于说明如何使用nltk.tokenize.WordPunctTokenizer()。

项目：AbTextSumm 作者：StevenLOL | 项目源码 | 文件源码

def getredundantComponents(sentences):
    window_size=4
    introList=[]
    midlist=[]
    endlist=[]

    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)
        length_sent=len(words)

        f_point = (length_sent)//3
        m_point=(length_sent)//2
        index_span=window_size//2
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
        introList.append(intro)
        midlist.append(mid)
        endlist.append(end)
    return introList, midlist, endlist

项目：minke 作者：DistrictDataLabs | 项目源码 | 文件源码

def __init__(self, root, fileids=DOC_PATTERN, tags=None,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                    'tokenizers/punkt/english.pickle'),
                 encoding='utf8', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._good_tags = tags or self.TAGS

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：hate-to-hugs 作者：sdoran35 | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：FancyWord 作者：EastonLee | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：beepboop 作者：nicolehe | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：kind2anki 作者：prz3m | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：but_sentiment 作者：MixedEmotions | 项目源码 | 文件源码

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

项目：adaware-nlp 作者：mhw32 | 项目源码 | 文件源码

def __init__(self):
        self.model = WordPunctTokenizer()

项目：deep-mlsa 作者：spinningbytes | 项目源码 | 文件源码

def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer()