Python nltk.tokenize 模块,WordPunctTokenizer() 实例源码


项目:AbTextSumm    作者:StevenLOL    | 项目源码 | 文件源码
def getredundantComponents(sentences):

    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)

        f_point = (length_sent)//3
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
    return introList, midlist, endlist
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self, root, fileids=DOC_PATTERN, tags=None,
                 encoding='utf8', **kwargs):
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._good_tags = tags or self.TAGS
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def words(self, fileid=None):
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
        return out
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def __init__(self):
        self.model = WordPunctTokenizer()
项目:deep-mlsa    作者:spinningbytes    | 项目源码 | 文件源码
def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()

        self.tokenizer = TweetTokenizer()