Python nltk.tokenize 模块,TreebankWordTokenizer() 实例源码

我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.tokenize.TreebankWordTokenizer()

项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html
项目:variational-text-tensorflow    作者:carpedm20    | 项目源码 | 文件源码
def get(self, text=["medical"]):
    if type(text) == str:
      text = text.lower()
      text = TreebankWordTokenizer().tokenize(text)

    try:
      data = np.array(map(self.vocab.get, text))
      return self.onehot(data), data
    except:
      unknowns = []
      for word in text:
        if self.vocab.get(word) == None:
          unknowns.append(word)
      raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def __init__(self, word_tokenizer=TreebankWordTokenizer(),
                 sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'),
                 **kwargs):
        self._seq = MongoDBLazySequence(**kwargs)
        self._word_tokenize = word_tokenizer.tokenize
        self._sent_tokenize = sent_tokenizer.tokenize
项目:Search-Engine    作者:SoufianEly    | 项目源码 | 文件源码
def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms)
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def SentenceTokenize(self, text):
        tokens = TreebankWordTokenizer().tokenize(text)

        return tokens
项目:variational_inference    作者:carpeanon    | 项目源码 | 文件源码
def get(self, text=["medical"]):
    if type(text) == str:
      text = text.lower()
      text = TreebankWordTokenizer().tokenize(text)

    try:
      data = np.array(map(self.vocab.get, text))
      return self.onehot(data), data
    except:
      unknowns = []
      for word in text:
        if self.vocab.get(word) == None:
          unknowns.append(word)
      raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
项目:PredPatt    作者:hltcoe    | 项目源码 | 文件源码
def tokenize(sentence):
    "Tokenize sentence the way parser expects."
    tokenizer = TreebankWordTokenizer()
    s = tokenizer.tokenize(sentence)
    s = ' '.join(s)
    # character replacements
    s = ''.join(REPLACEMENTS_R.get(x,x) for x in s)
    return s
项目:PredPatt    作者:hltcoe    | 项目源码 | 文件源码
def fresh(self, s, tokenized=False):
        """UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string).

        Pass in `tokenized=True` if `s` has already been tokenized, otherwise we
        apply `nltk.tokenize.TreebankWordTokenizer`.

        """
        if self.process is None:
            self._start_subprocess()
        s = str(s.strip())
        if not tokenized:
            s = tokenize(s)
        s = s.strip()
        assert '\n' not in s, "No newline characters allowed %r" % s
        try:
            self.process.stdin.write(s.encode('utf-8'))
        except IOError as e:
            #if e.errno == 32:          # broken pipe
            #    self.process = None
            #    return self(s)  # retry will restart process
            raise e
        self.process.stdin.write(b'\n')
        self.process.stdin.flush()
        out = self.process.stdout.readline()
        if sys.version_info[0] == 3:
            out = out.decode()
        return self.to_ud(out)
项目:seq2seq-lasagne    作者:erfannoury    | 项目源码 | 文件源码
def __prepare__(self):
        """

        """
        conversations = open(path.join(self.BASE_PATH, self.CONVS_FILE), 'r').readlines()
        movie_lines = open(path.join(self.BASE_PATH, self.LINES_FILE), 'r').readlines()
        tbt = TreebankWordTokenizer().tokenize
        self.words_set = set()
        self.lines_dict = {}
        for i, line in enumerate(movie_lines):
            parts = map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))
            tokens = tbt(parts[-1])
            self.lines_dict[parts[0]] = tokens
            self.words_set |= set(tokens)
        self.word2idx = {}
        self.word2idx[self.PAD_TOKEN] = 0
        self.word2idx[self.EOS_TOKEN] = 1
        self.word2idx[self.GO_TOKEN] = 2
        for i, word in enumerate(self.words_set):
            self.word2idx[word] = i + 3
        self.idx2word = [0] * len(self.word2idx)
        for w, i in self.word2idx.items():
            self.idx2word[i] = w

        # extract pairs of lines in a conversation (s0, s1, s2) -> {(s0, s1), (s1, s2)}
        utt_pairs = []
        for line in conversations:
            parts = map(lambda x: x[1:-1], map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))[-1][1:-1].split(', '))
            utt_pairs += list(pairwise(parts))
        utt_pairs = np.random.permutation(utt_pairs)
        train_utt_pairs = utt_pairs[self.VAL_COUNT:]
        self.val_pairs = utt_pairs[:self.VAL_COUNT]

        def find_bucket(enc_size, dec_size, buckets):
            return next(dropwhile(lambda x: enc_size > x[0] or dec_size > x[1], buckets), None)

        for pair in train_utt_pairs:
            bckt = find_bucket(len(self.lines_dict[pair[0]]), len(self.lines_dict[pair[1]]), self.bucket_sizes)
            if bckt is None:
                self.bucket_pairs[(-1, -1)].append(pair)
            else:
                self.bucket_pairs[bckt].append(pair)

        self.bucket_ordering = []
        for bckt, _ in sorted(map(lambda x: (x[0], len(x[1])), self.bucket_pairs.items()), key=lambda x: x[1], reverse=True):
            self.bucket_ordering.append(bckt)
项目:hatespeech    作者:lukovnikov    | 项目源码 | 文件源码
def transform(self, X, **transform_params):

        #sparse matrix with occurrences nxm
        # n : number of docs
        # m : size of lexicon 
        features = np.empty((len(X),len(self.lexicon)))            

        for docid,doc in enumerate(X):
            if self.preprocessor is not None:
                doc = self.preprocessor(doc)

            tokens = TreebankWordTokenizer().tokenize(doc)
            bigrams = [" ".join(i) for i in ngrams(tokens,2)]
            doctokens = tokens + bigrams

            tokencounts = Counter(doctokens)            
            match = set(tokencounts.keys()) & set(self.lexicon["ngram"])

            if len(match) > 0 :
                #occurrences vector
                occurrences = self.lexicon["ngram"].map(lambda w : w in match)
                ovec = csr_matrix(occurrences)
                #polarity vector
                pvec = csr_matrix(self.lexicon["polarity"])
                #counts vector
                counts = self.lexicon["ngram"].map(lambda w : tokencounts[w] if w in match else 0 )
                cvec = csr_matrix(counts)

                if self.polarity:
                    if self.weightedcount:
                        vector = ovec.multiply(pvec).multiply(cvec)
                    else :
                        vector = ovec.multiply(pvec)
                else : 
                    if self.weightedcount:
                        vector = ovec.multiply(cvec)
                    else :
                        vector = ovec         
                vector = vector.todense()
            else:
                #can't skip because np.empty is > 0 
                vector = np.zeros(len(self.lexicon))

            features[docid] = vector

        return csr_matrix(features)