我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.tokenize.TreebankWordTokenizer()。
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
def get(self, text=["medical"]): if type(text) == str: text = text.lower() text = TreebankWordTokenizer().tokenize(text) try: data = np.array(map(self.vocab.get, text)) return self.onehot(data), data except: unknowns = [] for word in text: if self.vocab.get(word) == None: unknowns.append(word) raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
def __init__(self, word_tokenizer=TreebankWordTokenizer(), sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'), **kwargs): self._seq = MongoDBLazySequence(**kwargs) self._word_tokenize = word_tokenizer.tokenize self._sent_tokenize = sent_tokenizer.tokenize
def stem_and_tokenize_text(text): sents = sent_tokenize(text) tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents])) terms = [Term(token) for token in tokens] return filter(lambda term: not term.is_punctuation(), terms)
def SentenceTokenize(self, text): tokens = TreebankWordTokenizer().tokenize(text) return tokens
def tokenize(sentence): "Tokenize sentence the way parser expects." tokenizer = TreebankWordTokenizer() s = tokenizer.tokenize(sentence) s = ' '.join(s) # character replacements s = ''.join(REPLACEMENTS_R.get(x,x) for x in s) return s
def fresh(self, s, tokenized=False): """UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string). Pass in `tokenized=True` if `s` has already been tokenized, otherwise we apply `nltk.tokenize.TreebankWordTokenizer`. """ if self.process is None: self._start_subprocess() s = str(s.strip()) if not tokenized: s = tokenize(s) s = s.strip() assert '\n' not in s, "No newline characters allowed %r" % s try: self.process.stdin.write(s.encode('utf-8')) except IOError as e: #if e.errno == 32: # broken pipe # self.process = None # return self(s) # retry will restart process raise e self.process.stdin.write(b'\n') self.process.stdin.flush() out = self.process.stdout.readline() if sys.version_info[0] == 3: out = out.decode() return self.to_ud(out)
def __prepare__(self): """ """ conversations = open(path.join(self.BASE_PATH, self.CONVS_FILE), 'r').readlines() movie_lines = open(path.join(self.BASE_PATH, self.LINES_FILE), 'r').readlines() tbt = TreebankWordTokenizer().tokenize self.words_set = set() self.lines_dict = {} for i, line in enumerate(movie_lines): parts = map(lambda x: x.strip(), line.lower().split(self.FILE_SEP)) tokens = tbt(parts[-1]) self.lines_dict[parts[0]] = tokens self.words_set |= set(tokens) self.word2idx = {} self.word2idx[self.PAD_TOKEN] = 0 self.word2idx[self.EOS_TOKEN] = 1 self.word2idx[self.GO_TOKEN] = 2 for i, word in enumerate(self.words_set): self.word2idx[word] = i + 3 self.idx2word = [0] * len(self.word2idx) for w, i in self.word2idx.items(): self.idx2word[i] = w # extract pairs of lines in a conversation (s0, s1, s2) -> {(s0, s1), (s1, s2)} utt_pairs = [] for line in conversations: parts = map(lambda x: x[1:-1], map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))[-1][1:-1].split(', ')) utt_pairs += list(pairwise(parts)) utt_pairs = np.random.permutation(utt_pairs) train_utt_pairs = utt_pairs[self.VAL_COUNT:] self.val_pairs = utt_pairs[:self.VAL_COUNT] def find_bucket(enc_size, dec_size, buckets): return next(dropwhile(lambda x: enc_size > x[0] or dec_size > x[1], buckets), None) for pair in train_utt_pairs: bckt = find_bucket(len(self.lines_dict[pair[0]]), len(self.lines_dict[pair[1]]), self.bucket_sizes) if bckt is None: self.bucket_pairs[(-1, -1)].append(pair) else: self.bucket_pairs[bckt].append(pair) self.bucket_ordering = [] for bckt, _ in sorted(map(lambda x: (x[0], len(x[1])), self.bucket_pairs.items()), key=lambda x: x[1], reverse=True): self.bucket_ordering.append(bckt)
def transform(self, X, **transform_params): #sparse matrix with occurrences nxm # n : number of docs # m : size of lexicon features = np.empty((len(X),len(self.lexicon))) for docid,doc in enumerate(X): if self.preprocessor is not None: doc = self.preprocessor(doc) tokens = TreebankWordTokenizer().tokenize(doc) bigrams = [" ".join(i) for i in ngrams(tokens,2)] doctokens = tokens + bigrams tokencounts = Counter(doctokens) match = set(tokencounts.keys()) & set(self.lexicon["ngram"]) if len(match) > 0 : #occurrences vector occurrences = self.lexicon["ngram"].map(lambda w : w in match) ovec = csr_matrix(occurrences) #polarity vector pvec = csr_matrix(self.lexicon["polarity"]) #counts vector counts = self.lexicon["ngram"].map(lambda w : tokencounts[w] if w in match else 0 ) cvec = csr_matrix(counts) if self.polarity: if self.weightedcount: vector = ovec.multiply(pvec).multiply(cvec) else : vector = ovec.multiply(pvec) else : if self.weightedcount: vector = ovec.multiply(cvec) else : vector = ovec vector = vector.todense() else: #can't skip because np.empty is > 0 vector = np.zeros(len(self.lexicon)) features[docid] = vector return csr_matrix(features)