我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.word_tokenize()。
def tokenize_and_stem(text): """ First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token """ tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): if 'intern' == token: token = '' if 'student' == token: token = '' if 'and' == token: token = '' filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0] return stems
def preprocessing(text): text = text.decode("utf8") # tokenize into words tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # remove stopwords stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] # lower capitalization tokens = [word.lower() for word in tokens] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] preprocessed_text= ' '.join(tokens) return preprocessed_text
def ne_tagging(text): chunked = ne_chunk(pos_tag(word_tokenize(text))) prev = None continuous_chunk = [] current_chunk = [] for i in chunked: if type(i) == Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk
def get_sentence_tokens(text): ''' Given a text(review), return the token list of each sentence :param text: :return: ''' sentences = sent_tokenize(text) sent_tokens = [] for sentence in sentences: sent_token = word_tokenize(sentence) sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))] sent_tokens.append(sent_token) # remove stop words and short tokens # stemming, experiment shows that stemming works nothing... # if (stemming): # stemmer = PorterStemmer() # texts = [[ stemmer.stem(token) for token in text] for text in texts] return sent_tokens
def createbigramvocabulary(reviewfile, vocabfile): createvocabulary(reviewfile, vocabfile) finput = open(reviewfile,"r") foutput = open(vocabfile,"a") all_bigrams = [] for line in finput: tokenized_line = [] tokenized_line.append('*') tokenized_line.extend(word_tokenize(line[1:])) tokenized_line.append('$') bgrms = bigrams(tokenized_line) all_bigrams.extend(bgrms) c = Counter(all_bigrams) for b in c: if (b[0] != "+" and b[0] != "-" and c[b] >= 3): foutput.write(b[0] + " " + b[1] + "\n") finput.close() foutput.close()
def word_count(message, word): """ Computes the number of times a word appears in a message (case-insensitive). Args: message: A Message object. word: A string with no spaces. Returns: An int representing the number of times word (case-insensitive) appears in the text of message split by spaces. """ if ' ' in word: raise ValueError('word cannot contain spaces') lowercase_tokens = [token.lower() for token in nltk.word_tokenize(message.text)] return lowercase_tokens.count(word.lower())
def build_vocab(train_data, test_data): counter = collections.Counter() for stories, questions, answers in [train_data, test_data]: for story in stories: for sent in story: for word in nltk.word_tokenize(sent): counter[word.lower()] += 1 for question in questions: for word in nltk.word_tokenize(question): counter[word.lower()] += 1 for answer in answers: for word in nltk.word_tokenize(answer): counter[word.lower()] += 1 # no OOV here because there are not too many words in dataset word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())} word2idx["PAD"] = 0 idx2word = {v:k for k, v in word2idx.items()} return word2idx, idx2word
def tokenizeVal(sent): '''Return the tokens of a sentence including punctuation. >>> tokenize('Bob dropped the apple. Where is the apple?') ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] ''' tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)] tokenIdx2CharIdx = [None] * len(tokenizedSent) idx = 0 token_idx = 0 while idx < len(sent) and token_idx < len(tokenizedSent): word = tokenizedSent[token_idx] if sent[idx:idx+len(word)] == word: tokenIdx2CharIdx[token_idx] = idx idx += len(word) token_idx += 1 else: idx += 1 return tokenizedSent, tokenIdx2CharIdx
def tokenize_text( sample_text ): global sequence_lengths processed_text = [] if cfg.remove_punctuation: cleaned = sample_text.lower().translate( t_table ) else: cleaned = sample_text if cfg.use_casual_tokenizer: tokens = tknzr.tokenize( cleaned ) else: tokens = nltk.word_tokenize( cleaned, language='english') if cfg.remove_stopwords: tokens = [w for w in tokens if not w in stopwords.words('english')] sequence_lengths.append( len( tokens ) ) processed_text.extend( tokens ) return processed_text
def process_imdb(fname, setting): labels, sentences = [], [] filename = setting + ".csv" quota = [0,0] if setting == 'test': maxquota = 5000 else: maxquota = 15000 with open(os.path.join(fname, filename), 'rb') as f: csvreader = csv.reader(f) for line in csvreader: label = 0 if line[0] == "1" else 1 quota[label] += 1 if quota[label] > maxquota: continue sentence = line[2].replace("\"", "") text = nltk.word_tokenize(sentence.decode('utf-8')) labels.append(int(label)) sentences.append(text) return sentences, labels
def tokenize(self, sentence): """ Given a string, tokenize it into words (with the conventional notion of word). Parameters ---------- sentence: str The string to tokenize. Returns ------- tokenized_sentence: List[str] The tokenized representation of the string, as a list of tokens. """ return nltk.word_tokenize(sentence.lower())
def add(self, filename, document): """ Add a document string to the index. """ # You can uncomment the following line to see the words found in each # image. # print("Words found in %s: %s" % (filename, document)) for token in [t.lower() for t in nltk.word_tokenize(document)]: if token in self.stopwords: continue if token in ['.', ',', ':', '']: continue if self.stemmer: token = self.stemmer.stem(token) # Add the filename to the set associated with the token. self.redis_token_client.sadd(token, filename) # store the 'document text' for the filename. self.redis_docs_client.set(filename, document)
def _identify_pronoun(self, answer): """Calculate percentage of pronouns within answer - Args: answer(str): answer text - Returns: percentage(float): ratio of pronouns in answer """ text = nltk.word_tokenize(answer) post = nltk.pos_tag(text) pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$'] # init variables num_pronouns = 0 num_terms = len(post) percentage = 0 for k, v in post: if v in pronoun_list: num_pronouns += 1 percentage = float(num_pronouns) / num_terms return percentage
def _identify_pronoun2(self, sentence): """Calculate percentage of pronouns in the sentence that are in the answer - Args: sentence(str): question sentence - Returns: pronoun_in_sentence(list): pronouns in sentence sentence_len(int): length of current sentence """ text = nltk.word_tokenize(sentence) post = nltk.pos_tag(text) pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$'] pronoun_in_sentence = [] sentence_len = len(post) for k, v in post: if v in pronoun_list: pronoun_in_sentence.append(k) return pronoun_in_sentence, sentence_len
def _first_tagger_after_answer_span(self, question): """Get the first tagger after answer span - Args: question(string): string of current question - Returns: tagger(string): tagger of first term after span """ index = 0 text = nltk.word_tokenize(question) post = nltk.pos_tag(text) for idx, t in enumerate(post): if t[0] == '_____': index = idx + 1 break try: return post[index][1] except IndexError: return 'dummy'
def _first_tagger_before_answer_span(self, question): """Get the first tagger before answer span - Args: question(string): string of current question - Returns: tagger(string): tagger of first term before span """ index = 0 text = nltk.word_tokenize(question) post = nltk.pos_tag(text) for idx, t in enumerate(post): if t[0] == "_____": index = idx - 1 break try: return post[index][1] except IndexError: return 'dummy'
def main(): parser = argparse.ArgumentParser() parser.add_argument("env_data", type=str, help="Generated environment data filename in JSON format") args = parser.parse_args() print("-- Initialized environment") env = SquadEnv(args.env_data) context, question = env.reset() done = False while not done: print("Context ids: {}".format(context)) print("Question ids: {}".format(question)) print("Context tokens: {}".format(ids2tokens(context, env.id2token))) print("Question tokens: {}".format(ids2tokens(question, env.id2token))) answer_tokens = tokens2ids(word_tokenize(input("Answer: ")) + ["#eos#"], env.token2id) question_reward = 0 for token in answer_tokens: (context, question), reward, done, _ = env.step(token) question_reward += reward print("You got {} reward".format(question_reward))
def LemNormalize(text): # convert non ascii characters text = text.encode('ascii', 'replace').decode() # remove punctuation and digits remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits]) transformed = text.lower().translate(remove_punct_and_digits) # shortword = re.compile(r'\W*\b\w{1,2}\b') # transformed = shortword.sub('', transformed) # tokenize the transformed string tokenized = nltk.word_tokenize(transformed) # remove short words (less than 3 char) tokenized = [w for w in tokenized if len(w) > 3] tokenizer = LemTokens(tokenized) return tokenizer
def LemNormalizeIt(text): # convert non ascii characters text = text.encode('ascii', 'replace').decode() # remove punctuation and digits remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits]) transformed = text.lower().translate(remove_punct_and_digits) # tokenize the transformed string tokenized = nltk.word_tokenize(transformed) # apply lemming with morph it morph_it = load_morph_it() tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3] return tokenized
def tag(self, lines): ''' Tokenize and categorise the words in the collection of text :param lines: The list of strings with the text to match :type lines: ``list`` of ``str`` :rtype: :class: :return: ''' try: tokenized_words = nltk.word_tokenize(lines) return nltk.pos_tag(tokenized_words) except LookupError as le: print("Run install_words.py first") raise le
def _generate_candidate_keywords(self, sentences, max_length=3): """Creates a list of candidate keywords, or phrases of at most max_length words, from a set of sentences""" phrase_list = [] for sentence in sentences: words = map(lambda x: "|" if x in self.stopwords else x, nltk.word_tokenize(sentence.lower())) phrase = [] for word in words: if word == "|" or is_punctuation(word): if len(phrase) > 0: if len(phrase) <= max_length: phrase_list.append(phrase) phrase = [] else: phrase.append(word) return phrase_list
def get_tokenizer(name, lowercase): if name == 'char': if lowercase: return (lambda s: list(s.strip().lower())) else: return (lambda s: list(s.strip())) elif (name == 'space') or (name == 'bpe'): if lowercase: return (lambda s: s.lower().split()) else: return str.split elif name == 'word': if lowercase: return (lambda s: word_tokenize(s.lower())) else: return word_tokenize else: raise ValueError('Unknown tokenizer: "%s"' % name)
def _set_tokenizer(self, tokenizer): """ Set tokenizer :param tokenizer: tokenization method :return: None """ if tokenizer == "nltk": self.tokenizer = nltk.word_tokenize elif tokenizer == "spacy": spacy_en = spacy.load("en") def spacy_tokenizer(seq): return [w.text for w in spacy_en(seq)] self.tokenizer = spacy_tokenizer else: raise ValueError("Invalid tokenizing method %s" % tokenizer)
def map_coocurence(context_size, data): coocurrence_list = [] try: if detect(data) == 'en': region = nltk.word_tokenize(data) for l_context, word, r_context in _context_windows(region, context_size, context_size): if isWord(word): for i, context_word in enumerate(l_context[::-1]): if isWord(context_word): coocurrence_list.append(((word, context_word), 1 / (i + 1))) for i, context_word in enumerate(r_context): if isWord(context_word): coocurrence_list.append(((word, context_word), 1 / (i + 1))) except LangDetectException: return coocurrence_list return coocurrence_list
def from_sentence(sent): tokens = nltk.word_tokenize(sent) tagged = nltk.pos_tag(tokens) dg = DependencyGraph() for (index, (word, tag)) in enumerate(tagged): dg.nodes[index + 1] = { 'word': word, 'lemma': '_', 'ctag': tag, 'tag': tag, 'feats': '_', 'rel': '_', 'deps': defaultdict(), 'head': '_', 'address': index + 1, } dg.connect_graph() return dg
def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
def train(self, chain_len = None): """ Trains the markov data structure by creating chains of desired length """ if not chain_len: chain_len = self.CHAIN_LENGTH self.CHAIN_LEN = chain_len self.everything['corpus'] = {} self.corpus = self.everything['corpus'] for f in self.everything['input']: for line in sent_tokenize( self.everything['input'][f] ): words = word_tokenize(line) for chain in self._make_chains(words): k = " ".join( chain[:-1] ) # key is everything but last word v = chain[-1] # value is last word try: self.corpus[k].append(v) except: self.corpus[k] = [v]
def parse_gender(text): sentences = [ [word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(text) ] sents, words = count_gender(sentences) total = sum(words.values()) for gender, count in words.items(): pcent = (count / total) * 100 nsents = sents[gender] print( "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents) )
def _convert_obj(self, obj): pre_sentence = obj['sentence1'] hyp_sentence = obj['sentence2'] if self.lower: pre_sentence = pre_sentence.lower() hyp_sentence = hyp_sentence.lower() pre_words = word_tokenize(pre_sentence) hyp_words = word_tokenize(hyp_sentence) pre = [self.word_vocab.word_to_id(w) for w in pre_words] hyp = [self.word_vocab.word_to_id(w) for w in hyp_words] pre_length = len(pre) hyp_length = len(hyp) label = obj['gold_label'] if len(pre) > self._max_length or len(hyp) > self._max_length: return None if label == '-': return None label = self.label_vocab.word_to_id(label) return pre, hyp, pre_length, hyp_length, label
def tokenize_me(file_text): #firstly let's apply nltk tokenization tokens = nltk.word_tokenize(file_text) #let's delete punctuation symbols tokens = [i for i in tokens if i not in string.punctuation] #deleting stop_words tokens = [i for i in tokens if i not in stop_words] #cleaning words tokens = [i.replace("«", "").replace("»", "") for i in tokens] tokens = [stemmer.stem(i) for i in tokens] return set(tokens)
def tokenize_sentences(self): # tokenize the sentences into words and count the word frequencies # get most common words, build index_to_word and word_to_index vectors self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in self.sentences] word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences)) print("Found %d unique word tokens." % len(word_freq.items())) vocab = word_freq.most_common(self.vocabulary_size - 1) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict( [(w, i) for i, w in enumerate(self.index_to_word)]) print("Using vocabulary size %d." % self.vocabulary_size) print( "The least frequent word is '%s' appearing %d times." % ( vocab[-1][0], vocab[-1][1])) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(self.tokenized_sentences): self.tokenized_sentences[i] = [ w if w in self.word_to_index else self.unknown_token for w in sent]
def sent2vec(s): words = str(s).lower() words = word_tokenize(words) words = [w for w in words if not w in stop_words] words = [w for w in words if w.isalpha()] M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v ** 2).sum()) ############################################################################### # Train
def get_review_sentences(): ''' Read the yelp review and return after sentence segmentattion :return: ''' review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8') count_sentence = 0 sentences = [] for line in review_file: json_review = json.loads(line.strip()) text = json_review.get("text").replace('\n','').lower() raw_sentences = sent_tokenize(text) for raw_sentence in raw_sentences: if len(raw_sentence.strip()) > 0: sent_tokens = word_tokenize(raw_sentence) sentences.append(sent_tokens) return sentences
def createTrainingList(reviewLst): sds = SupervisedDataSet(100,1) for review in reviewLst: revString = unicode(review[1], errors='ignore') revSentences = nltk.word_tokenize(revString.strip()) revWords = [] for i in revSentences: revWords += i.lower().split() vec = 0 for i in revWords: try: vec+=model[i]/2 except: pass vec=vec/len(revWords) sds.addSample(vec,review[0]) net = buildNetwork(100, 20, 1, hiddenclass=TanhLayer, outclass=SoftmaxLayer,bias=True) trainer = BackpropTrainer(net, sds) print "Error score:",trainer.train() print trainer.trainUntilConvergence(verbose=True,maxEpochs=100)
def token_func(input_string): tokens = nltk.word_tokenize(input_string) long_tokens = [] refined_tokens = [] # lemmatized_tokens = [] stopwordlist = get_stopwordlist("../data/first_stopwordlist.txt") regex = re.compile('[^1-9a-zA-Z]') for token in tokens: token = regex.sub('', token) if len(token) > 3: long_tokens.append(token) lemmatized_tokens = dhh_preprocess_tools.hfst_words(long_tokens, filter=('VERB', 'NOUN', 'ADJ', 'PROPN')) for token in lemmatized_tokens: token = token.lower() if token not in stopwordlist: refined_tokens.append(token) return refined_tokens
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def txt_to_sent(sentences, word_vec, tokenize=True): sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences] n_w = np.sum([len(x) for x in sentences]) # filters words without glove vectors for i in range(len(sentences)): s_f = [word for word in sentences[i] if word in word_vec] if not s_f: import warnings warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i)) s_f = ['</s>'] sentences[i] = s_f lengths = np.array([len(s) for s in sentences]) n_wk = np.sum(lengths) print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2))) return sentences
def __init__(self, text): self.text = text self.tokens = nltk.word_tokenize(text) self.lowercase_tokens = [t.lower() for t in self.tokens] self.alpha_tokens = [t for t in self.lowercase_tokens if t.isalpha()]
def maybe_build_vocab(reuters_dir, vocab_file): vocab = collections.defaultdict(int) if os.path.exists(vocab_file): fvoc = open(vocab_file, "rb") for line in fvoc: word, idx = line.strip().split("\t") vocab[word] = int(idx) fvoc.close() else: counter = collections.Counter() num_docs_read = 0 for doc in stream_reuters_documents(reuters_dir): if num_docs_read % 100 == 0: print("building vocab from {:d} docs" .format(num_docs_read)) topics = doc["topics"] if len(topics) == 0: continue title = doc["title"] body = doc["body"] title_body = ". ".join([title, body]).lower() for sent in nltk.sent_tokenize(title_body): for word in nltk.word_tokenize(sent): counter[word] += 1 for i, c in enumerate(counter.most_common(VOCAB_SIZE)): vocab[c[0]] = i + 1 num_docs_read += 1 print("vocab built from {:d} docs, complete" .format(num_docs_read)) fvoc = open(vocab_file, "wb") for k in vocab.keys(): fvoc.write("{:s}\t{:d}\n".format(k, vocab[k])) fvoc.close() return vocab
def build_numeric_text(vocab, text): wids = [] for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): wids.append(vocab[word]) return ",".join([str(x) for x in wids]) ##################### main ######################
def get_maxlens(train_data, test_data): story_maxlen, question_maxlen = 0, 0 for stories, questions, _ in [train_data, test_data]: for story in stories: story_len = 0 for sent in story: swords = nltk.word_tokenize(sent) story_len += len(swords) if story_len > story_maxlen: story_maxlen = story_len for question in questions: question_len = len(nltk.word_tokenize(question)) if question_len > question_maxlen: question_maxlen = question_len return story_maxlen, question_maxlen
def vectorize(data, word2idx, story_maxlen, question_maxlen): Xs, Xq, Y = [], [], [] stories, questions, answers = data for story, question, answer in zip(stories, questions, answers): xs = [[word2idx[w.lower()] for w in nltk.word_tokenize(s)] for s in story] xs = list(itertools.chain.from_iterable(xs)) xq = [word2idx[w.lower()] for w in nltk.word_tokenize(question)] Xs.append(xs) Xq.append(xq) Y.append(word2idx[answer.lower()]) return pad_sequences(Xs, maxlen=story_maxlen),\ pad_sequences(Xq, maxlen=question_maxlen),\ np_utils.to_categorical(Y, num_classes=len(word2idx))
def tokenize(sent): '''Return the tokens of a sentence including punctuation. >>> tokenize('Bob dropped the apple. Where is the apple?') ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] ''' return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]