我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tokenize.word_tokenize()。
def keyword_extractor(data): try: #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data))) #result = np_extractor.extract() text = words_wo_stopwords(strip_tags(data)) #TODO this is duplicated job, should be improved words = word_tokenize(strip_tags(text)) taggged = pos_tag(words) cleaned = filter_insignificant(taggged) text = " ".join(cleaned) wc = WordCloudMod().generate(text) result = list(wc.keys())[:10] except Exception as err: print(colored.red("At keywords extraction {}".format(err))) result = [] return result # TODO definitely can be better if we knew where content is
def create_batch(self, sentence_li): """Create a batch for a list of sentences.""" embeddings_batch = [] for sen in sentence_li: embeddings = [] sent_toks = sent_tokenize(sen) word_toks = [word_tokenize(el) for el in sent_toks] tokens = [val for sublist in word_toks for val in sublist] tokens = [el for el in tokens if el != ''] for tok in tokens: embeddings.append(self.embdict.tok2emb.get(tok)) if len(tokens) < self.max_sequence_length: pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))] embeddings = pads + embeddings else: embeddings = embeddings[-self.max_sequence_length:] embeddings = np.asarray(embeddings) embeddings_batch.append(embeddings) embeddings_batch = np.asarray(embeddings_batch) return embeddings_batch
def extractFeatures(self, article, n, customStopWords=None): # pass in article as a tuple ( text, title) text = article[0] # extract the text title = article[1] # extract the title sentences = sent_tokenize(text) # split text into sentences word_sent = [word_tokenize(sentences.lower()) for a in sentences] # split sentences into words self._freq = self._compute_frequencies(word_sent, customStopWords) # calculate word freq using member func created above if n < 0: # how many features (words) to return - a -ve number means # no feature ( word) selection, just return all features return nlargest(len(self._freq_keys()), self._freq, key=self._freq.get) else: # here we say if calling e func has asked for a subset # then return only the 'n' largest features, i.e. the # most important words ( important == frequent, less stopwords) return nlargest(n, self._freq, key=self._freq.get)
def summarize(self, article, n): text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i, sentence in enumerate(word_sent): for word in sentence: if word in self._freq: ranking[i] += self._freq[word] sentences_index = nlargest(n, ranking, key=ranking.get) return [sentences[j] for j in sentences_index] ############################################################################## # TEST
def similarity(c1, c2): '''stop words are words like "it" and "the" , that have no massive impact on the sentence''' stop_words = list(stopwords.words("english")) # Removes stop words in both sentences c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words] c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words] c1_words = Counter(dedupe(c1_cleaned)) c2_words = Counter(dedupe(c2_cleaned)) total_words = c1_words + c2_words similarity_between_words = 0 for key, val in total_words.items(): ''' Looks at whether the two articles share a word''' if total_words[key] > 1: similarity_between_words += 1 return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
def process_line(line): tokens = word_tokenize(line) output_tokens = [] for token in tokens: if token in INS_PUNCTS: output_tokens.append(INS_PUNCTS[token]) elif token in EOS_PUNCTS: output_tokens.append(EOS_PUNCTS[token]) elif is_number(token): output_tokens.append(NUM) else: output_tokens.append(token.lower()) return untokenize(" ".join(output_tokens) + " ")
def check_sent(s): count = 0 for r in s: #words = word_tokenize(r) # for w in words: for w in r: if type(w) != str: print(w) count += 1 continue if w in inv_words or w in oov_words_in_train: continue if w not in word2vec: count += 1 oov_words_in_train.add(w) else: inv_words[w] = word2vec.vocab[w].index return count
def preprocess_questions(examples, nlp='nltk'): if nlp == 'nltk': from nltk.tokenize import word_tokenize print('Example of generated tokens after preprocessing some questions:') for i, ex in enumerate(examples): s = ex['question'] if nlp == 'nltk': ex['question_words'] = word_tokenize(str(s).lower()) elif nlp == 'mcb': ex['question_words'] = tokenize_mcb(s) else: ex['question_words'] = tokenize(s) if i < 10: print(ex['question_words']) if i % 1000 == 0: sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(examples), i*100.0/len(examples)) ) sys.stdout.flush() return examples
def summarize(self, text, n): """ Return a list of n sentences which represent the summary of text. """ sents = sent_tokenize(text) assert n <= len(sents) word_sent = [word_tokenize(s.lower()) for s in sents] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i,sent in enumerate(word_sent): for w in sent: if w in self._freq: ranking[i] += self._freq[w] sents_idx = self._rank(ranking, n) return [sents[j] for j in sents_idx]
def load_jacana(fname, regexen): samples = [] with open(fname, 'rt') as inp: for line in inp: line = line.strip() if line.startswith('<Q> '): qorig = line[len('<Q> '):] q = word_tokenize(qorig) else: l = line.split(' ') label = int(l[0]) kwweight = float(l[1]) aboutkwweight = float(l[2]) text = word_tokenize(' '.join(l[3:])) toklabels = regex_overlap(text, regexen[qorig]) samples.append({'qtext': ' '.join(q), 'label': label, 'atext': ' '.join(text), 'kwweight': kwweight, 'aboutkwweight': aboutkwweight, 'toklabels': ' '.join([str(0+tl) for tl in toklabels])}) return samples
def load_sts(dsfile, skip_unlabeled=True): """ load a dataset in the sts tsv format """ s0 = [] s1 = [] labels = [] with codecs.open(dsfile, encoding='utf8') as f: for line in f: line = line.rstrip() label, s0x, s1x = line.split('\t') if label == '': if skip_unlabeled: continue else: labels.append(-1.) else: labels.append(float(label)) s0.append(word_tokenize(s0x)) s1.append(word_tokenize(s1x)) return (s0, s1, np.array(labels))
def load_quora(dsfile): """ load a dataset in the quora csv format """ s0 = [] s1 = [] labels = [] with open(dsfile, encoding = 'utf8') as csvfile: f = csv.reader(csvfile) firstline = True for line in f: if firstline: firstline = False continue s0x = line[3] s1x = line[4] label = line[5] labels.append(float(label)) s0.append(word_tokenize(s0x)) s1.append(word_tokenize(s1x)) return (s0, s1, np.array(labels))
def make_word_feature(df,embeddings): # use embeddings to vectorize merchant description # currently using averaging to combine words in merchant # there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence merchants = df.merchant.tolist() veclen = len(embeddings['food']) word_feature = np.zeros((len(merchants),veclen)) for idx, merchant in enumerate(merchants): num_known = 0 try: words = tokenize.word_tokenize(merchant) words = [word.lower() for word in words] for word in words: wordvec = embeddings[word] word_feature[idx,:] += wordvec num_known += 1 except: pass word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1)) return word_feature
def predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob): predicted_class = [] for review in testSet: negative_probab = math.log10(PN) positive_probab = math.log10(PP) review_words = word_tokenize(review) for w in review_words: if w in negative_probabilities: negative_probab = negative_probab + math.log10(negative_probabilities[w]) else: negative_probab = negative_probab + math.log10(unseen_neg_prob) if w in positive_probabilities: positive_probab = positive_probab + math.log10(positive_probabilities[w]) else: positive_probab = positive_probab + math.log10(unseen_pos_prob) if(negative_probab > positive_probab): result = '-' else: result = '+' predicted_class.append(result) return predicted_class
def create_vocab(self,dataset_path, vocab_path ,max_vocab_size): print("generating vocab from dataset at {}".format(dataset_path)) all_words = [] for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]: for line in open(os.path.join(dataset_path, dataset),"r").readlines(): data = json.loads(line) all_words += word_tokenize(data["sentence1"].lower()) all_words += word_tokenize(data["sentence2"].lower()) counter = Counter(all_words) count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0])) words, _ = list(zip(*count_pairs)) words = ["PAD"] + ["UNK"] + list(words) word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size))) with open(vocab_path, "w") as file: for word, id in word_to_id.items(): file.write("{}\t{}\n".format(word,id)) print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))
def getFreqWords(directoryPath): files = getListOfFilesInDir(directoryPath, "*") # get list of files in directory allWords = [] count = 0 if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files): files = random.sample(files, MAX_FILES_PER_CLASS) for ifile, fi in enumerate(files): # for each file in current class: with open(fi) as f: content = f.read() words = word_tokenize(content.decode('utf-8')) words = [w.lower() for w in words if w.lower() not in stop] words = list(set(words)) allWords += words count += 1 #print allWords C = Counter(allWords) C = sorted(C.items(), key=itemgetter(1),reverse=True) for c in C: if c[1] > 0.05 * float(count): print c[0], c[1] / float(count)
def prepro_question(imgs, params): # preprocess all the question print 'example processed tokens:' for i,img in enumerate(imgs): s = img['question'] if params['token_method'] == 'nltk': txt = word_tokenize(str(s).lower()) else: txt = tokenize(s) img['processed_tokens'] = txt if i < 10: print txt if i % 1000 == 0: sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) ) sys.stdout.flush() return imgs
def create_lexicon(pos, neg): lexicon = [] for fi in [pos, neg]: with open (fi, 'r') as f: contents = f.readlines() for l in contents[:hm_lines]: all_words = word_tokenize(l) lexicon += list(all_words) lexicon = [lemmatizer.lemmatize(i) for i in lexicon] w_counts = Counter(lexicon) """ This is done in the tutorial. Seems like a brute force method of removing stopwords. TODO: Use NLTK stopwords to remove stop words ? """ l2 = [] for w in w_counts: if 1000 > w_counts[w] > 50: l2.append(w) return l2
def sample_handling(sample, lexicon, classification): featureset = [] with open(sample, 'r') as f: contents = f.readlines() for l in contents[:hm_lines]: current_words = word_tokenize(l.lower()) current_words = [lemmatizer.lemmatize(i) for i in current_words] features = np.zeros(len(lexicon)) for word in current_words: if word.lower() in lexicon: index_value = lexicon.index(word.lower()) features[index_value] += 1 features = list(features) featureset.append([features, classification]) return featureset
def _avgrank_corp(inp_dir,hdv_vocab, num = 5000): cnt, vocab = Counter(), [] # Counter for all words in the corpus for (root, dirs, files) in os.walk(inp_dir): files = [f for f in files if not f[0] == '.'] for f in files: filepath = os.path.join(root,f) with codecs.open(filepath,'r', encoding="utf-8") as f: tok_txt = word_tokenize(f.read()) for word in tok_txt: cnt[word] += 1 for word in hdv_vocab: if word in cnt.keys(): del cnt[word] for word in cnt.most_common(num): try: vocab.append(str(word[0])) except: continue return vocab
def create_lexicon(pos, neg): lexicon = [] for fi in [pos, neg]: with io.open(fi, 'r', encoding='utf-8') as f: contents = f.readlines() for l in contents[:hm_lines]: all_words = word_tokenize(l.lower()) lexicon += list(all_words) lexicon = [lemmatizer.lemmatize(i) for i in lexicon] w_counts = Counter(lexicon) l2 = [] for w in w_counts: if 1000 > w_counts[w] > 50: l2.append(w) return l2
def sample_handling(sample, lexicon, classification): featureset = [] with io.open(sample, 'r', encoding='utf-8') as f: contents = f.readlines() for l in contents[:hm_lines]: current_words = word_tokenize(l.lower()) current_words = [lemmatizer.lemmatize(i) for i in current_words] features = np.zeros(len(lexicon)) for word in current_words: if word.lower() in lexicon: index_value = lexicon.index(word.lower()) features[index_value] += 1 features = list(features) featureset.append([features, classification]) return featureset
def custom_tokenizer(sentence, delimiters=['|', ','], remove_puncs=True, get_unique=False): # tokens = re.split('(\W)', sentence) for delimiter in delimiters: sentence = re.sub(re.escape(delimiter), " "+delimiter+" ", sentence) tokens = word_tokenize(sentence) # Remove duplicates if get_unique: tokens = list(set(tokens)) if remove_puncs: tokens = [token for token in tokens if not ((len(token.strip()) == 1) and bool(re.search("[^a-zA-Z0-9]", token)))] tokens = [token for token in tokens if (not bool(re.search("\s", token)) and token != '')] # Remove duplicates if get_unique: tokens = list(set(tokens)) return tokens
def offset_tokenize(text): tail = text accum = 0 tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)] info_tokens = [] for tok in tokens: scaped_tok = re.escape(tok) m = re.search(scaped_tok, tail) start, end = m.span() # global offsets gs = accum + start ge = accum + end accum += end # keep searching in the rest tail = tail[end:] info_tokens.append((tok, (gs, ge))) return info_tokens
def prepro_question(imgs, params): # preprocess all the question print 'example processed tokens:' for i,img in enumerate(imgs): s = img['question'] if params['token_method'] == 'nltk': txt = word_tokenize(str(s).lower()) else: txt = tokenize(s) img['processed_tokens'] = txt if i < 10: print txt if i % 100 == 0: sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(imgs), i*100.0/len(imgs)) ) sys.stdout.flush() return imgs
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""): try: tagged = pos_tag(word_tokenize(sent)) #Maybe actually better if possessives aren't included. #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is #probably not enough information to identify a data source chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) chunks = [] for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'): chunk = "" for leave in subtree.leaves(): chunk += leave[0] + ' ' chunks.append(chunk.strip()) return chunked, chunks except Exception as e: print(str(e))
def train_model(documents, labels, sample_size=.3, verbose=True): if verbose: print('starting to generate training data...', end='', flush=True) labeled_feature_set = list() for n, doc in enumerate(documents): feature = word_tokenize(' '.join(doc)) label = labels[n] resampled = resample(feature, label, sample_size) labeled_feature_set += resampled if verbose: print('done', flush=True) print('training model...this may take a few minutes.', flush=True, end='') trained_model = NaiveBayesClassifier.train(iter(labeled_feature_set)) if verbose: print('done', flush=True) return trained_model
def create_lexicon(fin): lexicon = [] with open(fin, 'r', buffering=100000, encoding ='latin-1') as f: try: counter = 1 content = '' for line in f: counter+=1 if(counter/2500.0).is_integer(): tweet=line.split(':::')[1] content+= ' '+tweet words = word_tokenize(content) words = [lemmatizer.lemmatize(i) for i in words] lexicon = list(set(lexicon + words)) print(counter, len(lexicon)) except Exception as e: print(str(e)) with open('lexicon.pickle', 'wb') as f: pickle.dump(lexicon, f)
def convert_to_vec(fin, fout, lexicon_pickle): with open(lexicon_pickle, 'rb') as f: lexicon = pickle.load(f) outfile = open(fout, 'a') with open(fin, buffering= 20000, encoding = 'latin-1') as f: counter = 0 for line in f: counter +=1 label = line.split(':::')[0] tweet = line.split(':::')[1] current_words = word_tokenize(tweet.lower()) current_words = [lemmatizer.lemmatize(i) for i in current_words] features = np.zeros(len(lexicon)) for word in current_words: if word.lower() in lexicon: index_value = lexicon.index(word.lower()) features[index_value] +=1 features = list(features) outline = str(features)+'::'+str(label)+ '\n' outfile.write(outline) print(counter)
def sample_handling(sample, lexicon, classification): featureset = [] # [1 0] pos sentiment [0 1] negative sentiment with open(sample, 'r') as f: contents = f.readlines() for l in contents[:hm_lines]: current_words = word_tokenize(l.lower()) current_words = [lemmatizer.lemmatize(i) for i in current_words] features = np.zeros(len(lexicon)) #print(features) for word in current_words: if word.lower() in lexicon: index_value = lexicon.index(word.lower()) # like the example discussed earlier features[index_value] += 1 features = list(features) featureset.append([features, classification]) #print(featureset) return featureset
def runprops_data(self, docs): new_docs = [] for doc_name, doc in docs: print 'Processing:', doc_name doc_new = [] doc = self.props_exception(doc_name, doc) for index, sent in enumerate(doc): doc_new.append(' '.join(word_tokenize(sent))) print index+1, doc_new[index] triples = [] for i, sent in enumerate(doc_new): try: tmp_triples = self.props_parser.extract_triples([sent]) triples.append(tmp_triples) except: print('Error: failed for line %s' % (sent)) continue parse_sents = create_trees(triples, doc_new) sents = [] new_docs.append((doc_name, parse_sents)) return new_docs
def wordMatch(question, line, storyPOS_dict): wordsInAQuestion = word_tokenize(question) rootsInAQuestion = set() for word in wordsInAQuestion: root = lancaster_stemmer.stem(word) rootsInAQuestion.add(root) if line in storyPOS_dict: verbmatch_score = 0 rootmatch_score = 0 scoreOfALine = {} for (word,tag) in storyPOS_dict[line]: if 'V' in tag: verb_root = lancaster_stemmer.stem(word) if verb_root in rootsInAQuestion: verbmatch_score = verbmatch_score + 6 else: word_root = lancaster_stemmer.stem(word) if word_root in rootsInAQuestion: rootmatch_score = rootmatch_score + 3 scoreOfALine[line] = rootmatch_score + verbmatch_score return rootmatch_score + verbmatch_score
def preprocess_sentence(sentence): """ Preprocesses a sentence, turning it all to lowercase and tokenizing it into words. :param sentence: the sentence to pre-process. :return: the sentence, as a list of words, all in lowercase """ sentence = sentence.lower() return word_tokenize(sentence)
def create_paper_dictionaries(filename="", readin=True, paper=None): """ Creates the metadata data structures for a specific paper required to compute the extra features which are appended to the sentence vector. :param filename: the filename only, not the path, for the paper to create dictionaries for. :return: a tuple of the metadata data structures for the paper. """ if readin and filename != "": # Read the paper in as a dictionary, keys are sections and values are the section text paper = read_in_paper(filename) # Extract paper keyphrases keyphrases = set(filter(None, " ".join(paper["KEYPHRASES"].lower().split("\n")).split(" "))) # Get the paper's vocab full_paper = " ".join([val for _, val in paper.iteritems()]).lower() paper_words = word_tokenize(full_paper) vocab = set(paper_words) # Create a bag of words for the paper paper_bag_of_words = defaultdict(int) for word in paper_words: paper_bag_of_words[word] += 1 # Get the title words title_words = set([x.lower() for x in word_tokenize(paper["MAIN-TITLE"]) if x not in STOPWORDS]) return keyphrases, vocab, paper_bag_of_words, title_words
def preprocess(text): """ Preprocess text for encoder """ X = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for t in text: sents = sent_detector.tokenize(t) result = '' for s in sents: tokens = word_tokenize(s) result += ' ' + ' '.join(tokens) X.append(result) return X
def get_latitude(self, user_input): """ Returns the latitude extracted from the input. """ from nltk import tokenize for token in tokenize.word_tokenize(user_input): if 'latitude=' in token: return re.sub('latitude=', '', token) return ''
def get_longitude(self, user_input): """ Returns the longitude extracted from the input. """ from nltk import tokenize for token in tokenize.word_tokenize(user_input): if 'longitude=' in token: return re.sub('longitude=', '', token) return ''
def split_ingr(x): wnl=WordNetLemmatizer() cleanlist=[] lst = x.strip('[]').split(',') cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst] return cleanlist #remove low-information words from ingredients, could use more
def add_items(self, sentence_li): """Add new items to the tok2emb dictionary from a given text.""" for sen in sentence_li: sent_toks = sent_tokenize(sen) word_toks = [word_tokenize(el) for el in sent_toks] tokens = [val for sublist in word_toks for val in sublist] tokens = [el for el in tokens if el != ''] for tok in tokens: if self.tok2emb.get(tok) is None: self.tok2emb[tok] = self.fasttext_model[tok]
def sentiment(request): open_file = open("wordfeature5k.pickle","rb") word_features = pickle.load(open_file) open_file.close() def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features open_file = open("naivebayesclassifier.pickle","rb") classifier = pickle.load(open_file) open_file.close() sentence = request.POST['sentence'] result = classifier.classify(find_features(sentence)) if result == "positive": return render(request, "home/index.html",{"sentence":sentence, "positive":"positive"}) elif result == "negative": return render(request, "home/index.html",{"sentence":sentence, "negative":"negative"})
def word_seg_en(docs): docs = [word_tokenize(sent) for sent in tqdm(docs)] # show the progress of word segmentation with tqdm '''docs_seg = [] print('docs size', len(docs)) for i in tqdm(range(len(docs))): docs_seg.append(word_tokenize(docs[i]))''' return docs
def get_word_dict(self, sentences, tokenize=True): # create vocab of words word_dict = {} if tokenize: from nltk.tokenize import word_tokenize sentences = [s.split() if not tokenize else word_tokenize(s) for s in sentences] for sent in sentences: for word in sent: if word not in word_dict: word_dict[word] = '' word_dict['<s>'] = '' word_dict['</s>'] = '' return word_dict
def visualize(self, sent, tokenize=True): if tokenize: from nltk.tokenize import word_tokenize sent = sent.split() if not tokenize else word_tokenize(sent) sent = [['<s>'] + [word for word in sent if word in self.word_vec] + ['</s>']] if ' '.join(sent[0]) == '<s> </s>': import warnings warnings.warn('No words in "{0}" have glove vectors. \ Replacing by "<s> </s>"..'.format(sent)) batch = Variable(self.get_batch(sent), volatile=True) if self.use_cuda: batch = batch.cuda() output = self.enc_lstm(batch)[0] output, idxs = torch.max(output, 0) # output, idxs = output.squeeze(), idxs.squeeze() idxs = idxs.data.cpu().numpy() argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))] # visualize model import matplotlib.pyplot as plt x = range(len(sent[0])) y = [100.0*n/np.sum(argmaxs) for n in argmaxs] plt.xticks(x, sent[0], rotation=45) plt.bar(x, y) plt.ylabel('%') plt.title('Visualisation of words importance') plt.show() return output, idxs
def extractRawFrequencies(self, article): # this method is similar to above but returns # the raw freq.cies ( all word count) text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] freq = defaultdict(int) for s in word_sent: for word in s: if word not in self._stopwords: freq[word] += 1 return freq
def split_words(self, sentence: str) -> List[Token]: # Import is here because it's slow, and by default unnecessary. from nltk.tokenize import word_tokenize return [Token(t) for t in word_tokenize(sentence.lower())]