我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.tokenize.wordpunct_tokenize()。
def generate_vocabulary(self, review_summary_file): """ :param review_summary_file: :return: """ self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values for review,summary in self.rev_sum_pair: rev_lst = wordpunct_tokenize(review) sum_lst = wordpunct_tokenize(summary) self.__add_list_to_dict(rev_lst) self.__add_list_to_dict(sum_lst) # Now store the "" empty string as the last word of the voacabulary self.map[""] = len(self.map) self.revmap[len(self.map)] = ""
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( # re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if # not token.isdigit() and not token in stop_words]
def tokenize(directorys): full_content = '' for _file in os.listdir(directory): #disp_count = 5 with open(directory+_file,'r') as f: contents = f.readlines() for item in contents: try: sentence = item.split('\t')[1].strip() full_content += sentence except IndexError: continue # if np.random.binomial(1,0.1): # print sentence # time.sleep(2) # disp_count -=1 # if not disp_count: # print '*'*100 # break # else: # print '#' return wordpunct_tokenize(full_content.lower())
def load_unannotated_file(filepath='test.txt', nb_instances=None, tokenized_input=False): if tokenized_input: instances = [] for line in codecs.open(filepath, 'r', 'utf8'): line = line.strip() if line: instances.append(line) if nb_instances: nb_instances -= 1 if nb_instances <= 0: break return instances else: from nltk.tokenize import wordpunct_tokenize W = re.compile(r'\s+') with codecs.open(filepath, 'r', 'utf8') as f: text = W.sub(f.read(), ' ') tokens = wordpunct_tokenize(text) if nb_instances: return tokens[:nb_instances] else: return tokens
def _extract_tokens(self, file_text): """Extract tokens from a file and return a Counter dictionary. This method is designed specifically so that it can be overridden easily while maintaining _get_file_tokens and _get_dir_tokens. """ token_dict = collections.Counter() # does a simple word and punctuation tokenization on the text tokens = wordpunct_tokenize(file_text) for token in tokens: token_dict[token] += 1 return token_dict
def _extract_tokens(self, file_text): """Extract tokens from a Babel file and return a Counter dictionary.""" token_dict = collections.Counter() # matches and removes beginning and end tags regex = re.compile(r'\[\d*\.\d*\]\n(.*)') matches = regex.findall(file_text) tokens = set() for match in matches: wp_tokenized = wordpunct_tokenize(match) tokens.update(wp_tokenized) for token in tokens: token_dict[token] += 1 return token_dict
def _get_revision_word_dist(self, page_title, revid): """""" revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title] if revid in revids_to_word_dist: return revids_to_word_dist[revid] text = self._get_revision_text(page_title, revid) text = [word.lower() for word in wordpunct_tokenize(text) if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION] pdist = StatsCounter(text).normalize() revids_to_word_dist[revid] = pdist return pdist
def tokenize(text): """ :param text: a paragraph string :return: a list of words """ try: try: txt = unicode(text, 'utf-8') # py2 except NameError: txt = text # py3 words = wordpunct_tokenize(txt) length = len(words) except TypeError: words, length = ['NA'], 0 return words, length
def augment(texts, dic_thes): if prm.aug<2: return texts out = [] for text in texts: words_orig = wordpunct_tokenize(text) maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words for j in range(prm.aug): words = list(words_orig) #copy for k in range(randint(1,maxrep)): idx = randint(0,len(words)-1) word = words[idx] if word in dic_thes: synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym] words[idx] = dic_thes[word][synonym] out.append(" ".join(words)) return out
def __init__(self, lines): self.lookup = {} self.max_len = 0 ensure_package_path() from nltk.tokenize import wordpunct_tokenize as tokenize for line in lines: word_data = json.loads(line) # capture both positive and negative, choose one at scoring time pos_score, neg_score = word_data['pos'], word_data['neg'] terms = [word_data['word']] # TODO: make the sentiment scorer configurable if 'word_ar' in word_data: terms.append(word_data['word_ar']) if 'word_ur' in word_data: terms.append(word_data['word_ur']) for term in terms: # if a scores exists for a term use the least neutral score existing_scores = (0., 0.) if term in self.lookup: existing_scores = self.lookup[term] self.lookup[term] = (max(pos_score, existing_scores[0]), max(neg_score, existing_scores[1])) # update the maximum token length to check self.max_len = max(len(tokenize(term)), self.max_len)
def extract_keywords(sentence, keywords): # check if there are keywords for the sentence language language = sentence['Language'] if language in keywords: languageKeywords = keywords[language] keywordMatches = [] if languageKeywords != None: message = sentence['Sentence'] # tokenize the sentence for keyword in sorted(languageKeywords): keywordRegex = languageKeywords[keyword] if keywordRegex.search(message): # if match, add keyword canonical form to list keywordMatches.append(keyword) sentence['Keywords'] = keywordMatches return sentence
def parseDocument(doc, vocab): wordslist = list() countslist = list() doc = doc.lower() tokens = wordpunct_tokenize(doc) dictionary = dict() for word in tokens: if word in vocab: wordtk = vocab[word] if wordtk not in dictionary: dictionary[wordtk] = 1 else: dictionary[wordtk] += 1 wordslist.append(dictionary.keys()) countslist.append(dictionary.values()) return (wordslist[0], countslist[0])
def __generate_tensor(self, is_review, reverse=False): """ :param is_review: :param reverse: :return: """ seq_length = self.review_max_words if is_review else self.summary_max_words total_rev_summary_pairs = self.rev_sum_pair.shape[0] data_tensor = np.zeros([total_rev_summary_pairs,seq_length]) sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1] for index, entry in enumerate(sample.tolist()): index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)]) # reverse if want to get backward form if reverse: index_lst = index_lst[::-1] # Pad the list if len(index_lst) <= seq_length: index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0)) else: index_lst = index_lst[0:seq_length] data_tensor[index] = index_lst return data_tensor
def tiny_tokenize_xml(text, stem=False, stop_words=[]): return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if not token.isdigit() and not token in stop_words]
def top_tokens(text): freq_dict = defaultdict(int) tokens = wordpunct_tokenize(text) for token in tokens: freq_dict[token] += 1 return sorted(freq_dict, key=freq_dict.get, reverse=True)
def wikipediaAction(message): """Makes the appropriate calls to the wikipedia API for answer wiki queries. Args: message: An incoming text message processer: Instance of NLProcessor class Returns: A message indicating what action was taking with the wikipedia API """ # tokenize input tokens = tokenize.wordpunct_tokenize(message) # filter stopwords, additionally, remove 'wiki' or 'wikipedia' tokens_filtered = remove_stopwords(tokens) tokens_filtered = [token for token in tokens_filtered if token != 'wiki' and token != 'wikipedia'] # join filtered message message = ' '.join(tokens_filtered) # for debugging/testing print("(Highly) processed input: ", message) # Get the wikipedia summary for the request try: summary = wikipedia.summary(message, sentences = 1) url = wikipedia.page(message).url answer = summary + "\nSee more here: " + url if len(answer) > 500: answer = answer[0:500] + "\nSee wikipedia for more..." except: # handle all errors answer = "Request was not found using Wikipedia. Be more specific?" return answer
def create_tags_for_package(package_name): """Create tags for a package based on its name.""" stop_words = set(['org', 'com', 'io', 'ch', 'cn']) tags = [] tags = set([tag.lower() for tag in wordpunct_tokenize(package_name) if tag not in string.punctuation and tag not in stop_words ]) return list(tags)[:MAX_TAG_COUNT]
def analyze_false(validData,validDataNumbers,validLabels,model): 'Calculating precision and recall for best model...' predictions = np.squeeze((model.predict(validDataNumbers) > 0.5).astype('int32')) c1_inds = np.where(validLabels == 1)[0] pos_inds = np.where((predictions+validLabels) == 2)[0] #np.squeeze(predictions) == validLabels neg_inds = np.setdiff1d(c1_inds,pos_inds) seq_lengths = np.zeros((validData.shape[0])) for ind,row in np.ndenumerate(validData): seq_lengths[ind] = len(wordpunct_tokenize(row.lower().strip())) mean_true_length = np.mean(seq_lengths[pos_inds]) mean_false_length = np.mean(seq_lengths[neg_inds]) return mean_false_length,mean_true_length
def tokenize(directory,exclude_files): full_content = '' for _file in os.listdir(directory): #disp_count = 5 if exclude_files and (_file in exclude_files): continue with open(directory+_file,'r') as f: contents = f.readlines() for item in contents: try: sentence = item.split('\t')[1].strip() full_content += sentence except IndexError: continue # if np.random.binomial(1,0.1): # print sentence # time.sleep(2) # disp_count -=1 # if not disp_count: # print '*'*100 # break # else: # print '#' return wordpunct_tokenize(full_content.lower())
def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks
def _extract_tokens(self, file_text): """Extract tokens from a file and return a Counter dictionary.""" token_dict = collections.Counter() # matches and removes beginning and end tags regex = re.compile(r'(<doc id.*>|<\/doc>)') data = regex.sub('', file_text) tokens = wordpunct_tokenize(data) for token in tokens: token_dict[token] += 1 return token_dict
def get_words(sents = []): from nltk.tokenize import wordpunct_tokenize words = [] for sent in sents: words.append(wordpunct_tokenize(sent)) return words # file_name = sys.argv[1]
def tokenize_into_words(sents = []): words = [] for sent in sents: words.append(wordpunct_tokenize(sent)) return words
def _extract_text_ngram_freqs(self, text): """Tokenize the text. For each token in the text, extract ngrams of different length (from 1 to 5). Compute how many times each of these ngrams occur in the text. Then return a dictionary of { ngram: frequencies }. >>> implementation = CavnarTrenkleImpl() >>> ngrams = implementation._extract_text_ngram_freqs("HeLLo") >>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \ 'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1} True >>> ngrams = implementation._extract_text_ngram_freqs("CIAO") >>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \ 'cia': 1, 'iao': 1, 'ciao': 1} True """ tokens = wordpunct_tokenize(text.lower()) # Force lower case # TODO: Delete numbers and punctuation # TODO: Should we use nltk twitter tokenizer? ngram_freqs = defaultdict(int) for token in tokens: for n in range(1, 6): # Use 1-grams to 5-grams for ngram in ngrams(token, n): ngram_string = ''.join(ngram) ngram_freqs[ngram_string] += 1 # ngram_freqs[ngrams(token, n)] += 1 return ngram_freqs
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ): print "text_to_sentence" #from nltk.tokenize import wordpunct_tokenize # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # text=text.decode("utf8") from nltk.tokenize import sent_tokenize,wordpunct_tokenize # 1. Use the NLTK tokenizer to split the paragraph into sentences #raw_sentences = tokenizer.tokenize(text.strip()) raw_sentences = sent_tokenize(text.strip()) print "finish tokenize sentence",len(raw_sentences) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: #print "sentence:",raw_sentence # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words #sentences.append( text_to_wordlist( raw_sentence, \ # remove_stopwords )) #print removePunctuation(raw_sentence).lower().split() print raw_sentence sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split()) print wordpunct_tokenize(raw_sentence) #print text_to_wordlist( raw_sentence, remove_stopwords ) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def locateWord(word, wordsArr): if word in wordsArr: return wordsArr.index(word) else: idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)] return idxs[0]
def negSent2JointTrain(negSents, posSentNum): neg_training_data = [] for sentId, (sent_id, sent) in enumerate(negSents): wordsIn = wordpunct_tokenize(sent) sent = " ".join(wordsIn) eventTypeSequence = ["O" for i in range(len(wordsIn))] neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence)) return neg_training_data
def vis_att(pages_idx, query, alpha, wiki, vocab, idx): rows = [prm.root_page.title()] for pageidx in pages_idx[:-1]: if pageidx != -1: rows.append(wiki.get_article_title(pageidx).decode('utf-8', 'ignore').title()) else: break #rows.append('Stop') rows = rows[::-1] columns = [] for word in wordpunct_tokenize(query): if word.lower() in vocab: columns.append(str(word)) columns = columns[:prm.max_words_query*prm.n_consec] alpha = alpha[:len(rows),:len(columns)] alpha = alpha[::-1] fig,ax=plt.subplots(figsize=(27,10)) #Advance color controls norm = matplotlib.colors.Normalize(0,1) im = ax.pcolor(alpha,cmap=plt.cm.gray,edgecolors='w',norm=norm) fig.colorbar(im) ax.set_xticks(np.arange(0,len(columns))+0.5) ax.set_yticks(np.arange(0,len(rows))+0.5) ax.tick_params(axis='x', which='minor', pad=15) # Here we position the tick labels for x and y axis ax.xaxis.tick_bottom() ax.yaxis.tick_left() ax.axis('tight') # correcting pyplot bug that add extra white columns. plt.xticks(rotation=90) fig.subplots_adjust(bottom=0.2) fig.subplots_adjust(left=0.2) #Values against each labels ax.set_xticklabels(columns,minor=False,fontsize=18) ax.set_yticklabels(rows,minor=False,fontsize=18) plt.savefig('vis' + str(idx) + '.svg') plt.close()
def BOW2(texts, vocab, dim): ''' Convert a list of texts to the BoW dense representation. ''' out = np.zeros((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) for i, text in enumerate(texts): bow = BOW(wordpunct_tokenize(text), vocab) out[i,:len(bow[0])] = bow[0] mask[i,:len(bow[1])] = bow[1] return out, mask
def Word2Vec_encode(texts, wemb): out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32) for i, text in enumerate(texts): words = wordpunct_tokenize(text) n = 0. for word in words: if word in wemb: out[i,:] += wemb[word] n += 1. out[i,:] /= max(1.,n) return out
def _generate_phrases(self, sentences): """Method to generate contender phrases given the sentences of the text document. :param sentences: List of strings where each string represents a sentence which forms the text. :return: Set of string tuples where each tuple is a collection of words forming a contender phrase. """ phrase_list = set() # Create contender phrases from sentences. for sentence in sentences: word_list = [word.lower() for word in wordpunct_tokenize(sentence)] phrase_list.update(self._get_phrase_list_from_words(word_list)) return phrase_list
def _on_start(self, utterance): # do all on start things # maybe clear all chart data structures # maybe clear agenda data structures self.agenda.clear() tokenized_utterance = tokenizer(utterance) self.utter_len = self.settings.utter_len = len(tokenized_utterance) self.left_buckets = [set() for _ in xrange(self.utter_len+1)] self.right_buckets = [set() for _ in xrange(self.utter_len+1)] self.initialize_agenda(tokenized_utterance) # Buckets are over dot indices, so are len=1 # self._print_buckets()
def score(self, sentence): # track both positive and negative scores for sentence pos_score, neg_score = 0., 0. # assuming no contextual forms are used for Arabic ensure_package_path() from nltk.tokenize import wordpunct_tokenize as tokenize tokens = tokenize(sentence.lower()) term_count = 0 # using nested while loops here to accomodate early termination of # inner loop, and updating the index of the outer loop based on the # number of tokens used in the sub-phrase i = 0 while i < len(tokens): matched = False j = min(self.max_len, len(tokens) - i) # check phrase lengths up to `max_len` while j > 0 and (i + j) <= len(tokens): sub_tokens = tokens[i : i + j] sub_word = ' '.join(sub_tokens) # if a match exist for phrase, update scores and counts if sub_word in self.lookup: sub_word_scores = self.lookup[sub_word] pos_score += sub_word_scores[0] neg_score += sub_word_scores[1] term_count += 1 matched = True i += j break j -= 1 # if not matched, skip token if not matched: i += 1 # if no terms matched, or scores are equal, return a neutral score if pos_score == neg_score: return 0.5 # if sentence is more positive than negative, use positive word sense elif pos_score > neg_score: return 0.5 + pos_score / term_count / 2 # if sentence is more negative than positive, use negative word sense else: return 0.5 - neg_score / term_count / 2
def create_keyword_regex(keyword): print 'create_keyword_regex' # import nltk ensure_package_path() from nltk.tokenize import wordpunct_tokenize as tokenize print 'tokenize ==> %s' % (keyword) tokens = tokenize(keyword) pattern = '\\s+'.join(tokens) pattern = '\\b%s\\b' % pattern print 'compile pattern ==> %s' % (pattern) return re.compile(pattern, re.I | re.UNICODE)
def tokenize(text, filter_stopwords=False, lowercase=True): words = wordpunct_tokenize(text) if filter_stopwords: words = [w for w in words if w not in STOPWORDS] return words
def text2idx2(texts, vocab, dim, use_mask=False): ''' Convert a list of texts to their corresponding vocabulary indexes. ''' if use_mask: out = -np.ones((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) else: out = -2 * np.ones((len(texts), dim), dtype=np.int32) out_lst = [] for i, text in enumerate(texts): words = wordpunct_tokenize(text)[:dim] for j, word in enumerate(words): if word in vocab: out[i,j] = vocab[word] else: out[i,j] = -1 # Unknown words out_lst.append(words) if use_mask: mask[i,:j] = 1. if use_mask: return out, mask, out_lst else: return out, out_lst
def get_syllables(sonnet): from nltk.tokenize import wordpunct_tokenize tokens = [wordpunct_tokenize(s) for s in sonnet] punct = set(['.', ',', '!', ':', ';']) filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens] last = [ sentence[len(sentence) - 1] for sentence in filtered] syllables = [[(word, len(pron), pron) for (word, pron) in cmu_dict if word == w] for w in last] return syllables
def compute_idx(pages_path_in, pages_path_out, vocab): f = h5py.File(pages_path_in, 'r') if prm.att_doc and prm.att_segment_type == 'sentence': nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') os.remove(pages_path_out) if os.path.exists(pages_path_out) else None # Save to HDF5 fout = h5py.File(pages_path_out,'a') if prm.att_doc: shape = (f['text'].shape[0],prm.max_segs_doc,prm.max_words) else: shape=(f['text'].shape[0],prm.max_words) idxs = fout.create_dataset('idx', shape=shape, dtype=np.int32) mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32) i = 0 for text in f['text']: st = time.time() if prm.att_doc: if prm.att_segment_type.lower() == 'section' or prm.att_segment_type.lower() == 'subsection': segs = [''] for line in text.split('\n'): if prm.att_segment_type == 'section': line = line.replace('===', '') if line.strip().startswith('==') and line.strip().endswith('=='): segs.append('') segs[-1] += line.lower() + '\n' elif prm.att_segment_type.lower() == 'sentence': segs = tokenizer.tokenize(text.lower().decode('ascii', 'ignore')) elif prm.att_segment_type.lower() == 'word': segs = wordpunct_tokenize(text.decode('ascii', 'ignore')) else: raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter. Valid options are "section", "subsection", "sentence", or "word".') segs = segs[:prm.max_segs_doc] idxs_, _ = utils.text2idx2(segs, vocab, prm.max_words) idxs[i,:len(idxs_),:] = idxs_ mask[i] = len(idxs_) else: idx, _ = utils.text2idx2([text.lower()], vocab, prm.max_words) idxs[i,:] = idx[0] i += 1 #if i > 3000: # break print 'processing article', i, 'time', time.time()-st f.close() fout.close()
def get_candidates(qatp): print 'loading data...' idf = pkl.load(open(prm.idf_path, "rb")) wk = wiki.Wiki(prm.pages_path) print 'creating vocabulary...' vocab = {} for q,_,_,_ in qatp: words = wordpunct_tokenize(q.lower()) for word in words: if word in idf: vocab[word] = {} print 'creating inverted index...' i = 0 for text in wk.get_text_iter(): if i%10000==0: print 'article', i words = wordpunct_tokenize(text.lower()) for word in words: if word in vocab: vocab[word][i] = 0 #if i > 500000: # break i += 1 print 'selecting pages...' candidates = [] for i,[q,_,_,_] in enumerate(qatp): st = time.time() words = wordpunct_tokenize(q.lower()) scores = {} for word in words: if word in vocab: if len(vocab[word]) < 100000: for pageid in vocab[word].keys(): if pageid not in scores: scores[pageid] = 0. scores[pageid] += idf[word] idxs = np.argsort(np.asarray(scores.values()))[::-1] pages = scores.keys() if len(idxs)==0: print 'error question:', q c = OrderedDict() for idx in idxs[:prm.max_candidates]: c[pages[idx]] = 0 candidates.append(c) print 'sample ' + str(i) + ' time ' + str(time.time()-st) #if i > 10000: # break return candidates