我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.tokenize.RegexpTokenizer()。
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1'): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def process(input_text): # Create a regular expression tokenizer tokenizer = RegexpTokenizer(r'\w+') # Create a Snowball stemmer stemmer = SnowballStemmer('english') # Get the list of stop words stop_words = stopwords.words('english') # Tokenize the input string tokens = tokenizer.tokenize(input_text.lower()) # Remove the stop words tokens = [x for x in tokens if not x in stop_words] # Perform stemming on the tokenized words tokens_stemmed = [stemmer.stem(x) for x in tokens] return tokens_stemmed
def paragraph_to_words(paragraph, remove_stopwords=False, lemmatize=True, stem=False): words = BeautifulSoup(paragraph["review"], "html.parser").get_text() words = re.sub("[^a-zA-Z]", " ", words) # tokenizer = RegexpTokenizer(r'\w+') # words = tokenizer.tokenize(words.strip().lower()) words = words.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] if lemmatize: words = [lemmatizer.lemmatize(w) for w in words] if stem: words = [stemmer.stem(w) for w in words] return LabelDoc(words, paragraph["id"])
def tweets(word_len, sent_len, train_valid_ratio=[5,1]): df = pandas.read_csv('tweets_large.csv') field = 'text' label = 'label' tokenizer = RegexpTokenizer(r'\w+') # encode characters into numbers encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer, word_len=word_len, sent_len=sent_len) encoder.build_char_map() encode_X = encoder.make_char_embed() # encode categories into one hot array cat_encoder = CatNumberEncoder(df[label]) cat_encoder.build_cat_map() encode_y = cat_encoder.make_cat_embed() nclass = len(np.unique(encode_y)) encode_y = make_one_hot(encode_y, nclass) return encode_X, encode_y, nclass
def analysis(reviews_collection_text): with open('data/reviews_%s' % reviews_collection_text, 'r') as f: raw_data = f.read() with open('data/reviews_%s' % reviews_collection_text, 'r') as f: comments = f.readlines() data = raw_data.replace('\n', ' ') data_lower = data.lower() tokens_with_punc = word_tokenize(data_lower) tokens = RegexpTokenizer(r'\w+').tokenize(data_lower) print("--- Most frequent tokens ---\n", FreqDist(tokens_with_punc).most_common(15)) print("--- Tokens without punctuation ---\n", FreqDist(tokens).most_common(15)) stop = set(stopwords.words('english')) words = [word for word in tokens if word not in stop] print("--- Most frequent words ---\n", FreqDist(words).most_common(15)) tagged = pos_tag(words) nouns = [word for word, pos in tagged if (pos == 'NN')] print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15)) adjts = [word for word, pos in tagged if (pos == 'JJ')] print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15)) tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments] lxdst = [lexical_density(token) for token in tokns if len(token) > 0] avgld = sum(lxdst) / len(comments) print("--- Average lexical density ---\n", avgld)
def __init__(self, fname): words_map = {} for line in csv.reader(open(fname)): word, syn = line if word.startswith('#'): continue words_map[word] = syn super(CSVWordReplacer, self).__init__(words_map) ######### for now just a wrapper to RegexpTokenizer #########
def __init__(self,pattern): self.pattern = pattern self.tokenizer = RegexpTokenizer(self.pattern) ######## defining a default stopwords set #############
def rm_punctuation(data,pattern=r'[a-zA-Z]+-?[0-9]*',silent=1): if silent==0: print("remove punctuation ...") from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(pattern) return tokenizer.tokenize(" ".join(data))
def __init__(self): # Create a regular expression tokenizer self.tokenizer = RegexpTokenizer(r'\w+') # get the list of stop words self.stop_words_english = stopwords.words('english') # Create a Snowball stemmer self.stemmer = SnowballStemmer('english') # Tokenizing, stop word removal, and stemming
def get_list(): stop_words = set(stopwords.words('english')) filename = 'data/new_acronyms.json' f = open(filename, 'r') data = json.load(f) paragraph_list = [] full_form_list = [] for k,v in data.items(): if k=="WDM": for poss in v['possibilities']: paragraph_list.append(poss['summary']) full_form_list.append(poss['full_form']) s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly." paragraph_list.append(s) full_form_list.append("Wavelength context") texts = [] taggeddoc = [] p_stemmer = PorterStemmer() tokeniser = RegexpTokenizer(r'\w+') for index, para in enumerate(paragraph_list): raw = para.lower() tokens = tokeniser.tokenize(raw) stopped_tokens = [t for t in tokens if not t in stop_words] number_tokens = [x for x in stopped_tokens if x.isalpha] stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] length_tokens = [i for i in stemmed_tokens if len(i) > 1] texts.append(length_tokens) td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]]) taggeddoc.append(td) return taggeddoc
def get_summarized(self, input_data, num_sentences): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want # TODO multilingual! tokenizer = RegexpTokenizer('\w+') stopwords_ = [smart_text(word) for word in stopwords.words('english')] # get the frequency of each word in the input base_words = [smart_text(word.lower()) for word in tokenizer.tokenize(smart_text(input_data))] words = [smart_text(word) for word in base_words if word not in stopwords_] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in list(word_frequencies.items())[:100]] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input_data) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order return self.reorder_sentences(output_sentences=output_sentences, input_data=input_data)
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is', 'have', 'are', 'were', 'and', 'very', '.', ',']) self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set(lemmatize(token) for token in self.text_tokens) self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def __init__(self, root, items, encoding='utf8'): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer) #: A list of all documents and their titles in ycoe.
def split_sentence_into_words(sentence): tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(sentence.lower())
def remove_punctuation(str): tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(str)
def tokenize(text, level): """Tokenize a text into a list of strings. Args: text (str): An arbitrary string. level (str): Either "char" or "word". For "char", the string is split into characters. For "word", letters and numbers are glued to themselves and everything else is split. Example: "asdf df!?123 as12" -> "asdf", " ", "df", "!", "?", "123", " ", "as", "12" Returns: list[str]: The tokens Raises: ValueError: If the level is not "char" or "word" """ if level == "char": # No need for tokenizing return list(text) elif level == "word": # Tokenize while keeping indentation. Glue letters and numbers to themselves but # keep all other chars isolated. tokenizer = RegexpTokenizer(r'\w+|\S|\s') return tokenizer.tokenize(text) else: raise ValueError("Unknown token level: {}".format(level))
def remove_punc(string): # '''Description: This function takes in a string of descriptions and return a tokenized string without punctuation # Parameters: String of descriptions # Output: Tokenized string with punctuation removed''' tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(string) return " ".join(tokens)
def getAllReviews(movieList): reviews = np.array(map(lambda x: x["reviews"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg')) return tokenizeReview
def getAllCritics(movieList): reviews = np.array(map(lambda x: x["critics"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg')) return tokenizeReview
def get_vocabulary(doc_set): tokenizer = RegexpTokenizer(r'\w+') distinctwords = {} i = 0 # loop through document list for text in doc_set: raw = text.lower() tokens = tokenizer.tokenize(raw) for word in tokens: if word not in distinctwords: distinctwords[word] = i i += 1 return distinctwords
def get_frequency_table(titles, vocab): tokenizer = RegexpTokenizer(r'\w+') freqtable = np.ndarray(shape=(len(titles),len(vocab)), dtype=int, order='C') freqtable.fill(0) for i in range(0,len(titles)): raw = titles[i].lower() tokens = tokenizer.tokenize(raw) for token in tokens: index = vocab[token] freqtable[i][index] +=1 return freqtable
def _test(text=None): if text is not None: text = "What are the prerequisites for csc369?\n" tokenizer = RegexpTokenizer('[\w\d]+') word_tokens = tokenizer.tokenize(query) # TODO: how to tokenize 'u of t' and 'uoft' # TODO: use Bing Spell Check API tokens = [Token(tk) for tk in word_tokens] ser = RegexpEntityRecognizer() ser.process(tokens)
def tokenize(data): tokenizer = RegexpTokenizer(r'\w+') return [tokenizer.tokenize(d) for d in data]
def build_lda_model(self, data, docs, n_topics=5): texts = [] tokenizer = RegexpTokenizer(r'\w+') for d in data: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = self.remove_stopwords(tokens) stemmed_tokens = stopped_tokens #stemmer = PorterStemmer() #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) index = similarities.MatrixSimilarity(corpus) self.save_lda_model(lda_model, corpus, dictionary, index) self.save_similarities(index, docs) return dictionary, texts, lda_model
def preprocess(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower() tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return tokens
def preprocess_imageclef(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower() tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def preprocess_wikidata(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower().split('../img/')[0] tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def get_bigram_likelihood(statements, freq_filter=3, nbest=200): """ Returns n (likelihood ratio) bi-grams from a group of documents :param statements: list of strings :param output_file: output path for saved file :param freq_filter: filter for # of appearances in bi-gram :param nbest: likelihood ratio for bi-grams """ words = list() print 'Generating word list...' #tokenize sentence into words for statement in statements: # remove non-words tokenizer = RegexpTokenizer(r'\w+') words.extend(tokenizer.tokenize(statement)) bigram_measures = nltk.collocations.BigramAssocMeasures() bigram_finder = BigramCollocationFinder.from_words(words) # only bi-grams that appear n+ times bigram_finder.apply_freq_filter(freq_filter) # TODO: use custom stop words bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english')) bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest) return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)