我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.corpus.stopwords.words()。
def SpeechToText(): r = sr.Recognizer() #Speech recognition with sr.Microphone() as source: print("Say something!") audio = r.listen(source) message = r.recognize_google(audio) print("Check: "+message) try: print("User: " + r.recognize_google(audio)) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) return message #function to find importance of words to use them to deduce that which thing is being asked more
def preprocessing(text): text = text.decode("utf8") # tokenize into words tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # remove stopwords stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] # lower capitalization tokens = [word.lower() for word in tokens] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] preprocessed_text= ' '.join(tokens) return preprocessed_text
def collection_stats(): # list of documents documents_stat = reuters.fileids() print(str(len(documents)) + " documents") train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat)) print(str(len(train_docs_stat)) + " total training documents") test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat)) print(str(len(test_docs_stat) + " total test documents")) # list of categories categories = reuters.categories() print(str(len(categories)) + " categories") # get the documents in a category category_docs = reuters.fileids("acq") # words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print(document_words) # print the raw document print(reuters.raw(document_id))
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def get_user_to_word_proportion(user_to_text, word): """ Maps each user to the proportion of his words that consist of a specificied word. """ user_to_word_proportion = {} for user in user_to_text: lm = LanuageModel(user_to_text[user]) n_tokens = len(lm.lowercase_tokens) if n_tokens > 0: fd = nltk.FreqDist(lm.lowercase_tokens) user_to_word_proportion[user] = fd[word] / float(n_tokens) else: user_to_word_proportion[user] = 0.0 print 'Finished user {}'.format(user.encode('utf-8')) return user_to_word_proportion
def generate(cfd, start_word, n): word = start_word words = [] for i in range(n): words.append(word) # word = cfd[word].max() fd = cfd[word] n_next_words = sum(fd.values()) if n_next_words > 0: probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())] word = choice(sorted(fd.keys()), p=probabilities) else: # Pick random word old_word = word # TODO: use unigram probabilities later word = choice(cfd.keys()) words.append(word) sentence = ' '.join(words) # TODO: modify above for punctuation return sentence
def rm_stop_words(data, mode="nltk",silent=1): """ Input: data is a set, {} or Counter """ if silent==0: print("remove stop words ...") if mode == "nltk": from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) else: print("unknown mode",mode) assert 0 if isinstance(data,list): data = [i for i in data if i.lower() not in stop_words] return data else: for word in stop_words: if word in data: del data[word]
def words_to_char_sequence(words_list, tk): """Convert words list to chars sequence # Arguments words: word list, (sentence_len, word_len) # Output shape (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD) """ c_seqs = np.zeros((len(words_list), TrainConfig.MAX_SEQUENCE_LENGTH, TrainConfig.MAX_CHAR_PER_WORD), dtype='int32') for w_i in xrange(len(words_list)): words = words_list[w_i] fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH, TrainConfig.MAX_CHAR_PER_WORD), dtype='int32') ws = tk.texts_to_sequences(words) ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD) if TrainConfig.MAX_SEQUENCE_LENGTH < len(words): max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH else: max_word_len = len(words) fixed_ws[:max_word_len, :] = ws[:max_word_len, :] c_seqs[w_i] = fixed_ws return c_seqs
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( # re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if # not token.isdigit() and not token in stop_words]
def build_vocab(word_freq, threshold=5, topn=None, start_idx=0): """ threshold only take effects when topn is None. words are indexed by overall frequency in the dataset. """ word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True) if topn: word_freq = zip(*word_freq[:topn])[0] vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx))) else: idx = start_idx vocab_dict = {} for word, freq in word_freq: if freq < threshold: return vocab_dict vocab_dict[word] = idx idx += 1 return vocab_dict
def bigrams(words, join_string, skip=0): """ Input: a list of words, e.g., ["I", "am", "Denny"] Output: a list of bigram, e.g., ["I_am", "am_Denny"] """ assert type(words) == list L = len(words) if L > 1: lst = [] for i in range(L - 1): for k in range(1, skip + 2): if i + k < L: lst.append(join_string.join([words[i], words[i + k]])) else: # set it as unigram lst = NgramUtil.unigrams(words) return lst
def trigrams(words, join_string, skip=0): """ Input: a list of words, e.g., ["I", "am", "Denny"] Output: a list of trigram, e.g., ["I_am_Denny"] """ assert type(words) == list L = len(words) if L > 2: lst = [] for i in range(L - 2): for k1 in range(1, skip + 2): for k2 in range(1, skip + 2): if i + k1 < L and i + k1 + k2 < L: lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]])) else: # set it as bigram lst = NgramUtil.bigrams(words, join_string, skip) return lst
def biterms(words, join_string): """ Input: a list of words, e.g., ["I", "am", "Denny", "boy"] Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"] """ assert type(words) == list L = len(words) if L > 1: lst = [] for i in range(L - 1): for j in range(i + 1, L): lst.append(join_string.join([words[i], words[j]])) else: # set it as uniterm lst = NgramUtil.uniterms(words) return lst
def triterms(words, join_string): """ Input: a list of words, e.g., ["I", "am", "Denny", "boy"] Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"] """ assert type(words) == list L = len(words) if L > 2: lst = [] for i in xrange(L - 2): for j in xrange(i + 1, L - 1): for k in xrange(j + 1, L): lst.append(join_string.join([words[i], words[j], words[k]])) else: # set it as biterm lst = NgramUtil.biterms(words, join_string) return lst
def fourterms(words, join_string): """ Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"] Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"] """ assert type(words) == list L = len(words) if L > 3: lst = [] for i in xrange(L - 3): for j in xrange(i + 1, L - 2): for k in xrange(j + 1, L - 1): for l in xrange(k + 1, L): lst.append(join_string.join([words[i], words[j], words[k], words[l]])) else: # set it as triterm lst = NgramUtil.triterms(words, join_string) return lst
def ngrams(words, ngram, join_string=" "): """ wrapper for ngram """ if ngram == 1: return NgramUtil.unigrams(words) elif ngram == 2: return NgramUtil.bigrams(words, join_string) elif ngram == 3: return NgramUtil.trigrams(words, join_string) elif ngram == 4: return NgramUtil.fourgrams(words, join_string) elif ngram == 12: unigram = NgramUtil.unigrams(words) bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2] return unigram + bigram elif ngram == 123: unigram = NgramUtil.unigrams(words) bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2] trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3] return unigram + bigram + trigram
def build_vocabulary( words, max_size ): vocab_instances = 0 unique_counts = Counter(words) d = dict(unique_counts.most_common(cfg.vocabulary_size-2) ) vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) ) # start at 2 to leave room for padding & unknown pb = Progress_bar(len(d) - 1) for i, (key, value) in enumerate(vocabulary.items(), start=2): vocab_instances += value vocabulary[key] = i pb.tick() vocabulary[cfg.padding_char] = 0 vocabulary[cfg.placeholder_char] = 1 #reverse the vocbulary (for reverse lookup) rev_vocabulary = {v: k for k, v in vocabulary.items()} vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary) return vocab
def tokenize_text( sample_text ): global sequence_lengths processed_text = [] if cfg.remove_punctuation: cleaned = sample_text.lower().translate( t_table ) else: cleaned = sample_text if cfg.use_casual_tokenizer: tokens = tknzr.tokenize( cleaned ) else: tokens = nltk.word_tokenize( cleaned, language='english') if cfg.remove_stopwords: tokens = [w for w in tokens if not w in stopwords.words('english')] sequence_lengths.append( len( tokens ) ) processed_text.extend( tokens ) return processed_text
def _init_(self, min_cut=0.1, max_cut=0.9): # identation changes - we are inside the constructor # here we set up the behaviour # this is called each time an object of feq summ class is # created or instantiated self._min_cut = min_cut # self=keyword that reports the variable self._max_cut = max_cut # we save the val of the 2 parameters passed by assigning them # two member variables - the 'self.' prefix identifies them as part # of the self argument - using underscore as first char. self._stopwords = set(stopwords.words('english') + list(punctuation)) # this is alist of all common words and punc symols # identation changes - we are out of the constructor here # This is still the body of the class # Defining var here ( outside a member function) but within the class # member var becomes STATIC. This means it belongs to the class, and not # to any specific individual instance (object) of the class
def extractFeatures(self, article, n, customStopWords=None): # pass in article as a tuple ( text, title) text = article[0] # extract the text title = article[1] # extract the title sentences = sent_tokenize(text) # split text into sentences word_sent = [word_tokenize(sentences.lower()) for a in sentences] # split sentences into words self._freq = self._compute_frequencies(word_sent, customStopWords) # calculate word freq using member func created above if n < 0: # how many features (words) to return - a -ve number means # no feature ( word) selection, just return all features return nlargest(len(self._freq_keys()), self._freq, key=self._freq.get) else: # here we say if calling e func has asked for a subset # then return only the 'n' largest features, i.e. the # most important words ( important == frequent, less stopwords) return nlargest(n, self._freq, key=self._freq.get)
def similarity(c1, c2): '''stop words are words like "it" and "the" , that have no massive impact on the sentence''' stop_words = list(stopwords.words("english")) # Removes stop words in both sentences c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words] c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words] c1_words = Counter(dedupe(c1_cleaned)) c2_words = Counter(dedupe(c2_cleaned)) total_words = c1_words + c2_words similarity_between_words = 0 for key, val in total_words.items(): ''' Looks at whether the two articles share a word''' if total_words[key] > 1: similarity_between_words += 1 return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
def _answer_stop_word_density(self, row): """Percentage of tokens in the answer are stopwords - Args: row(pandas.dataframe): input row vector - Returns: row(pandas.dataframe): ouput vector with new feature """ stop = stopwords.words('english') answer = row.Answer if answer: tokens = answer.split() num_tokens = len(tokens) stop_word_in_answer = [i for i in tokens if i in stop] num_stop_word_in_answer = len(stop_word_in_answer) row['ANSWER_STOPWORD_DENSITY'] = float( num_stop_word_in_answer) / num_tokens return row else: row['ANSWER_STOPWORD_DENSITY'] = 0 return row
def _answer_quantifier_density(self, row): """Percentage of tokens in the answer that are quantifier words - Args: row(pandas.dataframe): input pandas dataframe - Returns: row(pandas.dataframe): result a pandas dataframe with new feature """ answer = row.Answer if answer: tokens = answer.split() answer_len = len(tokens) quantifier_tokens = [ i for i in tokens if i in ling.QUANTIFIER_WORDS] quantifier_tokens_len = len(quantifier_tokens) row['ANSWER_QUANTIFIER_DENSITY'] = float( quantifier_tokens_len) / answer_len return row else: row['ANSWER_QUANTIFIER_DENSITY'] = 0 return row
def _percentage_capitalized_word_in_answer(self, row): """Percentage of capitalized words in the sentence that are in the answer - Args: row(pandas.dataframe): input pandas dataframe - Returns: row(pandas.dataframe): result a pandas dataframe with new feature """ answer = row.Answer sentence = row.Sentence if answer is not None and sentence is not None: tokens = sentence.split() num_tokens = len(tokens) cap_tokens = [i for i in tokens if i.isupper() == True] cap_tokens_in_answer = [i for i in cap_tokens if i in answer] row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = float( len(cap_tokens_in_answer)) / num_tokens return row else: row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = 0 return row
def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt): """ Get overlap, idf weighted overlap, overlap excluding stopwords, and idf weighted overlap excluding stopwords. """ stoplist = set(stopwords.words('english')) num_docs = len(sent_list_1) overlap_feats = [] for s1, s2 in zip(sent_list_1, sent_list_2): tokens_a_set, tokens_b_set = set(s1), set(s2) intersect = tokens_a_set & tokens_b_set overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set)) idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect) idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set)) tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist) tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist) intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop)) idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop) idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop)) overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop]) return overlap_feats
def get_similar_documents_for_query(model_id, text): """ Return documents similar to the query or an empty set if an error occurs or the query has no words after preprocessing :param model_id: :param text: :return: """ model = db_utils.get_model(model_id) topics_assignment = assign_topics_for_query(model_id, text) if len(topics_assignment) != 0: topics_vector = transform_topics_assignment_from_lda_to_vector(model['number_of_topics'], topics_assignment[0]) # print(topics_vector) return get_similar_documents_by_vector(model_id, topics_vector) else: return []
def get_binary(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False ))) ])
def get_sgdc(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)) ])
def wash(fileList): # denyPos = ['CC', 'CD', 'DT', 'TO', ''] st = LancasterStemmer() for f in tqdm(fileList): fr = open('./washFile/' + f, 'r') fw = open("./washFile_stem/" + f, 'w') for line in fr.read().splitlines(): line = remove_punctuation(line).lower() # wordpos = pos(remove_punctuation(line).lower()) # for turple in wordpos: # if (turple[0] not in stopwords.words('english')): # fw.write(turple[0] + ' ') # fw.write(x + ' ' for x in line.split() if x not in stopwords.words('english')) # stopw = stopwords.words('english') words = [x for x in line.split()] for x in words: try: fw.write(st.stem(x) + ' ') except: print x fr.close() fw.close()
def count_entries(file_list): """Performs a count of the number of number of words in the corpus Args: file_list (list): list of file names. Returns: list: A list of json objects containing the count per file name """ result = [] for obj in file_list: with open(CSV_PATH + obj + '.csv', "r") as entry: reader = csv.reader(entry, delimiter=",") col_count = len(reader.next()) res = {"Filename": obj, "Count": col_count} result.append(res) return result
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def build_dictionary(sentences, vocabulary_size): # Turn sentences (list of strings) into lists of words split_sentences = [s.split() for s in sentences] words = [x for sublist in split_sentences for x in sublist] # Initialize list of [word, word_count] for each word, starting with unknown count = [['RARE', -1]] # Now add most frequent words, limited to the N-most frequent (N=vocabulary size) count.extend(collections.Counter(words).most_common(vocabulary_size-1)) # Now create the dictionary word_dict = {} # For each word, that we want in the dictionary, add it, then make it # the value of the prior dictionary length for word, word_count in count: word_dict[word] = len(word_dict) return(word_dict) # Turn text data into lists of integers from dictionary
def load_text_vec(alphabet,filename="",embedding_size = 100): vectors = {} with open(filename) as f: i=0 for line in f: i+=1 if i % 100000 == 0: print 'epch %d' % i items = line.strip().split(' ') if len(items) == 2: vocab_size, embedding_size= items[0],items[1] print ( vocab_size, embedding_size) else: word = items[0] if word in alphabet: vectors[word] = items[1:] print 'embedding_size',embedding_size print 'done' print 'words found in wor2vec embedding ',len(vectors.keys()) return vectors
def add_list_of_words_in_w2v_model(self, unknown_words): huge_w2v_model_file = open(self.w2v_huge_model_path, "r") current_w2v_model_file = open(self.w2v_model_path, "a") line = huge_w2v_model_file.readline() unknown_words_left = len(unknown_words) while line and unknown_words_left: word = line.split()[0] if word in unknown_words: current_w2v_model_file.write(line) unknown_words = unknown_words - set([word]) unknown_words_left -= 1 line = huge_w2v_model_file.readline() for word in list(unknown_words): random_position = random(self.w2v_model.vector_size)*2-1 current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position]))) print "warning random positions introduced for new words ... in the future this should be solved" current_w2v_model_file.close() huge_w2v_model_file.close()
def extract_NPs(chunk): """ Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'], we extract the NPs with stopping and location words filtered out, and return list of noun phrases. """ forbid_wds = stop_words + location_words NPs = [] for phrase, ptype in chunk: if ptype == 'NP': filtered_wds = [] for wd in phrase.split(): if wd not in forbid_wds: filtered_wds += [wd] if len(' '.join(filtered_wds)) > 0: NPs += [' '.join(filtered_wds)] return NPs
def extract_NNs(chunk, pos): """ Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'], and pos [(word, pos)], e.g., [('man', 'NN')] we extract from NPs with stopping, location, color, size words filtered out, and return list of NN words only. """ forbid_wds = stop_words + location_words + color_words + size_words NNs = [] for phrase, ptype in chunk: if ptype == 'NP': filtered_wds = [] for wd in phrase.split(): wd_pos = [p[1] for p in pos if p[0] == wd][0] if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD': # we don't need JJ nor CD words neither. filtered_wds += [wd] if len(' '.join(filtered_wds)) > 0: NNs += [' '.join(filtered_wds)] return NNs
def process_text(self, text): flags = (UNICODE if sys.version < '3' and type(text) is unicode else 0) regexp = self.regexp if self.regexp is not None else r"\w[\w']+" words = findall(regexp, text, flags) # remove stopwords words = [word for word in words] # remove 's words = [word[:-2] if word.lower().endswith("'s") else word for word in words] # remove numbers words = [word for word in words if not word.isdigit()] if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def tokenize(text): """ Tokenizes sequences of text and stems the tokens. :param text: String to tokenize :return: List with stemmed tokens """ tokens = nl.WhitespaceTokenizer().tokenize(text) tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens)) tokens = [word for word in tokens if word not in stopwords.words('english')] tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens)) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
def review_to_wordlist( review, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # # 5. Return a list of words return(words) # Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(review.strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \ remove_stopwords )) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def extract_unigram_feats(document, unigrams, handle_negation=False): """ Populate a dictionary of unigram features, reflecting the presence/absence in the document of each of the tokens in `unigrams`. :param document: a list of words/tokens. :param unigrams: a list of words/tokens whose presence/absence has to be checked in `document`. :param handle_negation: if `handle_negation == True` apply `mark_negation` method to `document` before checking for unigram presence/absence. :return: a dictionary of unigram features {unigram : boolean}. >>> words = ['ice', 'police', 'riot'] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_unigram_feats(document, words).items()) [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] """ features = {} if handle_negation: document = mark_negation(document) for word in unigrams: features['contains({0})'.format(word)] = word in set(document) return features
def __init__(self, w=20, k=10, similarity_method=BLOCK_COMPARISON, stopwords=None, smoothing_method=DEFAULT_SMOOTHING, smoothing_width=2, smoothing_rounds=1, cutoff_policy=HC, demo_mode=False): if stopwords is None: from nltk.corpus import stopwords stopwords = stopwords.words('english') self.__dict__.update(locals()) del self.__dict__['self']
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue wfd[w1] += 1 for w2 in window[1:]: if w2 is not None: bfd[(w1, w2)] += 1 return cls(wfd, bfd, window_size=window_size)
def from_words(cls, words, window_size=3): """Construct a TrigramCollocationFinder for all trigrams in the given sequence. """ if window_size < 3: raise ValueError("Specify window_size at least 3") wfd = FreqDist() wildfd = FreqDist() bfd = FreqDist() tfd = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3 in _itertools.combinations(window[1:], 2): wfd[w1] += 1 if w2 is None: continue bfd[(w1, w2)] += 1 if w3 is None: continue wildfd[(w1, w3)] += 1 tfd[(w1, w2, w3)] += 1 return cls(wfd, bfd, wildfd, tfd)