我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用nltk.stem.LancasterStemmer()。
def preprocessor_data(data, ids, test=0): stopwords_set = set(stopwords.words('english')) stemmer = stem.LancasterStemmer() data_in_preprocessed = [] labels = [] for line in data: words_preprocessed = [] line.lower() label, words = line.split()[0], line.split()[1:] labels.append(int(label)) for word in words: if word in stopwords_set: continue lemmatized = stemmer.stem(word) if test == 0: ids[lemmatized] words_preprocessed.append(lemmatized) data_in_preprocessed.append(words_preprocessed) return data_in_preprocessed, labels
def getFeature(word_list): stemmer = stem.LancasterStemmer() # stemmer2 = stem.PorterStemmer() feature = defaultdict(lambda: 0) for word in word_list: if not isStopWords(word): word_stem = stemmer.stem(word) feature[word_stem] += 1 return dict(feature)
def stem_text(text): from nltk.stem import LancasterStemmer ls = LancasterStemmer() tokens = tokenize_text(text) filtered_tokens = [ls.stem(token) for token in tokens] filtered_text = ' '.join(filtered_tokens) return filtered_text
def preprocessor_words(words): stopwords_set = set(stopwords.words('english')) stemmer = stem.LancasterStemmer() words_preprocessed = [] for word in words: if word in stopwords_set: continue lemmatized = stemmer.stem(word) words_preprocessed.append(lemmatized) return words_preprocessed