我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用nltk.stem.SnowballStemmer()。
def snowball(inputpath=None, text=None): """ docstring """ data = '' sb = SnowballStemmer('english') if inputpath: filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)] sbstemmed_list = [] for file in filenames: with open(file, 'r') as f: data = f.read() if data: texts = data.split(',') stemmedfile = [] for text in texts: sbstemmed = sb.stem(text) stemmedfile.append(sbstemmed) sbstemmed_list.extend(stemmedfile) return sbstemmed_list if text: sbstemmed = sb.stem(text) return sbstemmed
def snowball_stem(word, lang='english'): global _nltk_snowball_stemmer try: _nltk_snowball_stemmer except NameError: available_languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'german2', 'hungarian', 'italian', 'kraaij_pohlmann', 'lovins', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish'] assert lang in available_languages, "Snowball Stemmer for {} not available".format(lang) # Checks that the snowball data was previously downloaded. download('snowball_data', quiet=True) _nltk_snowball_stemmer = SnowballStemmer(lang) return _nltk_snowball_stemmer.stem(word)
def _transform(self, dataset): opinion_stemm = SnowballStemmer('english') udfStemmer = udf(lambda tokens: [opinion_stemm.stem(word) for word in tokens], ArrayType(StringType())) inCol = self.getInputCol() outCol = self.getOutputCol() return dataset.withColumn(outCol, udfStemmer(inCol))
def _set_treetagger(self, language): import treetaggerwrapper as ttw try: self._tagger = ttw.TreeTagger(TAGLANG=language) self.morphy = self._treetagger_morphy except ttw.TreeTaggerError: raise(ImportError) # sets SnowballStemmer as the morphology analyzer
def _set_snowball(self, language): from nltk.stem import SnowballStemmer self._stemmer = SnowballStemmer(_langmap[language]) self.morphy = self._snowball_morphy # sets custom morphology analyzer
def get_stemmer(language, stemmers={}): if language in stemmers: return stemmers[language] from nltk.stem import SnowballStemmer try: stemmers[language] = SnowballStemmer(language) except Exception: stemmers[language] = 0 return stemmers[language]
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Convert words to lower case and split them text = str(text).lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", " 911 ", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() # stemmer = SnowballStemmer('english') stemmer = PorterStemmer() stemmed_words = [] for word in text: try: stemmed_words.append(stemmer.stem(word)) except: print word stemmed_words.append(word) # stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def text_to_wordlist(text,remove_stopwords=False,stem_words=False): text = text.lower().split() if remove_stopwords: stops = set(stopwords.words('english')) text = [w for w in text if not w in stops] text = " ".join(text)#to str #clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) #punction replace text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text)#change to 3 words text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"60k", " 60000 ", text) #text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)