我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用nltk.stem.PorterStemmer()。
def __repr__(self): return '<PorterStemmer>' ## --NLTK-- ## This test procedure isn't applicable. #if __name__ == '__main__': # p = PorterStemmer() # if len(sys.argv) > 1: # for f in sys.argv[1:]: # with open(f, 'r') as infile: # while 1: # w = infile.readline() # if w == '': # break # w = w[:-1] # print(p.stem(w)) ##--NLTK-- ## Added a demo() function
def mk_feature(): d=defaultdict(lambda:0) stoplist=stopwords.words("english")+[",",".","!","?",";",":","\n","\t","(",")"," ",""] stemmer=stem.PorterStemmer() l=list() for line in open("sentiment.txt","r"): y=line.split(" ")[0] for item in line.strip("\n").split(" ")[1:]: item=stemmer.stem(item) if item not in stoplist: d[item]+=1 for key,value in d.items(): if value < 5: l+=[key] for key in l: del d[key] return(d)
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*'*70)
def make_feature(f, flag): with open(file_name) as f: feature = defaultdict(int) for i,line in enumerate(f): print(i) y,x = line.split('\t') y = int(y) words = x.split() for word in words: if stop_word_check(word) == False: word = stem.PorterStemmer().stem(word) if flag == 0: feature[word] += 1 elif flag == 1: if y == 1: feature[word] += 1 elif y == -1: feature[word] -= 1 return feature
def __init__(self): self.stemmer = NltkPorterStemmer()
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.files()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*'*70) ##--NLTK--
def simple_stemming(tokens): ps = PorterStemmer() stemmed_tokens = [ps.stem(w) for w in tokens] return stemmed_tokens
def feature_making(sentence): stemming = stem.PorterStemmer() ans_list = list() for word in sentence: word = stemming.stem(word) if stop(word) == False: ans_list.append(word) return ans_list
def extract_feature(sentence): features = defaultdict(lambda: 0) stemmer = stem.PorterStemmer() for word in sentence.split(): if not include_stopword(word): features[stemmer.stem(word)] += 1 return features
def getFeature(word_list): stemmer = stem.LancasterStemmer() # stemmer2 = stem.PorterStemmer() feature = defaultdict(lambda: 0) for word in word_list: if not isStopWords(word): word_stem = stemmer.stem(word) feature[word_stem] += 1 return dict(feature)
def mk_label(line,d,thre): import math from nltk import stem stemmer=stem.PorterStemmer() score=0 for item in line.strip("\n").split(" "): item=stemmer.stem(item) score+=d[item] p_pos=math.exp(score)/(1+math.exp(score)) if p_pos > thre: return(line.split(" ")[0]+"\t+1\t"+str(p_pos)) else: return(line.split(" ")[0]+"\t-1\t"+str(p_pos))
def _preprocess(self, text): """ Return a list of lists. Each list is a preprocessed sentence of text in bag-of-words format.""" stemmer = PorterStemmer() self._sents = sent_tokenize(text) # tokenize sentences word_sents = [word_tokenize(sent.lower()) for sent in self._sents] # remove stop-words and stem words word_sents = [[stemmer.stem(word) for word in sent if word not in self._stopwords] for sent in word_sents] return word_sents
def stemmer(text): # '''Description: This function takes in the string of descriptions and return string with all words stemmed # Parameters: String of descriptions # Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")''' stemmer = PorterStemmer() lis = unicode(str(text), 'utf-8').split(" ") stemmed_words = [str(stemmer.stem(word)) for word in lis] return " ".join(stemmed_words)
def __init__(self, mode=NLTK_EXTENSIONS): if mode not in ( self.NLTK_EXTENSIONS, self.MARTIN_EXTENSIONS, self.ORIGINAL_ALGORITHM ): raise ValueError( "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, " "PorterStemmer.MARTIN_EXTENSIONS, or " "PorterStemmer.ORIGINAL_ALGORITHM" ) self.mode = mode if self.mode == self.NLTK_EXTENSIONS: # This is a table of irregular forms. It is quite short, # but still reflects the errors actually drawn to Martin # Porter's attention over a 20 year period! irregular_forms = { "sky" : ["sky", "skies"], "die" : ["dying"], "lie" : ["lying"], "tie" : ["tying"], "news" : ["news"], "inning" : ["innings", "inning"], "outing" : ["outings", "outing"], "canning" : ["cannings", "canning"], "howe" : ["howe"], "proceed" : ["proceed"], "exceed" : ["exceed"], "succeed" : ["succeed"], } self.pool = {} for key in irregular_forms: for val in irregular_forms[key]: self.pool[val] = key self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
def __repr__(self): return '<PorterStemmer>'
def feature(sentence): features = [] stemmer = PorterStemmer() for word in sentence: stem_word = stemmer.stem(word) if stopwords(stem_word) == False: features.append(stem_word) return features
def make_sentence_feature(sen, ids): sentence_feature = [0 for i in range(len(ids))] words = sen.split() for word in words: word = stem.PorterStemmer().stem(word) if word in ids: sentence_feature[ids[word]] += 1 return sentence_feature
def stemming_porter(): for word_stem in separate_word(): stemmer = stem.PorterStemmer() yield (word_stem,stemmer.stem(word_stem))
def __init__(self, stemmer=None): ''' @param stemmer: an object or module with a 'stem' method (defaults to stemming.porter2) @returns: a new L{Stemmer} object ''' if not stemmer: from nltk.stem import PorterStemmer stemmer = PorterStemmer() self.stemmer = stemmer
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None): if stopwords is not None: terms = [t for t in terms if t not in stopwords] if only_N_J is not None: # include only nouns and verbs tagged = nltk.pos_tag(terms) terms = [t for t, pos in tagged if pos in tags] if lemmatize is not None: lem = WordNetLemmatizer() terms = [lem.lemmatize(t) for t in terms] if stem is not None: stem = PorterStemmer() terms = [stem.stem(t) for t in terms] return terms