我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用nltk.PorterStemmer()。
def test_stems(self): import nltk stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS) stops = frozenset(nltk.corpus.stopwords.words('english')) tests = [("foo bar", ['foo', 'bar']), ("foo $1.23 is the bar", ['foo', 'bar']), ("a b c d", []), # assume single char stems are useless ("ab cd", ['ab', 'cd']), ("-1.23 1.23 foo", ['foo']), ("-123 foo 123", ['foo']), ("8:12 12:34am foo", ['foo']), ("ab. foo, then bar", ['ab', 'foo', 'bar']), ("crying infants", ["cry", "infant"]), ("drop 12 all 3.45 the 0.123 numbers", ['drop', 'number'])] for test, exp in tests: obs = list(stems(stops, stemmer, test)) self.assertEqual(obs, exp)
def clean_text(raw_text, filtered_word_types): """Clean raw text for bag-of-words model""" # Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", raw_text) # Convert to lower case, split into individual words words = letters_only.lower().split() # stem words stemmer = PorterStemmer() stemmed_words = list(map(stemmer.stem, words)) # Remove stop words if requested if filtered_word_types is not None: tagged_text = nltk.pos_tag(stemmed_words) stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types] # join together return " ".join(stemmed_words)
def stem_text(sent, context=None): processed_tokens = [] tokens = nltk.word_tokenize(sent) porter = nltk.PorterStemmer() for t in tokens: t = porter.stem(t) processed_tokens.append(t) return " ".join(processed_tokens) # Split to train and test sample sets:
def stemming(sentence): st = nltk.PorterStemmer() words = [st.stem(word.lower()) for word in re.sub("[\.\,\!\?;\:\(\)\[\]\'\"]$", '', sentence.rstrip()).split()] words = [word for word in words if word not in stopwords.words('english')] return words
def __init__(self, lower: bool = True, stemmer="port"): self.lower = lower self.stemmer = stemmer if stemmer == "port": self._stemmer = PorterStemmer() self._stem = self._stemmer.stem elif stemmer == "wordnet": self._stemmer = WordNetLemmatizer() self._stem = self._stemmer.lemmatize else: raise ValueError(stemmer) # stemming is slow, so we cache words as we go self.normalize_cache = {}
def df_to_stems(df): """Convert a DataFrame to stem -> index associations Parameters ---------- df : pd.DataFrame A pandas DataFrame to index Returns ------- dict {stem: {set of indices}} """ from collections import defaultdict import functools import nltk # not using nltk default as we want this to be portable so that, for # instance, a javascript library can query stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS) stops = frozenset(nltk.corpus.stopwords.words('english')) stem_f = functools.partial(stems, stops, stemmer) d = defaultdict(set) for sample, row in df.iterrows(): for value in row.values: for stem in stem_f(value): d[stem].add(sample) return dict(d)
def str_stemmer(s): return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])
def setmword(word): return PorterStemmer().stem_word(word)
def tweetMeaning(self,term): self.dbout = self.searcher(term) with open("data/words.json") as filedata: self.wordList = json.load(filedata) threading.Thread(target=self.dis.spinner, args=("Analysing Tweets ",)).start() self.tweetList = [] for self.i in self.dbout: self.procounter = 0 self.negcounter = 0 for self.word in nltk.word_tokenize(self.i["tweet"]): #print("Analysing word: "+self.word) try: if nltk.PorterStemmer().stem(self.word) in self.wordList["good"]: #print("Found good world") self.procounter = + 1 if nltk.PorterStemmer().stem(self.word) in self.wordList["bad"]: #print("Found bad world") self.negcounter = + 1 # if nltk.PorterStemmer().stem(self.word) in self.wordList["swear"]: # print("Found bad world") # self.negcounter = + 1 else: self.neucounter = + 1 except IndexError: print("Ignoring tweet:",self.i["tweet"]) self.view = "unknown" if self.procounter > self.negcounter: self.view = "pro" if self.negcounter > self.procounter: self.view = "neg" self.tweetDict = { "id": self.i["_id"], "tweet": self.i["tweet"], "procount": self.procounter, "negcount": self.negcounter, # "view":"pro" if self.procounter > self.negcounter else "neg" "view": self.view } self.tweetList.append(self.tweetDict) self.dis.stop() return self.tweetList # This method gets the poll data from the JSON file it is # stored in, ii then adds them up to get a total.
def processFile(file_name): # read file from provided folder path f = open(file_name,'r') text_0 = f.read() # extract content in TEXT tag and remove tags text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL) text_1 = re.sub("<TEXT>\n","",text_1.group(0)) text_1 = re.sub("\n</TEXT>","",text_1) # replace all types of quotations by normal quotes text_1 = re.sub("\n"," ",text_1) text_1 = re.sub("\"","\"",text_1) text_1 = re.sub("''","\"",text_1) text_1 = re.sub("``","\"",text_1) text_1 = re.sub(" +"," ",text_1) # segment data into a list of sentences sentence_token = nltk.data.load('tokenizers/punkt/english.pickle') lines = sentence_token.tokenize(text_1.strip()) # setting the stemmer sentences = [] porter = nltk.PorterStemmer() # modelling each sentence in file as sentence object for line in lines: # original words of the sentence before stemming originalWords = line[:] line = line.strip().lower() # word tokenization sent = nltk.word_tokenize(line) # stemming words stemmedSent = [porter.stem(word) for word in sent] stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent) # list of sentence objects if stemmedSent != []: sentences.append(sentence.sentence(file_name, stemmedSent, originalWords)) return sentences #--------------------------------------------------------------------------------- # Description : Function to find the term frequencies of the words in the # sentences present in the provided document cluster # Parameters : sentences, sentences of the document cluster # Return : dictonary of word, term frequency score #---------------------------------------------------------------------------------
def processFile(self, file_path_and_name): try: f = open(file_path_and_name,'rb') text = f.read() # soup = BeautifulSoup(text,"html.parser") # text = soup.getText() # text = re.sub("APW19981212.0848","",text) # text = re.sub("APW19981129.0668","",text) # text = re.sub("NEWSWIRE","",text) text_1 = re.search(r"<TEXT>.*</TEXT>",text, re.DOTALL) text_1 = re.sub("<TEXT>\n","",text_1.group(0)) text_1 = re.sub("\n</TEXT>","",text_1) # replace all types of quotations by normal quotes text_1 = re.sub("\n"," ",text_1) text_1 = re.sub(" +"," ",text_1) # text_1 = re.sub("\'\'","\"",text_1) # text_1 = re.sub("\`\`","\"",text_1) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') lines = sent_tokenizer.tokenize(text_1.strip()) text_1 = lines sentences = [] porter = nltk.PorterStemmer() for sent in lines: OG_sent = sent[:] sent = sent.strip().lower() line = nltk.word_tokenize(sent) stemmed_sentence = [porter.stem(word) for word in line] stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence) if stemmed_sentence != []: sentences.append(sentence(file_path_and_name, stemmed_sentence, OG_sent)) return sentences except IOError: print 'Oops! File not found',file_path_and_name return [sentence(file_path_and_name, [],[])]
def get_features(df_features): # now = datetime.datetime.now() # print now.strftime('%Y-%m-%d %H:%M:%S') # print "matchnouns" # df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) # df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) # #df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long # df_features['z_noun_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_nouns, r.question2_nouns), axis = 1) # now = datetime.datetime.now() # print now.strftime('%Y-%m-%d %H:%M:%S') # print "matchverb" # df_features['question1_verbs'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B']) # df_features['question2_verbs'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[0] == 'V' and t[1] == 'B']) # #df_features['z_verb_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_verbs if w in r.question2_verbs]), axis=1) #takes long # df_features['z_verb_match'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1_verbs, r.question2_verbs), axis = 1) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print "stem_tfidf" df_features['q1_stem'] = df_features.question1.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')]) df_features['q2_stem'] = df_features.question2.map(lambda x: [w for w in nltk.PorterStemmer().stem_word(str(x).lower()).split(' ')]) #df_features['z_adj_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_adjs if w in r.question2_adjs]), axis=1) #takes long df_features['z_stem_tfidf'] = df_features.apply(lambda r : tfidf_word_match_share(r.q1_stem, r.q2_stem), axis = 1) now = datetime.datetime.now() # print now.strftime('%Y-%m-%d %H:%M:%S') # print('w2v tfidf...') # df_features['z_tfidf_w2v'] = df_features.apply(lambda r : tfidf_word_match_share(r.question1.tolist(), r.question2.tolist()), axis = 1) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print('nouns...') df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1) #takes long print('lengths...') df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x))) df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x))) df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split())) df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split())) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print('difflib...') df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1) #takes long print('word match...') df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True) print('tfidf...') df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data)) df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data)) df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data)) df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data)) df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data)) df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data)) return df_features.fillna(0.0)