我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.stem()。
def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem): if punc: df[column] = df[column].apply(remove_punc).to_frame() if html: df[column] = df[column].apply(remove_html).to_frame() if email: df[column] = df[column].apply(remove_email).to_frame() if non_ascii: df[column] = df[column].apply(remove_non_ascii).to_frame() if stopwords: df[column] = df[column].apply(remove_stop).to_frame() if number: df[column] = df[column].apply(remove_numbers).to_frame() if nonenglish: df[column] = df[column].apply(nonenglish).to_frame() if stemorlem == 'stem': df[column] = df[column].apply(stemmer).to_frame() elif stemorlem == 'lem': df[column] = df[column].apply(lemmatizer).to_frame() return df
def preprocess(content): word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() words_set = [] for twitter in content: words_set += (word_tokenizer.tokenize(twitter['twitter_content'])) words_set = list(set(words_set)) stop_words = stopwords.words('english') non_words = list(punctuation) lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() # only need the alphabetic word formartted_twitter_words_set = [] for word in words_set: if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words): formartted_twitter_words_set.append(lemmatizer.lemmatize(word)) nltk_words_set = list(set(nltk.corpus.words.words())) # training whole set training_set = formartted_twitter_words_set + nltk_words_set return training_set
def resource_similarity_score_via_word_net_1(need_res_set,offer_tweet_list): if len(need_res_set)==0: return 0 value=0 offer_res_list=[] for i in offer_tweet_list: for j in i.split(): if stemmer.stem(j.lower()) not in out_stem_list: offer_res_list.append(stemmer.stem(j.lower())) for word in need_res_set: temp= get_similarity_score_1(word,offer_res_list) if temp > 0.6: value=value+temp return value/len(need_res_set)
def __init__(self, string, stem=None, rating=1.0, proper=False, terminal=False): ''' @param string: the actual representation of the tag @param stem: the internal (usually stemmed) representation; tags with the same stem are regarded as equal @param rating: a measure of the tag's relevance in the interval [0,1] @param proper: whether the tag is a proper noun @param terminal: set to True if the tag is at the end of a phrase (or anyway it cannot be logically merged to the following one) @returns: a new L{Tag} object ''' self.string = string self.stem = stem or string self.rating = rating self.proper = proper self.terminal = terminal
def __init__(self, tail, head=None): ''' @param tail: the L{Tag} object to add to the first part (head) @param head: the (eventually absent) L{MultiTag} to be extended @returns: a new L{MultiTag} object ''' if not head: Tag.__init__(self, tail.string, tail.stem, tail.rating, tail.proper, tail.terminal) self.size = 1 self.subratings = [self.rating] else: self.string = ' '.join([head.string, tail.string]) self.stem = ' '.join([head.stem, tail.stem]) self.size = head.size + 1 self.proper = (head.proper and tail.proper) self.terminal = tail.terminal self.subratings = head.subratings + [tail.rating] self.rating = self.combined_rating()
def respond(sentences): tokenized_sentence = sent_tokenize(sentences) stop_words = set(stopwords.words("english")) # Getting the stop words from the Local DB if len(tokenized_sentence) > 1: # if the length of the tokenized sentence is greater than one # for sentence in tokenized_sentence: # words = word_tokenize(sentence) # Each word is tokenized pos_tagged = parts_of_speechtag(sentences) print(tuple(pos_tagged)) # filtered_words = [w for w in words if w not in stop_words] # removing the additional stop words for # portStemer_object = PorterStemmer() # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words] # return filtered_steam_words else: pos_tagged = parts_of_speechtag(sentences) print(type(pos_tagged)) # words = word_tokenize(sentences) # filtered_words = [w for w in words if w not in stop_words] # portStemer_object = PorterStemmer() # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words] #return filtered_steam_words
def stem(self,word,pos=u'n'): return self.lemmatize(word,pos) ######## Wrapper for all of the popular stemmers ###########
def __init__(self,stemmer_type): self.stemmer_type = stemmer_type if (self.stemmer_type == 'porter'): self.stemmer = nltk.stem.PorterStemmer() elif (self.stemmer_type == 'snowball'): self.stemmer = nltk.stem.SnowballStemmer('english') elif (self.stemmer_type == 'lemmatize'): self.stemmer = WordNetStemmer() else: raise NameError("'"+stemmer_type +"'" + " not supported") ######## Simple wordreplacer object using a dictionary ############
def normalize(self, text): return [self.stemmer.stem(token) for token in self.tokenizer.tokenize(text.lower()) if token not in self.stop_words] ######### defining a default normalizer ##########
def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) ########## Stemmer + CountVectorizer wrapper #############
def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) ########## Defaults TF-IDF & Count Vectorizers ######## #======== TF-IDF Vectorizer =========#
def Stem(self): #???? fin = open('../file/pos_signature.txt', 'r') fout = open('../file/stem_signature.txt', 'w+') while True: line = fin.readline() if line: if '***' in line: fout.write(line) elif '---------' in line: fout.write(line) else: num, line = line.split(':', 1) line = self.RemSingleWord(line)#??????? line = self.CleanStopWords(line)#???? line = self.CleanLines(line)#??? line = line.split() word_list = [] s = nltk.stem.SnowballStemmer('english') for w in line: w = s.stem(w) word_list.append(w) line = ' '.join(word_list) fout.write(num + ':' + line + '\n') else: break
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap): self.itemId = itemId self.questionType = questionType self.answerType = answerType self.question = question self.answer = answer self.Question = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap] self.Answer = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap] self.qFeature = {} self.aFeature = {} self.create_QAFeature()
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj): self.itemId = itemId self.sent = Review self.rObj = ReviewObj self.Sent = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap] self.sFeature = {}
def get_lemma_sentences(sentences): lemma_sentences = [] for s in sentences: words = [w for w in nltk.word_tokenize(s) if w] w_s = [stemmer.stem(w) for w in words] l_s = ' '.join(w_s) lemma_sentences.append(l_s) return lemma_sentences
def tokenizeDocument(document): # remove punctuation (otherwise we have a bunch of empty tokens at the end) translate_table = dict((ord(char), " ") for char in string.punctuation) document = document.translate(translate_table) # tokenize tokenized_doc = nltk.word_tokenize(document) # stem snowball = stem.snowball.EnglishStemmer() tokenized_doc = [snowball.stem(word) for word in tokenized_doc] # remove stop words tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')] return tokenized_doc # given the dictionary, return an array of all the tokenized comments
def stemmer(text): # '''Description: This function takes in the string of descriptions and return string with all words stemmed # Parameters: String of descriptions # Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")''' stemmer = PorterStemmer() lis = unicode(str(text), 'utf-8').split(" ") stemmed_words = [str(stemmer.stem(word)) for word in lis] return " ".join(stemmed_words)
def extract_keywords(text): tokens = [i.lower() for i in nltk.word_tokenize(text) if i not in stop_words ] pos_tagged_tokens = nltk.pos_tag(tokens) result=[] for token in pos_tagged_tokens: # print token if token[1] in POS_KEYS: result.append(token[0]) return [ ps.stem(w) for w in result]
def getKeywords(question): tagged = nltk.tag.pos_tag(question) tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux] return {ps.stem(tag[0]) for tag in tagged} # Given a question, return a list of each sentence in the article # with a score attached to it
def score(question, sentence): score = 0 sentence = map(ps.stem, sentence) keywords = getKeywords(question) question = map(ps.stem, question) score += proximity(keywords, sentence) question_ngrams = count_ngrams(question, MAX_NGRAMS, True) sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True) precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5) f1 = (2*precision*recall)/(precision+recall) score += 2*f1 return score # Finds the shortest window in the targest sentence # in which all keywords appear, and assigns a score.
def _stem_(s): from nltk.stem.lancaster import LancasterStemmer rs = LancasterStemmer() rs = rs.stem(s) return rs
def _lemma_(token): if isinstance(token, str): return _stem_(token) if isinstance(token, unicode): return _stem_(token) from nltk.corpus import wordnet def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() p = get_wordnet_pos(token.pos()[0][1]) if p!=wordnet.VERB: return _stem_(token[0]) rs = wordnet_lemmatizer.lemmatize(token[0], pos=p) return rs
def stem_text(text): from nltk.stem import LancasterStemmer ls = LancasterStemmer() tokens = tokenize_text(text) filtered_tokens = [ls.stem(token) for token in tokens] filtered_text = ' '.join(filtered_tokens) return filtered_text
def __call__(self, doc): return [self.stemmer.stem(t) for t in word_tokenize(doc)]
def stem_str(sen): sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen) sen = nltk.word_tokenize(sen.lower()) sen = map(snowball_stemmer.stem, sen) sen = map(wordnet_lemmatizer.lemmatize, sen) return (' '.join(sen)).lower()
def get_list_1(need_tweet_list): need_res_set=[] for i in need_tweet_list: for j in i.split(): if stemmer.stem(j.lower()) not in out_stem_list: need_res_set.append(j.lower()) return list(set(need_res_set))
def get_list_2(need_tweet_list): need_res_set=[] for i in need_tweet_list: for j in i.split(): if stem2.stem(j.lower()) not in lanc_stem_list: need_res_set.append(j.lower()) return list(set(need_res_set))
def get_set_1(need_tweet_list): need_res_set=set() for i in need_tweet_list: for j in i.split(): if stemmer.stem(j.lower()) not in out_stem_list: need_res_set.add(stemmer.stem(j.lower())) return need_res_set
def resource_similarity_score_via_exact_word_match_1(need_res_set,offer_tweet_list): if len(need_res_set)==0: return 0 offer_res_set=set() for i in offer_tweet_list: for j in i.split(): if j not in out_stem_list: offer_res_set.add(stemmer.stem(j.lower())) return(len(offer_res_set&need_res_set)/len(need_res_set))
def __eq__(self, other): return self.stem == other.stem
def __hash__(self): return hash(self.stem)
def __init__(self, stemmer=None): ''' @param stemmer: an object or module with a 'stem' method (defaults to stemming.porter2) @returns: a new L{Stemmer} object ''' if not stemmer: from nltk.stem import PorterStemmer stemmer = PorterStemmer() self.stemmer = stemmer
def rate_tags(self, tags): ''' @param tags: a list of tags to be assigned a rating ''' term_count = collections.Counter(tags) for t in tags: # rating of a single tag is term frequency * weight t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)
def review_to_words(review): if isinstance(review, float): review = str(review).encode("utf-8") letters_only = re.sub("\W+", " ", review, flags=re.UNICODE) words = letters_only.lower().split() #nltk.data.path.append('./nltk_data/') #stops = set(nltk.corpus.stopwords.words("portuguese")) meaningful_words = words #[w for w in words if not w in stops] #stemmer = RSLPStemmer() meaningful_stemmed = meaningful_words #[stemmer.stem(w) for w in meaningful_words] return(" ".join(meaningful_stemmed))
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None): if stopwords is not None: terms = [t for t in terms if t not in stopwords] if only_N_J is not None: # include only nouns and verbs tagged = nltk.pos_tag(terms) terms = [t for t, pos in tagged if pos in tags] if lemmatize is not None: lem = WordNetLemmatizer() terms = [lem.lemmatize(t) for t in terms] if stem is not None: stem = PorterStemmer() terms = [stem.stem(t) for t in terms] return terms
def extract_terms_from_file(file_location, stopwords=None, lemmatize=None, stem=None, only_N_J=None): with open(file_location, 'r', encoding='iso-8859-1') as doc: terms = [] for line in doc: terms.extend(re.compile('\w+').findall(line.lower())) # terms = re.compile('\w+').findall(doc # .read() # .replace('\n', '') # .lower()) return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
def extract_terms_from_sentence(sentence, stopwords=None, lemmatize=None, stem=None, only_N_J=None): terms = re.compile('\w+').findall(sentence.lower()) return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
def addToSentenceScore(question, sentence): score = 0 questionSet = set() for item in question.split(): questionSet.add(morpher.stem(item.replace("?",""))) sentenceSet = set() for item in sentence.split(): sentenceSet.add(morpher.stem(item.replace("?",""))) jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet))) common = ' '.join(sentenceSet.intersection(questionSet)) tagCommon = nltk.pos_tag(nltk.word_tokenize(common)) if tagCommon: for item in tagCommon: if 'VB' in item[1]: score += 6 else: score += 3 # Add sentence and score to a hashmap sentenceScore[sentence] = score + (jaccard * 10) return score # PARSER TO TOKENIZE, REMOVE STOP WORDS, MORPHOLOGY, ADD TO SET
def parser(line): tokLine = nltk.word_tokenize(line) keywords = list(set(tokLine) - set(stopwords)) lineSet = set() for item in keywords: lineSet.add(morpher.stem(item.replace("?", ""))) return lineSet # WORD MATCH
def worMatch(question, sentence): score = 0 questionSet = set() for item in question.split(): questionSet.add(morpher.stem(item.replace("?",""))) sentenceSet = set() for item in sentence.split(): sentenceSet.add(morpher.stem(item.replace("?",""))) jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet))) common = ' '.join(sentenceSet & questionSet) tagCommon = nltk.pos_tag(nltk.word_tokenize(common)) if tagCommon: for item in tagCommon: if 'VB' in item[1]: score += 6 else: score += 3 return score + (jaccard * 10) # GET INPUT FILE NAME
def steam_words(self, word): ps_obj = PorterStemmer() # creating the port steamer steamed_word = ps_obj.stem(word) return steamed_word # returns the steamed word to the main file . # Natural Language displaying setneces .
def CleanReVerb(self): fin_seed = open('../file/seed_ReVerb.txt', 'r') fout_seed = open('../file/seed_ReVerb_clean.txt', 'w+') fin_signature = open('../file/signature_ReVerb.txt', 'r') fout_signature = open('../file/signature_ReVerb_clean.txt', 'w+') while True: line = fin_seed.readline() if line: if '***' in line: fout_seed.write(line) else: mark, line = line.split(':', 1) line = self.CleanStopWords(line)#????? #???? line = line.split() word_list = [] s = nltk.stem.SnowballStemmer('english') for w in line: w = s.stem(w) word_list.append(w) if len(word_list) > 0: line = ' '.join(word_list) fout_seed.write(mark + ':' + line + '\n') else: break while True: line = fin_signature.readline() if line: if '***' in line: fout_signature.write(line) else: mark, line = line.split(':', 1) line = self.CleanStopWords(line)#????? #???? line = line.split() word_list = [] s = nltk.stem.SnowballStemmer('english') for w in line: w = s.stem(w) word_list.append(w) if len(word_list) > 0: line = ' '.join(word_list) fout_signature.write(mark + ':' + line + '\n') else: break fin_signature.close() fout_signature.close()
def __call__(self, tags): ''' @param tags: a list of (preferably stemmed) tags @returns: a list of unique (multi)tags sorted by relevance ''' # print tags self.rate_tags(tags) multitags = self.create_multitags(tags) # keep most frequent version of each tag clusters = collections.defaultdict(collections.Counter) proper = collections.defaultdict(int) ratings = collections.defaultdict(float) for t in multitags: clusters[t][t.string] += 1 if t.proper: proper[t] += 1 ratings[t] = max(ratings[t], t.rating) term_count = collections.Counter(multitags) for t, cnt in term_count.iteritems(): t.string = clusters[t].most_common(1)[0][0] proper_freq = proper[t] / cnt if proper_freq >= 0.5: t.proper = True t.rating = ratings[t] # purge duplicates, one-character tags and stopwords unique_tags = set(t for t in term_count if len(t.string) > 1 and t.rating > 0.0) # remove redundant tags for t, cnt in term_count.iteritems(): words = t.stem.split() for l in xrange(1, len(words)): for i in xrange(len(words) - l + 1): s = Tag(' '.join(words[i:i + l])) relative_freq = cnt / term_count[s] if ((relative_freq == 1.0 and t.proper) or (relative_freq >= 0.5 and t.rating > 0.0)): unique_tags.discard(s) else: unique_tags.discard(t) # print unique_tags return sorted(unique_tags)
def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True): ds = './datasets/%s/' % dataset Gs = [] labels = [] type_ = 2 vocab_creation = vocabulary_creation words = [] # for vocabulary for doc in os.listdir(ds): if 'train.txt' in doc: type_ = 1 if type_ == 1: if os.path.exists("ds/vocab.txt"): vocab_creation = False with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc: dc = 1 for line in doc: label = line[0] labels.append(label) terms = extract_terms_from_sentence(line[1:], stopwords=stopwords.words('english'), lemmatize=True, stem=True, only_N_J=True) if vocab_creation: words.extend(terms) graph = terms_to_graph(terms, window_size) G = graph_to_networkx(graph, name=label + '_' + str(dc)) # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label') nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes()))) Gs.append(G) dc += 1 else: if os.path.exists("ds/vocab.txt"): vocab_creation = False for cat in cats.keys(): for doc in os.listdir(ds + cat): terms = extract_terms_from_file(ds + cat + '/' + doc, stopwords=stopwords.words('english'), lemmatize=True, stem=True, only_N_J=True) if vocab_creation: words.extend(terms) graph = terms_to_graph(terms, window_size) G = graph_to_networkx(graph, name=cat + doc.split('.')[0]) # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label') nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes()))) Gs.append(G) labels.append(cats[cat]) if vocab_creation: vocab = dict(Counter(words)) create_vocabulary_file(fname, vocab) return Gs, labels # needs fix or discard