我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用nltk.stem.porter.PorterStemmer()。
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( # re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if # not token.isdigit() and not token in stop_words]
def select_top_words(word_list, n=10): """ Filter out cluster term names""" import re from nltk.stem.porter import PorterStemmer from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS st = PorterStemmer() out_st = [] out = [] for word in word_list: word_st = st.stem(word) if len(word_st) <= 2 or\ re.match('\d+', word_st) or \ re.match('[^a-zA-Z0-9]', word_st) or\ word in COMMON_FIRST_NAMES or \ word in CUSTOM_STOP_WORDS or\ word in ENGLISH_STOP_WORDS or \ word_st in out_st: # ignore stemming duplicate continue out_st.append(word_st) out.append(word) if len(out) >= n: break return out
def porter(inputpath=None, text=None): """ docstring """ data = '' p = PorterStemmer() if inputpath: filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)] pstemmed_list = [] for file in filenames: with open(file, 'r') as f: data = f.read() if data: texts = data.split(',') stemmedfile = [] for text in texts: pstemmed = p.stem(text) stemmedfile.append(pstemmed) pstemmed_list.extend(stemmedfile) return pstemmed_list if text: pstemmed = p.stem(text) return pstemmed
def stem_split(tokens): """ Takes a list of tokens and splits stemmed tokens into stem, ending - inserting ending as extra token. returns: revised (possibly longer) list of tokens. """ stemmer = PorterStemmer() token_list = list() for token in tokens: stem = stemmer.stem(token) split_list = token.split(stem) if token == stem: token_list.append(token) elif len(split_list) > 1: token_list.append(stem) token_list.append(split_list[1]) else: token_list.append(split_list[0]) return token_list
def stem(words,stem_dic,mode="nltk",silent=1): if silent==0: print("stem ...") if mode == "nltk": from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() else: print("unknown mode",mode) assert 0 for word in set(words): if word not in stem_dic: stem_dic[word] = stemmer.stem(word) words = [stem_dic[word] for word in words] return words
def tiny_tokenize_xml(text, stem=False, stop_words=[]): return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if not token.isdigit() and not token in stop_words]
def __porter_stemmer(self): """Initializes PorterStemmer Returns: Initializes PorterStemmer """ self.stemmer = PorterStemmer()
def tokenize(text): min_length = 3 words = map(lambda word: word.lower(), word_tokenize(text)) words = [word for word in words if word not in cachedStopWords] tokens = (list(map(lambda token: PorterStemmer().stem(token), words))) p = re.compile('[a-zA-Z]+') filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens)) return filtered_tokens
def __init__(self, full_word): self.full_word = full_word # TODO: Lemmatization requires downloads # wnl = WordNetLemmatizer() # lemmas = [wnl.lemmatize(token) for token in tokens] self.stem = PorterStemmer().stem(full_word).lower()
def get_list(): stop_words = set(stopwords.words('english')) filename = 'data/new_acronyms.json' f = open(filename, 'r') data = json.load(f) paragraph_list = [] full_form_list = [] for k,v in data.items(): if k=="WDM": for poss in v['possibilities']: paragraph_list.append(poss['summary']) full_form_list.append(poss['full_form']) s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly." paragraph_list.append(s) full_form_list.append("Wavelength context") texts = [] taggeddoc = [] p_stemmer = PorterStemmer() tokeniser = RegexpTokenizer(r'\w+') for index, para in enumerate(paragraph_list): raw = para.lower() tokens = tokeniser.tokenize(raw) stopped_tokens = [t for t in tokens if not t in stop_words] number_tokens = [x for x in stopped_tokens if x.isalpha] stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] length_tokens = [i for i in stemmed_tokens if len(i) > 1] texts.append(length_tokens) td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]]) taggeddoc.append(td) return taggeddoc
def tokenizer_porter(text): porter = PorterStemmer() return [porter.stem(word) for word in text.split() if word not in stop] # Cambiamos a este stemmer que tiene soporte para español
def __init__(self, ignore_stopwords=False): _LanguageSpecificStemmer.__init__(self, ignore_stopwords) porter.PorterStemmer.__init__(self)
def tweet_stemming(tweet, token_freqs): """ Stems tweets words and counts diversty :param tweet: the tweet to analyze :type tweet: str or unicode :param token_freqs: counter of words frequency :type token_freqs: Counter :returns: words added to token_freqs :rtype: int """ pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)' regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation)) porter = PorterStemmer() counter_tokens = 0 tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE) # remove URL tweet_url_removed_tokenized = word_tokenize(tweet_url_removed) # tokenize tweet tweet_url_removed_tokenized_cleaned_stemming = [] # cleaned of URLs and hashs, and stemming for token in tweet_url_removed_tokenized: new_token = regex_punctuation.sub(u'', token) # remove punctuation and hash if not new_token == u'': new_token_stemming = porter.stem(new_token) tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming) token_freqs[new_token_stemming] += 1 counter_tokens += 1 return counter_tokens
def stem_list(word_list): """ Return a tokenised text list. :param word_list: word list to be stemmed. :return: list """ stemmer = PorterStemmer() return [stemmer.stem(word) for word in word_list]
def __init__(self): # TODO: placeholder for password. Will eventually take # as an arg of some sort self.password = b"password" # TODO: need to sort out use of salt. Previously, salt was # randomly generated in initKeys, but the resulting pass- # words k & kPrime were different on each execution, and # decryption was impossible. Hardcoding salt makes dectyption # possible but may be a bad short cut self.iv = None self.salt = "$2b$12$ddTuco8zWXF2.kTqtOZa9O" # Two keys, generated/Initialized by KDF (self.k, self.kPrime) = self.initKeys() # Two K's: generated/initialized by PRF self.k1 = None self.k2 = None # client's cipher (AES w/ CBC) self.cipher = self.initCipher() # Stemming tool (cuts words to their roots/stems) self.stemmer = PorterStemmer()
def stem(tokens): """ Stem passed text tokens. """ stemmer = PorterStemmer() return [stemmer.stem(token) for token in tokens]
def __init__(self): self.ps = PorterStemmer()
def getAllReviews(movieList): reviews = np.array(map(lambda x: x["reviews"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg')) return tokenizeReview
def getAllCritics(movieList): reviews = np.array(map(lambda x: x["critics"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg')) return tokenizeReview
def tokenizer_porter(text): return [PorterStemmer().stem(word) for word in text.split()]
def __init__(self): self.stemmer = PorterStemmer()
def __init__(self, lang="spanish"): """ Initializes the parameters for specific language """ self.languages = ["spanish", "english", "italian", "german"] self.lang = lang if self.lang not in self.languages: raise LangDependencyError("Language not supported: " + lang) self.stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None) if self.stopwords is None: self.stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang))) LangDependency.STOPWORDS_CACHE[lang] = self.stopwords self.neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None) if self.neg_stopwords is None: self.neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang))) LangDependency.NEG_STOPWORDS_CACHE[lang] = self.neg_stopwords if self.lang not in SnowballStemmer.languages: raise LangDependencyError("Language not supported for stemming: " + lang) if self.lang == "english": self.stemmer = PorterStemmer() else: self.stemmer = SnowballStemmer(self.lang)
def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True): porter = PorterStemmer() lmtz = WordNetLemmatizer() # empty bag of words bag_of_words = [] # Iterate for string for string in tqdm(list_of_strings): string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True) bag_of_words.extend(string_tokens) if remove_alnums: bag_of_words = [bag for bag in bag_of_words if bag.isalpha()] elif remove_digits: bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))] bag_of_words.sort() # Stem and Lemmatize the data bag_of_words_stemmed = [] for word in bag_of_words: try: bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word))) except: bag_of_words_stemmed.append(word) bag_of_words = list(bag_of_words_stemmed) # Remove stop words stop = set(stopwords.words('english')) print('Removing Stop words...') bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)] bow_counter = Counter(bag_of_words) bow_counter = OrderedDict(sorted(bow_counter.items())) return bow_counter
def build_lda_model(self, data, docs, n_topics=5): texts = [] tokenizer = RegexpTokenizer(r'\w+') for d in data: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = self.remove_stopwords(tokens) stemmed_tokens = stopped_tokens #stemmer = PorterStemmer() #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) index = similarities.MatrixSimilarity(corpus) self.save_lda_model(lda_model, corpus, dictionary, index) self.save_similarities(index, docs) return dictionary, texts, lda_model
def extract_bigrams(self, text): text = self.remove_return_lines_and_quotes(text) bigrams = [] st = PorterStemmer() stop = stopwords.words('english') more_stop_words = [ '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...'] stop = stopwords.words('english') stop = stop + more_stop_words tokens = st.stem(text) tokens = nltk.word_tokenize(tokens.lower()) tokens = [i for i in tokens if i not in stop] tokens = [word for word in tokens if len(word) > 2] bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(2) top_bigrams = finder.nbest(bigram_measures.pmi, 1000) for bg in top_bigrams: bg = " ".join(bg) tag = nltk.pos_tag([bg])[0] if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']: bigrams.append(tag[0]) return bigrams
def stem_tokens(tokens, stemmer = PorterStemmer()): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed
def k_tokenizer(text): text = text.encode('ascii',errors='ignore').replace('-', '') """ We should use a better way to remove non-english words """ tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(text) # stopset = set(stopwords.words('english')) # tokens = [word for word in tokens if not word in stopset] """ Synonyms using wordnet """ mwe_tokenizer = MWETokenizer([('ios', '9'),]) mwe_tokens = mwe_tokenizer.tokenize(tokens) """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """ """ train -> train_NN train_V""" tagged = nltk.pos_tag(mwe_tokens) def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # we preserve the original form of any unknown word wordnet_lemmatizer = WordNetLemmatizer() final_doc=[] for token, tag in tagged: word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag)) final_doc.append(word) # porter = PorterStemmer() # final_doc=[] # for token in mwe_tokens: # final_doc.append(porter.stem(token)) return final_doc
def get_encoded_vector(list_of_words, new_string): porter = PorterStemmer() lmtz = WordNetLemmatizer() if 'START_SEQ' not in list_of_words: list_of_words.append('START_SEQ') if 'UNKNOWN_WORDS' not in list_of_words: list_of_words.append('UNKNOWN_WORDS') if 'END_SEQ' not in list_of_words: list_of_words.append('END_SEQ') tokens = text_to_word_sequence(new_string, lower=True, split=" ") # Stem and Lemmatize the data token_stemmed = [] for token in tokens: try: token_stemmed.append(porter.stem(lmtz.lemmatize(token))) except: token_stemmed.append(token) tokens = list(token_stemmed) out = [] all_unknown_words = True for token in tokens: if token in list_of_words: all_unknown_words = False out.append(list_of_words.index(token)) else: out.append(list_of_words.index('UNKNOWN_WORDS')) if all_unknown_words: print('Sentence not recognised:', new_string) out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')] return out