我们从Python开源项目中,提取了以下34个代码示例,用于说明如何使用nltk.stem.WordNetLemmatizer()。
def preprocessing(text): text = text.decode("utf8") # tokenize into words tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # remove stopwords stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] # lower capitalization tokens = [word.lower() for word in tokens] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] preprocessed_text= ' '.join(tokens) return preprocessed_text
def test_ranker(options): lemmatizer = WordNetLemmatizer() words, answers, candidate_lfs = load_lf_test(options.data_dir) r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file) assert(os.path.exists(options.ranker_model_dir)) r.load_model(options.ranker_model_dir) result_file = os.path.join(options.result_dir, 'test') rf = open(result_file, 'w') print ('testing...') for word, answer, lf in iter_lf_test(words, answers, candidate_lfs): lemma = [lemmatizer.lemmatize(w) for w in word] selected = r.test(word, lemma, lf) write_file(rf, selected[0], answer, selected[1]) rf.close() print (getResults(result_file))
def getPOSLinks(text): wordnet_lemmatizer = WordNetLemmatizer() text = nltk.word_tokenize(text) pos = nltk.pos_tag(text) links = [] link = [] active = False for w in pos: part = w[1] word = w[0] if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")): active = True if(active): link.append(wordnet_lemmatizer.lemmatize(word)) #extract main body if(active and (part == "PRP" or part[:2] == "NN" or part == "." )): active = False links.append(" ".join(link)) link = [] return links
def wordnet_lemmatize(word, pos='n'): global _nltk_wordnet_lemmatizer try: _nltk_wordnet_lemmatizer except NameError: _nltk_wordnet_lemmatizer = WordNetLemmatizer() return _nltk_wordnet_lemmatizer.lemmatize(word, penn2morphy(pos))
def split_ingr(x): wnl=WordNetLemmatizer() cleanlist=[] lst = x.strip('[]').split(',') cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst] return cleanlist #remove low-information words from ingredients, could use more
def __wordnet_lemmatizer(self): """Initializes WordNetLemmatizer Returns: Initializes WordNetLemmatizer """ self.lemmatizer = WordNetLemmatizer() # Call lemmatize to avoid lazy load _ = self.lemmatizer.lemmatize('start')
def __init__(self): self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'} self.wnl = WordNetLemmatizer() self.dictionary = enchant.Dict('en') self.inflengine = inflect.engine()
def __init__(self): self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'} self.wnl = WordNetLemmatizer() self.dictionary = enchant.Dict('en') self.lookup_table = {}
def __tokenizeWholeCorpora(self,pathToCorpora): print 'Start tokenzing the corpora: %s' % (pathToCorpora) punct = re.compile('[%s]' % re.escape(string.punctuation)) wnl = WordNetLemmatizer() doc_count=0 train_set = [] doc_mapping = {} link_mapping = {} for f in glob(pathToCorpora+'/*'): filereader = open(f, 'r') article = filereader.readlines();filereader.close() text = '' try: link = article[0] title = article[1] text = article[2].lower() except IndexError: continue # Skip document length < min_length if len(text) < self.min_length: continue text = punct.sub("",text) # Remove all punctuations tokens = nltk.word_tokenize(text) # Tokenize the whole text # Lemmatize every word and add to tokens list if the word is not in stopword train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) # Build doc-mapping doc_mapping[doc_count] = title link_mapping[doc_count] = link doc_count = doc_count+1 if doc_count % 10000 == 0: print 'Have processed %i documents' % (doc_count) print 'Finished tokenzing the copora: %s' % (pathToCorpora) return doc_count,train_set,doc_mapping,link_mapping
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ): """ Preprocess a list containing text documents stored as strings. """ token_pattern = re.compile(r"\b\w\w+\b", re.U) if lemmatize: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() def normalize( x ): x = x.lower() if lemmatize: return wnl.lemmatize(x) return x def custom_tokenizer( s ): return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[ v[term] ] = term return (X,terms)
def __init__(self): self.model = WordNetLemmatizer()
def get_lemma(word): l = WordNetLemmatizer() return l.lemmatize(word)
def simple_lemmatizing(tokens): lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens] return lemmatized_tokens
def __lemmatize(self, lemma): """ Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma. """ string, tag = lemma if tag in ('a', 'n', 'r', 'v'): wnl = WordNetLemmatizer() string = wnl.lemmatize(string, tag) return (string, tag) ###################################################################### # POSITIONING.
def tokenizer(document): """ input: a string output: a list of strings converts a string into tokens and performs the following steps: 1. elimaintes non alphabetical characters 2. converts to lower case 3. lemmatizes using the nltk.stem.WordNetLemmatizer 4. splits into tokens """ text = re.sub('[^a-zA-Z]', ' ', document) tokens = text.lower().split() tokens = [lemmatizer(tkn) for tkn in tokens] return tokens
def __wn_lemmatize(self, lemma): """ Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always returns a (string, pos) pair. Lemmatizes even when the tag isn't helpful, by ignoring it for stemming. """ string, tag = lemma wnl = WordNetLemmatizer() if tag in ('a', 'n', 'r', 'v'): string = wnl.lemmatize(string, tag) else: string = wnl.lemmatize(string) return (string, tag)
def clean_review(review,stopwords): result = "" lemmatizer = WordNetLemmatizer() for word in review: #converts the word to its lemma form word = lemmatizer.lemmatize(word) #adds the word to the resultant review only if its not a stopword if word not in stopwords: #removes all non-alphabet characters word = re.sub('[^A-Za-z ]','',word) if(len(word) != 0): result += word+" " return result
def lemmatizer(text): # '''Description: This function takes in the string of descriptions and return string with all words lemmatized # Parameters: String of descriptions # Output: String with all words lemmatized (ex. "meeting" to "meeting" if noun and "meet" if verb)''' lemmatizer = WordNetLemmatizer() lis = unicode(str(text), 'utf-8').split(" ") lemm_words = [lemmatizer.lemmatize(word) for word in lis] return " ".join(lemm_words)
def _lemma_(token): if isinstance(token, str): return _stem_(token) if isinstance(token, unicode): return _stem_(token) from nltk.corpus import wordnet def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() p = get_wordnet_pos(token.pos()[0][1]) if p!=wordnet.VERB: return _stem_(token[0]) rs = wordnet_lemmatizer.lemmatize(token[0], pos=p) return rs
def train_ranker(options): lemmatizer = WordNetLemmatizer() words, answers, good_lfs, bad_lfs = load_lf_train(options.data_dir) r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file) trainer = optimizers[options.optimizer](r.model) sents = 0 total_loss = 0.0 train_size = len(words) i = 0 for epoch in range(options.epochs): for word, answer, good_lf, bad_lf in iter_lf_train(words, answers, good_lfs, bad_lfs): if len(good_lf) == 0: continue lemma = [lemmatizer.lemmatize(w) for w in word] loss = r.train(word, lemma, good_lf, bad_lf) sents += 1 if loss is not None: total_loss += loss.scalar_value() loss.backward() trainer.update() e = float(i) / train_size if i % options.print_every == 0: print('epoch {}: loss per sentence: {}'.format(e, total_loss / sents)) sents = 0 total_loss = 0.0 i += 1 print ('saving model...') save_as = '%s/epoch%03d.ranker' % (options.result_dir, epoch) r.save_model(save_as)
def find_match_word(hash_content, wordlist): split_words = [] while len(hash_content) !=0: #return the index of the matched word word, index = check_match(hash_content,wordlist) split_words.append(word) #remove the matched words from the original tokens hash_content = hash_content[len(hash_content)*(-1):index] return split_words #use WordNetLemmatizer to lemmatize the word
def text_clean(filename): ''' Input: File path of script. Output: List of all words in script lowercased, lemmatized, without punctuation. ''' wnl = WordNetLemmatizer() word_list = [word.decode("utf8", errors='ignore') for line in open(filename, 'r') for word in line.split()] lemma_list = [wnl.lemmatize(word.lower()) for word in word_list] return lemma_list
def preprocess(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower() tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return tokens
def preprocess_imageclef(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower() tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def preprocess_wikidata(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower().split('../img/')[0] tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None): if stopwords is not None: terms = [t for t in terms if t not in stopwords] if only_N_J is not None: # include only nouns and verbs tagged = nltk.pos_tag(terms) terms = [t for t, pos in tagged if pos in tags] if lemmatize is not None: lem = WordNetLemmatizer() terms = [lem.lemmatize(t) for t in terms] if stem is not None: stem = PorterStemmer() terms = [stem.stem(t) for t in terms] return terms
def __init__(self): """ Intialize memebers: question_dist - generalized-question distribution of the assigned extraction location. """ self.question_dist = defaultdict(lambda : defaultdict(lambda : 0)) self.lmtzr = WordNetLemmatizer()
def lemmatize(text): lemmatizer = WordNetLemmatizer() return ' '.join(lemmatizer.lemmatize(word) for word in text.split())
def word_lemma(doc_unis_list): wnl = WordNetLemmatizer() doc_stems_list = [] for doc_unis in doc_unis_list: doc_stems = [] for uni in doc_unis: stem_uni = wnl.lemmatize(uni) doc_stems.append(stem_uni) doc_stems_list.append(doc_stems) return doc_stems_list ########## Text Statistic Fuctions ##########
def POStagging(self): #????????????????????????? fin = open('../file/entity_signature.txt', 'r') fout = open('../file/pos_signature.txt', 'w+') lemmatizer = WordNetLemmatizer() j = 0#???????????????????? num = 0 while True: line = fin.readline() if line: if '***' in line: #print j, num fout.write(line) pro_num, pro = line.split('.') pro, num = pro.split() pro1, pro2 = pro.split('***') j = 0#??????????? elif '------' in line: fout.write(line) else: # split text into tokens #?? num, line = line.split(':', 1) fout.write(num + ':') text_tokens = nltk.word_tokenize(line) t = 0 # tag the sentence, using the default NTLK English tagger # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' sentence_tag = nltk.pos_tag(text_tokens) for i in range(len(sentence_tag)): word = sentence_tag[i][0] tag = sentence_tag[i][1] if word == 'Entity1': fout.write('#' + pro1 + '# ') elif word == 'Entity2': fout.write('#' + pro2 + '# ') else: if (re.match('(V|N)', tag)) and (not re.match('(NNP)', tag)): #if re.match('(V|N)', tag): #if re.match('V', tag): word = lemmatizer.lemmatize(word) t = t + 1 fout.write(word + ' ') fout.write('\n') if t > 0: j = j + 1 else: break fin.close() fout.close()
def gen_dataset(sentences, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output the category (one-hotted) Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( '../storage/pos_tagger/GoogleNews-vectors-negative300.bin', binary=True) vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON wordnet_lemmatizer = WordNetLemmatizer() lemmatizer = lambda x: wordnet_lemmatizer.lemmatize(x) X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, 300)) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words for sent_i in I: words = sentences[sent_i] if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, vectorizer=vectorizer, lemmatizer=lemmatizer, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = util.split_data( X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict
def gen_dataset(sentences, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output is an embedded vector for the lemmatized form. Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( '../storage/pos_tagger/GoogleNews-vectors-negative300.bin', binary=True) vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON lemmatizer = WordNetLemmatizer().lemmatize X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, 300)) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words for sent_i, words in enumerate(sentences): if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, vectorizer=vectorizer, lemmatizer=lemmatizer, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = split_data( X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict