我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.wordpunct_tokenize()。
def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma
def parseTweetSet(tweets_data_path): tweets_text = [] tweets_file = open(tweets_data_path, "r") english_stopwords_set = set(stopwords.words('english')) for line in tweets_file: tweet = json.loads(line) text = tweet['text'] tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] words_set = set(words) common_elements = words_set.intersection(english_stopwords_set) if (len(common_elements)>2): tweets_text.append(tweet['text']) tweets_text_set = set(tweets_text) #print len(tweets_text) #print len(tweets_text_set) #print tweets_text_set return list(tweets_text_set)
def tokenize(self, text): """ Performs tokenization in addition to normalization. """ return self.normalize(nltk.wordpunct_tokenize(text))
def parse(sent): parser = nltk.ChartParser(grammar) tokens = nltk.wordpunct_tokenize(sent) return parser.parse(tokens)
def tokenize(string, lower=True): if lower: return nltk.wordpunct_tokenize(string.lower().strip()) else: return nltk.wordpunct_tokenize(string.strip())
def tokenize_and_normalize(string, lower=True): if lower: return nltk.wordpunct_tokenize(normalize(string).lower().strip()) else: return nltk.wordpunct_tokenize(normalize(string).strip())
def nonenglish(string): # '''Description: This function takes in the string of descriptions and return the string with nonenglish words removed (useful for course syllabi) # Parameters: String of descriptions # Output: the string with nonenglish words removed''' words = set(nltk.corpus.words.words()) result=[w for w in nltk.wordpunct_tokenize(string) if w.lower() in words] return " ".join(result)
def calculate_languages_ratios(text): """ Compute per language included in nltk number of unique stopwords appearing in analyzed text. """ languages_ratios = {} tokens = wordpunct_tokenize(text) words = {word.lower() for word in tokens} for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) common_elements = words & stopwords_set languages_ratios[language] = len(common_elements) return languages_ratios
def translateHinglishTweets(tweets_text): counter = 0 tweets_text_translated = [] n = len(tweets_text) open_file = open("dictionary.pickle", "rb") dictionary = pickle.load(open_file) open_file.close() english_stopwords_set = set(stopwords.words('english')) for i in range(n): text = tweets_text[i] translated_text = "" tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] for word in words: if word in english_stopwords_set: translated_text = translated_text + " " + word elif (word in dictionary): #print word + "-" + dictionary[word] translated_text = translated_text + " " + dictionary[word] counter = counter + 1 else: translated_text = translated_text + " " + word tweets_text_translated.append(translated_text) #print counter return tweets_text_translated
def __call__(self, text): ''' @param text: the string of text to be tagged @returns: a list of tags respecting the order in the text ''' sentences = nltk.sent_tokenize(text) punctuation = set(string.punctuation) proper_noun = lambda x: True if x == 'NN' else False tags = [] #Giving importance to first sentece words. if len(sentences) > 0: #stripping away punctuation words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation]) if len(words) > 1: tags.append(Tag(str(words[0][0]))) for word, tag in words[1:-1]: tags.append(Tag(str(word), proper=proper_noun(tag))) tags.append(Tag(str(words[-1][0]), proper=proper_noun(str(words[-1][1])), terminal=True)) elif len(words) == 1: tags.append(Tag(str(words[0][0]), terminal=True)) #Rest of the sentences for sent in sentences[1:]: words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation]) if len(words) > 1: for word,tag in words[:-1]: tags.append(Tag(str(word), proper=proper_noun(tag))) if len(words) > 0: tags.append(Tag(str(words[-1][0]), proper=proper_noun(str(words[-1][1])), terminal=True)) return tags