我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用nltk.text()。
def tokenize(text): stem = nltk.stem.SnowballStemmer('english') text = text.lower() for token in nltk.word_tokenize(text): if token in string.punctuation: continue yield stem.stem(token) # The corpus object
def sklearn_frequency_vectorize(corpus): # The Scikit-Learn frequency vectorize method from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() return vectorizer.fit_transform(corpus)
def sklearn_one_hot_vectorize(corpus): # The Sklearn one hot vectorize method from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import Binarizer freq = CountVectorizer() vectors = freq.fit_transform(corpus) print(len(vectors.toarray()[0])) onehot = Binarizer() vectors = onehot.fit_transform(vectors.toarray()) print(len(vectors[0]))
def nltk_tfidf_vectorize(corpus): from nltk.text import TextCollection corpus = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield { term: texts.tf_idf(term, doc) for term in doc }
def sklearn_tfidf_vectorize(corpus): from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() return tfidf.fit_transform(corpus)