我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.CountVectorizer()。
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0): """ Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency :param ngram_range: n-grams are created for all numbers within this range :param min_df: min document frequency of features :param max_df: max document frequency of features :return: """ if self.is_weight == 'FP':#Feature Presence vectorizer = CountVectorizer(ngram_range=ngram_range, tokenizer=self.tokenize, min_df=min_df, max_df=max_df, binary=True, stop_words='english') if self.is_weight == 'TF-IDF':#Feature Presence vectorizer = TfidfVectorizer(ngram_range=ngram_range, tokenizer=self.tokenize, min_df=min_df, max_df=max_df, binary=True, stop_words='english') return vectorizer
def getTFV(token_pattern = token_pattern, norm = tfidf__norm, max_df = tfidf__max_df, min_df = tfidf__min_df, ngram_range = (1, 1), vocabulary = None, stop_words = 'english'): tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=token_pattern, ngram_range=ngram_range, use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = stop_words, norm=norm, vocabulary=vocabulary) return tfv #========= CountVectorizer =========#
def function_2(text): paragraphs = text.split('\n\n') count_vect = CountVectorizer() bow_matrix = count_vect.fit_transform(paragraphs) normalized_matrix = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied similarity_graph.toarray() nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) #TextRank applied ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores ten_percent = int(round(10.00/100.00 * len(ranked))) ten_percent_high_scores = ranked[0:ten_percent] summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order) return "\n\n".join(summary) #Text taken from the user's uploaded PDF or URL, cleaned and formatted.
def getBOW(token_pattern = token_pattern, max_df = bow__max_df, min_df = bow__min_df, ngram_range = (1, 1), vocabulary = None, stop_words = 'english'): bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=token_pattern, ngram_range=ngram_range, stop_words = stop_words, vocabulary=vocabulary) return bow ######################################################## # ------------------------------ # Simple text cleaning using # # -replacement dict # # or # # -WordReplacer object #--------------------------------
def predict_job(job_list): """Assign a classification to a url""" # TODO: Add case where len is 1 or 0.... job_list = [job for j in job_list for job in j] new_job_list = [regex.tokenize_and_stem(i) for i in job_list] new_job_list = [' '.join(job) for job in new_job_list] vect = CountVectorizer() x_series = pd.Series(X) X_train_dtm = vect.fit_transform(x_series) y_train = pd.Series(y) job_list_series = pd.Series(new_job_list) job_list_dtm = vect.transform(job_list_series) nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred = nb.predict(job_list_dtm) # for i in range(len(job_list)): # print(job_list[i], y_pred[i]) return y_pred # print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def run(self): all_file_names = [] all_labels = [] for n, folder_name in enumerate(os.listdir(self.in_txtdir().path)): full_folder_name = self.in_txtdir().path+'/'+folder_name if os.path.isfile(full_folder_name): continue for file_name in os.listdir(full_folder_name): all_labels.append(n) all_file_names.append(full_folder_name+'/'+file_name) vectorizer = CountVectorizer(input='filename') vector = vectorizer.fit_transform(all_file_names) numpy.save(self.out_npy().path,vector) numpy.save('labels',numpy.array(all_labels)) #Where and how do we want to save this? #This is just to test the tasks above
def gen_lstm_status(screen_name, timeline, short_url, depth): # Create a vector of words and their frequency in on the user's timeline. # Experimentation shows that requiring a word to occur at least 4 * depth # times to be considered gives good results. with open("stopwords.txt", 'r') as stopwords_file: stopwords = [line.strip() for line in stopwords_file] processed_timeline_text = [preprocess_post(post) for post in timeline] vectorizer = CountVectorizer(min_df=4*depth, stop_words=stopwords) X = vectorizer.fit_transform(processed_timeline_text) vocab = vectorizer.get_feature_names() topic = random.choice(vocab) # Generates a status using a helper bash script. proc = subprocess.Popen([NN_SAMPLE_COMMAND, topic], stdout=subprocess.PIPE) status = topic + " " + proc.stdout.read().split("\n")[-2].strip() return "@" + screen_name + " " + status + " " + short_url
def count_features(self,X,verbose=False): ''' ??????????????????????????? X?dataframe??columns????self.columns? ???????????self.estimators_??????????dataframe?index?X?columns?self.columns???????? ''' result=[] for i,estimator in enumerate(self.estimators_): tmp=pd.Series(estimator.apply(X[self.columns])) tmp.index=X.index tmp=tmp.map(lambda xx: ' '.join([yy[0] for yy in self.paths[i][xx]])) vect=CountVectorizer(vocabulary=self.columns,lowercase=False) tmp=vect.transform(tmp).toarray() tmp=pd.DataFrame(tmp) vocabulary_inverse={vect.vocabulary_[key]:key for key in vect.vocabulary_} tmp.columns=[vocabulary_inverse[k] for k in range(tmp.shape[1])] tmp.index=X.index tmp.index.name=X.index.name tmp=tmp.fillna(0) result.append(tmp.copy()) if verbose: print('Done:',i) return result
def textToTokens(text): """Converts input string to a corpus of tokenized sentences. Assumes that the sentences are divided by newlines (but will ignore empty sentences). You can use this to try out your own datasets, but is not needed for reading the homework data. """ corpus = [] sents = text.split("\n") from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() count_vect.fit(sents) tokenizer = count_vect.build_tokenizer() for s in sents: toks = tokenizer(s) if len(toks) > 0: corpus.append(toks) return corpus
def _vectorize_documents(self,method='tfidf',max_features=100): stop_words = [] try: for lexicon_id in self.params['cluster_lexicons']: lexicon = Lexicon.objects.get(id=int(lexicon_id)) words = Word.objects.filter(lexicon=lexicon) stop_words+=[word.wrd for word in words] except: KeyError if method == 'count': vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words) if method == 'tfidf': vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words) document_vectors = vectorizer.fit_transform(self.documents) document_vectors = document_vectors.toarray() return document_vectors,vectorizer.get_feature_names()
def __init__(self, match_fn=TermMatch, binary=True, dtype=np.bool_, **cv_params): """initializes a Matching object :match_fn: A matching function of signature `docs, query` -> indices of matching docs :binary: Store only binary term occurrences. :dtype: Data type of internal feature matrix :cv_params: Parameter for the count vectorizer such as lowercase=True """ # RetrievalBase.__init__(self) self._match_fn = match_fn self._vect = CountVectorizer(binary=binary, dtype=dtype, **cv_params)
def is_embedded(sentence, embedding, analyzer): """ >>> embedding = ["a", "b", "c"] >>> queries = ["a b c", "a", "b", "c", "a b c d", "d", "a b c" ] >>> analyzer = lambda x: x.split() >>> [query for query in queries if is_embedded(query, embedding, analyzer)] ['a b c', 'a', 'b', 'c', 'a b c'] >>> analyzer = CountVectorizer().build_analyzer() >>> [query for query in queries if is_embedded(query, embedding, analyzer)] ['a b c', 'a', 'b', 'c', 'a b c'] """ for word in analyzer(sentence): if word not in embedding: print("Dropping:", sentence, file=sys.stderr) return False return True
def train_feature_finder(self, training_db, clf): training_sentences = [] c = 0 training_classes = [] self.class_names = [] self.vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 500) for key, value in training_db.iteritems(): training_sentences += value training_classes += [c for i in range(len(value))] c+=1 self.class_names.append(key) train_data_features = self.vectorizer.fit_transform(training_sentences) train_data_features = train_data_features.toarray() clf = clf.fit( train_data_features, training_classes) return clf
def getDatas(dataset_dir_name): movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) #word_tokenizer ?????????????????????????????????????????????????? vectorizer = CountVectorizer(binary = True, decode_error = u'ignore') word_tokenizer = vectorizer.build_tokenizer() #????????list doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train) doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test) return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
def run(): py2neo.authenticate("localhost:7474","neo4j","neo4j1") graph = Graph("http://localhost:7474/db/data/") result=graph.data('''MATCH (n:Product)-[r:BELONGS_TO]->(c:Category) WITH n, rand() AS number RETURN n.name,n.description,n.catName order by number limit 3000''') st = "" for x in result: p=','.join(str(val).strip(string.punctuation) for (key,val) in x.items()) st=st + p p="" vectorizer = CountVectorizer(strip_accents='ascii') tokenizer = vectorizer.build_tokenizer() preprocessor = vectorizer.build_preprocessor() tokens = set() for item in tokenizer(st): tokens.add(preprocessor(item)) with codecs.open(path_config.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f: for token in tokens: f.write(token + '\n')
def word_unigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) vectorizer = CountVectorizer(min_df=2, stop_words=get_stopwords(), preprocessor=preprocessor, ngram_range=(1, 1)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_unigrams', pipeline)
def get_data(): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer vectorizer = CountVectorizer() categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] # Train set newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True) X_train = vectorizer.fit_transform(newsgroups_train.data) y_train = newsgroups_train.target # Test set newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True) X_test = vectorizer.transform(newsgroups_test.data) y_test = newsgroups_test.target return X_train, y_train, X_test, y_test
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False): #choosing the particular flavor of vectorizer if method == 'counts': vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary) elif method == 'tfidf': vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace') #fitting the vectorizer and converting the counts to an array full_fit = vectorizer.fit_transform(df[x_name]) full_counts = full_fit.toarray() self.vocabulary_ = vectorizer.vocabulary_ #passing the attributes up to the class instance self.data = df if sparse: full_counts = csr_matrix(full_counts) self.X = full_counts if y_name != None: self.y = np.array(df[y_name]) return #splits the data into training and test sets; either called from process() #or on its own when your text is already vectorized and divided into x and y
def new(n_feature=128): vectorizer = CountVectorizer( encoding='utf-8', ngram_range=(1,1), # Unigram only max_features=n_feature, binary=True ) # Fill the gap (missing expected tags) # --- # Hypothesis: Some tags are somehow related so # we smoothen the missing values with matrix factorisation. smoother = NMF(n_components=n_feature) # Binarise the vector's individual values binariser = Binarizer(copy=True) # Count vectoriser => NMF as smoother => Binariser print(colored('Taghasher model created','yellow')) return [vectorizer,smoother,binariser]
def bag_of_words(messages, model=None, weighting=''): # TODO: Add stemmming or baseform here messages, stemmings2baseform = texttools.stemming_messages_snowball(messages) # Create new model for extrating text features if None is given if model is None: if weighting == 'tfidf': model = TfidfVectorizer() else: model = CountVectorizer() model.fit(messages) # Extract features x = model.transform(messages) return x
def test_build(self): newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) corpus = CorpusFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(corpus .get_term_freq_df() .assign(score=corpus.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief']) self.assertGreater(len(corpus.get_texts()[0]), 5)
def test_build(self): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) term_doc_mat = TermDocMatrixFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names).build() self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(term_doc_mat .get_term_freq_df() .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
def make_lda(self, nt, iterations): # ''' # description: sets important attributes and creates lda model # params: nt-number of topics for lda # iterations: number of iterations for lda # dim: 2d or 3d grpah # threshold: minimum percentage of the maximum topic in a document which can be included in a "cluster" # ''' self.nt = nt self.cvectorizer = CountVectorizer(min_df=5, stop_words='english') cvz = self.cvectorizer.fit_transform(self.descriptions) # train an LDA model self.lda_model = lda.LDA(n_topics=nt, n_iter=iterations) self.X_topics_original = self.lda_model.fit_transform(cvz) #initialize current stuff self.X_topics_current = self.X_topics_original self.titles_current = self.titles_original
def countvectorizer(inputpath=None, text=None): """ docstring """ vectorizer = CountVectorizer(min_df=1) if inputpath: filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)] corpus = [] for file in filenames: with open(file, 'r') as f: data = f.read() corpus.append(data) if text: corpus = text X = vectorizer.fit_transform(corpus) print(X.toarray()) print(vectorizer.get_feature_names())
def build_feature_matrix(documents, feature_type='frequency'): feature_type = feature_type.lower().strip() if feature_type == 'binary': vectorizer = CountVectorizer(binary=True, min_df=1, ngram_range=(1, 1)) elif feature_type == 'frequency': vectorizer = CountVectorizer(binary=False, min_df=1, ngram_range=(1, 1)) elif feature_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1)) else: raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'") feature_matrix = vectorizer.fit_transform(documents).astype(float) return vectorizer, feature_matrix
def build_feature_matrix(documents, feature_type='frequency', ngram_range=(1, 1), min_df=0.0, max_df=1.0): feature_type = feature_type.lower().strip() if feature_type == 'binary': vectorizer = CountVectorizer(binary=True, min_df=min_df, max_df=max_df, ngram_range=ngram_range) elif feature_type == 'frequency': vectorizer = CountVectorizer(binary=False, min_df=min_df, max_df=max_df, ngram_range=ngram_range) elif feature_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range) else: raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'") feature_matrix = vectorizer.fit_transform(documents).astype(float) return vectorizer, feature_matrix
def getTFIDF(): """ :return: """ corpus,textList=getFenCiWords(); vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i??????? transformer=TfidfTransformer()#??????????tf-idf?? tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform?????????? word=vectorizer.get_feature_names()#???????????? weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf?? print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??" return weight, textList # for i in range(len(weight)):#???????tf-idf????????for??????????for????????????? # print u"-------?????",i,u"??????tf-idf??------" # for j in range(len(word)): # print word[j],weight[i][j]
def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard", generate=True): super(MILClassifier, self).__init__() self.modelname = modelname self.pairtype = pairtype self.pairs = {} # (e1.normalized, e2.normalized) => (e1, e2) self.instances = {} # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1) self.bag_labels = [] # ordered list of labels for each bag self.bag_pairs = [] # ordered list of pair labels (e1.normalized, e2.normalized) self.data = [] # ordered list of bags, each is a list of feature vectors self.predicted = [] # ordered list of predictions for each bag self.resultsfile = None self.examplesfile = None self.ner_model = ner self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b') self.corpus = corpus #self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=) #self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20) self.classifier = misvm.sMIL(kernel='linear', C=1) #self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20) #if generate: # self.generateMILdata(test=test, pairtype=pairtype, relations=relations)
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def tfidf_feature(xtrain, xtest, stopwords_path): """ tf-idf feature """ xtrain = [" ".join(word) for word in xtrain] xtest = [" ".join(word) for word in xtest] stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines() stopwords = [word.strip("\n") for word in stopwords] vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5) count_train = vectorizer_train.fit_transform(xtrain) vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_) count_test = vectorizer_test.fit_transform(xtest) transformer = TfidfTransformer() tfidf_train = transformer.fit(count_train).transform(count_train) tfidf_test = transformer.fit(count_test).transform(count_test) return tfidf_train.toarray(),tfidf_test.toarray()
def __init__(self, ngram_range=(1, 1), analyzer='word', count=True, n_features=200): """Initializes the classifier. Args: ngram_range (tuple): Pair of ints specifying the range of ngrams. analyzer (string): Determines what type of analyzer to be used. Setting it to 'word' will consider each word as a unit of language and 'char' will consider each character as a unit of language. count (boolean): Determines if features are counts of n-grams versus a binary value encoding if the n-gram is present or not. n_features (int): Maximum number of features used. """ # checking what type of vectorizer to create if count: self.vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range, max_features=n_features) else: self.vectorizer = HashingVectorizer(analyzer=analyzer, ngram_range=ngram_range, n_features=n_features)
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test): fe = CountVectorizer( preprocessor=normalize, tokenizer=micro_tokenize, binary=True, ) predictor = NBSVM_predictor( kernel=conf.SVM_KERNEL, class_weight=conf.SVM_CLWEIGHT, C=conf.SVM_C, ) fe.fit(txt_train) X = fe.transform(txt_train) predictor.fit(X, y_train) X_test = fe.transform(txt_test) y_pred = predictor.predict(X_test) return y_pred
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test): fe = CountVectorizer( preprocessor=normalize, tokenizer=micro_tokenize, binary=True, ) predictor = SVC( kernel=conf.SVM_KERNEL, class_weight=conf.SVM_CLWEIGHT, C=conf.SVM_C, random_state=conf.SEED, ) fe.fit(txt_train) X = fe.transform(txt_train) predictor.fit(X, y_train) X_test = fe.transform(txt_test) y_pred = predictor.predict(X_test) return y_pred
def compute_VwS(self,s): """ Compute V(w,S) as defined by Cohen et al.'s IJCAI03 paper """ # Get term-frequency vectors and vocab list for string cv = CountVectorizer(min_df = 0.0, token_pattern=u'(?u)\\b\\w+\\b') tf = cv.fit_transform([s]); tf = tf.tocsr() vocab = cv.vocabulary_ # Compute V(w,S) for string vprime_ws = dict() vprime_ws_norm = 0 for w in vocab: if w in self.CORPUS_VOCAB: vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.LOG_IDF[self.CORPUS_VOCAB[w]] else: vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.OOV_IDF_VAL #if not in vocab, defauly to OOC_IDF_VAL vprime_ws_norm += vprime_ws[w]**2 vprime_ws_norm = math.sqrt(vprime_ws_norm) return (vocab,vprime_ws,vprime_ws_norm)
def bow_to_npy(vocabulary_fname, bow_fname, npy_fname): ''' Vectorize bag-of-words dump and save in NumPy file PARAMETERS ----------- vocabulary_fname: str or Path Vocabulary text file name, with one word on each line. bow_fname: str or Path Bag-of-words .txt.gz file name. When uncompressed, each line represents a document with only lower-case words separated by space. npy_fname: str or Path NumPy .npy file name to write the word count vectors into. ''' with Path(vocabulary_fname).open('r') as vocabulary_file: vocabulary = [line.strip() for line in vocabulary_file] vectorizer = CountVectorizer(vocabulary=vocabulary) with gzip.open(bow_fname, 'rt') as bow_file: word_counts = vectorizer.transform(bow_file) np.save(npy_fname, word_counts)
def test_read_files(self): docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit'] thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': []}, '13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet']}, } vocabulary = {'13542-1': 1, '13542-4': 0} fnames = [] for doc in docs: file = NamedTemporaryFile(mode='w', delete=False) fnames.append(file.name) print(doc, file=file) cf = ConceptAnalyzer(thesaurus, input='filename') counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary, input='filename') res = counter.fit_transform(fnames).todense() np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def get_topic_distributions(examples, vectorizer, lda_model): """ Retrieve the topic distributions of a collection of documents. :param examples: a list of tokenised documents :param vectorizer: the CountVectorizer used for transforming the documents :param lda_model: the trained LDA model :return: an array of shape (num_examples, num_topics) containing the topic distribution of each example """ vectorized_corpus = vectorizer.transform(examples) gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus, documents_columns=False) topic_representations = [] for doc in gensim_corpus: topic_representations.append( [topic_prob for (topic_id, topic_prob) in lda_model.get_document_topics(doc, minimum_probability=0.)]) return np.array(topic_representations) # PRE-TRAINED WORD EMBEDDINGS METHODS
def get_word_counts(input_str, limit = 100): input_str = PreprocessManager.remove_non_ascii(input_str) wordnet_lemmatizer = WordNetLemmatizer() snowball_stemmer = EnglishStemmer() tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower()) tokenized_text = [word for word in tokenized_text if len(word) > 1] # Filter some small words #tokenized_text = [word for word in tokenized_text if not word.isnumeric()] filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')] stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words] # Calculate frequency distribution frequency_dist = nltk.FreqDist(stemmed_list) # Output top 50 words result = dict() for word, frequency in frequency_dist.most_common(limit): # print(u'{};{}'.format(word, frequency)) result[word] = frequency return result # This function just splits the words and gives the words that's all!
def getModels(self): with open(self.data_path + '/categories.pkl', 'rb') as f: categories = cPickle.load(f) with open(self.data_path + '/category_map.pkl', 'rb') as f: category_map = cPickle.load(f) with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f: clf = cPickle.load(f) count_vect = CountVectorizer() with open(self.data_path + '/count_vect.pkl', 'rb') as f: count_vect = cPickle.load(f) tfidf_transformer = TfidfTransformer() with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f: tfidf_transformer = cPickle.load(f) with open(self.data_path + '/tree.pkl', 'rb') as f: tree = cPickle.load(f) return categories, category_map, clf, count_vect, tfidf_transformer, tree
def get_topic_idf(self, sentences): vectorizer = CountVectorizer() sent_word_matrix = vectorizer.fit_transform(sentences) transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False) tfidf = transformer.fit_transform(sent_word_matrix) tfidf = tfidf.toarray() centroid_vector = tfidf.sum(0) centroid_vector = np.divide(centroid_vector, centroid_vector.max()) # print(centroid_vector.max()) feature_names = vectorizer.get_feature_names() word_list = [] for i in range(centroid_vector.shape[0]): if centroid_vector[i] > self.topic_threshold: # print(feature_names[i], centroid_vector[i]) word_list.append(feature_names[i]) return word_list
def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) ########## Stemmer + CountVectorizer wrapper #############
def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) ########## Defaults TF-IDF & Count Vectorizers ######## #======== TF-IDF Vectorizer =========#
def train_test(): """Identify accuracy via training set""" X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2) vect = CountVectorizer() X_train_dtm = vect.fit_transform(X_train) # creates vocab set and dtm for each raw document! X_test_dtm = vect.transform(X_test) nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm # w = list(X_test) return metrics.accuracy_score(y_test, y_pred_class) # print(train_test())
def __init__(self): self.clf = LinearSVC() self.scores = [] self.vectorizer = CountVectorizer(token_pattern=r'[A-z]+', stop_words=english_stops, ngram_range=(1, 1))
def create_speech(self): self.speech = dict.fromkeys(self.archives,[]) #blacklist=[] # ids to be ignored, not implemented yet self.vectorizer = dict.fromkeys(self.archives,[]) self.mat = dict.fromkeys(self.archives,[]) for key in self.speech: self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?) self.vectorizer[key]=CountVectorizer(min_df=1) if key >=0: continue # why create dictionaries for private messages right now... logfile="{}.gz".format(os.path.join(self.logpath,str(key))) try: ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:] except IOError: print("{} not found".format(logfile)) continue prev_id = -1 for msg_line in ziplines: msg = Msg(json.loads(msg_line)) text=msg.get_text() chat_id=msg.get_chat_id() if (key != chat_id): input("Error in your logfile (key {} / chat {})!".format(key,chat_id)) sent_id=msg.get_sent_id() if text and text[0] not in ["/","!"] and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through if sent_id == prev_id: self.speech[key][0][-1]+="\n{}".format(text) else: self.speech[key][0].append(text) self.speech[key][1].append(sent_id) prev_id = sent_id if self.speech[key][0]: self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])
def compute_tf(data, stopwords_list, language, use_lemmer=True, min_df=2, max_df=0.8): """ Compute the tf matrix for the provided data :param language: 'en' or 'it' :param data: :param stopwords_list: :param use_lemmer: :param min_df: :param max_df: :return: """ lemmer_tokenizer = None if use_lemmer: if language == 'it': lemmer_tokenizer = LemNormalizeIt else: lemmer_tokenizer = LemNormalize min_df = min_df if len(data) > min_df else 1 max_df = max_df if max_df * len(data) >= min_df else 1.0 # tf tf_vectorizer = CountVectorizer(tokenizer=lemmer_tokenizer, max_df=max_df, min_df=min_df, max_features=None, stop_words=stopwords_list, token_pattern="[a-zA-Z]{3,}") try: tf = tf_vectorizer.fit_transform(data) tf_features_names = tf_vectorizer.get_feature_names() except: logging.warning('The computed tf matrix is empty. Check stopwords.') tf = [] tf_features_names = [] return tf, tf_features_names
def voc_count_bag(self): if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""): print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty." return file_obj = open(self.wordbag_path+self.trainset_name,'rb') self.data_set = pickle.load(file_obj) file_obj.close() #??vocabulary_count_bag????? self.vocabulary_count_bag.target_name = self.data_set.target_name self.vocabulary_count_bag.label =self.data_set.label self.vocabulary_count_bag.filenames =self.data_set.filenames corpus = self.data_set.content stopword_list = self.getstopword(self.stopword_path) #??????????,????????????? vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000) y = vectorizer.fit_transform(corpus) self.vocabulary_count_bag.vcm = y self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0) self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names() if not os.path.exists(self.wordbag_path): os.makedirs(self.wordbag_path) file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb') pickle.dump(self.vocabulary_count_bag,file_obj1) file_obj1.close() print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????" print "#######################################" #???????
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'): """Definido en la declaracion de la clase. Attributes: texts (list of str): Textos a clasificar. ids (list of str): Identificadores únicos para cada texto (debe tener la misma longitud que `texts`). vocabulary (list): Opcional. Vocabulario a tener en cuenta para la vectorización de los textos. Default: usa todas las palabras presentes en los textos, salvo los ES_stopwords.txt. encoding (str): Codificación de los textos en `texts` y en `ids`. """ this_dir, this_filename = os.path.split(__file__) es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'), header=None, encoding='utf-8') es_stopwords = list(np.squeeze(es_stopwords.values)) self._check_id_length(ids) self.vectorizer = CountVectorizer( input='content', encoding=encoding, decode_error='strict', strip_accents='ascii', lowercase=True, preprocessor=None, tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1), analyzer='word', max_df=0.8, min_df=1, max_features=None, vocabulary=vocabulary, binary=False) self.transformer = TfidfTransformer() self.ids = None # Matiene una lista ordenada de ids de textos. self.term_mat = None # Matriz que cuenta los terminos en un texto. self.tfidf_mat = None # Matriz de relevancia de los terminos. self.reload_texts(texts, ids)