我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.TfidfTransformer()。
def function_2(text): paragraphs = text.split('\n\n') count_vect = CountVectorizer() bow_matrix = count_vect.fit_transform(paragraphs) normalized_matrix = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied similarity_graph.toarray() nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) #TextRank applied ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores ten_percent = int(round(10.00/100.00 * len(ranked))) ten_percent_high_scores = ranked[0:ten_percent] summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order) return "\n\n".join(summary) #Text taken from the user's uploaded PDF or URL, cleaned and formatted.
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,): docs=[] for sent, sim in generatedSentences: docs.append(sent) docs.extend(originalSentences) bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) #simMatrix = (normalized[0:] * normalized[0:].T).A simindices=[] #print 'Num original, ', len(originalSentences) for i in xrange(len(generatedSentences)): simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten() if(max(simGeneratedScores) >= threshold): simindices.append(i) #print simindices finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices] #print len(generatedSentences), len(finalGen) return finalGen
def extract_feature(self): """ ??????????? """ # ?????????-??? self.train_dtm = self.count_vect.fit_transform(self.data['train'].data) # ????? TF ?? tf_transformer = TfidfTransformer(use_idf=False) self.train_tf = tf_transformer.transform(self.train_dtm) # ????? TF-IDF ?? tfidf_transformer = TfidfTransformer().fit(self.train_dtm) self.train_tfidf = tf_transformer.transform(self.train_dtm)
def feature(terms): dataMatrix = np.genfromtxt(finaltest, delimiter='|', dtype=None, skip_header=True) n = dataMatrix.size l = len(terms) occurence = np.zeros((n, l), dtype=np.int) d = 0 for row in dataMatrix: temp = row[0].lower().decode('UTF-8').split(' ') for i in range(l): if terms[i] in temp: occurence[d][i] += 1 d += 1 transformer = TfidfTransformer() tfdif = transformer.fit_transform(occurence) occurence = tfdif.toarray() np.savetxt('occurencetest.csv',occurence,delimiter=',') return occurence, dataMatrix
def word_unigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) vectorizer = CountVectorizer(min_df=2, stop_words=get_stopwords(), preprocessor=preprocessor, ngram_range=(1, 1)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_unigrams', pipeline)
def Training_model(): #???????????? f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt") # ??????????? f.readline() # ???? data = np.loadtxt(f) #????????? f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt") leibie = np.loadtxt(f1) f.close() f1.close() #TF-IDF?? transformer = TfidfTransformer() tfidf = transformer.fit_transform(data) data1 = tfidf.toarray() #SVM????? clf = svm.SVC() # class clf.fit(data1, leibie) # training the svc model return clf
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
def get_logistic_regression_coefs_l2(self, category, clf=RidgeClassifierCV()): ''' Computes l2-penalized logistic regression score. Parameters ---------- category : str category name to score category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' from sklearn.cross_validation import cross_val_predict y = self._get_mask_from_category(category) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y) y_hat = cross_val_predict(clf, X, y) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) return clf.coef_[0], acc, baseline
def get_logistic_regression_coefs_l1(self, category, clf=LassoCV(alphas=[0.1, 0.001], max_iter=10000, n_jobs=-1)): ''' Computes l1-penalized logistic regression score. Parameters ---------- category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' from sklearn.cross_validation import cross_val_predict y = self._get_mask_from_category(category) y_continuous = self._get_continuous_version_boolean_y(y) # X = TfidfTransformer().fit_transform(self._X) X = self._X clf.fit(X, y_continuous) y_hat = (cross_val_predict(clf, X, y_continuous) > 0) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) clf.fit(X, y_continuous) return clf.coef_, acc, baseline
def getTFIDF(): """ :return: """ corpus,textList=getFenCiWords(); vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i??????? transformer=TfidfTransformer()#??????????tf-idf?? tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform?????????? word=vectorizer.get_feature_names()#???????????? weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf?? print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??" return weight, textList # for i in range(len(weight)):#???????tf-idf????????for??????????for????????????? # print u"-------?????",i,u"??????tf-idf??------" # for j in range(len(word)): # print word[j],weight[i][j]
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def tfidf_feature(xtrain, xtest, stopwords_path): """ tf-idf feature """ xtrain = [" ".join(word) for word in xtrain] xtest = [" ".join(word) for word in xtest] stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines() stopwords = [word.strip("\n") for word in stopwords] vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5) count_train = vectorizer_train.fit_transform(xtrain) vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_) count_test = vectorizer_test.fit_transform(xtest) transformer = TfidfTransformer() tfidf_train = transformer.fit(count_train).transform(count_train) tfidf_test = transformer.fit(count_test).transform(count_test) return tfidf_train.toarray(),tfidf_test.toarray()
def getModels(self): with open(self.data_path + '/categories.pkl', 'rb') as f: categories = cPickle.load(f) with open(self.data_path + '/category_map.pkl', 'rb') as f: category_map = cPickle.load(f) with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f: clf = cPickle.load(f) count_vect = CountVectorizer() with open(self.data_path + '/count_vect.pkl', 'rb') as f: count_vect = cPickle.load(f) tfidf_transformer = TfidfTransformer() with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f: tfidf_transformer = cPickle.load(f) with open(self.data_path + '/tree.pkl', 'rb') as f: tree = cPickle.load(f) return categories, category_map, clf, count_vect, tfidf_transformer, tree
def get_topic_idf(self, sentences): vectorizer = CountVectorizer() sent_word_matrix = vectorizer.fit_transform(sentences) transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False) tfidf = transformer.fit_transform(sent_word_matrix) tfidf = tfidf.toarray() centroid_vector = tfidf.sum(0) centroid_vector = np.divide(centroid_vector, centroid_vector.max()) # print(centroid_vector.max()) feature_names = vectorizer.get_feature_names() word_list = [] for i in range(centroid_vector.shape[0]): if centroid_vector[i] > self.topic_threshold: # print(feature_names[i], centroid_vector[i]) word_list.append(feature_names[i]) return word_list
def test_tf_idf_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # this is robust to features with only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all())
def _vectorize(self,corpus,fit): assert isinstance(corpus,kindred.Corpus) matrices = [] for feature in self.chosenFeatures: assert feature in self.featureInfo.keys() featureFunction = self.featureInfo[feature]['func'] never_tfidf = self.featureInfo[feature]['never_tfidf'] data = featureFunction(corpus) notEmpty = any( len(d)>0 for d in data ) if fit: if notEmpty: self.dictVectorizers[feature] = DictVectorizer() if self.tfidf and not never_tfidf: self.tfidfTransformers[feature] = TfidfTransformer() intermediate = self.dictVectorizers[feature].fit_transform(data) matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate)) else: matrices.append(self.dictVectorizers[feature].fit_transform(data)) else: if feature in self.dictVectorizers: if self.tfidf and not never_tfidf: intermediate = self.dictVectorizers[feature].transform(data) matrices.append(self.tfidfTransformers[feature].transform(intermediate)) else: matrices.append(self.dictVectorizers[feature].transform(data)) mergedMatrix = hstack(matrices) return mergedMatrix
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'): """Definido en la declaracion de la clase. Attributes: texts (list of str): Textos a clasificar. ids (list of str): Identificadores únicos para cada texto (debe tener la misma longitud que `texts`). vocabulary (list): Opcional. Vocabulario a tener en cuenta para la vectorización de los textos. Default: usa todas las palabras presentes en los textos, salvo los ES_stopwords.txt. encoding (str): Codificación de los textos en `texts` y en `ids`. """ this_dir, this_filename = os.path.split(__file__) es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'), header=None, encoding='utf-8') es_stopwords = list(np.squeeze(es_stopwords.values)) self._check_id_length(ids) self.vectorizer = CountVectorizer( input='content', encoding=encoding, decode_error='strict', strip_accents='ascii', lowercase=True, preprocessor=None, tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1), analyzer='word', max_df=0.8, min_df=1, max_features=None, vocabulary=vocabulary, binary=False) self.transformer = TfidfTransformer() self.ids = None # Matiene una lista ordenada de ids de textos. self.term_mat = None # Matriz que cuenta los terminos en un texto. self.tfidf_mat = None # Matriz de relevancia de los terminos. self.reload_texts(texts, ids)
def case1(): from sklearn import datasets news = datasets.fetch_20newsgroups(subset='all') # print len(news.data) # print len(news.target) # print '*'*10 # print news.data[0] # print '*'*10 # print news.target[0] from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer vec = CountVectorizer() x = vec.fit_transform(news.data) # print x.shape # print x[:2] print x[:10,:10].toarray() TFIDF = TfidfTransformer() x_tfidf = TFIDF.fit_transform(x) print x_tfidf[:10,:10].toarray() from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233) tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233) from sklearn.naive_bayes import MultinomialNB mnb =MultinomialNB() tf_mnb = MultinomialNB() mmb.fit(Xtrain,ytrain) tf_mnb.fit(tf_Xtrain,tf_ytrain)
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()) ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def cvectorize(f, c, n): r"""Use the Count Vectorizer and TF-IDF Transformer. Parameters ---------- f : pandas.DataFrame Dataframe containing the column ``c``. c : str Name of the text column in the dataframe ``f``. n : int The number of n-grams. Returns ------- new_features : sparse matrix The transformed features. References ---------- To use count vectorization and TF-IDF, you can find more information here [TFE]_. .. [TFE] http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction """ fc = f[c] fc.fillna(BSEP, inplace=True) cvect = CountVectorizer(ngram_range=[1, n], analyzer='char') cfeat = cvect.fit_transform(fc) tfidf_transformer = TfidfTransformer() new_features = tfidf_transformer.fit_transform(cfeat).toarray() return new_features # # Function apply_treatment #
def __init__(self, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False, **kwargs): self.tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) # override defaults since we need the counts here self.verbose = kwargs.get('verbose', 0) binary = kwargs.pop('binary', False) dtype = kwargs.pop('dtype', np.int64) # pass remaining args to countvectorizer self._init_params(name="TFIDF", binary=binary, dtype=dtype, **kwargs)
def predict(self, docs): """ ??????????? """ X_new_counts = self.count_vect.transform(docs) tfidf_transformer = TfidfTransformer().fit(X_new_counts) X_new_tfidf = tfidf_transformer.transform(X_new_counts) return self.clf.predict(X_new_tfidf)
def normalize(counts): transformer = TfidfTransformer(smooth_idf=1) return transformer.fit_transform(counts).toarray()
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_regression(args.input, encoding='windows-1252') analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) test = read_test_data(args.test, encoding='windows-1252') regressor = pipeline.fit(data[0], data[1]) y = regressor.predict(test[2]) with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile: for id_, topic, rate in zip(test[0], test[1], y): print(id_, topic, rate, sep='\t', file=outfile)
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def feature(): global termcount dataMatrix = np.genfromtxt(finaltrial, delimiter='|', dtype=None, skip_header=True) terms = [] n = dataMatrix.size for row in dataMatrix: row[0] = row[0].lower().decode('UTF-8') temp = row[0].decode('UTF-8').replace(' ', '+') temp = (get.urlopen("http://localhost:5095/parser?sentence=" + temp).read()).decode('UTF-8') terms.extend([x.split('/')[0] for x in temp.split(' ') if x.split('/')[1] == 'JJ' or x.split('/')[1].startswith('VB')]) tfidf(temp) s = sum(list(termcount.values())) termcount = {x: (y * 100 / s) for x, y in zip(termcount.keys(), termcount.values())} # terms.extend([x for x in termcount.keys()]) terms = list(set(terms)) stop = open('stop.csv', 'r').read().splitlines() terms = [x for x in terms if x not in stop] l = len(terms) occurence = np.zeros((n, l), dtype=np.int) d = 0 for row in dataMatrix: temp = row[0].decode('UTF-8').split(' ') for i in range(l): if terms[i] in temp: occurence[d][i] += 1 d += 1 transformer = TfidfTransformer() tfdif = transformer.fit_transform(occurence) occurence = tfdif.toarray() np.savetxt('occurence.csv',occurence,delimiter=',') return occurence, dataMatrix, terms
def avg_spelling_error(lang=None): pipeline = Pipeline([('feature', SpellingError(language=lang)), ('tfidf', TfidfTransformer(sublinear_tf=False)), ('scale', Normalizer())]) return ('avg_spelling_error', pipeline)
def punctuation_features(): pipeline = Pipeline([('feature', PunctuationFeatures()), ('tfidf', TfidfTransformer(sublinear_tf=False)), ('scale', Normalizer())]) return ('punctuation_features', pipeline)
def word_bigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor, ngram_range=(2, 2))), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_bigrams', pipeline)
def char_ngrams(): vectorizer = CountVectorizer(min_df=1, preprocessor=TextCleaner(filter_urls=True, filter_mentions=True, filter_hashtags=True, lowercase=False), analyzer='char_wb', ngram_range=(4, 4)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('char_ngrams', pipeline)
def TFIDF_result(): str_handel_list = read_handel_list() # ??30?????????????????????str str_test = read_test_list() # ?????????????????str # ??TF-IDF??? corpus = str_handel_list[:] # TF-IDF???? corpus.append(str_test) # ???????????? print "TF-IDF corpus building success..." ######################### ??scikit-learn?? TF-IDF???? # ??????????????????????a[i][j] ??j??i??????? vectorizer = CountVectorizer() # ??????????tf-idf?? transformer = TfidfTransformer() # ???fit_transform???tf-idf????fit_transform?????????? tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # ???????????? word = vectorizer.get_feature_names() # ?tf-idf?????????a[i][j]??j??i?????tf-idf?? weight = tfidf.toarray() print "TF-IDF score is calcuated success..." # ???30???????????TF-IDF?? results = [] for j in range(len(word)): if word[j] == '??' or word[j] == '??' or len(word[j]) == 1: # ??????????1?? continue results.append((word[j], weight[30][j])) # ?????????? sorted_results = sorted(results, key=lambda result: result[1], reverse=True) # ?????? # ?TF-IDF???100???? fp_tfidf_result = open("f://emotion/mysite/Label_extract/result_tfidf.txt", 'w+') tfidf_results = [] for i in range(100): # ???????100?????????????? tfidf_results.append((sorted_results[i][0], sorted_results[i][1])) fp_tfidf_result.write(sorted_results[i][0] + ' ' + str(round(sorted_results[i][1], 10))) fp_tfidf_result.write('\n') fp_tfidf_result.close() return tfidf_results
def _fit_tfidf_model(self, category, clf): y = self._get_mask_from_category(category) y_continuous = self._get_continuous_version_boolean_y(y) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y_continuous)
def fit_tfidf(count_vector): ''' Fits a term frequency matrix on a count vector. ''' tfidf_vector = TfidfTransformer(use_idf=False).fit(count_vector) return tfidf_vector
def fit_tfidf(count_vector): ''' Transforms a count vector into a tf vector. TF: count vector normalized on legnth of docs. ''' tfidf = TfidfTransformer(use_idf=False) tfidf_vector = tfidf.fit(count_vector) return tfidf_vector
def fit_tfidf(count_vector): tfidf = TfidfTransformer(use_idf=False) tfidf_vector = tfidf.fit(count_vector) return tfidf_vector
def train_sgdc(training_list): footnotes=[] cate=[] for i in training_list: footnotes.append(i[0]) cate.append(i[1]) text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,n_iter=5, random_state=42)),]) _ = text_clf.fit(footnotes,cate) return text_clf
def parseToBOW(): vectorizer = CountVectorizer(min_df=1) texts = pickle.load(open(OUTFILE, 'rb'))[0] tdm = vectorizer.fit_transform(texts) transformer = TfidfTransformer() tdidf = transformer.fit_transform(tdm) f = open(DATASET_PATH + "BOW.p", "wb") pickle.dump(tdm, f) f.close() f = open(DATASET_PATH + "BOW_TDIDF.p", "wb") pickle.dump(tdidf, f) f.close()
def tfidf_transformer(bow_matrix): transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True) tfidf_matrix = transformer.fit_transform(bow_matrix) return transformer, tfidf_matrix
def transformTFIDF(X_train_all, X_test_all): """Transform bag-of-events using TF-IDF. Arguments --------- X_train_all: pandas DataFrame X_test_all: pandas DataFrame Returns ------- X_train_t: CSR matrix X_test_t: CSR matrix """ tfidf_t = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True) X_train = scipy.sparse.csr_matrix(X_train_all) X_test = scipy.sparse.csr_matrix(X_test_all) # Fit TFIDF using training data. tfidf_t.fit(X_train) # Transform both training and test data. X_train_t = tfidf_t.transform(X_train) X_test_t = tfidf_t.transform(X_test) return X_train_t, X_test_t
def get_pipeline(name): x = TrainingSet.objects.filter(classifier=name).values_list('body', flat=True) y = TrainingSet.objects.filter(classifier=name).values_list('target', flat=True) pipeline = Pipeline([ ('vector', CountVectorizer()), ('transform', TfidfTransformer()), ('bayes', MultinomialNB()) ]) pipeline.fit(x, y) return pipeline
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def kmeans(class_num): """ kmeans ?? :param class_num: ???? :return:class_list[[??1???2],[??1???2]] """ class_list=list(); sentences_words,sentences=loadFile() vectorizer = CountVectorizer() # ??????????????????????a[i][j] ??j??i??????? transformer = TfidfTransformer() # ??????????tf-idf?? # ???fit_transform???tf-idf????fit_transform?????????? #?????words_list ???["? ? ?? ???","?? ??"] ?????????????list tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences_words)) #weight ???shape=[????????] ??????? weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf?? clf = KMeans(n_clusters=class_num) s = clf.fit(weight) for i in range(class_num): class_list.append(list()) print clf.labels_ for i in range(len(clf.labels_)):#clf.labels_ ??????????[1,3,2,5,0,3,5,4,1] ??????????? class_label=clf.labels_[i] class_list[class_label].append(sentences[i]) #print "#######?"+str(clf.labels_[i])+"?"+words_list[i] return class_list;
def __init__(self,min_df=2,norm="l2"): """ Constructor """ self.cv = CountVectorizer(min_df=min_df) self.tfidf = TfidfTransformer(norm) self.LOG_IDF = None self.CORPUS_VOCAB = None self.OOV_IDF_VAL = 0 #min idf value to assign for out-of-vocabulary terms self.IDF_MODEL = dict()
def compute_query_idf(self,corpus): """ Compute IDF from s and t in case you have no externally computed IDF to use """ cv = CountVectorizer(min_df = 0.0) cv.fit_transform(corpus) self.logger.debug(cv.vocabulary_) freq_term_matrix = cv.transform(corpus) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) log_idf = tfidf.idf_ self.LOG_IDF = log_idf self.CORPUS_VOCAB = cv.vocabulary_
def getTF(dataset): tfidf = TfidfTransformer(norm=None) tfidf.fit(dataset['train']) return tfidf.idf_
def tfidf_pipeline(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=False, pos_features=False, clf=LinearSVC()): return Pipeline([ ('mapper', mapper(df, ngram_range, lowercase, binary, min_df, max_df, caps_features, pos_features)), ('scaler', TfidfTransformer()), ('clf', clf), ])
def file2mat(filename): transformer = TfidfTransformer() vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) data = load(filename) reviews = [each_data['review'] for each_data in data] bag_of_word = vectorizer.fit_transform(reviews) tfidf = transformer.fit_transform(bag_of_word) aspect_label = collect_aspect_label(data) rating_label = collect_rating_label(data) return tfidf, aspect_label, rating_label # ??wordVec ????? ?????????