我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.TfidfVectorizer()。
def represent(documents): train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] # Tokenization vectorizer = TfidfVectorizer(tokenizer=tokenize) # Learn and transform train documents vectorised_train_documents = vectorizer.fit_transform(train_docs) vectorised_test_documents = vectorizer.transform(test_docs) # Transform multilabel labels mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id]) return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0): """ Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency :param ngram_range: n-grams are created for all numbers within this range :param min_df: min document frequency of features :param max_df: max document frequency of features :return: """ if self.is_weight == 'FP':#Feature Presence vectorizer = CountVectorizer(ngram_range=ngram_range, tokenizer=self.tokenize, min_df=min_df, max_df=max_df, binary=True, stop_words='english') if self.is_weight == 'TF-IDF':#Feature Presence vectorizer = TfidfVectorizer(ngram_range=ngram_range, tokenizer=self.tokenize, min_df=min_df, max_df=max_df, binary=True, stop_words='english') return vectorizer
def getTFV(token_pattern = token_pattern, norm = tfidf__norm, max_df = tfidf__max_df, min_df = tfidf__min_df, ngram_range = (1, 1), vocabulary = None, stop_words = 'english'): tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=token_pattern, ngram_range=ngram_range, use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = stop_words, norm=norm, vocabulary=vocabulary) return tfv #========= CountVectorizer =========#
def __init__(self, column_descriptions=None): self.column_descriptions = column_descriptions self.text_col_indicators = set(['text', 'nlp']) self.text_columns = {} for key, val in self.column_descriptions.items(): if val in self.text_col_indicators: self.text_columns[key] = TfidfVectorizer( # If we have any documents that cannot be decoded properly, just ignore them and keep going as planned with everything else decode_error='ignore' # Try to strip accents from characters. Using unicode is slightly slower but more comprehensive than 'ascii' , strip_accents='unicode' # Can also choose 'character', which will likely increase accuracy, at the cost of much more space, generally , analyzer='word' # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal , stop_words='english' # Convert all characters to lowercase , lowercase=True # Only consider words that appear in fewer than max_df percent of all documents # In this case, ignore all words that appear in 90% of all documents , max_df=0.9 # Consider only the most frequently occurring 3000 words, after taking into account all the other filtering going on , max_features=3000 )
def fit_tfidf(self, df): ''' Function to fit a TF-IDF matrix to a corpus of text INPUT: df: df with 'lemmatized_text' to analyze ''' self.tfidf = TfidfVectorizer(input='content', use_idf=True, lowercase=True, max_features=self.tfidf_max_features, max_df=self.tfidf_max_df, min_df=self.tfidf_min_df) self.tfidf_matrix = self.tfidf.fit_transform( df['lemmatized_text']).toarray() self.tfidf_features = np.array(self.tfidf.get_feature_names()) self.tfidf_reverse_lookup = { word: idx for idx, word in enumerate(self.tfidf_features)}
def create_vectorizer_selector(train_data, train_labels, model_file, ngram_list=[1], max_num_features_list=[100], analyzer_type_list=['word']): """Call creation and save of vectorizers and selectors including special cases. Args: train_data: list of train text samples train_labels: list of train labels model_file: model filename ngram_list: list of ranges of n-grams max_num_features_list: list of maximum number of features to select analyzer_type_list: list of analyzer types for TfidfVectorizer 'word' or 'char' Returns: nothing """ for i in range(len(ngram_list)): ngrams_selection(train_data, train_labels, 'general_' + str(i), model_file, ngram_range_=(ngram_list[i], ngram_list[i]), max_num_features=max_num_features_list[i], analyzer_type=analyzer_type_list[i]) you_are_data = ngrams_you_are(train_data) ngrams_selection(you_are_data, train_labels, 'special', model_file, ngram_range_=(1,1), max_num_features=100) return
def load_20ng_dataset_bow(): """ Loads the 20NG dataset :return: """ newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') # Convert data to tf-idf vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95) train_data = vectorizer.fit_transform(newsgroups_train.data) test_data = vectorizer.transform(newsgroups_test.data) train_data = train_data.todense() test_data = test_data.todense() train_labels = newsgroups_train.target test_labels = newsgroups_test.target return train_data, train_labels, test_data, test_labels
def fit(self, X_df, y=None): # See if we should fit TfidfVectorizer or not for key in X_df.columns: if key in self.text_columns: X_df[key].fillna('nan', inplace=True) text_col = X_df[key].astype(str, raise_on_error=False) self.text_columns[key].fit(text_col) col_names = self.text_columns[key].get_feature_names() # Make weird characters play nice, or just ignore them :) for idx, word in enumerate(col_names): try: col_names[idx] = str(word) except: col_names[idx] = 'non_ascii_word_' + str(idx) col_names = ['nlp_' + key + '_' + str(word) for word in col_names] self.text_columns[key].cleaned_feature_names = col_names return self
def train(self, train_size=0.8, k_folds=5): # retrieve data from DB and pre-process self._get_data() # perform train/test split self._get_train_test_split(train_size=train_size) # define text pre-processing pipeline text_pipeline = Pipeline([ ('extract_text', DFColumnExtractor(TEXT_FEATURES)), ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer)) ]) # define pipeline for pre-processing of numeric features numeric_pipeline = Pipeline([ ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)), ('scaler', MinMaxScaler()) ]) # combine both steps into a single pipeline pipeline = Pipeline([ ('features', FeatureUnion([ ('text_processing', text_pipeline), ('num_processing', numeric_pipeline) ])), ('clf', self._estimator) ]) self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds)) gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds) X = self.data.iloc[self.train_inds_, :] y = self.data[LABEL].values[self.train_inds_] gs.fit(X, y) self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_)) self.gs_ = gs self.model_ = gs.best_estimator_
def tfidf(self): #keep both hashtags and mentions #token_pattern=r'(?u)@?#?\b\w\w+\b' #remove hashtags and mentions #token_pattern = r'(?u)(?<![#@])\b\w+\b' #just remove mentions and remove hashsign from hashtags #token_pattern = r'(?u)(?<![@])\b\w+\b' #remove mentions but keep hashtags with their sign #token_pattern = r'(?u)(?<![@])#?\b\w\w+\b' #remove multple occurrences of a character after 2 times yesss => yess #re.sub(r"(.)\1+", r"\1\1", s) self.vectorizer = TfidfVectorizer(tokenizer=self.tokenizer, token_pattern=self.token_pattern, use_idf=self.idf, norm=self.norm, binary=self.btf, sublinear_tf=self.subtf, min_df=self.mindf, max_df=self.maxdf, ngram_range=(1, 1), stop_words=self.stops, vocabulary=self.vocab, encoding=self.encoding, dtype='float32') logging.info(self.vectorizer) self.X_train = self.vectorizer.fit_transform(self.df_train.text.values) self.X_dev = self.vectorizer.transform(self.df_dev.text.values) self.X_test = self.vectorizer.transform(self.df_test.text.values) logging.info("training n_samples: %d, n_features: %d" % self.X_train.shape) logging.info("development n_samples: %d, n_features: %d" % self.X_dev.shape) logging.info("test n_samples: %d, n_features: %d" % self.X_test.shape)
def loadDataset(): '''???????''' df = pd.read_csv('df_vec.csv') # print df.shape X = np.array(df.iloc[:, 1:]) y = np.array(df.iloc[:, 0]) # print y # bet_list = list(df.iloc[:, 0]) # dataset = [] # for bet in bet_list: # s, bet = bet.split(':') # dataset.append(bet) # print dataset # print X # print y return X, y # def transform(dataset, n_features=1000): # vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True) # X = vectorizer.fit_transform(dataset) # print X # # print vectorizer # return X, vectorizer
def get_binary(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False ))) ])
def get_sgdc(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)) ])
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix): """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix. :param str verb_token: Surface form of a verb, e.g., *born* :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer used to transform verbs into vectors :return: cosine similarity score :rtype: ndarray """ verb_token_vector = vectorizer.transform([verb_token]) # Here the linear kernel is the same as the cosine similarity, but faster # cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity scores = linear_kernel(verb_token_vector, tf_idf_matrix) logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores)) logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores))) return scores
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ): """ Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace """ token_pattern = re.compile(r"[\s\-]+", re.U) def custom_tokenizer( s ): return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[ v[term] ] = term return (X,terms)
def _vectorize_documents(self,method='tfidf',max_features=100): stop_words = [] try: for lexicon_id in self.params['cluster_lexicons']: lexicon = Lexicon.objects.get(id=int(lexicon_id)) words = Word.objects.filter(lexicon=lexicon) stop_words+=[word.wrd for word in words] except: KeyError if method == 'count': vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words) if method == 'tfidf': vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words) document_vectors = vectorizer.fit_transform(self.documents) document_vectors = document_vectors.toarray() return document_vectors,vectorizer.get_feature_names()
def generateTfIdfVectorizer(data, stop='english', max_df=0.08, min_df=8): tokenizer = tokenizer_snowball if stop != 'english' else tokenizer_porter tfidf = TfidfVectorizer(strip_accents=None, max_df=max_df, min_df=min_df, lowercase=True, stop_words=stop, sublinear_tf=True, tokenizer=tokenizer, analyzer='word', max_features=16, preprocessor=preprocessor) X = tfidf.fit_transform(data) print('%d Features: %s' % (len(tfidf.get_feature_names()), tfidf.get_feature_names())) return X
def gridSearch(data, params, true_k): tfidf = TfidfVectorizer(strip_accents=None, lowercase=True, sublinear_tf=True, analyzer='word') lr_tfidf = Pipeline([('vect', tfidf), ('clf', KMeans(init='k-means++', n_jobs=-1, random_state=0, verbose=0))]) gsTfIdf = GridSearchCV( lr_tfidf, params, n_jobs=1, verbose=1) gsTfIdf.fit(data) print() print("Best score: %0.3f" % gsTfIdf.best_score_) print("Best parameters set:") best_parameters = gsTfIdf.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def get_word_clouds(tweets, users, words_n=50, lang='english'): default_stopwords = set(nltk.corpus.stopwords.words(lang)) stopwords_file = '../data/stopwords.txt' custom_stopwords = set(open(stopwords_file, 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords)) X = vectorizer.fit_transform(tweets) terms = vectorizer.get_feature_names() word_cloud_per_person = {} for doc in range(len(tweets)): feature_index = X[doc, :].nonzero()[1] tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index]) doc_terms = [] for word, score in [(terms[i], score) for (i, score) in tfidf_scores]: doc_terms.append((word, score)) important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n] word_cloud_per_person[users[doc]] = important_terms return word_cloud_per_person
def delegate_create( self, top, bottom, sample_size=1000, source=sfsf_config.EPUB ): top_sellers, bottom_sellers = top, bottom if source == sfsf_config.EPUB: training_data_top = self.sample_epubs( top_sellers, sample_size ) training_data_bottom = self.sample_epubs( bottom_sellers, sample_size ) else: training_data_top = self.sample_txts( top_sellers, sample_size ) training_data_bottom = self.sample_txts( bottom_sellers, sample_size ) training_samples_top = [ sample for training_data in training_data_top for sample in training_data[1] ] training_samples_bottom = [ sample for training_data in training_data_bottom for sample in training_data[1] ] isbns = [ training_data[0] for training_data in training_data_top for sample in training_data[1] ] + [ training_data[0] for training_data in training_data_bottom for sample in training_data[1] ] y_narr = numpy.array( [1] * len( training_samples_top ) + [0] * len( training_samples_bottom ) ) vect = TfidfVectorizer( tokenizer = MorePunctuationTokenizer() ) x_tdm = vect.fit_transform( training_samples_top + training_samples_bottom ) print( 'Created training data', ':' ) print( 'x shape', ':', x_tdm.shape ) print( 'y shape', ':', y_narr.shape ) # TODO: make a nicer return structure return { 'x': x_tdm, 'y': y_narr, 'vectorizer': vect, 'isbns': isbns }
def create_model_from_training_data(self): training_comments=[] training_ratings=[] print("Training classifier model..") for sentidata in self.training_data: comments = preprocess_text(sentidata.text) training_comments.append(comments) training_ratings.append(sentidata.rating) # discard stopwords, apply stemming, and discard words present in less than 3 comments self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5, stop_words=mystop_words, min_df=3) X_train = self.vectorizer.fit_transform(training_comments).toarray() Y_train = np.array(training_ratings) #Apply SMOTE to improve ratio of the minority class smote_model = SMOTE(ratio=0.5, random_state=None, k=None, k_neighbors=15, m=None, m_neighbors=15, out_step=.0001, kind='regular', svm_estimator=None, n_jobs=1) X_resampled, Y_resampled=smote_model.fit_sample(X_train, Y_train) model=self.get_classifier() model.fit(X_resampled, Y_resampled) return model
def create_pipeline(estimator, reduction=False): steps = [ ('normalize', TextNormalizer()), ('vectorize', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )) ] if reduction: steps.append(( 'reduction', TruncatedSVD(n_components=10000) )) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps)
def construct_tf_idf_matrix(data, store=False): print ("TF-IDF Normalized Matrix Construction...") vectorizer = TfidfVectorizer(stop_words='english') print(data) training_data = vectorizer.fit_transform(data) print ("Done Constructing Matrix") print(training_data.toarray()) if store: print ("Pickling Trained Transformer...") pickle.dump(vectorizer, open(path_config.TRANSFORMER_PICKLING_FILE, 'wb')) print ("Pickling Done.") return training_data
def rf_categorize(email): # get training corpus emails = [] db = utils.get_local_db() for collection in db.collection_names(): for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # vectorize corpus labels = [row[0] for row in emails] data = [row[1] for row in emails] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data) X = X.toarray() # vectorize input email_vector = vectorizer.transform([email]) # create random forest and return prediction forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1) forest.fit(X, labels) return forest.predict(email_vector)[0]
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def create_ngram_model(params=None): def preprocessor(tweet): global emoticons_replaced tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") clf = MultinomialNB() pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def TL(): allurls = './data/data.csv' #path to our all urls file allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe allurlsdata = np.array(allurlsdata) #converting it into an array random.shuffle(allurlsdata) #shuffling y = [d[1] for d in allurlsdata] #all labels corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad) vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer X = vectorizer.fit_transform(corpus) #get the X vector X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio lgs = LogisticRegression() #using logistic regression lgs.fit(X_train, y_train) print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98% return vectorizer, lgs
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False): #choosing the particular flavor of vectorizer if method == 'counts': vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary) elif method == 'tfidf': vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace') #fitting the vectorizer and converting the counts to an array full_fit = vectorizer.fit_transform(df[x_name]) full_counts = full_fit.toarray() self.vocabulary_ = vectorizer.vocabulary_ #passing the attributes up to the class instance self.data = df if sparse: full_counts = csr_matrix(full_counts) self.X = full_counts if y_name != None: self.y = np.array(df[y_name]) return #splits the data into training and test sets; either called from process() #or on its own when your text is already vectorized and divided into x and y
def bag_of_words(messages, model=None, weighting=''): # TODO: Add stemmming or baseform here messages, stemmings2baseform = texttools.stemming_messages_snowball(messages) # Create new model for extrating text features if None is given if model is None: if weighting == 'tfidf': model = TfidfVectorizer() else: model = CountVectorizer() model.fit(messages) # Extract features x = model.transform(messages) return x
def spams_count(texts): """ Returns the number of spams from a list of (type, text) tuples. Args: texts: a list of (type, text) tuples. Returns: an integer representing the number of spams. """ spams_count = 0 for t, _ in texts: # t=1 if it's a spam, 0 if not spams_count += t return spams_count # Ex 1.3 # See http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html # for the parameters
def transform_text(pairs): """ Transforms the pair data into a matrix X containing tf-idf values for the messages and a vector y containing 0s and 1s (for hams and spams respectively). Row i in X corresponds to the i-th element of y. Args: pairs: a list of (type, message) tuples. Returns: X: a sparse TF-IDF matrix where each row represents a message and each column represents a word. Y: a vector whose i-th element is 0 if the i-th message is a ham, else 1. """ tfidf = TfidfVectorizer(stop_words="english") types, texts = zip(*pairs) X = tfidf.fit_transform(texts) # Convert the list to a Numpy array because some sklearn objects don't # accept lists. y = np.array(types) return X, y # Ex 2
def prune(self, question, paragraphs: List[ExtractedParagraph]): if not self.filter_dist_one and len(paragraphs) == 1: return paragraphs tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words) text = [] for para in paragraphs: text.append(" ".join(" ".join(s) for s in para.text)) try: para_features = tfidf.fit_transform(text) q_features = tfidf.transform([" ".join(question)]) except ValueError: return [] dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph if self.filter_dist_one: return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0] else: return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]
def dists(self, question, paragraphs: List[ExtractedParagraph]): tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words) text = [] for para in paragraphs: text.append(" ".join(" ".join(s) for s in para.text)) try: para_features = tfidf.fit_transform(text) q_features = tfidf.transform([" ".join(question)]) except ValueError: return [] dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph if self.filter_dist_one: return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0] else: return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]
def texts_tfidf(ids, important_texts, citations_texts) : ''' Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. ''' tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1,2), min_df=2) freqs1 = tfidf.fit_transform(important_texts) terms1 = tfidf.get_feature_names() freqs2 = tfidf.fit_transform(citations_texts) terms2 = tfidf.get_feature_names() return terms1, terms2, freqs1, freqs2
def precomputed_similarity(self): # calculate similarity between train an testset job descriptions # this is of high order complexity - test it on a subset of the data corpus_list = pandas_vector_to_list(self.description_train_data) queries_list = pandas_vector_to_list(self.description_test_data) self.free_memory() print('{}: starting to vectorize description'.format(self.__class__.__name__)) # use custom vectorizer to cut of min/max 1% of df since they carry little information vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, min_df=0.05, max_df=0.99) vectorizer, corpus_vector, queries_vector = tfidf_vectorize(corpus_list, queries_list, tfidf_vectorizer=vectorizer) print("vocabulary size: {}".format(len(vectorizer.get_feature_names()))) self.store_precomputed_data(corpus_vector, queries_vector, self.y_train, self.y_test) self.load_precomputed_data()
def tfidf_vectorize(documents, queries=[''], tfidf_vectorizer=TfidfVectorizer(stop_words='english', lowercase=True)): """ vectorize job_descriptions using tfidf :param documents: list of text (training_data :param queries: list of text (test data) - can be empty [''] (default) in case we just want to vectorize a single corpus :param tfidf_vectorizer: to overwrite with an existing/trained vectorizer or different parameters :return: (tfidf_vectorizer, document_vector, queries_vector) """ # easier to test with smaller data set # use this to overwrite the incoming corpus/queries # documents = ['aaa bbb', 'ccc eee', 'aaa ddd', 'ddd ddd', 'ccc aaa'] # queries = ['aaa bbb', 'ddd ddd'] tfidf_vectorizer.fit(documents, queries) document_vector = tfidf_vectorizer.transform(documents) queries_vector = tfidf_vectorizer.transform(queries) return tfidf_vectorizer, document_vector, queries_vector
def build_feature_matrix(documents, feature_type='frequency'): feature_type = feature_type.lower().strip() if feature_type == 'binary': vectorizer = CountVectorizer(binary=True, min_df=1, ngram_range=(1, 1)) elif feature_type == 'frequency': vectorizer = CountVectorizer(binary=False, min_df=1, ngram_range=(1, 1)) elif feature_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1)) else: raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'") feature_matrix = vectorizer.fit_transform(documents).astype(float) return vectorizer, feature_matrix
def build_feature_matrix(documents, feature_type='frequency', ngram_range=(1, 1), min_df=0.0, max_df=1.0): feature_type = feature_type.lower().strip() if feature_type == 'binary': vectorizer = CountVectorizer(binary=True, min_df=min_df, max_df=max_df, ngram_range=ngram_range) elif feature_type == 'frequency': vectorizer = CountVectorizer(binary=False, min_df=min_df, max_df=max_df, ngram_range=ngram_range) elif feature_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range) else: raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'") feature_matrix = vectorizer.fit_transform(documents).astype(float) return vectorizer, feature_matrix
def __getTFIDFVectors(self, question1, question2): question1 = question1.lower() question2 = question2.lower() question1 = question1 if question1 != "nan" else "" question2 = question2 if question2 != "nan" else "" question1 = re.sub('\W+', ' ', question1) question2 = re.sub('\W+', ' ', question2) question1_tokens = question1.split() question2_tokens = question2.split() vocabulary = question1_tokens + question2_tokens vocabulary = list(set(vocabulary)) vectorizer = TfidfVectorizer(analyzer='word', vocabulary=vocabulary) vectorized_q1 = vectorizer.fit_transform([question1]) vectorized_q2 = vectorizer.transform([question2]) return vectorized_q1, vectorized_q2
def run(self): contents = [self.__read_file(novel['_id']) for novel in self.novels] vectorizer = TfidfVectorizer(input="file", stop_words=stop_words, max_features=50000) print("start vectorizing...") t0 = time() # ???? X = vectorizer.fit_transform(contents) print("done in %0.3fs" % (time() - t0)) with open("dataset.pickle", "w") as f: print("saving dataset.....") pickle.dump(X, f, pickle.HIGHEST_PROTOCOL) # ???? with open("vectorizer.pickle", "w") as f: print("saving vectorizer model.....") pickle.dump(vectorizer, f) # ????? self.__close() print("Finished!! All documents has been vectorized.")
def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard", generate=True): super(MILClassifier, self).__init__() self.modelname = modelname self.pairtype = pairtype self.pairs = {} # (e1.normalized, e2.normalized) => (e1, e2) self.instances = {} # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1) self.bag_labels = [] # ordered list of labels for each bag self.bag_pairs = [] # ordered list of pair labels (e1.normalized, e2.normalized) self.data = [] # ordered list of bags, each is a list of feature vectors self.predicted = [] # ordered list of predictions for each bag self.resultsfile = None self.examplesfile = None self.ner_model = ner self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b') self.corpus = corpus #self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=) #self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20) self.classifier = misvm.sMIL(kernel='linear', C=1) #self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20) #if generate: # self.generateMILdata(test=test, pairtype=pairtype, relations=relations)
def _init_word_ngram_tfidf(self, ngram, vocabulary=None): tfidf = TfidfVectorizer(min_df=3, max_df=0.75, max_features=None, norm="l2", strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, # stop_words="english", vocabulary=vocabulary) return tfidf ## char based
def _init_char_ngram_tfidf(self, ngram, vocabulary=None): tfidf = TfidfVectorizer(min_df=3, max_df=0.75, max_features=None, norm="l2", strip_accents="unicode", analyzer="char", token_pattern=r"\w{1,}", ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, # stop_words="english", vocabulary=vocabulary) return tfidf # ------------------------ LSA -------------------------------
def extractDoc(ext): root = 'data' data = [] for f in os.listdir(os.path.join(root, ext))[:5]: with open(os.path.join(root, ext, f), 'r') as sc: sc = clean(sc.read(), 'cpp') data.append(sc) print "[SUCCESS] Read", os.path.join(root, ext, f) vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,2)) X = vectorizer.fit_transform(data) del data features_by_gram = defaultdict(list) for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_): features_by_gram[len(f.split(' '))].append((f, w)) top_n = 50 for gram, features in features_by_gram.iteritems(): top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n] top_features = [f[0] for f in top_features] print '{}-gram top:'.format(gram), top_features
def extract_terms_with_corpus_sklearn(text_files, number_of_terms=10, max_features=20, max_words=3, lemmatize=True, train_on_script = True): # tokenizer analyzer = lambda s: extract_chunks(read_txt(s),lemmatize=lemmatize,max_words=max_words) # All-in-one object for tfidf calculation tfidf_vectorizer = TfidfVectorizer(input='filename', analyzer = analyzer, max_features=max_features) # fit training data & get tfidf matrix if train_on_script: tfidf_mat = tfidf_vectorizer.fit(text_files[0:]) else: tfidf_mat = tfidf_vectorizer.fit(text_files[1:]) # transform first file tfidf_script = tfidf_vectorizer.transform([text_files[0]]) # get map between id and term id2term = tfidf_vectorizer.get_feature_names() return [(id2term[i],tfidf_script[0,i]) for i in tfidf_script.toarray()[0,:].argsort()[::-1][0:number_of_terms]]
def tfidf(corpus, corpusKeys): #TODO clean this up #discard any stop words - saves on processing stopset = list(stopwords.words('english')) stopset.append('000') stopset.extend([str(x) for x in range(9999)]) vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3)) #matrix of input set X = (vectorizer.fit_transform(corpus)).toarray() size_matrix = X.shape[0] lsa = TruncatedSVD(n_components=size_matrix, n_iter=100) terms = vectorizer.get_feature_names() records = [] for i, comp in enumerate(X): termsInComp = zip(terms, comp) sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10] #List with all the terms gathered from the tfidf vectorizer termList = [term[0] + '.' for term in sortedTerms] # List with Article ID and list of tfidf terms records.append((vader(corpusKeys[i], termList), termList)) return records
def GaussianNBPredictModel(localTrainLabel, config): train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",") test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",") print "Train tf-idf vector Model..." encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50) localTrainFeature = encode.fit_transform(train['qlist'].values) localTestFeature = encode.transform(train['qlist'].values) print localTrainFeature.shape, localTestFeature.shape print 'train...' model = GaussianNB() model.fit(X = localTrainFeature.toarray(), y = localTrainLabel) print 'predict...' if config['prob'] == False: return model.predict(localTestFeature.toarray()), test['uid'].values else: return model.predict_log_proba(localTestFeature.toarray()), test['uid'].values #-- Multinomial Navie Bayes corss validation model frame
def MultinomialNBPredictModel(localTrainLabel, config): train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",") test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",") print "Train tf-idf vector Model..." encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50) localTrainFeature = encode.fit_transform(train['qlist'].values) localTestFeature = encode.transform(train['qlist'].values) print localTrainFeature.shape, localTestFeature.shape print 'train...' model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) model.fit(X = localTrainFeature, y = localTrainLabel) print 'predict...' if config['prob'] == False: return model.predict(localTestFeature), test['uid'].values else: return model.predict_log_proba(localTestFeature), test['uid'].values #-- xgboost local corss validation model frame