我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.DictVectorizer()。
def onehot_encode_bar(tr,te,cols=None,bar=10000): if cols is None: cols = [i for i in tr.columns.values if i in te.columns.values] vec = DictVectorizer() cat,num = [],[] for col in cols: nu = tr[col].unique().shape[0] if (nu<bar and nu>2) or tr[col].dtype=='object': cat.append(col) tr[col] = tr[col].map(str) te[col] = te[col].map(str) else: num.append(col) print("start fitting num of cat features:",len(cat)) X = vec.fit_transform(tr[cat].T.to_dict().values()) Xt = vec.transform(te[cat].T.to_dict().values()) print("done fitting",X.shape,Xt.shape) X = sparse.hstack([X,tr[num].values],format='csr') Xt = sparse.hstack([Xt,te[num].values],format='csr') return X,Xt
def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf self._gs = a_grid_search if a_clf is None: classifier = XGBClassifier(max_depth=MAX_DEPTH, n_estimators=NTREES, learning_rate=ALPHA, objective="multi:softprob") self._clf = classifier # latest version of XGBoost cannot deal with non-sparse feature vectors self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)])
def _consolidate_pipeline(self, transformation_pipeline, final_model=None): # First, restrict our DictVectorizer or DataFrameVectorizer # This goes through and has DV only output the items that have passed our support mask # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step # It also significantly reduces the size of dv.vocabulary_ which can get quite large dv = transformation_pipeline.named_steps['dv'] try: feature_selection = transformation_pipeline.named_steps['feature_selection'] feature_selection_mask = feature_selection.support_mask dv.restrict(feature_selection_mask) except KeyError: pass # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model) return trained_pipeline_without_feature_selection
def ohEncoding(data, cols, replace=False): if cols is None: cols = [] for el, v in data.dtypes.iteritems(): if v == 'object': if el == 'key': pass else: cols.append(el) print "Categorical features not set, detected as categorical: %s" % str(cols) vec = DictVectorizer() mkdict = lambda row: dict((col, row[col]) for col in cols) vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray()) vecData.columns = vec.get_feature_names() vecData.index = data.index if replace is True: data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec # df, t, v = ohEncoding(df, col, replace=True)
def ohEncoding(data, cols=None, replace=False): if cols is None: cols = [] for el, v in data.dtypes.iteritems(): if v == 'object': cols.append(el) print "Categorical features not set, detected as categorical: %s" % str(cols) vec = DictVectorizer() mkdict = lambda row: dict((col, row[col]) for col in cols) vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray()) vecData.columns = vec.get_feature_names() vecData.index = data.index if replace is True: data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None): """ Initializes the extractor. :param language: The language of the sentences that will be used :param window_width: how many tokens to look before and after a each token when building its features. :param collapse_fes: Whether to collapse FEs to a single token or to keep them split. """ self.language = language self.tagger = TTPosTagger(language) self.window_width = window_width self.collapse_fes = collapse_fes self.unk_feature = 'UNK' self.vectorizer = DictVectorizer() self.target_size = target_size self.reducer = TruncatedSVD(target_size) if target_size else None self.vocabulary = set() self.label_index = {} self.lu_index = {} self.stopwords = set(w.lower() for w in StopWords().words(language)) self.start()
def __init__(self, a_clf=None, a_grid_search=False): """Class constructor. Initialize classifier. Args: a_clf (classifier or None): classifier to use or None for default a_grid_search (bool): use grid search for estimating hyper-parameters """ classifier = a_clf or LinearSVC(C=DFLT_C, **DFLT_PARAMS) self._gs = a_grid_search self._model = Pipeline([("vect", DictVectorizer()), ("clf", classifier)])
def _execute(self, sources, alignment_stream, interval): time_interval = TimeInterval(MIN_DATE, interval.end) param_doc = sources[0].window(time_interval, force_calculation=True).last() if param_doc is None: logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval)) return steps = deserialise_json_pipeline({ 'vectorisation': DictVectorizer(sparse=False), 'fill_missing': FillZeros(), 'classifier': LinearDiscriminantAnalysis(), 'label_encoder': LabelEncoder() }, param_doc.value) clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')]) locations = steps['label_encoder'].classes_ data = sources[1].window(interval, force_calculation=True) for tt, dd in data: yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def predict_function(): x_list = [] line_list = [] line_dict = {} predict_doc = joblib.load('logreg.pkl') feature_doc = joblib.load("word_vec.pkl") y_train, x_train = get_feature() line = "bad bad good good" line_list = line.split() for line in x_train: for key in line: line_dict[key] = 0 line_dict.update(dict(Counter(line_list))) for a in sorted(line_dict.items(), key = lambda x:x[1]): print(a) x_list.append(line_dict) print(x_list) exit() X = DictVectorizer().fit_transform(x_list) pred = predict_doc.predict(X) prob = predict_doc.predict_proba(X) for pred, prob in zip(pred,prob): print(pred, prob)
def dimension_reduction(): X = PPMI_matrix() word_list = list() vecdict_list = list() for word, vector in sorted(X.items()): word_list.append(word) vecdict_list.append(dict(vector)) Dic2Vec = DictVectorizer(sparse=True) vector_list = Dic2Vec.fit_transform(vecdict_list) X_svd = svds(vector_list, 300) X_pca = np.dot(X_svd[0], np.diag(X_svd[1])) word_matrix = dict() for word, vector in zip(word_list, X_pca): word_matrix[word] = vector return word_matrix
def dimension_compression(): X_t_c = make_matrix() token_list = [] contexts_list = [] for token, contexts in sorted(X_t_c.items()): token_list.append(token) contexts_list.append(contexts) pca = PCA(n_components = 300) DictoVec = DictVectorizer(sparse = True) sparse = DictoVec.fit_transform(contexts_list) print(sparse.shape) vec_list = pca.fit_transform(sparse.todense()) word_vec = {} for token, vec in zip(token_list, vec_list): word_vec[token] = vec return word_vec
def dim_reduction(): dic2vec = DictVectorizer(sparse=True) PPMI = getPPMI() tc = list() token_list = list() for token, contexts in sorted(PPMI.items()): token_list.append(token) contexts = dict(contexts) tc.append(contexts) tc_vec = dic2vec.fit_transform(tc) tc_svd = svds(tc_vec, 300) tc_pca = np.dot(tc_svd[0], np.diag(tc_svd[1])) word_vec = dict() for token, vec in zip(token_list, tc_pca): word_vec[token] = vec return word_vec
def _consolidate_pipeline(self, transformation_pipeline, final_model=None): # First, restrict our DictVectorizer or DataFrameVectorizer # This goes through and has DV only output the items that have passed our support mask # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step # It also significantly reduces the size of dv.vocabulary_ which can get quite large try: feature_selection = transformation_pipeline.named_steps['feature_selection'] feature_selection_mask = feature_selection.support_mask transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask) except KeyError: pass # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model) return trained_pipeline_without_feature_selection
def data2Vector(self): vec = DictVectorizer() dummy_x = vec.fit_transform(self.feature_list).toarray() lb = LabelBinarizer() dummy_y = lb.fit_transform(self.label_list) return dummy_x, dummy_y # here the decision tree use the algorithm which we call ID3, ID3 will use # information gain as feature select
def transform(self, documents): """ Returns a dictionary of text features in advance of a DictVectorizer. """ for document in documents: # Collect token and vocabulary counts counts = Counter( item[0] for para in document for sent in para for item in sent ) # Yield structured information about the document yield { 'paragraphs': len(document), 'sentences': sum(len(para) for para in document), 'words': sum(counts.values()), 'vocab': len(counts), } ########################################################################## ## Model Building Functions ##########################################################################
def create_feature(sent_list): feature_ = [] polarity = [] # ?????? features_ = [] # ?????? #????? vec = DictVectorizer() for line in sent_list: sentence = line.strip('\n').split() sentence2 = sentence.pop(0) polarity.append(int(sentence2)) #print(polarity) feature_ = feature(sentence) ''' for word in feature(sentence): feature_.append(word) print(feature_) ''' features_.append(feature_vector(feature_)) x_feature = vec.fit_transform(features_) return x_feature, polarity
def getFeatures(numWordsToUse, allTweets, allTweetsSentiment): # each corpus's getFeatures function is responsible for somehow loading in their own allTweets and allTweetsSentiment data # then they have to ensure that data is tokenized (leveraging the modular tokenization functionality in utils) # then shuffle the dataset # then create the frequency distribution and popularWords # then extract features from each tweet, and un-combine the sentiment again global popularWords formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering( allTweets, allTweetsSentiment,0,numWordsToUse,'counts' ) # right now we have a data structure roughly equivalent to a dense matrix, except each row is a dictionary # DictVectorizer performs two key functions for us: # 1. transforms each row from a dictionary into a vector using consistent placing of keys into indexed positions within each vector # 2. returns sparse vectors, saving enormous amounts of memory which becomes very useful when training our models sparseFeatures = dv.fit_transform(formattedTweets) return sparseFeatures, sentiment
def test_unseen_or_no_features(): D = [{"camelot": 0, "spamalot": 1}] for sparse in [True, False]: v = DictVectorizer(sparse=sparse).fit(D) X = v.transform({"push the pram a lot": 2}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) X = v.transform({}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) try: v.transform([]) except ValueError as e: assert_in("empty", str(e))
def onehot_encode(tr,te,cols=None): if cols is None: cols = [i for i in tr.columns.values if i in te.columns.values] vec = DictVectorizer() for col in cols: tr[col] = tr[col].map(str) te[col] = te[col].map(str) print("start fitting") X = vec.fit_transform(tr[cols].T.to_dict().values()) Xt = vec.transform(te[cols].T.to_dict().values()) print("done fitting",X.shape,Xt.shape) return X,Xt
def _validate_input_col_descriptions(self): found_output_column = False self.cols_to_ignore = [] expected_vals = set(['categorical', 'text', 'nlp']) for key, value in self.column_descriptions.items(): value = value.lower() self.column_descriptions[key] = value if value == 'output': self.output_column = key found_output_column = True elif value == 'date': self.date_cols.append(key) elif value == 'ignore': self.cols_to_ignore.append(key) elif value in expected_vals: pass else: raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".') if found_output_column is False: print('Here is the column_descriptions that was passed in:') print(self.column_descriptions) raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.') # We will be adding one new categorical variable for each date col # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column for date_col in self.date_cols: self.column_descriptions[date_col + '_day_part'] = 'categorical' # We use _construct_pipeline at both the start and end of our training. # At the start, it constructs the pipeline from scratch # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it
def chiSquare(train_data, train_classes, topK): vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) y_train = train_classes if (x_train.shape[1] < topK): topK = x_train.shape[1] selector = SelectKBest(chi2, k=topK) x_new = selector.fit_transform(x_train, y_train) return vectorizer.inverse_transform(selector.inverse_transform(x_new))
def varianceFilter(train_data, train_classes, threshold): #if True: # return frequencyFilter(train_data, train_classes, threshold) ''' Variance filter ''' vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) #y_train = train_classes sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) x_new = sel.fit_transform(x_train) return vectorizer.inverse_transform(sel.inverse_transform(x_new))
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()): labels = [x.severity for x in data] generatePrimaryFeats(data, featTypes) featurized = [] for d in data: instance = {} for featname, values in d.feats.items(): # Give each feature a unique name to avoid overwriting features. # If e.g. a concept feature has the same name as a bow word, the old code # would overwrite one of the features. instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()}) featurized.append(instance) d = DictVectorizer() x_train = d.fit_transform(featurized) folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed) grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds) fit_grid = grid.fit(x_train, labels) print(fit_grid.best_params_) return fit_grid.best_params_
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs): new_train_set = list(trainSet) new_y_train = list(y_train) trainAndBSData = trainSet + bootstrap_data generateDataDrivenFeats(trainSet, trainAndBSData, es) featurized = featurize(trainAndBSData) train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)] test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)] #Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es) # calculate Inter-annotator weighting. weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot) vectorizer = DictVectorizer() x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: min_max_scalar = MinMaxScaler() x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) model = train(estimator, x_train, y_train, weights_train, model=None) y_pred_prob = model.predict_proba(x_test) for i, cur_y in enumerate(y_pred_prob): if np.max(cur_y) > th_bs: new_train_set.append(bootstrap_data[i]) new_y_train.append(np.argmax(cur_y)) return (new_train_set, new_y_train) #update none to confidence vector
def _vectorize(self,corpus,fit): assert isinstance(corpus,kindred.Corpus) matrices = [] for feature in self.chosenFeatures: assert feature in self.featureInfo.keys() featureFunction = self.featureInfo[feature]['func'] never_tfidf = self.featureInfo[feature]['never_tfidf'] data = featureFunction(corpus) notEmpty = any( len(d)>0 for d in data ) if fit: if notEmpty: self.dictVectorizers[feature] = DictVectorizer() if self.tfidf and not never_tfidf: self.tfidfTransformers[feature] = TfidfTransformer() intermediate = self.dictVectorizers[feature].fit_transform(data) matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate)) else: matrices.append(self.dictVectorizers[feature].fit_transform(data)) else: if feature in self.dictVectorizers: if self.tfidf and not never_tfidf: intermediate = self.dictVectorizers[feature].transform(data) matrices.append(self.tfidfTransformers[feature].transform(intermediate)) else: matrices.append(self.dictVectorizers[feature].transform(data)) mergedMatrix = hstack(matrices) return mergedMatrix
def ohEncoding(data, cols, replace=False): vec = DictVectorizer() mkdict = lambda row: dict((col, row[col]) for col in cols) vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray()) vecData.columns = vec.get_feature_names() vecData.index = data.index if replace is True: data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec
def __init__(self, name, warm_start=True): self.vocal = DictVectorizer() self.model = linear_model.LogisticRegression(warm_start=warm_start, solver='sag', max_iter=200, verbose=0, penalty='l2', n_jobs=4)
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")): return make_pipeline(DictVectorizer(sparse=False), clf)
def predict_function(): x_list = [] predict_doc = joblib.load('logreg.pkl') y_train, x_train = get_feature() for line in x_train: x_list.append(dict(Counter(line))) X = DictVectorizer().fit_transform(x_list) pred = predict_doc.predict(X) prob = predict_doc.predict_proba(X) return pred, y_train, prob
def log_regression(): x_list = [] logreg = LogisticRegression() y_train, x_train = get_feature() for line in x_train: x_list.append(dict(Counter(line))) word_vec = DictVectorizer() X = word_vec.fit_transform(x_list) logreg.fit(X, y_train) joblib.dump(logreg, 'logreg.pkl') joblib.dump(word_vec,"word_vec.pkl")
def cv_prediction(feature_dict, feature, polarity, threshold, folds): accuracy = 0 precision = 0 recall = 0 f1 = 0 count = 0 dicvec = DictVectorizer() LR = LogisticRegression() kfold = KFold(len(polarity), n_folds=folds) for train, test in kfold: count += 1 x = list() y = list() [x.append(feature[i]) for i in train] [y.append(polarity[i]) for i in train] x.append(feature_dict) y.append(0) LR.fit(dicvec.fit_transform(x), y) test_label = list() answer_label = list() [answer_label.append(polarity[j]) for j in test] for j in test: query = fit_feature(feature[j], feature_dict) result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold) test_label.append(result) accuracy += accuracy_score(answer_label, test_label) precision += precision_score(answer_label, test_label) recall += recall_score(answer_label, test_label) f1 += f1_score(answer_label, test_label) print('{}_fold finished.'.format(count)) return accuracy, precision, recall, f1
def main(): lr = joblib.load('./lr.pkl') dic2vec = DictVectorizer() features = list() y = list() for line in open('sentiment.txt'): word_list = line[3:].strip('\n').strip().split() features.append(getFeature(word_list)) x = dic2vec.fit_transform(features) with open('sentiment_prediction.txt', 'w') as fp: for sentiment, prob in zip(lr.predict(x), lr.predict_proba(x)): print('{}\t{}'.format(sentiment, prob), file=fp)
def fit(self, X, y=None): # assumes all columns of X are strings Xdict = X.to_dict('records') self.dv = DictVectorizer(sparse=False) self.dv.fit(Xdict) return self
def test_dictvectorizer(self): D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}] for sparse in (True, False): for dtype in (int, np.float32, np.int16): for sort in (True, False): v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) v = v.fit(D) self._test_conversion(D, v)
def test_unseen_or_no_features(self): D1 = [{"camelot": 0, "spamalot": 1}] D2 = [{}, {"nothing" : 21}] for sparse in (True, False): for dtype in (int, np.float32, np.int16): for sort in (True, False): v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) v = v.fit(D1) self._test_conversion(D2, v)
def test_int_features_in_pipeline(self): import numpy.random as rn import pandas as pd rn.seed(0) x_train_dict = [ dict( (rn.randint(100), 1) for i in range(20)) for j in range(100)] y_train = [0,1]*50 from sklearn.pipeline import Pipeline from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression pl = Pipeline([("dv", DictVectorizer()), ("lm", LogisticRegression())]) pl.fit(x_train_dict, y_train) import coremltools model = coremltools.converters.sklearn.convert(pl, input_features = "features", output_feature_names = "target") x = pd.DataFrame( {"features" : x_train_dict, "prediction" : pl.predict(x_train_dict)}) cur_eval_metics = evaluate_classifier(model, x) self.assertEquals(cur_eval_metics['num_errors'], 0)
def _validate_input_col_descriptions(self): found_output_column = False self.cols_to_ignore = [] expected_vals = set(['categorical', 'text', 'nlp']) for key, value in self.column_descriptions.items(): value = value.lower() self.column_descriptions[key] = value if value == 'output': self.output_column = key found_output_column = True elif value == 'date': self.date_cols.append(key) elif value == 'ignore': self.cols_to_ignore.append(key) elif value in expected_vals: pass else: raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".') if found_output_column is False: print('Here is the column_descriptions that was passed in:') print(self.column_descriptions) raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.') # We will be adding one new categorical variable for each date col # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column for date_col in self.date_cols: self.column_descriptions[date_col + '_day_part'] = 'categorical' self.cols_to_ignore = set(self.cols_to_ignore) # We use _construct_pipeline at both the start and end of our training. # At the start, it constructs the pipeline from scratch # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it