我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.LabelEncoder()。
def lbl_encode(df_tr,df_te=None,cols=None,objonly=True): print("label encode ...") lbl = LabelEncoder() if df_te is not None: df = df_tr.append(df_te) if cols is None: cols = set(df_tr.columns.values).intersection(set(df_te.columns.values)) else: df = df_tr if cols is None: cols = df_tr.columns.values encoded = [] for col in cols: if objonly and df[col].dtype!='object': continue encoded.append(col) lbl.fit(df[col].map(str)) df_tr[col] = lbl.transform(df_tr[col].map(str)) if df_te is not None: df_te[col] = lbl.transform(df_te[col].map(str)) print('lbl encode:',encoded)
def test_autoclean_cv_no_nans_with_strings(): """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) training_data = data[:500].copy() testing_data = data[500:].copy() cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() encoder = LabelEncoder() hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values) hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_with_nans_with_strings(): """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) data.loc[10:20, 'A'] = np.nan data.loc[50:70, 'C'] = np.nan hand_cleaned_data = data.copy() hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True) hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) cleaned_data = autoclean(data) assert cleaned_data.equals(hand_cleaned_data)
def test_autoclean_real_data(): """Test autoclean() with the adult data set""" adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') adult_data.loc[30:60, 'age'] = np.nan adult_data.loc[90:100, 'education'] = np.nan hand_cleaned_adult_data = adult_data.copy() hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True) hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True) for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'label']: hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values) cleaned_adult_data = autoclean(adult_data) assert cleaned_adult_data.equals(hand_cleaned_adult_data)
def initialize_labels(self, Y): y_nodes_flat = [y_val for y in Y for y_val in y.nodes] y_links_flat = [y_val for y in Y for y_val in y.links] self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat) self.link_encoder_ = LabelEncoder().fit(y_links_flat) self.n_prop_states = len(self.prop_encoder_.classes_) self.n_link_states = len(self.link_encoder_.classes_) self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_, dtype=np.double) self.link_cw_ = compute_class_weight(self.class_weight, self.link_encoder_.classes_, y_links_flat) self.link_cw_ /= self.link_cw_.min() logging.info('Setting node class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip( self.prop_encoder_.classes_, self.prop_cw_)))) logging.info('Setting link class weights {}'.format(", ".join( "{}: {}".format(lbl, cw) for lbl, cw in zip( self.link_encoder_.classes_, self.link_cw_))))
def _execute(self, sources, alignment_stream, interval): time_interval = TimeInterval(MIN_DATE, interval.end) param_doc = sources[0].window(time_interval, force_calculation=True).last() if param_doc is None: logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval)) return steps = deserialise_json_pipeline({ 'vectorisation': DictVectorizer(sparse=False), 'fill_missing': FillZeros(), 'classifier': LinearDiscriminantAnalysis(), 'label_encoder': LabelEncoder() }, param_doc.value) clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')]) locations = steps['label_encoder'].classes_ data = sources[1].window(interval, force_calculation=True) for tt, dd in data: yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def test_predict_from_file(): from microtc.wrappers import ClassifierWrapper from microtc.textmodel import TextModel from microtc.utils import read_data_labels from sklearn.preprocessing import LabelEncoder import os fname = os.path.dirname(__file__) + '/text.json' corpus, labels = read_data_labels(fname) t = TextModel(corpus) le = LabelEncoder() le.fit(labels) y = le.transform(labels) c = ClassifierWrapper() X = [t[x] for x in corpus] c.fit(X, y) hy = le.inverse_transform(c.predict(X)) for i in hy: assert i in ['POS', 'NEU', 'NEG']
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='r2', classifier=RegressorWrapper, random_state=None): assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio) self.score = score self.le = preprocessing.LabelEncoder().fit(y) self.create_classifier = classifier if test_ratio is None: test_ratio = 1.0 - ratio I = list(range(len(y))) np.random.shuffle(I) s = int(np.ceil(len(y) * ratio)) s_end = int(np.ceil(len(y) * test_ratio)) y = self.le.transform(y) train, test = I[:s], I[s:s+s_end] self.train_corpus = [X[i] for i in train] self.train_corpus.extend(Xstatic) if len(ystatic) > 0: ystatic = self.le.transform(ystatic) self.train_y = np.hstack((y[train], ystatic)) else: self.train_y = y[train] self.test_corpus = [X[i] for i in test] self.test_y = y[test]
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='macrof1', classifier=ClassifierWrapper, random_state=None): assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio) self.score = score self.le = preprocessing.LabelEncoder().fit(y) self.create_classifier = classifier if test_ratio is None: test_ratio = 1.0 - ratio I = list(range(len(y))) np.random.shuffle(I) s = int(np.ceil(len(y) * ratio)) s_end = int(np.ceil(len(y) * test_ratio)) y = self.le.transform(y) train, test = I[:s], I[s:s+s_end] self.train_corpus = [X[i] for i in train] self.train_corpus.extend(Xstatic) if len(ystatic) > 0: ystatic = self.le.transform(ystatic) self.train_y = np.hstack((y[train], ystatic)) else: self.train_y = y[train] self.test_corpus = [X[i] for i in test] self.test_y = y[test]
def score_model(model, data_test, labeler): ''' ??????? ?????????????????? ??????, ?????? ? ??????????? ????? ??? ???????: ???????? ?????????, ???????? ??????? ? ???????? ??? ??????? ??????, ???????? ? ????????????? ??????. ?????????: model - ????????? ?????? data_test - ??????????? ??????? labeler - LabelEncoder ?????? ??????? ??????????: ?????? ''' X_test = data_test.drop(["proto"], axis=1) y_test = data_test["proto"] y_predicted = model.predict(X_test) true_labels = labeler.inverse_transform(y_test) predicted_labels = labeler.inverse_transform(y_predicted) print feature_importances_report(model, X_test.columns) print "\n", classification_report(true_labels, predicted_labels) print cross_class_report(true_labels, predicted_labels)
def doDescartes(X_train, X_test): res = X_test[['instanceID']] X_test.drop('instanceID', axis=1, inplace=True) data = X_train.append(X_test, ignore_index=True) del X_train, X_test gc.collect() for feat_1 in ['maybe_0', 'maybe_2']: for feat_2 in ['connectionType', 'creativeID', 'positionID']: le = LabelEncoder() data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str')) X_train = data.loc[data['label'] != -1, :] X_test = data.loc[data['label'] == -1, :] X_test.loc[:, 'instanceID'] = res.values del data gc.collect() return X_train, X_test
def preprocessData(dataset): le = preprocessing.LabelEncoder() # in case divid-by-zero dataset.Open[dataset.Open == 0] = 1 # add prediction target: next day Up/Down threshold = 0.000 dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open'] dataset.UpDown[dataset.UpDown >= threshold] = 'Up' dataset.UpDown[dataset.UpDown < threshold] = 'Down' dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value return dataset
def generate_test_data(): with open('./test.csv', 'r') as test_file: test_csv = csv.reader(test_file, delimiter=',') next(test_csv) test_data = list(test_csv) test_data = numpy.array(test_data) # delete id column # test_data = numpy.delete(test_data, 0, 1) # One of K encoding of categorical data encoder = preprocessing.LabelEncoder() for j in (1, 2, 3, 4, 5, 6, 7, 8, 9, 14): test_data[:, j+1] = encoder.fit_transform(test_data[:, j+1]) # Converting numpy strings to floats test_data = test_data.astype(numpy.float) missValueIndex = 7 Xy_test = test_data[test_data[:, 3+1]==missValueIndex] Xy_train = test_data[test_data[:, 3+1]!=missValueIndex] X_train = numpy.delete(Xy_train, 3+1 ,1) y_train = Xy_train[:, 3+1] X_test = numpy.delete(Xy_test, 3+1 ,1) market_test_data = MarketingData(X_train, y_train, X_test) return market_test_data, test_data # use knn for impute missing values
def check_proba_classif_convergence(X_train, y_train, mc): lb = LabelBinarizer() y_bin = lb.fit_transform(y_train) le = LabelEncoder() y_enc = le.fit_transform(y_train) proba = mc.predict_proba(X_train) labels = mc.predict(X_train) assert_array_equal(proba, y_bin) assert_array_equal(labels, lb.inverse_transform(y_bin)) # For points completely far away from the training data, this # should converge to the empirical distribution of labels. # X is scaled between to -1.0 and 1.0 X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]), -30.0 * np.ones(X_train.shape[1]))) inf_proba = mc.predict_proba(X_inf) emp_proba = np.bincount(y_enc) / float(len(y_enc)) assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])
def check_proba_classif_convergence(est, X_train, y_train): lb = LabelBinarizer() y_bin = lb.fit_transform(y_train) le = LabelEncoder() y_enc = le.fit_transform(y_train) proba = est.predict_proba(X_train) labels = est.predict(X_train) assert_array_equal(proba, y_bin) assert_array_equal(labels, lb.inverse_transform(y_bin)) # For points completely far away from the training data, this # should converge to the empirical distribution of labels. X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]), -30.0 * np.ones(X_train.shape[1]))) inf_proba = est.predict_proba(X_inf) emp_proba = np.bincount(y_enc) / float(len(y_enc)) assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
def train_model(data, with_mac=True): global without_mac_clf, mac_clf df = pd.DataFrame.from_dict(data) y = df.pop("location") features = [f for f in df.columns if f is not 'mac'] df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features]))) model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME if with_mac: df = df.apply(LabelEncoder().fit_transform) else: df.drop("mac", axis=1, inplace=True) clf = DecisionTreeClassifier() clf.fit(df, y) joblib.dump(clf, model_name) if with_mac and mac_clf is None: mac_clf = clf if not with_mac and without_mac_clf is None: without_mac_clf = clf export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot') os.system("dot -Tpng model.dot -o model.png")
def _fit_targets(self, y, classes=None): self.multilabel_ = self._is_multilabel(y) # If provided, use classes to fit the encoded and set classes_. # Otherwise, find the unique classes in y. if classes is not None: y = classes if self.multilabel_: self._enc = None self.classes_ = np.arange(y.shape[1]) self.n_classes_ = y.shape[1] else: self._enc = LabelEncoder().fit(y) self.classes_ = self._enc.classes_ self.n_classes_ = len(self.classes_)
def label_encoding(self, dataset): """ :param data_set: :param data_target: :return: data_set """ le_1 = preprocessing.LabelEncoder() le_2 = preprocessing.LabelEncoder() le_3 = preprocessing.LabelEncoder() le_1.fit(np.unique(dataset[:, 1])) le_2.fit(np.unique(dataset[:, 2])) le_3.fit(np.unique(dataset[:, 3])) dataset[:, 1] = le_1.transform(dataset[:, 1]) dataset[:, 2] = le_2.transform(dataset[:, 2]) dataset[:, 3] = le_3.transform(dataset[:, 3]) return dataset
def gen_features(train, y, test): for c in ['active', 'alco', 'smoke']: le = preprocessing.LabelEncoder() le.fit(train[c].values.tolist() + test[c].values.tolist()) train[c] = le.transform(train[c]) test[c] = le.transform(test[c]) train['ap_dif'] = train.ap_hi - train.ap_lo test['ap_dif'] = test.ap_hi - test.ap_lo h = train['height'] / 100 train['BWI'] = train['weight'] / (h * h) h = test['height'] / 100 test['BWI'] = test['weight'] / (h * h) imp = preprocessing.Imputer() train = imp.fit_transform(train) test = imp.transform(test) return train, y, test
def FeatureCombination(Df,s='',num_feature=2): feature_set = [] for c in Df.columns: if c.startswith(s): feature_set.append(c) print('combining', len(feature_set), 'features') data = Df[feature_set].values for c in Df.columns: if Df[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(Df[c].values)) Df[c] = lbl.transform(list(Df[c].values)) imp = preprocessing.Imputer() data = imp.fit_transform(data) data = preprocessing.scale(data) pca = PCA(num_feature) pca.fit(data) print('explained_variance_ratio_:', pca.explained_variance_ratio_) trans = pca.transform(data) for i in range(0,num_feature): Df[s+'_%d'%(i+1)] = trans[:,i] Df.drop(feature_set,1,inplace=True) return Df
def create_codes(df, column_name, revive=False, model_code=0): print('Encoding', column_name, '...') # get unique data nms_unique = df[column_name].unique().tolist() # fit model if not revive: print('Creating new Label Encoder...') le = LabelEncoder() le.fit(nms_unique) else: # Reload LE le_file_name = "LE_" + str(model_code) le = load_pickle(ROOT_PATH + '\\Data\\PickleJar\\' + le_file_name + '.pkl') # get all data nms = df[column_name].tolist() return le.transform(nms), le
def addDailyReturn(dataset): """ Adding in daily return to create binary classifiers (Up or Down in relation to the previous day) """ #will normalize labels le = preprocessing.LabelEncoder() dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1) print dataset['UpDown'] # will be denoted by 2 when transformed dataset.UpDown[dataset.UpDown >= 0] = "up" # will be denoted by 1 when transformed dataset.UpDown[dataset.UpDown < 0] = "down" dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) print dataset['UpDown']
def addDailyReturn(dataset): """ Adding in daily return to create binary classifiers (Up or Down in relation to the previous day) """ #will normalize labels le = preprocessing.LabelEncoder() #print "dataset['Adj_Close']\n", dataset['Adj_Close'][:5] #print "dataset['Adj_Close'].shift(-1)\n", dataset['Adj_Close'].shift(1)[:5] dataset['UpDown'] = (dataset['Adj_Close']-dataset['Adj_Close'].shift(1))/dataset['Adj_Close'].shift(1) #print dataset['UpDown'][240:] # will be denoted by 3 when transformed dataset.UpDown[dataset.UpDown > 0] = "sell" dataset.UpDown[dataset.UpDown == 0] = "hold" dataset.UpDown[dataset.UpDown < 0] = "buy" #print dataset['UpDown'][:10] dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) #print dataset['UpDown']
def addDailyReturn(dataset): """ Adding in daily return to create binary classifiers (Up or Down in relation to the previous day) """ #will normalize labels le = preprocessing.LabelEncoder() dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1) print dataset['UpDown'][:5] # will be denoted by 2 when transformed dataset.UpDown[dataset.UpDown >= 0] = "up" # will be denoted by 1 when transformed dataset.UpDown[dataset.UpDown < 0] = "down" print dataset['UpDown'] dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) # print dataset['UpDown'][:5]
def create_id_df(cls, df, is_train): """ :rtype: DataFrame :return: dataFrame, sorted by id, columns are ["label", "id0", "id", "id_tr", "id_te"] """ df = df[["id0", "label"]].copy() df = df.reset_index(drop=True) is_train = np.array(is_train) le_tr = LabelEncoder().fit(df.id0[is_train]) le_te = LabelEncoder().fit(df.id0[~is_train]) df["id_tr"] = np.nan df["id_te"] = np.nan df.loc[is_train, "id_tr"] = le_tr.transform(df.id0[is_train]) df.loc[~is_train, "id_te"] = le_te.transform(df.id0[~is_train]) df["id"] = np.where(np.isnan(df["id_tr"]), len(le_tr.classes_) + df["id_te"], df["id_tr"]) df = df.fillna(-1) df = df.sort("id") df = df[["label", "id0", "id", "id_tr", "id_te"]] return df
def create_model(context, data): # Get the relevant daily prices recent_prices = data.history(context.assets, 'price',context.history_range, '1d') context.ma_50 =recent_prices.values[-50:].mean() context.ma_200 = recent_prices.values[-200:].mean() #print context.ma_50 #print context.ma_200 time_lags = pd.DataFrame(index=recent_prices.index) time_lags['price']=recent_prices.values time_lags['daily_returns']=time_lags['price'].pct_change() time_lags['multiple_day_returns'] = time_lags['price'].pct_change(3) time_lags['rolling_mean'] = time_lags['daily_returns'].rolling(window = 4,center=False).mean() time_lags['time_lagged'] = time_lags['price']-time_lags['price'].shift(-2) X = time_lags[['price','daily_returns','multiple_day_returns','rolling_mean']].dropna() time_lags['updown'] = time_lags['daily_returns'] time_lags.updown[time_lags['daily_returns']>=0]='up' time_lags.updown[time_lags['daily_returns']<0]='down' le = preprocessing.LabelEncoder() time_lags['encoding']=le.fit(time_lags['updown']).transform(time_lags['updown']) # X = time_lags[['lag1','lag2']] # Independent, or input variables # Y = time_lags['direction'] # Dependent, or output variable context.model.fit(X,time_lags['encoding'][4:]) # Generate our model
def deserialise_encoder( encoder: acton_pb.Database.LabelEncoder ) -> sklearn.preprocessing.LabelEncoder: """Deserialises a LabelEncoder protobuf. Parameters ---------- encoder LabelEncoder protobuf. Returns ------- sklearn.preprocessing.LabelEncoder LabelEncoder (or None if no encodings were specified). """ encodings = [] for encoding in encoder.encoding: encodings.append((encoding.class_int, encoding.class_label)) encodings.sort() encodings = numpy.array([c[1] for c in encodings]) encoder = SKLabelEncoder() encoder.classes_ = encodings return encoder
def fit(self, X, y): le = preprocessing.LabelEncoder() y = le.fit_transform(y) self.num_classes = np.unique(y).shape[0] sf = xgb.DMatrix(X, y) params = {"objective": 'multi:softprob', "eta": self.eta, "gamma": self.gamma, "max_depth": self.max_depth, "min_child_weight": self.min_child_weight, "max_delta_step": self.max_delta_step, "subsample": self.subsample, "silent": self.silent, "colsample_bytree": self.colsample_bytree, "seed": self.seed, "lambda": self.l2_reg, "alpha": self.l1_reg, "num_class": self.num_classes} self.model = xgb.train(params, sf, self.num_round) return self
def execute_inplace(self, data): df = data.df meta = data.metadata classes = {} cols_to_encode = meta[meta.type == ColType.CATEGORICAL].index for col in cols_to_encode: enc = LE() df.loc[df[col].notnull(), col] = enc.fit_transform(df.loc[df[col].notnull(), col]) df[col] = df[col].astype(float) meta.loc[col, 'type'] = ColType.INT_ENCODING meta.loc[col, 'derived_from'] = col classes[col] = enc.classes_ self.logger.info('LabelEncoder: encoded %s', col) self.state = {'classes': classes}
def pre_process_data(): for col in categorical_fields: data_frame[col].fillna('default',inplace=True) data_frame_test[col].fillna('default',inplace=True) for col in numerical_fields: data_frame[col].fillna(0,inplace=True) data_frame_test[col].fillna(0,inplace=True) encode=LabelEncoder() for col in categorical_fields: data_frame[col]=encode.fit_transform(data_frame[col]) data_frame_test[col]=encode.fit_transform(data_frame_test[col]) data_frame['SalePrice'].fillna(0,inplace=True)
def labels_to_categories(y): """ Labels to categories :param y: list of labels, ex. ['positive', 'negative', 'positive', 'neutral', 'positive', ...] :return: list of categories, ex. [0, 2, 1, 2, 0, ...] """ encoder = LabelEncoder() encoder.fit(y) y_num = encoder.transform(y) return y_num
def label_classes(df, estimated_var): le = LabelEncoder() le.fit(df[estimated_var].values) return le.classes_
def __init__(self, classifier=None): if classifier: self.clf = classifier else: self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500) self.labels = preprocessing.LabelEncoder() self.feature_length = -1
def get_dataset(): list_folder = os.listdir('data/') list_images = [] for i in xrange(len(list_folder)): images = os.listdir('data/' + list_folder[i]) for x in xrange(len(images)): image = [list_folder[i] + '/' + images[x], list_folder[i]] list_images.append(image) list_images = np.array(list_images) np.random.shuffle(list_images) print "before cleaning got: " + str(list_images.shape[0]) + " data" list_temp = [] for i in xrange(list_images.shape[0]): image = misc.imread('data/' + list_images[i, 0]) if len(image.shape) < 3: continue list_temp.append(list_images[i, :].tolist()) list_images = np.array(list_temp) print "after cleaning got: " + str(list_images.shape[0]) + " data" label = np.unique(list_images[:, 1]).tolist() list_images[:, 1] = LabelEncoder().fit_transform(list_images[:, 1]) return list_images, np.unique(list_images[:, 1]).shape[0], label
def __do_label_encoding(self): df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] le = LabelEncoder() cross_feature_dict = self.__get_label_encode_dict() for _, new_feature_name in cross_feature_dict.iteritems(): to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]] le.fit(pd.concat(to_be_stacked, axis=0)) df_train[new_feature_name] = le.transform(df_train[new_feature_name]) df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name]) df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name]) return
def fit(self, column): self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column)) self.classes_ = self.encoder_.classes_ return self
def __init__(self, multilabel=False): self.multilabel = multilabel if self.multilabel: self.le = MultiLabelBinarizer(sparse_output=True) else: self.le = LabelEncoder() self.from_classes = False
def __init__(self): self.label_encoder = preprocessing.LabelEncoder()
def test_autoclean_no_nans_with_strings(): """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) hand_cleaned_data = data.copy() hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) cleaned_data = autoclean(data) assert cleaned_data.equals(hand_cleaned_data)