Python sklearn.preprocessing 模块，LabelEncoder() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.preprocessing.LabelEncoder()。

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def lbl_encode(df_tr,df_te=None,cols=None,objonly=True):
    print("label encode ...")
    lbl = LabelEncoder()
    if df_te is not None:
        df = df_tr.append(df_te)
        if cols is None:
            cols = set(df_tr.columns.values).intersection(set(df_te.columns.values))
    else:
        df = df_tr
        if cols is None:
            cols = df_tr.columns.values
    encoded = []
    for col in cols:
        if objonly and df[col].dtype!='object':
            continue
        encoded.append(col)
        lbl.fit(df[col].map(str))
        df_tr[col] = lbl.transform(df_tr[col].map(str))
        if df_te is not None:
            df_te[col] = lbl.transform(df_te[col].map(str))
    print('lbl encode:',encoded)

项目：datacleaner 作者：rhiever | 项目源码 | 文件源码

def test_autoclean_cv_no_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)

项目：datacleaner 作者：rhiever | 项目源码 | 文件源码

def test_autoclean_with_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)

项目：datacleaner 作者：rhiever | 项目源码 | 文件源码

def test_autoclean_real_data():
    """Test autoclean() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
    adult_data.loc[30:60, 'age'] = np.nan
    adult_data.loc[90:100, 'education'] = np.nan

    hand_cleaned_adult_data = adult_data.copy()

    hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
    hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)

    cleaned_adult_data = autoclean(adult_data)

    assert cleaned_adult_data.equals(hand_cleaned_adult_data)

项目：marseille 作者：vene | 项目源码 | 文件源码

def initialize_labels(self, Y):

        y_nodes_flat = [y_val for y in Y for y_val in y.nodes]
        y_links_flat = [y_val for y in Y for y_val in y.links]
        self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

        self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_,
                                     dtype=np.double)
        self.link_cw_ = compute_class_weight(self.class_weight,
                                             self.link_encoder_.classes_,
                                             y_links_flat)

        self.link_cw_ /= self.link_cw_.min()

        logging.info('Setting node class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.prop_encoder_.classes_, self.prop_cw_))))

        logging.info('Setting link class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.link_encoder_.classes_, self.link_cw_))))

项目：SPHERE-HyperStream 作者：IRC-SPHERE | 项目源码 | 文件源码

def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：microTC 作者：INGEOTEC | 项目源码 | 文件源码

def test_predict_from_file():
    from microtc.wrappers import ClassifierWrapper
    from microtc.textmodel import TextModel
    from microtc.utils import read_data_labels
    from sklearn.preprocessing import LabelEncoder

    import os
    fname = os.path.dirname(__file__) + '/text.json'
    corpus, labels = read_data_labels(fname)
    t = TextModel(corpus)
    le = LabelEncoder()
    le.fit(labels)
    y = le.transform(labels)
    c = ClassifierWrapper()
    X = [t[x] for x in corpus]
    c.fit(X, y)
    hy = le.inverse_transform(c.predict(X))
    for i in hy:
        assert i in ['POS', 'NEU', 'NEG']

项目：microTC 作者：INGEOTEC | 项目源码 | 文件源码

def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='r2', classifier=RegressorWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]

项目：microTC 作者：INGEOTEC | 项目源码 | 文件源码

def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='macrof1', classifier=ClassifierWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]

项目：traffic-v2 作者：vnetserg | 项目源码 | 文件源码

def score_model(model, data_test, labeler):
    '''
        ??????? ?????????????????? ??????,
        ?????? ? ??????????? ????? ??? ???????:
        ???????? ?????????, ???????? ??????? ?
        ???????? ??? ??????? ??????, ????????
        ? ????????????? ??????.
        ?????????:
            model - ????????? ??????
            data_test - ??????????? ???????
            labeler - LabelEncoder ?????? ???????
        ??????????:
            ??????
    '''
    X_test = data_test.drop(["proto"], axis=1)
    y_test = data_test["proto"]
    y_predicted = model.predict(X_test)

    true_labels = labeler.inverse_transform(y_test)
    predicted_labels = labeler.inverse_transform(y_predicted)

    print feature_importances_report(model, X_test.columns)
    print "\n", classification_report(true_labels, predicted_labels)
    print cross_class_report(true_labels, predicted_labels)

项目：Tencent_Social_Ads 作者：freelzy | 项目源码 | 文件源码

def doDescartes(X_train, X_test):
    res = X_test[['instanceID']]
    X_test.drop('instanceID', axis=1, inplace=True)
    data = X_train.append(X_test, ignore_index=True)
    del X_train, X_test
    gc.collect()

    for feat_1 in ['maybe_0', 'maybe_2']:
        for feat_2 in ['connectionType', 'creativeID', 'positionID']:
            le = LabelEncoder()
            data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str'))
    X_train = data.loc[data['label'] != -1, :]
    X_test = data.loc[data['label'] == -1, :]
    X_test.loc[:, 'instanceID'] = res.values
    del data
    gc.collect()
    return X_train, X_test

项目：stock-price-prediction 作者：chinuy | 项目源码 | 文件源码

def preprocessData(dataset):

    le = preprocessing.LabelEncoder()

    # in case divid-by-zero
    dataset.Open[dataset.Open == 0] = 1

    # add prediction target: next day Up/Down
    threshold = 0.000
    dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
    dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
    dataset.UpDown[dataset.UpDown < threshold] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
    dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
    return dataset

项目：playground 作者：Pennsy | 项目源码 | 文件源码

def generate_test_data():
    with open('./test.csv', 'r') as test_file:
        test_csv = csv.reader(test_file, delimiter=',')
        next(test_csv)
        test_data = list(test_csv)
    test_data = numpy.array(test_data)
    # delete id column
    # test_data = numpy.delete(test_data, 0, 1)
    # One of K encoding of categorical data
    encoder = preprocessing.LabelEncoder()
    for j in (1, 2, 3, 4, 5, 6, 7, 8, 9, 14):
        test_data[:, j+1] = encoder.fit_transform(test_data[:, j+1])
    # Converting numpy strings to floats
    test_data = test_data.astype(numpy.float)
    missValueIndex = 7
    Xy_test = test_data[test_data[:, 3+1]==missValueIndex]
    Xy_train = test_data[test_data[:, 3+1]!=missValueIndex]
    X_train = numpy.delete(Xy_train, 3+1 ,1)
    y_train = Xy_train[:, 3+1]
    X_test = numpy.delete(Xy_test, 3+1 ,1)
    market_test_data = MarketingData(X_train, y_train, X_test)
    return market_test_data, test_data


# use knn for impute missing values

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def check_proba_classif_convergence(X_train, y_train, mc):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)

    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = mc.predict_proba(X_train)
    labels = mc.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = mc.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def check_proba_classif_convergence(est, X_train, y_train):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = est.predict_proba(X_train)
    labels = est.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = est.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：hate-to-hugs 作者：sdoran35 | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：whereareyou 作者：futurice | 项目源码 | 文件源码

def train_model(data, with_mac=True):
    global without_mac_clf, mac_clf
    df = pd.DataFrame.from_dict(data)
    y = df.pop("location")
    features = [f for f in df.columns if f is not 'mac']
    df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
    model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
    if with_mac:
        df = df.apply(LabelEncoder().fit_transform)
    else:
        df.drop("mac", axis=1, inplace=True)
    clf = DecisionTreeClassifier()
    clf.fit(df, y)
    joblib.dump(clf, model_name)
    if with_mac and mac_clf is None:
        mac_clf = clf
    if not with_mac and without_mac_clf is None:
        without_mac_clf = clf
    export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
    os.system("dot -Tpng model.dot -o model.png")

项目：FancyWord 作者：EastonLee | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：muffnn 作者：civisanalytics | 项目源码 | 文件源码

def _fit_targets(self, y, classes=None):
        self.multilabel_ = self._is_multilabel(y)

        # If provided, use classes to fit the encoded and set classes_.
        # Otherwise, find the unique classes in y.
        if classes is not None:
            y = classes

        if self.multilabel_:
            self._enc = None
            self.classes_ = np.arange(y.shape[1])
            self.n_classes_ = y.shape[1]
        else:
            self._enc = LabelEncoder().fit(y)
            self.classes_ = self._enc.classes_
            self.n_classes_ = len(self.classes_)

项目：beepboop 作者：nicolehe | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：kdd99-scikit 作者：PENGZhaoqing | 项目源码 | 文件源码

def label_encoding(self, dataset):
        """

        :param data_set:
        :param data_target:
        :return: data_set
        """

        le_1 = preprocessing.LabelEncoder()
        le_2 = preprocessing.LabelEncoder()
        le_3 = preprocessing.LabelEncoder()

        le_1.fit(np.unique(dataset[:, 1]))
        le_2.fit(np.unique(dataset[:, 2]))
        le_3.fit(np.unique(dataset[:, 3]))

        dataset[:, 1] = le_1.transform(dataset[:, 1])
        dataset[:, 2] = le_2.transform(dataset[:, 2])
        dataset[:, 3] = le_3.transform(dataset[:, 3])

        return dataset

项目：kind2anki 作者：prz3m | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：but_sentiment 作者：MixedEmotions | 项目源码 | 文件源码

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：Kaggle-Competition-Sberbank 作者：LenzDu | 项目源码 | 文件源码

def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))

    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df

项目：TextClassification 作者：AlgorTroy | 项目源码 | 文件源码

def create_codes(df, column_name, revive=False, model_code=0):
    print('Encoding', column_name, '...')
    # get unique data
    nms_unique = df[column_name].unique().tolist()

    # fit model

    if not revive:
        print('Creating new Label Encoder...')
        le = LabelEncoder()
        le.fit(nms_unique)
    else:
        # Reload LE
        le_file_name = "LE_" + str(model_code)
        le = load_pickle(ROOT_PATH + '\\Data\\PickleJar\\' + le_file_name + '.pkl')
    # get all data
    nms = df[column_name].tolist()

    return le.transform(nms), le

项目：Informed-Finance-Canary 作者：Darthone | 项目源码 | 文件源码

def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
    print dataset['UpDown']
    # will be denoted by 2 when transformed
    dataset.UpDown[dataset.UpDown >= 0] = "up"
    # will be denoted by 1 when transformed 
    dataset.UpDown[dataset.UpDown < 0] = "down"
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    print dataset['UpDown']

项目：Informed-Finance-Canary 作者：Darthone | 项目源码 | 文件源码

def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    #print "dataset['Adj_Close']\n", dataset['Adj_Close'][:5]

    #print "dataset['Adj_Close'].shift(-1)\n", dataset['Adj_Close'].shift(1)[:5]

    dataset['UpDown'] = (dataset['Adj_Close']-dataset['Adj_Close'].shift(1))/dataset['Adj_Close'].shift(1)
    #print dataset['UpDown'][240:]

    # will be denoted by 3 when transformed
    dataset.UpDown[dataset.UpDown > 0] = "sell"

    dataset.UpDown[dataset.UpDown == 0] = "hold"

    dataset.UpDown[dataset.UpDown < 0] = "buy"
    #print dataset['UpDown'][:10]
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)

    #print dataset['UpDown']

项目：Informed-Finance-Canary 作者：Darthone | 项目源码 | 文件源码

def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
    print dataset['UpDown'][:5]
    # will be denoted by 2 when transformed
    dataset.UpDown[dataset.UpDown >= 0] = "up"
    # will be denoted by 1 when transformed 
    dataset.UpDown[dataset.UpDown < 0] = "down"
    print dataset['UpDown'] 
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
#   print dataset['UpDown'][:5]

项目：kaggle-prudential-sample 作者：threecourse | 项目源码 | 文件源码

def create_id_df(cls, df, is_train):
        """
        :rtype: DataFrame
        :return: dataFrame, sorted by id, 
                 columns are ["label", "id0", "id", "id_tr", "id_te"]
        """

        df = df[["id0", "label"]].copy()
        df = df.reset_index(drop=True)
        is_train = np.array(is_train)

        le_tr = LabelEncoder().fit(df.id0[is_train])
        le_te = LabelEncoder().fit(df.id0[~is_train])

        df["id_tr"] = np.nan
        df["id_te"] = np.nan
        df.loc[is_train, "id_tr"] = le_tr.transform(df.id0[is_train])
        df.loc[~is_train, "id_te"] = le_te.transform(df.id0[~is_train])
        df["id"] = np.where(np.isnan(df["id_tr"]), len(le_tr.classes_) + df["id_te"], df["id_tr"])

        df = df.fillna(-1)
        df = df.sort("id")
        df = df[["label", "id0", "id", "id_tr", "id_te"]]

        return df

项目：quantopian-machinelearning 作者：arshpreetsingh | 项目源码 | 文件源码

def create_model(context, data):
    # Get the relevant daily prices
    recent_prices = data.history(context.assets, 'price',context.history_range, '1d')

    context.ma_50 =recent_prices.values[-50:].mean()     
    context.ma_200 = recent_prices.values[-200:].mean() 
    #print context.ma_50
    #print context.ma_200
    time_lags = pd.DataFrame(index=recent_prices.index)
    time_lags['price']=recent_prices.values
    time_lags['daily_returns']=time_lags['price'].pct_change()
    time_lags['multiple_day_returns'] =  time_lags['price'].pct_change(3)
    time_lags['rolling_mean'] = time_lags['daily_returns'].rolling(window = 4,center=False).mean()

    time_lags['time_lagged'] = time_lags['price']-time_lags['price'].shift(-2)
    X = time_lags[['price','daily_returns','multiple_day_returns','rolling_mean']].dropna()

    time_lags['updown'] = time_lags['daily_returns']
    time_lags.updown[time_lags['daily_returns']>=0]='up'
    time_lags.updown[time_lags['daily_returns']<0]='down'
    le = preprocessing.LabelEncoder()
    time_lags['encoding']=le.fit(time_lags['updown']).transform(time_lags['updown'])
  #  X = time_lags[['lag1','lag2']] # Independent, or input variables
   # Y = time_lags['direction'] # Dependent, or output variable
    context.model.fit(X,time_lags['encoding'][4:]) # Generate our model

项目：acton 作者：chengsoonong | 项目源码 | 文件源码

def deserialise_encoder(
            encoder: acton_pb.Database.LabelEncoder
        ) -> sklearn.preprocessing.LabelEncoder:
    """Deserialises a LabelEncoder protobuf.

    Parameters
    ----------
    encoder
        LabelEncoder protobuf.

    Returns
    -------
    sklearn.preprocessing.LabelEncoder
        LabelEncoder (or None if no encodings were specified).
    """
    encodings = []
    for encoding in encoder.encoding:
        encodings.append((encoding.class_int, encoding.class_label))
    encodings.sort()
    encodings = numpy.array([c[1] for c in encodings])

    encoder = SKLabelEncoder()
    encoder.classes_ = encodings
    return encoder

项目：Prudential-Life-Insurance-Assessment 作者：AntonUBC | 项目源码 | 文件源码

def fit(self, X, y):
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)
        self.num_classes = np.unique(y).shape[0]
        sf = xgb.DMatrix(X, y)
        params = {"objective": 'multi:softprob',
          "eta": self.eta,
          "gamma": self.gamma,
          "max_depth": self.max_depth,
          "min_child_weight": self.min_child_weight,
          "max_delta_step": self.max_delta_step,
          "subsample": self.subsample,
          "silent": self.silent,
          "colsample_bytree": self.colsample_bytree,
          "seed": self.seed,
          "lambda": self.l2_reg,
          "alpha": self.l1_reg,
          "num_class": self.num_classes}
        self.model = xgb.train(params, sf, self.num_round)

        return self

项目：guacml 作者：guacml | 项目源码 | 文件源码

def execute_inplace(self, data):
        df = data.df
        meta = data.metadata

        classes = {}
        cols_to_encode = meta[meta.type == ColType.CATEGORICAL].index
        for col in cols_to_encode:
            enc = LE()
            df.loc[df[col].notnull(), col] = enc.fit_transform(df.loc[df[col].notnull(), col])
            df[col] = df[col].astype(float)
            meta.loc[col, 'type'] = ColType.INT_ENCODING
            meta.loc[col, 'derived_from'] = col
            classes[col] = enc.classes_
            self.logger.info('LabelEncoder: encoded %s', col)

        self.state = {'classes': classes}

项目：HousePricePredictionKaggle 作者：Nuwantha | 项目源码 | 文件源码

def pre_process_data():
    for col in categorical_fields:
        data_frame[col].fillna('default',inplace=True)
        data_frame_test[col].fillna('default',inplace=True)

    for col in numerical_fields:
        data_frame[col].fillna(0,inplace=True)
        data_frame_test[col].fillna(0,inplace=True)

    encode=LabelEncoder()
    for col in categorical_fields:
        data_frame[col]=encode.fit_transform(data_frame[col])
        data_frame_test[col]=encode.fit_transform(data_frame_test[col])
    data_frame['SalePrice'].fillna(0,inplace=True)

项目：keras-utilities 作者：cbaziotis | 项目源码 | 文件源码

def labels_to_categories(y):
    """
    Labels to categories
    :param y: list of labels, ex. ['positive', 'negative', 'positive', 'neutral', 'positive', ...]
    :return: list of categories, ex. [0, 2, 1, 2, 0, ...]
    """
    encoder = LabelEncoder()
    encoder.fit(y)
    y_num = encoder.transform(y)
    return y_num

项目：PortfolioTimeSeriesAnalysis 作者：MizioAnd | 项目源码 | 文件源码

def label_classes(df, estimated_var):
        le = LabelEncoder()
        le.fit(df[estimated_var].values)
        return le.classes_

项目：TrackToTrip 作者：ruipgil | 项目源码 | 文件源码

def __init__(self, classifier=None):
        if classifier:
            self.clf = classifier
        else:
            self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500)
        self.labels = preprocessing.LabelEncoder()
        self.feature_length = -1

项目：YOLO-Object-Detection-Tensorflow 作者：huseinzol05 | 项目源码 | 文件源码

def get_dataset():

    list_folder = os.listdir('data/')
    list_images = []
    for i in xrange(len(list_folder)):
        images = os.listdir('data/' + list_folder[i])
        for x in xrange(len(images)):
            image = [list_folder[i] + '/' + images[x], list_folder[i]]
            list_images.append(image)
    list_images = np.array(list_images)
    np.random.shuffle(list_images)

    print "before cleaning got: " + str(list_images.shape[0]) + " data"

    list_temp = []
    for i in xrange(list_images.shape[0]):
        image = misc.imread('data/' + list_images[i, 0])
        if len(image.shape) < 3:
            continue
        list_temp.append(list_images[i, :].tolist())

    list_images = np.array(list_temp)
    print "after cleaning got: " + str(list_images.shape[0]) + " data"
    label = np.unique(list_images[:, 1]).tolist()
    list_images[:, 1] = LabelEncoder().fit_transform(list_images[:, 1])
    return list_images, np.unique(list_images[:, 1]).shape[0], label

项目：Supply-demand-forecasting 作者：LevinJ | 项目源码 | 文件源码

def __do_label_encoding(self):
        df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        le = LabelEncoder()
        cross_feature_dict = self.__get_label_encode_dict()
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]]
            le.fit(pd.concat(to_be_stacked, axis=0))
            df_train[new_feature_name] = le.transform(df_train[new_feature_name])
            df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name])
            df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name])

        return

项目：skutil 作者：tgsmith61591 | 项目源码 | 文件源码

def fit(self, column):
        self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column))
        self.classes_ = self.encoder_.classes_
        return self

项目：ltls 作者：kjasinska | 项目源码 | 文件源码

def __init__(self, multilabel=False):
        self.multilabel = multilabel
        if self.multilabel:
            self.le = MultiLabelBinarizer(sparse_output=True)
        else:
            self.le = LabelEncoder()
        self.from_classes = False

项目：quoll 作者：LanguageMachines | 项目源码 | 文件源码

def __init__(self):
        self.label_encoder = preprocessing.LabelEncoder()

项目：datacleaner 作者：rhiever | 项目源码 | 文件源码

def test_autoclean_no_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    hand_cleaned_data = data.copy()
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)