Python sklearn.preprocessing 模块,LabelEncoder() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.LabelEncoder()

项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def lbl_encode(df_tr,df_te=None,cols=None,objonly=True):
    print("label encode ...")
    lbl = LabelEncoder()
    if df_te is not None:
        df = df_tr.append(df_te)
        if cols is None:
            cols = set(df_tr.columns.values).intersection(set(df_te.columns.values))
    else:
        df = df_tr
        if cols is None:
            cols = df_tr.columns.values
    encoded = []
    for col in cols:
        if objonly and df[col].dtype!='object':
            continue
        encoded.append(col)
        lbl.fit(df[col].map(str))
        df_tr[col] = lbl.transform(df_tr[col].map(str))
        if df_te is not None:
            df_te[col] = lbl.transform(df_te[col].map(str))
    print('lbl encode:',encoded)
项目:datacleaner    作者:rhiever    | 项目源码 | 文件源码
def test_autoclean_cv_no_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
项目:datacleaner    作者:rhiever    | 项目源码 | 文件源码
def test_autoclean_with_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    data.loc[10:20, 'A'] = np.nan
    data.loc[50:70, 'C'] = np.nan

    hand_cleaned_data = data.copy()
    hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
    hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)
项目:datacleaner    作者:rhiever    | 项目源码 | 文件源码
def test_autoclean_real_data():
    """Test autoclean() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
    adult_data.loc[30:60, 'age'] = np.nan
    adult_data.loc[90:100, 'education'] = np.nan

    hand_cleaned_adult_data = adult_data.copy()

    hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
    hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)

    cleaned_adult_data = autoclean(adult_data)

    assert cleaned_adult_data.equals(hand_cleaned_adult_data)
项目:marseille    作者:vene    | 项目源码 | 文件源码
def initialize_labels(self, Y):

        y_nodes_flat = [y_val for y in Y for y_val in y.nodes]
        y_links_flat = [y_val for y in Y for y_val in y.links]
        self.prop_encoder_ = LabelEncoder().fit(y_nodes_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

        self.prop_cw_ = np.ones_like(self.prop_encoder_.classes_,
                                     dtype=np.double)
        self.link_cw_ = compute_class_weight(self.class_weight,
                                             self.link_encoder_.classes_,
                                             y_links_flat)

        self.link_cw_ /= self.link_cw_.min()

        logging.info('Setting node class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.prop_encoder_.classes_, self.prop_cw_))))

        logging.info('Setting link class weights {}'.format(", ".join(
            "{}: {}".format(lbl, cw) for lbl, cw in zip(
                self.link_encoder_.classes_, self.link_cw_))))
项目:SPHERE-HyperStream    作者:IRC-SPHERE    | 项目源码 | 文件源码
def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:microTC    作者:INGEOTEC    | 项目源码 | 文件源码
def test_predict_from_file():
    from microtc.wrappers import ClassifierWrapper
    from microtc.textmodel import TextModel
    from microtc.utils import read_data_labels
    from sklearn.preprocessing import LabelEncoder

    import os
    fname = os.path.dirname(__file__) + '/text.json'
    corpus, labels = read_data_labels(fname)
    t = TextModel(corpus)
    le = LabelEncoder()
    le.fit(labels)
    y = le.transform(labels)
    c = ClassifierWrapper()
    X = [t[x] for x in corpus]
    c.fit(X, y)
    hy = le.inverse_transform(c.predict(X))
    for i in hy:
        assert i in ['POS', 'NEU', 'NEG']
项目:microTC    作者:INGEOTEC    | 项目源码 | 文件源码
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='r2', classifier=RegressorWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]
项目:microTC    作者:INGEOTEC    | 项目源码 | 文件源码
def __init__(self, X, y, Xstatic=[], ystatic=[], ratio=0.8, test_ratio=None, score='macrof1', classifier=ClassifierWrapper, random_state=None):
        assert ratio < 1, "ratio {0} is invalid, valid values are 0 < ratio < 1".format(ratio)
        self.score = score
        self.le = preprocessing.LabelEncoder().fit(y)
        self.create_classifier = classifier
        if test_ratio is None:
            test_ratio = 1.0 - ratio

        I = list(range(len(y)))
        np.random.shuffle(I)
        s = int(np.ceil(len(y) * ratio))
        s_end = int(np.ceil(len(y) * test_ratio))
        y = self.le.transform(y)
        train, test = I[:s], I[s:s+s_end]
        self.train_corpus = [X[i] for i in train]
        self.train_corpus.extend(Xstatic)

        if len(ystatic) > 0:
            ystatic = self.le.transform(ystatic)
            self.train_y = np.hstack((y[train], ystatic))
        else:
            self.train_y = y[train]

        self.test_corpus = [X[i] for i in test]
        self.test_y = y[test]
项目:traffic-v2    作者:vnetserg    | 项目源码 | 文件源码
def score_model(model, data_test, labeler):
    '''
        ??????? ?????????????????? ??????,
        ?????? ? ??????????? ????? ??? ???????:
        ???????? ?????????, ???????? ??????? ?
        ???????? ??? ??????? ??????, ????????
        ? ????????????? ??????.
        ?????????:
            model - ????????? ??????
            data_test - ??????????? ???????
            labeler - LabelEncoder ?????? ???????
        ??????????:
            ??????
    '''
    X_test = data_test.drop(["proto"], axis=1)
    y_test = data_test["proto"]
    y_predicted = model.predict(X_test)

    true_labels = labeler.inverse_transform(y_test)
    predicted_labels = labeler.inverse_transform(y_predicted)

    print feature_importances_report(model, X_test.columns)
    print "\n", classification_report(true_labels, predicted_labels)
    print cross_class_report(true_labels, predicted_labels)
项目:Tencent_Social_Ads    作者:freelzy    | 项目源码 | 文件源码
def doDescartes(X_train, X_test):
    res = X_test[['instanceID']]
    X_test.drop('instanceID', axis=1, inplace=True)
    data = X_train.append(X_test, ignore_index=True)
    del X_train, X_test
    gc.collect()

    for feat_1 in ['maybe_0', 'maybe_2']:
        for feat_2 in ['connectionType', 'creativeID', 'positionID']:
            le = LabelEncoder()
            data[feat_1 + '_' + feat_2] = le.fit_transform(data[feat_1].astype('str') + data[feat_2].astype('str'))
    X_train = data.loc[data['label'] != -1, :]
    X_test = data.loc[data['label'] == -1, :]
    X_test.loc[:, 'instanceID'] = res.values
    del data
    gc.collect()
    return X_train, X_test
项目:stock-price-prediction    作者:chinuy    | 项目源码 | 文件源码
def preprocessData(dataset):

    le = preprocessing.LabelEncoder()

    # in case divid-by-zero
    dataset.Open[dataset.Open == 0] = 1

    # add prediction target: next day Up/Down
    threshold = 0.000
    dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
    dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
    dataset.UpDown[dataset.UpDown < threshold] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
    dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
    return dataset
项目:playground    作者:Pennsy    | 项目源码 | 文件源码
def generate_test_data():
    with open('./test.csv', 'r') as test_file:
        test_csv = csv.reader(test_file, delimiter=',')
        next(test_csv)
        test_data = list(test_csv)
    test_data = numpy.array(test_data)
    # delete id column
    # test_data = numpy.delete(test_data, 0, 1)
    # One of K encoding of categorical data
    encoder = preprocessing.LabelEncoder()
    for j in (1, 2, 3, 4, 5, 6, 7, 8, 9, 14):
        test_data[:, j+1] = encoder.fit_transform(test_data[:, j+1])
    # Converting numpy strings to floats
    test_data = test_data.astype(numpy.float)
    missValueIndex = 7
    Xy_test = test_data[test_data[:, 3+1]==missValueIndex]
    Xy_train = test_data[test_data[:, 3+1]!=missValueIndex]
    X_train = numpy.delete(Xy_train, 3+1 ,1)
    y_train = Xy_train[:, 3+1]
    X_test = numpy.delete(Xy_test, 3+1 ,1)
    market_test_data = MarketingData(X_train, y_train, X_test)
    return market_test_data, test_data


# use knn for impute missing values
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def check_proba_classif_convergence(X_train, y_train, mc):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)

    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = mc.predict_proba(X_train)
    labels = mc.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = mc.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba])
项目:scikit-garden    作者:scikit-garden    | 项目源码 | 文件源码
def check_proba_classif_convergence(est, X_train, y_train):
    lb = LabelBinarizer()
    y_bin = lb.fit_transform(y_train)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    proba = est.predict_proba(X_train)
    labels = est.predict(X_train)
    assert_array_equal(proba, y_bin)
    assert_array_equal(labels, lb.inverse_transform(y_bin))

    # For points completely far away from the training data, this
    # should converge to the empirical distribution of labels.
    X_inf = np.vstack((30.0 * np.ones(X_train.shape[1]),
                       -30.0 * np.ones(X_train.shape[1])))
    inf_proba = est.predict_proba(X_inf)
    emp_proba = np.bincount(y_enc) / float(len(y_enc))
    assert_array_almost_equal(inf_proba, [emp_proba, emp_proba], 3)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:whereareyou    作者:futurice    | 项目源码 | 文件源码
def train_model(data, with_mac=True):
    global without_mac_clf, mac_clf
    df = pd.DataFrame.from_dict(data)
    y = df.pop("location")
    features = [f for f in df.columns if f is not 'mac']
    df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
    model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
    if with_mac:
        df = df.apply(LabelEncoder().fit_transform)
    else:
        df.drop("mac", axis=1, inplace=True)
    clf = DecisionTreeClassifier()
    clf.fit(df, y)
    joblib.dump(clf, model_name)
    if with_mac and mac_clf is None:
        mac_clf = clf
    if not with_mac and without_mac_clf is None:
        without_mac_clf = clf
    export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
    os.system("dot -Tpng model.dot -o model.png")
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:muffnn    作者:civisanalytics    | 项目源码 | 文件源码
def _fit_targets(self, y, classes=None):
        self.multilabel_ = self._is_multilabel(y)

        # If provided, use classes to fit the encoded and set classes_.
        # Otherwise, find the unique classes in y.
        if classes is not None:
            y = classes

        if self.multilabel_:
            self._enc = None
            self.classes_ = np.arange(y.shape[1])
            self.n_classes_ = y.shape[1]
        else:
            self._enc = LabelEncoder().fit(y)
            self.classes_ = self._enc.classes_
            self.n_classes_ = len(self.classes_)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:kdd99-scikit    作者:PENGZhaoqing    | 项目源码 | 文件源码
def label_encoding(self, dataset):
        """

        :param data_set:
        :param data_target:
        :return: data_set
        """

        le_1 = preprocessing.LabelEncoder()
        le_2 = preprocessing.LabelEncoder()
        le_3 = preprocessing.LabelEncoder()

        le_1.fit(np.unique(dataset[:, 1]))
        le_2.fit(np.unique(dataset[:, 2]))
        le_3.fit(np.unique(dataset[:, 3]))

        dataset[:, 1] = le_1.transform(dataset[:, 1])
        dataset[:, 2] = le_2.transform(dataset[:, 2])
        dataset[:, 3] = le_3.transform(dataset[:, 3])

        return dataset
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:Kaggle-Competition-Sberbank    作者:LenzDu    | 项目源码 | 文件源码
def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))

    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df
项目:TextClassification    作者:AlgorTroy    | 项目源码 | 文件源码
def create_codes(df, column_name, revive=False, model_code=0):
    print('Encoding', column_name, '...')
    # get unique data
    nms_unique = df[column_name].unique().tolist()

    # fit model

    if not revive:
        print('Creating new Label Encoder...')
        le = LabelEncoder()
        le.fit(nms_unique)
    else:
        # Reload LE
        le_file_name = "LE_" + str(model_code)
        le = load_pickle(ROOT_PATH + '\\Data\\PickleJar\\' + le_file_name + '.pkl')
    # get all data
    nms = df[column_name].tolist()

    return le.transform(nms), le
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
    print dataset['UpDown']
    # will be denoted by 2 when transformed
    dataset.UpDown[dataset.UpDown >= 0] = "up"
    # will be denoted by 1 when transformed 
    dataset.UpDown[dataset.UpDown < 0] = "down"
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    print dataset['UpDown']
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    #print "dataset['Adj_Close']\n", dataset['Adj_Close'][:5]

    #print "dataset['Adj_Close'].shift(-1)\n", dataset['Adj_Close'].shift(1)[:5]

    dataset['UpDown'] = (dataset['Adj_Close']-dataset['Adj_Close'].shift(1))/dataset['Adj_Close'].shift(1)
    #print dataset['UpDown'][240:]

    # will be denoted by 3 when transformed
    dataset.UpDown[dataset.UpDown > 0] = "sell"

    dataset.UpDown[dataset.UpDown == 0] = "hold"

    dataset.UpDown[dataset.UpDown < 0] = "buy"
    #print dataset['UpDown'][:10]
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)

    #print dataset['UpDown']
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def addDailyReturn(dataset):
    """
    Adding in daily return to create binary classifiers (Up or Down in relation to the previous day)
    """

    #will normalize labels
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = -(dataset['Adj_Close']-dataset['Adj_Close'].shift(-1))/dataset['Adj_Close'].shift(-1)
    print dataset['UpDown'][:5]
    # will be denoted by 2 when transformed
    dataset.UpDown[dataset.UpDown >= 0] = "up"
    # will be denoted by 1 when transformed 
    dataset.UpDown[dataset.UpDown < 0] = "down"
    print dataset['UpDown'] 
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
#   print dataset['UpDown'][:5]
项目:kaggle-prudential-sample    作者:threecourse    | 项目源码 | 文件源码
def create_id_df(cls, df, is_train):
        """
        :rtype: DataFrame
        :return: dataFrame, sorted by id, 
                 columns are ["label", "id0", "id", "id_tr", "id_te"]
        """

        df = df[["id0", "label"]].copy()
        df = df.reset_index(drop=True)
        is_train = np.array(is_train)

        le_tr = LabelEncoder().fit(df.id0[is_train])
        le_te = LabelEncoder().fit(df.id0[~is_train])

        df["id_tr"] = np.nan
        df["id_te"] = np.nan
        df.loc[is_train, "id_tr"] = le_tr.transform(df.id0[is_train])
        df.loc[~is_train, "id_te"] = le_te.transform(df.id0[~is_train])
        df["id"] = np.where(np.isnan(df["id_tr"]), len(le_tr.classes_) + df["id_te"], df["id_tr"])

        df = df.fillna(-1)
        df = df.sort("id")
        df = df[["label", "id0", "id", "id_tr", "id_te"]]

        return df
项目:quantopian-machinelearning    作者:arshpreetsingh    | 项目源码 | 文件源码
def create_model(context, data):
    # Get the relevant daily prices
    recent_prices = data.history(context.assets, 'price',context.history_range, '1d')

    context.ma_50 =recent_prices.values[-50:].mean()     
    context.ma_200 = recent_prices.values[-200:].mean() 
    #print context.ma_50
    #print context.ma_200
    time_lags = pd.DataFrame(index=recent_prices.index)
    time_lags['price']=recent_prices.values
    time_lags['daily_returns']=time_lags['price'].pct_change()
    time_lags['multiple_day_returns'] =  time_lags['price'].pct_change(3)
    time_lags['rolling_mean'] = time_lags['daily_returns'].rolling(window = 4,center=False).mean()

    time_lags['time_lagged'] = time_lags['price']-time_lags['price'].shift(-2)
    X = time_lags[['price','daily_returns','multiple_day_returns','rolling_mean']].dropna()

    time_lags['updown'] = time_lags['daily_returns']
    time_lags.updown[time_lags['daily_returns']>=0]='up'
    time_lags.updown[time_lags['daily_returns']<0]='down'
    le = preprocessing.LabelEncoder()
    time_lags['encoding']=le.fit(time_lags['updown']).transform(time_lags['updown'])
  #  X = time_lags[['lag1','lag2']] # Independent, or input variables
   # Y = time_lags['direction'] # Dependent, or output variable
    context.model.fit(X,time_lags['encoding'][4:]) # Generate our model
项目:acton    作者:chengsoonong    | 项目源码 | 文件源码
def deserialise_encoder(
            encoder: acton_pb.Database.LabelEncoder
        ) -> sklearn.preprocessing.LabelEncoder:
    """Deserialises a LabelEncoder protobuf.

    Parameters
    ----------
    encoder
        LabelEncoder protobuf.

    Returns
    -------
    sklearn.preprocessing.LabelEncoder
        LabelEncoder (or None if no encodings were specified).
    """
    encodings = []
    for encoding in encoder.encoding:
        encodings.append((encoding.class_int, encoding.class_label))
    encodings.sort()
    encodings = numpy.array([c[1] for c in encodings])

    encoder = SKLabelEncoder()
    encoder.classes_ = encodings
    return encoder
项目:Prudential-Life-Insurance-Assessment    作者:AntonUBC    | 项目源码 | 文件源码
def fit(self, X, y):
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)
        self.num_classes = np.unique(y).shape[0]
        sf = xgb.DMatrix(X, y)
        params = {"objective": 'multi:softprob',
          "eta": self.eta,
          "gamma": self.gamma,
          "max_depth": self.max_depth,
          "min_child_weight": self.min_child_weight,
          "max_delta_step": self.max_delta_step,
          "subsample": self.subsample,
          "silent": self.silent,
          "colsample_bytree": self.colsample_bytree,
          "seed": self.seed,
          "lambda": self.l2_reg,
          "alpha": self.l1_reg,
          "num_class": self.num_classes}
        self.model = xgb.train(params, sf, self.num_round)

        return self
项目:guacml    作者:guacml    | 项目源码 | 文件源码
def execute_inplace(self, data):
        df = data.df
        meta = data.metadata

        classes = {}
        cols_to_encode = meta[meta.type == ColType.CATEGORICAL].index
        for col in cols_to_encode:
            enc = LE()
            df.loc[df[col].notnull(), col] = enc.fit_transform(df.loc[df[col].notnull(), col])
            df[col] = df[col].astype(float)
            meta.loc[col, 'type'] = ColType.INT_ENCODING
            meta.loc[col, 'derived_from'] = col
            classes[col] = enc.classes_
            self.logger.info('LabelEncoder: encoded %s', col)

        self.state = {'classes': classes}
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def pre_process_data():
    for col in categorical_fields:
        data_frame[col].fillna('default',inplace=True)
        data_frame_test[col].fillna('default',inplace=True)

    for col in numerical_fields:
        data_frame[col].fillna(0,inplace=True)
        data_frame_test[col].fillna(0,inplace=True)

    encode=LabelEncoder()
    for col in categorical_fields:
        data_frame[col]=encode.fit_transform(data_frame[col])
        data_frame_test[col]=encode.fit_transform(data_frame_test[col])
    data_frame['SalePrice'].fillna(0,inplace=True)
项目:keras-utilities    作者:cbaziotis    | 项目源码 | 文件源码
def labels_to_categories(y):
    """
    Labels to categories
    :param y: list of labels, ex. ['positive', 'negative', 'positive', 'neutral', 'positive', ...]
    :return: list of categories, ex. [0, 2, 1, 2, 0, ...]
    """
    encoder = LabelEncoder()
    encoder.fit(y)
    y_num = encoder.transform(y)
    return y_num
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def label_classes(df, estimated_var):
        le = LabelEncoder()
        le.fit(df[estimated_var].values)
        return le.classes_
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def __init__(self, classifier=None):
        if classifier:
            self.clf = classifier
        else:
            self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500)
        self.labels = preprocessing.LabelEncoder()
        self.feature_length = -1
项目:YOLO-Object-Detection-Tensorflow    作者:huseinzol05    | 项目源码 | 文件源码
def get_dataset():

    list_folder = os.listdir('data/')
    list_images = []
    for i in xrange(len(list_folder)):
        images = os.listdir('data/' + list_folder[i])
        for x in xrange(len(images)):
            image = [list_folder[i] + '/' + images[x], list_folder[i]]
            list_images.append(image)
    list_images = np.array(list_images)
    np.random.shuffle(list_images)

    print "before cleaning got: " + str(list_images.shape[0]) + " data"

    list_temp = []
    for i in xrange(list_images.shape[0]):
        image = misc.imread('data/' + list_images[i, 0])
        if len(image.shape) < 3:
            continue
        list_temp.append(list_images[i, :].tolist())

    list_images = np.array(list_temp)
    print "after cleaning got: " + str(list_images.shape[0]) + " data"
    label = np.unique(list_images[:, 1]).tolist()
    list_images[:, 1] = LabelEncoder().fit_transform(list_images[:, 1])
    return list_images, np.unique(list_images[:, 1]).shape[0], label
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def __do_label_encoding(self):
        df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        le = LabelEncoder()
        cross_feature_dict = self.__get_label_encode_dict()
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]]
            le.fit(pd.concat(to_be_stacked, axis=0))
            df_train[new_feature_name] = le.transform(df_train[new_feature_name])
            df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name])
            df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name])

        return
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def fit(self, column):
        self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column))
        self.classes_ = self.encoder_.classes_
        return self
项目:ltls    作者:kjasinska    | 项目源码 | 文件源码
def __init__(self, multilabel=False):
        self.multilabel = multilabel
        if self.multilabel:
            self.le = MultiLabelBinarizer(sparse_output=True)
        else:
            self.le = LabelEncoder()
        self.from_classes = False
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def __init__(self):
        self.label_encoder = preprocessing.LabelEncoder()
项目:datacleaner    作者:rhiever    | 项目源码 | 文件源码
def test_autoclean_no_nans_with_strings():
    """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    hand_cleaned_data = data.copy()
    hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

    cleaned_data = autoclean(data)

    assert cleaned_data.equals(hand_cleaned_data)