Python sklearn.preprocessing 模块,OneHotEncoder() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.OneHotEncoder()

项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def __do_one_hot_encodings(self):
        df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        enc = OneHotEncoder(sparse=False)
        cross_feature_dict = self.__get_label_encode_dict()
        to_be_encoded = []
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_encoded.append(new_feature_name)
        #fix all data source
        to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
        enc.fit(to_be_stacked_df)

        enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
        # transform on seprate data source
        self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
        self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
        self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
        return
项目:char-rbm    作者:colinmorris    | 项目源码 | 文件源码
def vectors_from_txtfile(fname, codec, limit=-1, mutagen=None):
    f = open(fname)
    skipped = Counter()
    vecs = []
    for line in f:
        line = line.strip()
        try:
            vecs.append(codec.encode(line, mutagen=mutagen))
            if len(vecs) == limit:
                break
        except NonEncodableTextException as e:
            # Too long, or illegal characters
            skipped[e.reason] += 1

    logging.debug("Gathered {} vectors. Skipped {} ({})".format(len(vecs), 
        sum(skipped.values()), dict(skipped)))
    vecs = np.asarray(vecs)
    # TODO: Why default to dtype=float? Seems wasteful? Maybe it doesn't really matter. Actually, docs here seem inconsistent? Constructor docs say default float. transform docs say int. Should file a bug on sklearn.
    return OneHotEncoder(len(codec.alphabet)).fit_transform(vecs)

# Adapted from sklearn.utils.extmath.softmax
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def __init__(self, n_values, feature_indices):
        import warnings
        from sklearn.preprocessing import OneHotEncoder

        if not isinstance(n_values, np.ndarray):
            n_values = np.array(n_values)
        if not isinstance(feature_indices, np.ndarray):
            feature_indices = np.array(feature_indices)
        assert feature_indices.size > 0
        assert feature_indices.shape == n_values.shape
        for nv in n_values:
            if nv <= 2:
                raise Exception("Categorical features must have 3+ labels")

        self.feature_indices = feature_indices
        self.n_values = n_values
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.encoder = OneHotEncoder(n_values=n_values, sparse=False)
        self.columnlabels = None
        self.xform_start_indices = None
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_plus_trees(self): 

        data = load_boston()

        pl = Pipeline([
            ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), 
            ("Trees",GradientBoostingRegressor(random_state = 1))])

        pl.fit(data.data, data.target)

        # Convert the model
        spec = convert(pl, data.feature_names, 'target')

        # Get predictions
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df['prediction'] = pl.predict(data.data)

        # Evaluate it
        result = evaluate_regressor(spec, df, 'target', verbose = False)

        assert result["max_error"] < 0.0001
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            model = OneHotEncoder(categorical_features = categorical_features, sparse=False)
            model.fit(data.data, data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data)]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0

    # This test still isn't working
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_pipeline(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct. 

            model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
                 ("Normalizer", Normalizer())])

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def __init__(self, X, y, multinomial, rounding=None):
        self.input_features = X.columns.values

        X = X.values

        cat_idx = [i for i in range(X.shape[1]) if min(X[:, i]) == 0]
        self.encoder = OneHotEncoder(categorical_features=cat_idx, sparse=False)

        X = self.encoder.fit_transform(X)

        self.features = range(X.shape[1])
        self.rounding = rounding

        # train a model on the whole dataset
        self.model = LogisticRegression()
        self.model.fit(X, y)

        self.w = self.model.coef_
        self.intercept = self.model.intercept_
        self.multinomial = multinomial
        assert not (multinomial and len(self.get_classes()) == 2)

        RegressionExtractor.__init__(self)
项目:RandomForestClustering    作者:joshloyal    | 项目源码 | 文件源码
def fit_transform(self, X, y=None, sample_weight=None):
        X = check_array(X, accept_sparse=['csc'], ensure_2d=False)

        if sp.issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        X_, y_ = generate_discriminative_dataset(X)

        super(RandomForestEmbedding, self).fit(X_, y_,
                                               sample_weight=sample_weight)

        self.one_hot_encoder_ = OneHotEncoder(sparse=True)
        if self.sparse_output:
            return self.one_hot_encoder_.fit_transform(self.apply(X))
        return self.apply(X)
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_OneHotEncoder():
    '''
    test the method
    :return: None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [3,3,3,3,3,],
          [1,1,1,1,1] ]
    print("before transform:",X)
    encoder=OneHotEncoder(sparse=False)
    encoder.fit(X)
    print("active_features_:",encoder.active_features_)
    print("feature_indices_:",encoder.feature_indices_)
    print("n_values_:",encoder.n_values_)
    print("after transform:",encoder.transform( [[1,2,3,4,5]]))
项目:kddcup2017    作者:floydwch    | 项目源码 | 文件源码
def get_one_hot_key():
    encoder = OneHotEncoder(n_values=[3, 3], sparse=False)
    encoder.fit([[0, 0]])
    intersection_id_map = dict(
        A=0,
        B=1,
        C=2
    )

    def one_hot_key(ix, **kargs):
        return encoder.transform([
            [
                intersection_id_map[ix[-2]],
                ix[-1] - 1
            ]
        ])[0].tolist()

    return one_hot_key
项目:AutoSleepScorerDev    作者:skjerns    | 项目源码 | 文件源码
def one_hot(hypno, n_categories):
    enc = OneHotEncoder(n_values=n_categories)
    hypno = enc.fit_transform(hypno).toarray()
    return np.array(hypno,'int32')
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def one_hot_encoder(df, estimated_var):
        df_class = df.copy()
        ohe = OneHotEncoder()
        label_classes = df_class[estimated_var].factorize()[1]
        new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes]
        mask = ~df[estimated_var].isnull()
        feature_var_values = ohe.fit_transform(np.reshape(np.array(df[''.join([estimated_var, 'Num'])][mask].values),
                                                          (df[mask].shape[0], 1))).toarray().astype(int)
        # Create new feature_var columns with one-hot encoded values
        for ite in new_one_hot_encoded_features:
            df[ite] = df[estimated_var]
        df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values
项目:feagen    作者:ianlini    | 项目源码 | 文件源码
def gen_pclass(self, data):
        from sklearn.preprocessing import OneHotEncoder
        data_pclass = data['data_df'][['Pclass']]
        # set unknown as a class
        data_pclass.fillna(4, inplace=True)
        return {'pclass': OneHotEncoder(sparse=False)
                            .fit_transform(data_pclass.values)}
项目:mobike    作者:angryBird2014    | 项目源码 | 文件源码
def generate_train_random_batch(data,label,batch_size,is_train = True):


    indics = np.random.randint(0,len(data),size=batch_size)
    vector = np.zeros([len(data)])
    vector[indics] = 1

    #y_label = OneHotEncoder(len(data),indics,sparse=False)

    data_batch = data.iloc[indics]
    if is_train:
        label_batch = label.iloc[indics]
        return data_batch.as_matrix(),label_batch.as_matrix(),vector
    else:
        return data_batch.as_matrix(),vector
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def one_hot_encoder(df, estimated_var):
        df_class = df.copy()
        ohe = OneHotEncoder()
        label_classes = df_class[estimated_var].factorize()[1]
        new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes]
        mask = ~df[estimated_var].isnull()
        feature_var_values = ohe.fit_transform(np.reshape(np.array(df[''.join([estimated_var, 'Num'])][mask].values),
                                                          (df[mask].shape[0], 1))).toarray().astype(int)
        # Create new feature_var columns with one-hot encoded values
        for ite in new_one_hot_encoded_features:
            df[ite] = df[estimated_var]
        df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = RandomForestClassifier()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = RandomForestClassifier()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            from sklearn.preprocessing import OneHotEncoder
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = NuSVR()
            spec = scikit_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = scikit_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = libsvm.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = DecisionTreeRegressor()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = GradientBoostingRegressor()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = GradientBoostingRegressor()
            spec = xgb_converter.convert(model, 'data', 'out')

        # Check the expected class during conversion
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = xgb_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = SVR()
            spec = sklearn_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = sklearn_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = libsvm.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_one_column(self):
        # Fit a single OHE
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data)
        spec = sklearn.convert(scikit_model, 'single_feature', 'out').get_spec()

        test_data = [{'single_feature' : row} for row in self.scikit_data]
        scikit_output = [{'out' : row} for row in scikit_model.transform(self.scikit_data).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics['num_errors'], 0)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_many_columns(self):
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data_multiple_cols)
        spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec()

        test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols]
        scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics['num_errors'], 0)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_one_column_of_several(self):
        scikit_model = OneHotEncoder(categorical_features = [0])
        scikit_model.fit(copy(self.scikit_data_multiple_cols))
        spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec()

        test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols]
        scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics['num_errors'], 0)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        """
        Failure testing for bad conversion.
        """
        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = converter.convert(model, 'data', 'out', 'regressor')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        from sklearn.preprocessing import OneHotEncoder

        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = SVC()
            spec = scikit_converter.convert(model, 'data', 'out')

        # Check the expected class during conversion
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = scikit_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = LinearRegression()
            spec = convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = DecisionTreeClassifier()
            spec = skl_converter(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = DecisionTreeClassifier()
            spec = skl_converter(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = GradientBoostingClassifier()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        from sklearn.preprocessing import OneHotEncoder

        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = NuSVC()
            spec = scikit_converter.convert(model, 'data', 'out')

        # Check the expected class during conversion
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = scikit_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):
        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = libsvm.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = RandomForestRegressor()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def encode_onehot(df, cols, vec=None):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.

    Modified from: https://gist.github.com/kljensen/5452382

    Details:

    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    x_data = df[cols]
    if vec is None:
        vec = preprocessing.OneHotEncoder()
        results = vec.fit_transform(x_data).toarray()
    else:
        results = vec.transform(x_data).toarray()

    result_size = results.shape[1]

    #TODO bug in pca code, find and fix
    #after_pca_size = 4*len(cols)
    #if(result_size > 5 and after_pca_size < result_size):
    #    results = results[:min(500000,  results.shape[0])]
    #    results = doPCA(results, after_pca_size)
    #    result_size = after_pca_size

    vec_data = pd.DataFrame(results)
    vec_data.columns = ["f"+str(i) for i in range(0, result_size)]
    vec_data.index = df.index

    #df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df, vec
项目:jsaicup2017    作者:SS1031    | 项目源码 | 文件源码
def day_of_week(encode_flg=False):
    """?????
    """
    train_df = data_loader.train()
    day_of_week_df = train_df.datetime.dt.dayofweek
    day_of_week_df.index = train_df.datetime
    if encode_flg:
        oh_encoder = OneHotEncoder(sparse=False)
        return pd.DataFrame(
            oh_encoder.fit_transform(train_df.datetime.dt.dayofweek.values.reshape(-1, 1)),
            index=train_df.datetime,
            columns=['dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6']
        )

    return day_of_week_df.to_frame('dow')
项目:supervised-hashing-baselines    作者:facebookresearch    | 项目源码 | 文件源码
def getmAP(clf, X_base, y_base, X_query, y_query, id_label, y_label):
    y_base, y_query = y_base[:, 0], y_query[:, 0]

    oneh = OneHotEncoder(sparse=False)
    y_label_1h = oneh.fit_transform(y_label)
    activations = clf.predict_proba(X_base)
    activations[id_label] = y_label_1h

    if args.code == "onehot":
        argmax = activations.argmax(axis=1).reshape((-1, 1))
        activations = oneh.fit_transform(argmax)

    if args.code == "lsh":
        index = faiss.IndexLSH(y_label_1h.shape[1], args.nbits, True, True)
    else:
        index = faiss.IndexFlatIP(y_label_1h.shape[1])

    index.train(activations.astype(np.float32))
    index.add(activations.astype(np.float32))

    queryAct = clf.predict_proba(X_query).astype(np.float32)

    _, idc = index.search(queryAct, y_base.shape[0])
    predictions = y_base[idc]
    results = np.equal(predictions, np.expand_dims(y_query, axis=1))

    return computemAP(results)
项目:sklearn-random-bits-forest    作者:tmadl    | 项目源码 | 文件源码
def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_mldata(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print "WARNING: No target found. Taking last column of data matrix as target"
        target = X[:, -1]
        X = X[:, :-1]
    if len(target.shape)>1 and target.shape[1]>X.shape[1]: # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft=[i for i in range(X.shape[1]) if 'str' in str(type(unpack(X[0,i]))) or 'unicode' in str(type(unpack(X[0,i])))]
        if len(cat_ft): 
            for i in cat_ft:
                X[:,i] = tonumeric(X[:,i]) 
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)),y
项目:heamy    作者:rushter    | 项目源码 | 文件源码
def xgb_to_features(model, X_train, X_test):
    """Converts xgboost model into categorical features.
    Reference:
    "Practical Lessons from Predicting Clicks on Ads at Facebook"
    https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/
    """
    import xgboost as xgb
    f_train = model.predict(xgb.DMatrix(X_train), pred_leaf=True)
    f_test = model.predict(xgb.DMatrix(X_test), pred_leaf=True)
    enc = OneHotEncoder()
    enc.fit(f_train)
    return enc.transform(f_train), enc.transform(f_test)
项目:tc_koubei_newBird    作者:yangydeng    | 项目源码 | 文件源码
def make_OHE(names):
    data = []
    for name in names:
        data.append([name])          
    enc = preprocessing.OneHotEncoder()
    enc.fit(data)
    OHE_data = enc.transform(data).toarray()  
    return OHE_data
项目:Text-Classification-with-Tensorflow    作者:jrzaurin    | 项目源码 | 文件源码
def one_hot(x):
    return np.array(OneHotEncoder().fit_transform(x.reshape(-1,1)).todense())
项目:distracted-drivers-keras    作者:fomorians    | 项目源码 | 文件源码
def load_train(base):
    driver_imgs_list = pd.read_csv('driver_imgs_list.csv')
    driver_imgs_grouped = driver_imgs_list.groupby('classname')

    X_train = []
    y_train = []
    driver_ids = []

    print('Reading train images...')
    for j in range(NUM_CLASSES):
        print('Loading folder c{}...'.format(j))
        driver_ids_group = driver_imgs_grouped.get_group('c{}'.format(j))
        paths = os.path.join(base, 'c{}/'.format(j)) + driver_ids_group.img

        if SUBSET:
            paths = paths[:100]
            driver_ids_group = driver_ids_group.iloc[:100]

        driver_ids += driver_ids_group['subject'].tolist()

        for i, path in tqdm(enumerate(paths), total=len(paths)):
            img = load_image(path)
            if i == 0:
                imsave('c{}.jpg'.format(j), img)
            img = img.swapaxes(2, 0)

            X_train.append(img)
            y_train.append(j)

    X_train = np.array(X_train)
    y_train = np.array(y_train)

    y_train = OneHotEncoder(n_values=NUM_CLASSES) \
        .fit_transform(y_train.reshape(-1, 1)) \
        .toarray()

    return X_train, y_train, driver_ids
项目:DeepIV    作者:jhartford    | 项目源码 | 文件源码
def one_hot(col, **kwargs):
    z = col.reshape(-1,1)
    enc = OneHotEncoder(sparse=False, **kwargs)
    return enc.fit_transform(z)
项目:MorphoBabushka    作者:nvanva    | 项目源码 | 文件源码
def mapper(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=True, pos_features=False):
    numbers = set('%d' % i for i in range(-100,100))
    feature_extractors = []
    feature_extractors.extend([
        (col, vec(ngram_range, lowercase, binary, min_df, max_df)) for col in df.columns if col in numbers
    ])  # we are using separate CountVectorizer for each word position (for min_df to work correctly)
    if caps_features:
        feature_extractors.extend([
            ([col], OneHotEncoder(sparse=False, handle_unknown='ignore')) for col in df.columns if col.endswith('_cap')
        ])
    if pos_features:
        feature_extractors.extend([
              (col, LabelBinarizer()) for col in df.columns if '_Extra' in col
        ])
    return DataFrameMapper(feature_extractors, sparse=True)
项目:highdimensional-decision-boundary-plot    作者:tmadl    | 项目源码 | 文件源码
def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_mldata(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print("WARNING: No target found. Taking last column of data matrix as target")
        target = X[:, -1]
        X = X[:, :-1]
    if len(target.shape) > 1 and target.shape[1] > X.shape[1]:  # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft = [i for i in range(X.shape[1]) if 'str' in str(
            type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))]
        if len(cat_ft):
            for i in cat_ft:
                X[:, i] = tonumeric(X[:, i])
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)), y
项目:Xserpy    作者:brmson    | 项目源码 | 文件源码
def encode(features, labels):
    """One-hot encode the values of each feature

    Keyword arguments:
    features -- feature matrix
    labels -- labels of samples

    """
    enc = OneHotEncoder()
    enc.fit(features)
    arr = enc.transform(features).toarray()
    result = np.array([[0 for j in range(len(arr[0])+1)] for k in range(len(arr))])
    for i in range(len(arr)):
        result[i] = np.append(arr[i], labels[i])
    return result
项目:kddcup2017    作者:floydwch    | 项目源码 | 文件源码
def get_one_hot_weekday():
    encoder = OneHotEncoder(n_values=[7], sparse=False)
    encoder.fit([[0]])

    def one_hot_weekday(ix, **kargs):
        return encoder.transform([
            [datetime.date(2016, *ix_to_date(ix)).weekday()]
        ])[0].tolist()

    return one_hot_weekday
项目:kddcup2017    作者:floydwch    | 项目源码 | 文件源码
def get_one_hot_key():
    encoder = OneHotEncoder(n_values=[3, 2], sparse=False)
    encoder.fit([[0, 0]])

    def one_hot_key(ix, **kargs):
        return encoder.transform([[ix[-2] - 1, ix[-1]]])[0].tolist()

    return one_hot_key