Python sklearn.preprocessing 模块，Binarizer() 实例源码

我们从Python开源项目中，提取了以下8个代码示例，用于说明如何使用sklearn.preprocessing.Binarizer()。

项目：pantip-libr 作者：starcolon | 项目源码 | 文件源码

def new(n_feature=128):
  vectorizer = CountVectorizer(
    encoding='utf-8',
    ngram_range=(1,1), # Unigram only
    max_features=n_feature, 
    binary=True
  )

  # Fill the gap (missing expected tags)
  # ---
  # Hypothesis: Some tags are somehow related so 
  # we smoothen the missing values with matrix factorisation.
  smoother = NMF(n_components=n_feature)

  # Binarise the vector's individual values 
  binariser = Binarizer(copy=True)

  # Count vectoriser => NMF as smoother => Binariser
  print(colored('Taghasher model created','yellow'))
  return [vectorizer,smoother,binariser]

项目：atap 作者：foxbook | 项目源码 | 文件源码

def sklearn_one_hot_vectorize(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq    = CountVectorizer()
    vectors = freq.fit_transform(corpus)

    print(len(vectors.toarray()[0]))

    onehot  = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    print(len(vectors[0]))

项目：senti 作者：stevenxxiu | 项目源码 | 文件源码

def fit_voting(self):
        voting = 'soft'
        names = [
            # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,'
            # 'elongated,negation_count)',
            # 'logreg(w2v_doc)',
            # 'logreg(w2v_word_avg_google)',
            'word2vec_bayes',
            'cnn_word(embedding=google)',
            'rnn_word(embedding=google)',
        ]
        classifiers = [ExternalModel({
            self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)),
            self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)),
        }) for name in names]
        all_scores = []
        for classifier in classifiers:
            scores = classifier.predict_proba(self.val_docs)
            if voting == 'hard':
                scores = Binarizer(1 / 3).transform(scores)
            all_scores.append(scores)
        all_scores = np.array(all_scores)
        all_scores_first, all_scores_rest = all_scores[0], all_scores[1:]
        le = LabelEncoder().fit(self.classes_)
        val_label_indexes = le.transform(self.val_labels())
        # assume w_0=1 as w is invariant to scaling
        w = basinhopping(
            lambda w_: -(val_label_indexes == np.argmax((
                all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1))
            ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000,
            minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1))
        ).x
        w = np.hstack([[1], w])
        w /= w.sum()
        logging.info('w: {}'.format(w))
        estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w)
        estimator.le_ = le
        estimator.estimators_ = classifiers
        return 'vote({})'.format(','.join(names)), estimator

项目：kdd2017 作者：JinpengLI | 项目源码 | 文件源码

def transform_data(x_i, le, with_fit=True):
    if isinstance(le, preprocessing.MinMaxScaler) or isinstance(le, preprocessing.Binarizer):
        x_i = x_i.astype(np.float)
        if with_fit:
            le.fit(x_i.reshape((-1, 1)))
        x_i = le.transform(x_i.reshape((-1, 1)))
    elif isinstance(le, preprocessing.LabelEncoder):
        if with_fit:
            le.fit(x_i)
        x_i = le.transform(x_i.reshape((-1, 1)))
    else:
        raise ValueError("unknow transform")
    return x_i.reshape((-1)), le

项目：ML-note 作者：JasonK93 | 项目源码 | 文件源码

def test_Binarizer():
    '''
    test Binatizer method
    :return: None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [3,3,3,3,3,],
          [1,1,1,1,1] ]
    print("before transform:",X)
    binarizer=Binarizer(threshold=2.5)
    print("after transform:",binarizer.transform(X))

项目：python_utils 作者：Jayhello | 项目源码 | 文件源码

def test_binarizer():
    from sklearn.preprocessing import Binarizer
    arr = np.array([0, 1, 2, 3, 4])
    print Binarizer(threshold=2).fit_transform(arr)
    # [[0 0 0 1 1]]

项目：visual-search 作者：GYXie | 项目源码 | 文件源码

def main():
    t = time.time()
    img = imread(args.img_file_path)
    imgs = [img, watermark(img), rotate(img), crop(img), mirror(img)]
    imgs_norm = image_normalize(imgs)
    dataset_features = np.load('fc6.npy')

    query_start = time.time()
    query_features = extract_feature(imgs_norm)
    binarizer = preprocessing.Binarizer().fit(query_features)
    query_features = binarizer.transform(query_features)
    print(dataset_features)
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist
    cosine = distance.cdist(dataset_features, query_features, 'cosine')
    print(cosine.shape)
    dis = cosine
    inds_all = argsort(dis, axis=0)  # ???? https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
    print('query cost: %f, dataset: %d, query: %d' % (time.time() - query_start, len(dataset_features), len(imgs)))
    img_names = load_image_names()
    fig, axes = plt.subplots(5, 11, figsize=(22, 10), subplot_kw={'xticks': [], 'yticks': []})
    fig.subplots_adjust(hspace=0.15, wspace=0.01, left=.02, right=.98, top=.92, bottom=.08)
    titles = ['original', 'watermark', 'rotate', 'crop', 'mirror']
    for i in range(len(imgs)):
        topK = []
        inds = inds_all[:, i]
        # print(inds)
        for k in range(10):
            topK.append(img_names[inds[k]])
            print(inds[k], dis[inds[k], i], img_names[inds[k]])

        original = axes[i, 0]
        original.set_title(titles[i])
        img = imgs[i]
        original.imshow(img)
        for j in range(10):
            ax = axes[i, j + 1]
            img = imread(topK[j])
            ax.imshow(img)
            title = '%d : %f' % (j + 1, dis[inds[j], i])
            ax.set_title(title)

    savePath = args.img_file_path + '_search_result.jpg'
    plt.savefig(savePath)
    print(time.time() - t)
    # os.system('open -a Preview.app -F ' + savePath)

项目：visual-search 作者：GYXie | 项目源码 | 文件源码

def main():
    x, fc6 = initModel()
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    img_names = load_image_names(args.input_data_dir)

    with open(args.output_image_name_file, 'w') as img_names_file:
        for img_name in img_names:
            img_names_file.write(img_name + '\n')

    t = time.time()
    # ???????????
    batch_size = 100
    features = []

    with open(args.output_feature_file, 'w') as output_file:
        for i in range(0, int(math.ceil(len(img_names) / (batch_size * 1.0)))):
            print('batch: %d' % i)
            if (i + 1) * batch_size < len(img_names):
                img_names_batch = img_names[i * batch_size:(i + 1) * batch_size]
            else:
                img_names_batch = img_names[i * batch_size:len(img_names)]
            img_batch = load_images(img_names_batch)
            output = sess.run(fc6, feed_dict={x: img_batch})
            features.append(output)
        features = np.vstack(features)
        # binarizer = preprocessing.Binarizer().fit(features)
        # features = binarizer.transform(features)
        np.save(output_file, features)

    # with open('fc6.npy', 'w') as output_file:
    #     for i in range(0, int(math.ceil(len(imgs) / (batch_size * 1.0)))):
    #         print('batch: %d' % i)
    #         if (i + 1) * batch_size < len(imgs):
    #             img_batch = imgs[i * batch_size:(i + 1) * batch_size]
    #         else:
    #             img_batch = imgs[i * batch_size: len(imgs)]
    #         output = sess.run(fc6, feed_dict={x: img_batch})
    #         features.append(output)
    #     features = np.vstack(features)
    #     np.save(output_file, features)

    print(time.time() - t)