我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用sklearn.preprocessing.Binarizer()。
def new(n_feature=128): vectorizer = CountVectorizer( encoding='utf-8', ngram_range=(1,1), # Unigram only max_features=n_feature, binary=True ) # Fill the gap (missing expected tags) # --- # Hypothesis: Some tags are somehow related so # we smoothen the missing values with matrix factorisation. smoother = NMF(n_components=n_feature) # Binarise the vector's individual values binariser = Binarizer(copy=True) # Count vectoriser => NMF as smoother => Binariser print(colored('Taghasher model created','yellow')) return [vectorizer,smoother,binariser]
def sklearn_one_hot_vectorize(corpus): # The Sklearn one hot vectorize method from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import Binarizer freq = CountVectorizer() vectors = freq.fit_transform(corpus) print(len(vectors.toarray()[0])) onehot = Binarizer() vectors = onehot.fit_transform(vectors.toarray()) print(len(vectors[0]))
def fit_voting(self): voting = 'soft' names = [ # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,' # 'elongated,negation_count)', # 'logreg(w2v_doc)', # 'logreg(w2v_word_avg_google)', 'word2vec_bayes', 'cnn_word(embedding=google)', 'rnn_word(embedding=google)', ] classifiers = [ExternalModel({ self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)), self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)), }) for name in names] all_scores = [] for classifier in classifiers: scores = classifier.predict_proba(self.val_docs) if voting == 'hard': scores = Binarizer(1 / 3).transform(scores) all_scores.append(scores) all_scores = np.array(all_scores) all_scores_first, all_scores_rest = all_scores[0], all_scores[1:] le = LabelEncoder().fit(self.classes_) val_label_indexes = le.transform(self.val_labels()) # assume w_0=1 as w is invariant to scaling w = basinhopping( lambda w_: -(val_label_indexes == np.argmax(( all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1)) ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000, minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1)) ).x w = np.hstack([[1], w]) w /= w.sum() logging.info('w: {}'.format(w)) estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w) estimator.le_ = le estimator.estimators_ = classifiers return 'vote({})'.format(','.join(names)), estimator
def transform_data(x_i, le, with_fit=True): if isinstance(le, preprocessing.MinMaxScaler) or isinstance(le, preprocessing.Binarizer): x_i = x_i.astype(np.float) if with_fit: le.fit(x_i.reshape((-1, 1))) x_i = le.transform(x_i.reshape((-1, 1))) elif isinstance(le, preprocessing.LabelEncoder): if with_fit: le.fit(x_i) x_i = le.transform(x_i.reshape((-1, 1))) else: raise ValueError("unknow transform") return x_i.reshape((-1)), le
def test_Binarizer(): ''' test Binatizer method :return: None ''' X=[ [1,2,3,4,5], [5,4,3,2,1], [3,3,3,3,3,], [1,1,1,1,1] ] print("before transform:",X) binarizer=Binarizer(threshold=2.5) print("after transform:",binarizer.transform(X))
def test_binarizer(): from sklearn.preprocessing import Binarizer arr = np.array([0, 1, 2, 3, 4]) print Binarizer(threshold=2).fit_transform(arr) # [[0 0 0 1 1]]
def main(): t = time.time() img = imread(args.img_file_path) imgs = [img, watermark(img), rotate(img), crop(img), mirror(img)] imgs_norm = image_normalize(imgs) dataset_features = np.load('fc6.npy') query_start = time.time() query_features = extract_feature(imgs_norm) binarizer = preprocessing.Binarizer().fit(query_features) query_features = binarizer.transform(query_features) print(dataset_features) # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist cosine = distance.cdist(dataset_features, query_features, 'cosine') print(cosine.shape) dis = cosine inds_all = argsort(dis, axis=0) # ???? https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html print('query cost: %f, dataset: %d, query: %d' % (time.time() - query_start, len(dataset_features), len(imgs))) img_names = load_image_names() fig, axes = plt.subplots(5, 11, figsize=(22, 10), subplot_kw={'xticks': [], 'yticks': []}) fig.subplots_adjust(hspace=0.15, wspace=0.01, left=.02, right=.98, top=.92, bottom=.08) titles = ['original', 'watermark', 'rotate', 'crop', 'mirror'] for i in range(len(imgs)): topK = [] inds = inds_all[:, i] # print(inds) for k in range(10): topK.append(img_names[inds[k]]) print(inds[k], dis[inds[k], i], img_names[inds[k]]) original = axes[i, 0] original.set_title(titles[i]) img = imgs[i] original.imshow(img) for j in range(10): ax = axes[i, j + 1] img = imread(topK[j]) ax.imshow(img) title = '%d : %f' % (j + 1, dis[inds[j], i]) ax.set_title(title) savePath = args.img_file_path + '_search_result.jpg' plt.savefig(savePath) print(time.time() - t) # os.system('open -a Preview.app -F ' + savePath)
def main(): x, fc6 = initModel() init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) img_names = load_image_names(args.input_data_dir) with open(args.output_image_name_file, 'w') as img_names_file: for img_name in img_names: img_names_file.write(img_name + '\n') t = time.time() # ??????????? batch_size = 100 features = [] with open(args.output_feature_file, 'w') as output_file: for i in range(0, int(math.ceil(len(img_names) / (batch_size * 1.0)))): print('batch: %d' % i) if (i + 1) * batch_size < len(img_names): img_names_batch = img_names[i * batch_size:(i + 1) * batch_size] else: img_names_batch = img_names[i * batch_size:len(img_names)] img_batch = load_images(img_names_batch) output = sess.run(fc6, feed_dict={x: img_batch}) features.append(output) features = np.vstack(features) # binarizer = preprocessing.Binarizer().fit(features) # features = binarizer.transform(features) np.save(output_file, features) # with open('fc6.npy', 'w') as output_file: # for i in range(0, int(math.ceil(len(imgs) / (batch_size * 1.0)))): # print('batch: %d' % i) # if (i + 1) * batch_size < len(imgs): # img_batch = imgs[i * batch_size:(i + 1) * batch_size] # else: # img_batch = imgs[i * batch_size: len(imgs)] # output = sess.run(fc6, feed_dict={x: img_batch}) # features.append(output) # features = np.vstack(features) # np.save(output_file, features) print(time.time() - t)