我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.utils.shuffle()。
def bagged_set(X,y,model, seed, estimators, xt, update_seed=True): # create array object to hold predictions baggedpred=[ 0.0 for d in range(0, (xt.shape[0]))] #loop for as many times as we want bags for n in range (0, estimators): #shuff;e first, aids in increasing variance and forces different results X_t,y_c=shuffle(X,y, random_state=seed+n) if update_seed: # update seed if requested, to give a slightly different model model.set_params(random_state=seed + n) model.fit(X_t,y_c) # fit model0.0917411475506 preds=model.predict_proba(xt)[:,1] # predict probabilities # update bag's array for j in range (0, (xt.shape[0])): baggedpred[j]+=preds[j] # divide with number of bags to create an average estimate for j in range (0, len(baggedpred)): baggedpred[j]/=float(estimators) # return probabilities return np.array(baggedpred)
def load_data(): """ Get and normalize data with labels, split into training, validation and test set. Automatically shuffles the training batch. """ data = _load_data() X_train, y_train = data[0] X_valid, y_valid = data[1] X_test, y_test = data[2] X_train = X_train.reshape((-1, 1, 28, 28)).astype(np.float32) X_valid = X_valid.reshape((-1, 1, 28, 28)).astype(np.float32) X_test = X_test.reshape((-1, 1, 28, 28)).astype(np.float32) y_train = y_train.astype(np.int32) y_valid = y_valid.astype(np.int32) y_test = y_test.astype(np.int32) X_train, y_train = shuffle(X_train, y_train, random_state=0) return X_train, y_train, X_valid, y_valid, X_test, y_test # --------------- Network architectures ---------------
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovr.partial_fit(iris.data[60:], iris.target[60:]) pred = ovr.predict(iris.data) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred), 0.65)
def generate_idxs(self, dataset_len): if self.subsample == 1: return repeat(list(range(dataset_len))) batch_size = int(dataset_len * self.subsample) \ if self.subsample < 1 else self.subsample if batch_size > dataset_len: raise Exception("dataset subset is larger than dataset") def gen(bs): rs = np.random.RandomState(seed=self.seed + 1000) idxs = list(range(dataset_len)) while True: rs.shuffle(idxs) yield idxs[:bs] return gen(batch_size)
def __init__(self, X, y, batch_size, shuffle=True, valid=False): """ Constructor :param X: list of samples (list of lists of char_ids) :param y: list of labels (list of probability (of name being a male name)) :param batch_size: the size of samples in a batch :param shuffle: if True, shuffle the data in every new epoch :param valid: if True, finish iterating on the data after one pass """ assert isinstance(X, list), 'Invalid argument type type(X) = {}'.format(type(X)) assert isinstance(y, list), 'Invalid argument type type(y) = {}'.format(type(y)) assert len(X) == len(y), 'len(X) != len(y)' assert batch_size > 0, 'batch_size <= 0' # BatchGenerator shouldn't have a by-product self._X = deepcopy(X) self._y = deepcopy(y) self._batch_id = 0 self._batch_size = batch_size self._shuffle = shuffle self._valid = valid self._data_size = len(self._X) self._finish = False
def _gen_batch(self, batch_id, batch_size, data_size): """Generate batch for given X, y, batch_id, batch_size, and data_size.""" start_index = (batch_id * batch_size) % data_size end_index = ((batch_id + 1) * batch_size) % data_size if start_index < end_index: return (deepcopy(self._X[start_index: end_index]), deepcopy(self._y[start_index: end_index])) else: # executing here means you have gone over X and y already X_first = deepcopy(self._X[start_index:]) y_first = deepcopy(self._y[start_index:]) if self._valid: self._finish = True return X_first, y_first # shuffle X and y after going over them if shuffle is True if self._shuffle: self._X, self._y = shuffle(self._X, self._y) X_second = deepcopy(self._X[:end_index]) y_second = deepcopy(self._y[:end_index]) return X_first + X_second, y_first + y_second
def svc_model(self, X, y, x_test, y_test, x_val, y_val, i, j): X, y = shuffle(X, y, random_state=self.SEED) clf = SVC(C=self.C, kernel='rbf', gamma=self.gamma, cache_size=self.cache_size, verbose=0, random_state=self.SEED) model = clf.fit(X, y) yhat_train = model.predict(X) yhat_val = model.predict(x_val) yhat_test = model.predict(x_test) train_error = (1 - accuracy_score(y, yhat_train)) * 100 val_error = (1 - accuracy_score(y_val, yhat_val)) * 100 test_error = (1 - accuracy_score(y_test, yhat_test)) * 100 self.warn_log.append([i, train_error, val_error, test_error]) return model
def preprocess(image_shape, image_paths, labels=[]): features = [] for image_path in tqdm(image_paths): image_data = list(Image.open(image_path).resize(image_shape[:2]).getdata()) image_data = np.asarray(image_data).reshape(image_shape) features.append(image_data) # Normalizer features = np.asarray(features) features = features / 255.0 if labels: # one hot encode label_binarizer = LabelBinarizer() labels = label_binarizer.fit_transform(labels) # Shuffle features, labels = shuffle(features, labels) return features, labels
def maybe_cache_featurs_labels(img_shape): chunk_size = 500 data_folder = './data/train/*/*.jpg' windows_folder = './data/windows/' if not isdir(windows_folder): makedirs(windows_folder) for features_file, labels_files, image_batch in tqdm( list(missing_chunks(chunk_size, data_folder, windows_folder)), unit='batch'): features, labels = get_feature_labels(image_batch, img_shape) features = features/255 features, labels = shuffle(features, labels) # Save Features and Labels assert len(features) == len(labels) np.save(features_file, features) np.save(labels_files, labels)
def cutData(): df_all=pd.read_csv('data/cutData/train_time_v9.csv') df_2=pd.read_csv('day30/data/train30_v9.csv') print('????') df_all=df_all.append(df_2) del df_2 df_all = shuffle(df_all,random_state=42) step=len(df_all)//5 train1=df_all[0:step] train2=df_all[step:2*step] train3=df_all[2*step:3*step] train4=df_all[3*step:4*step] train5=df_all[4*step:] del df_all return train1,train2,train3,train4,train5
def dealingUnbalancedData(self): """ Dealing with unbalanced training data """ len0 = np.count_nonzero(1-self.y_train) len1 = np.count_nonzero(self.y_train) dup = int(len0/len1) dup = int(dup * 1.5) # change this value, make it more possible to predict buy. X1 = self.X_train[np.where(self.y_train==1)[0], :] y1 = self.y_train[np.where(self.y_train==1)[0], :] y2 = self.y_train_price[np.where(self.y_train==1)[0], :] X1 = np.tile(X1, (dup-1,1)) y1 = np.tile(y1, (dup-1,1)) y2 = np.tile(y2, (dup-1,1)) self.X_train = np.concatenate((self.X_train, X1), axis=0) self.y_train = np.concatenate((self.y_train, y1), axis=0) self.y_train_price = np.concatenate((self.y_train_price, y2), axis=0) # shuffle train data self.X_train, self.y_train, self.y_train_price = shuffle(self.X_train, self.y_train, self.y_train_price, random_state=42)
def display(self,count=10): """ count : number of Samples to display after prediction """ self.images,self.labels = shuffle(self.images,self.labels) display_images,display_labels = self.images[:count],self.labels[:count] self.test(display_images,verbose=False) images_and_prediction = list(zip(display_images,self.prediction,display_labels)) for index,(image,prediction,actual) in enumerate(images_and_prediction): plt.subplot(count/3,count/2,index+1) plt.axis("off") image = image.reshape(112,92) plt.imshow(image,cmap="Greys_r") plt.title("{}/{}".format(prediction,actual)) plt.show()
def getKaggleMNIST(): # MNIST data: # column 0 is labels # column 1-785 is data, with values 0 .. 255 # total size of CSV: (42000, 1, 28, 28) train = pd.read_csv('../large_files/train.csv').as_matrix() train = shuffle(train) Xtrain = rearrange( train[:-1000,1:] ) Ytrain = train[:-1000,0] Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange( train[-1000:,1:] ) Ytest = train[-1000:,0] Ytest_ind = y2indicator(Ytest) return Xtrain, Ytrain, Ytrain_ind, Xtest, Ytest, Ytest_ind
def loadImages(datadir, maxDirectoryCount=10, split=0.9): for dirPath, dirNames, fileNames in os.walk(datadir): fileNames = [f for f in fileNames if not f[0] == '.'] dirNames[:] = [d for d in dirNames if not d[0] == '.'] if (maxDirectoryCount != 0): fullSizeFileNames = [fileName for fileName in fileNames if fileName.endswith("@2x.png") and (fileName.replace("@2x","") in fileNames)] for fullSizeFileName in fullSizeFileNames: inputImage = io.imread(dirPath + "/" + fullSizeFileName) targetImage = io.imread(dirPath + "/" + fullSizeFileName.replace("@2x","")) # print(dirPath + "/" + fullSizeFileName) inputSlices, targetSlices = sliceImages(inputImage, targetImage) # print("got", len(inputSlices), "input splices and",len(targetSlices),"targetSlices") inputImages.extend(inputSlices) targetImages.extend(targetSlices) maxDirectoryCount -= 1 x, y = np.asarray(inputImages), np.asarray(targetImages) x_train = x[:int(len(x) * split)] y_train = y[:int(len(y) * split)] x_test = x[int(len(x) * split):] y_test = y[int(len(y) * split):] # Shuffle training data so that repeats aren't in the same batch # x_train, y_train = shuffle(x_train, y_train, random_state=0) return (x_train, y_train, x_test, y_test)
def load_data(test=False): fname = FTEST if test else FTRAIN df = pd.read_csv(fname) cols = df.columns[:-1] df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ') / 255.0) df = df.dropna() X = np.vstack(df['Image']) X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1) if not test: # y = (df[cols].values -48) / 48.0 y = df[cols].values / 96.0 X, y = shuffle(X, y) joblib.dump(cols, 'data/cols.pkl', compress=3) else: y = None return X, y
def extract_train_data(df, flip_indices, cols): data = df[list(cols) + ['Image']].copy() data = data.dropna() X = np.vstack(data['Image'].values) X = X.astype(np.float32) X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1) y = data[data.columns[:-1]].values if len(flip_indices) != 0: X_flip = X[:, :, ::-1, :] X = np.vstack([X, X_flip]) y_flip = y.copy() y_flip[:, ::2] *= -1 y_flip[:, ::2] += 1 for a, b in flip_indices: y_flip[:, [a, b]] = y_flip[:, [b, a]] y = np.vstack([y, y_flip]) X, y = shuffle(X, y, random_state=42) # shuffle train data y = y.astype(np.float32) return X, y
def next_batch(self, batch_size, shuffle_data): start = self.__index_in_epoch self.__index_in_epoch += batch_size # Current epoch is finished (used all examples) if self.__index_in_epoch > self.__num_examples: self.__epochs_completed += 1 # reshuffle data for next epoch if shuffle_data: self.__X, self.__y = shuffle(self.__X, self.__y) start = 0 self.__index_in_epoch = batch_size # make sure batch size is smaller than the actual number of examples assert batch_size <= self.__num_examples end = self.__index_in_epoch return self.__X[start:end], self.__y[start:end]
def transform_feild4rnn(power, sequence_length): result = [] for index in range(len(power) - sequence_length): result.append(power[index: index + sequence_length]) result = np.array(result) # shape (2049230, 50) #print("result",result) #print("result.shape", result.shape) #result_mean = result.mean() #result -= result_mean #print "Shift : ", result_mean #print "Data : ", result.shape row = int(round(0.9 * result.shape[0])) train = result[:row, :] #np.random.shuffle(train) #print("train", train.shape) #print("train", train) X_train = train[:, :-1] y_train = train[:, -1] X_test = result[row:, :-1] y_test = result[row:, -1] return [X_train, y_train, X_test, y_test]
def __init__(self, X_train, X_test, y_train, kf_n, verbose=1): '''Stacking for models except "neuron network" and "xgboost" Parameters ---------- X_train: numpy array training data X_test: numpy array testing data y_train: numpy array training target kf_n: KFold object from model_selection model Example ------- kf_5 = KFold(n_splits=5, random_state=413, shuffle=True) stack_generater = ka_stacking_generalization(X.values, X_test.values, y.values, kf_5) ''' self.X_train = X_train self.X_test = X_test self.y_train = y_train self.kf_n = kf_n self.verbose = verbose self.cv_info = {'lgbm_info':{'cv_scores':[], 'cv_rounds':[], 'cv_losses':[]}}
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params , X_test, using_notebook=True, num_boost_round=0): ''' early version ''' # create array object to hold predictions baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32) #loop for as many times as we want bags if using_notebook: for n in tqdm_notebook(range(0, bag_round)): #shuffle first, aids in increasing variance and forces different results X_train, y_train=shuffle(X_train, y_train, random_state=seed+n) params['seed'] = seed + n model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round) pred = model.predict(X_test) baggedpred += pred/bag_round return baggedpred
def train_model(lrmodel, X, Y, devX, devY, devscores): """ Train model, using pearsonr on dev for early stopping """ done = False best = -1.0 r = np.arange(1,6) while not done: # Every 100 epochs, check Pearson on development set lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY)) yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r) score = pearsonr(yhat, devscores)[0] if score > best: print score best = score bestlrmodel = copy.deepcopy(lrmodel) else: done = True yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r) score = pearsonr(yhat, devscores)[0] print 'Dev Pearson: ' + str(score) return bestlrmodel
def get_codes(flag=myenum.all, n=100): """?????????, enum????myenum flag : enum.all ??? , enum.exclude_cyb ?????, enum.rand10 ???10? n : enum.rand??? return: list """ def readTDXlist(): #???csv?? fname = 'datas/tdx_codes.csv' df = pd.read_csv(fname, dtype=str, header=None) codes = df[0].tolist() if len(codes)>0: #????????? dapans = ['399001', '999999','399005','399002','399006','510050'] codes = [unicode(code) for code in codes if code not in dapans] #codes = filter(lambda x: x[:2] != '88', codes) return codes key = myredis.enum.KEY_CODES #??ths F10??? #???????THS??? ??????? ????? val = myredis.createRedisVal(key, readTDXlist) codes = val.get() if flag == myenum.randn: from sklearn.utils import shuffle codes = shuffle(codes) return list(codes[:n]) return codes
def __init__(self, hidden_layer_sizes=(100,), activation="relu", algorithm='l-bfgs', alpha=0.00001, batch_size=200, learning_rate="constant", learning_rate_init=0.5, power_t=0.5, max_iter=200, shuffle=False, random_state=None, tol=1e-5, verbose=False, warm_start=False): sup = super(MultilayerPerceptronClassifier, self) sup.__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, algorithm=algorithm, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='log_loss', shuffle=shuffle, random_state=random_state, tol=tol, beta=0, sparsity_param=0, verbose=verbose, warm_start=warm_start) self.label_binarizer_ = LabelBinarizer()
def __init__(self, hidden_layer_sizes=(100,), activation="relu", algorithm='l-bfgs', alpha=0.00001, batch_size=200, learning_rate="constant", learning_rate_init=0.1, power_t=0.5, max_iter=100, shuffle=False, random_state=None, tol=1e-5, verbose=False, warm_start=False): sup = super(MultilayerPerceptronRegressor, self) sup.__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, algorithm=algorithm, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='squared_loss', shuffle=shuffle, random_state=random_state, tol=tol, beta=0, sparsity_param=0, verbose=verbose, warm_start=warm_start)
def __init__(self, hidden_layer_sizes=(100,), algorithm='l-bfgs', batch_size=200, learning_rate="constant", learning_rate_init=0.5, alpha=3e-3, power_t=0.5, max_iter=100, shuffle=False, random_state=None, tol=1e-5, beta=3, sparsity_param=0.1, verbose=False, warm_start=False): sup = super(MultilayerPerceptronAutoencoder, self) sup.__init__(hidden_layer_sizes=hidden_layer_sizes, activation='logistic', algorithm=algorithm, alpha=alpha, batch_size=batch_size, beta=beta, sparsity_param=sparsity_param, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='squared_loss', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start)
def next_batch(self, batch_size, shuffle_data=True): start = self.__index_in_epoch self.__index_in_epoch += batch_size # Current epoch is finished (used all examples) if self.__index_in_epoch > self.__num_examples: self.__epochs_completed += 1 # reshuffle data for next epoch if shuffle_data: self.__X, self.__y = shuffle(self.__X, self.__y) start = 0 self.__index_in_epoch = batch_size # make sure batch size is smaller than the actual number of examples assert batch_size <= self.__num_examples end = self.__index_in_epoch return self.__X[start:end], self.__y[start:end]
def get_data(test=False, cols=None): """Loads data from FTEST if *test* is True, otherwise from FTRAIN. Pass a list of *cols* if you're only interested in a subset of the target columns. """ fname = FTEST if test else FTRAIN df = pd.read_csv(fname) df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ')) if cols: df = df[list(cols) + ['Image']] df = df.dropna() X = np.vstack(df['Image'].values).astype(np.float32) / 255. X = X.astype(np.float32) if not test: # only FTRAIN has any target columns y = df[df.columns[:-1]].values y = (y - 48) / 48 # scale target coordinates to [-1, 1] X, y = shuffle(X, y, random_state=42) # shuffle train data y = y.astype(np.float32) else: y = None return X, y
def batch_iter(doc, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = list() for iter in doc: data.append(iter) data = np.array(data) data_size = len(data) num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index]
def batch_generator(q1,q2,y,batch_size=128,shuffle=True,maxlen=238): sample_size = q1.shape[0] index_array = np.arange(sample_size) while 1: if shuffle: np.random.shuffle(index_array) batches = make_batches(sample_size, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] X_batch_1 = pad_sequences(q1[batch_ids],padding='pre',maxlen=maxlen) X_batch_2 = pad_sequences(q2[batch_ids],padding='pre',maxlen=maxlen) X_batch = [X_batch_1,X_batch_2] y_batch = y[batch_ids] yield X_batch,y_batch
def get_batch_generator(x, y, y_error, batch_size): n_data = len(x) while True: if batch_size is None: yield x, y, y_error else: x, y, y_error = shuffle(x, y, y_error) # XXX: copy ??? batches = [ ( x[k:k+batch_size], y[k:k+batch_size], y_error[k:k+batch_size], ) for k in range(0, n_data, batch_size) ] yield batches
def reorderRandomly(X,Y,list_of_images): ''' Reorder in the same way the vector of images and labels Parameters ------------ numpy.ndarray, numpy.ndarray images and labels Returns ------------ numpy.ndarray, numpy.ndarray images and labels shuffled in the same way ''' X, Y,list_of_images = shuffle(X, Y,list_of_images, random_state=43) return (X,Y,list_of_images)
def make_data(random_state, n_samples_per_center, grid_size, scale): random_state = check_random_state(random_state) centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)]) n_clusters_true, n_features = centers.shape noise = random_state.normal( scale=scale, size=(n_samples_per_center, centers.shape[1])) X = np.concatenate([c + noise for c in centers]) y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)]) return shuffle(X, y, random_state=random_state) # Part 1: Quantitative evaluation of various init methods
def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} return data
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert_equal(importances.shape[0], 10) assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(), True)
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron(shuffle=False)) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
def patch_pixel_cluster(rgb_dat, n_colors): heb_label = [0] orig_img_rgb = rgb_dat orig_img = np.array(orig_img_rgb, dtype=np.float64) / 255 # Load Image and transform to a 2D numpy array. w, h, d = original_shape = tuple(orig_img.shape) assert d == 3 image_array = np.reshape(orig_img, (w * h, d)) image_array_sample = shuffle(image_array, random_state=0)[:1000] kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample) # Get labels for all points print("Predicting color indices on the full image (k-means)") labels = kmeans.predict(image_array) #sort patch_label,the first is nucleus,the second is cytoplasm,the last is background sort_label = label_sort(kmeans.cluster_centers_) #cluster the three kinds of pixels in the orig img heb_label = pixels_classify(orig_img_rgb,sort_label,labels, w, h) return heb_label
def gen_list(prefix): ann_file = '%s2017.json'%prefix train_out = '%s.lst'%prefix # load annotations print('Loading annotations from: ' + os.path.basename(ann_file)) with open(ann_file) as data_file: ann_data = json.load(data_file) # set up the filenames and annotations imgs = [aa['file_name'] for aa in ann_data['images']] im_ids = [aa['id'] for aa in ann_data['images']] if 'annotations' in ann_data.keys(): # if we have class labels classes = [aa['category_id'] for aa in ann_data['annotations']] else: # otherwise dont have class info so set to 0 classes = [0]*len(im_ids) idx_to_class = {cc['id']: cc['name'] for cc in ann_data['categories']} print('\t' + str(len(imgs)) + ' images') print('\t' + str(len(idx_to_class)) + ' classes') for index in range(10): path = imgs[index] target = str(classes[index]) im_id = str(im_ids[index]-1) print(im_id + '\t' + target + '\t' + path) import pandas as pd from sklearn.utils import shuffle df = pd.DataFrame(classes) df[1] = imgs df = shuffle(df) df.to_csv(train_out, sep='\t', header=None, index=False) df = pd.read_csv(train_out, delimiter='\t', header=None) df.to_csv(train_out, sep='\t', header=None)
def batch_generator(X_path, y_path, batch_size, horizontal_flip=False, vertical_flip=False): X_file_list = os.listdir(X_path) num_batches = int(len(X_file_list) / batch_size) while True: for batch_index in range(num_batches): X_batch = np.zeros((batch_size, img_rows, img_cols, num_channels)) y_batch = np.zeros((batch_size, img_rows, img_cols)) for i in range(batch_size): img_path = X_file_list[batch_size * batch_index + i] mask_path = X_file_list[batch_size * batch_index + i][:-3] + 'png' img = cv2.imread(os.path.join(X_path, img_path)) img_mask = cv2.imread(os.path.join(y_path, mask_path), 0) yb = img_mask xb = img if horizontal_flip: if np.random.random() < 0.5: xb = flip_axis(xb, 0) yb = flip_axis(yb, 0) if vertical_flip: if np.random.random() < 0.5: xb = flip_axis(xb, 1) yb = flip_axis(yb, 1) y_batch[i] = yb X_batch[i] = xb # X_batch, y_batch = form_batch(X_path, y_path, batch_size, horizontal_flip, vertical_flip) # Add augmentations here X_batch = normalize(X_batch) yield X_batch, np.expand_dims(y_batch.astype(np.uint8), 3) X_file_list = shuffle(X_file_list)
def shuffle_lists(*args,**options): """ function which shuffles two lists and keeps their elements aligned for now use sklearn, maybe later get rid of dependency """ return shuffle(*args,**options)
def randomize(self): self.X, self.Y, self.rnd_idx = shuffle(self.X, self.Y, self.rnd_idx)
def quantize(cls, raster, n_colors, **kwargs): width, height, depth = raster.shape reshaped_raster = np.reshape(raster, (width * height, depth)) palette = shuffle(reshaped_raster)[:n_colors] labels = pairwise_distances_argmin(reshaped_raster, palette) quantized_raster = cls._recreate_image(palette, labels, width, height) return quantized_raster
def balance_shuffle_indices(y, random_state=None, weight=BALANCE_WEIGHTS): y = np.asarray(y) counter = Counter(y) max_count = np.max(counter.values()) indices = [] for cls, count in counter.items(): ratio = weight * max_count / count + (1 - weight) idx = np.tile(np.where(y == cls)[0], np.ceil(ratio).astype(int)) np.random.shuffle(idx) indices.append(idx[:max_count]) return shuffle(np.hstack(indices), random_state=random_state)
def shuffle(*arrays, **options): if isinstance(arrays[0][0], basestring): return list_shuffle(*arrays) else: return skutils.shuffle(*arrays, random_state=np_rng)
def data_augmentation(dataset, future=False): x, y = [], [] for row in dataset: row = reversed(row) if future else row for idx in range(0, len(row) - 1): x.append([e + 1 for e in row[0:idx + 1]]) y.append(row[idx + 1]) return shuffle(x, y)
def reshuffle_dataset(X_data_set, y_data_set): return shuffle(np.stack(X_data_set, axis=0), np.stack(y_data_set, axis=0), random_state=0)
def __init__(self, filename='./corpus/train.csv'): if os.path.exists(filename): data = pd.read_csv(filename) self.data = shuffle(data) X_data = pd.DataFrame(data.drop('sentiment', axis=1)) Y_data = column_or_1d(data[:]['sentiment'], warn=True) self.X_train, self.X_val,\ self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1) self.model = None self.load_model() self.preprocessor = Preprocessor.Preprocessor() else: print('No Source!') self.preprocessor.process_data()
def model_from_thumbnails(train_x, train_y, val_x, val_y): n_obs, n_channels, n_rows, n_cols = train_x.shape n_classes = y.shape[1] model = Sequential() model.add(Convolution2D(32, 2, 2, border_mode='valid', activation='relu', input_shape=(n_channels, n_rows, n_cols))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Convolution2D(64, 2, 2, border_mode='valid', activation='relu')) model.add(Convolution2D(64, 2, 2, border_mode='valid', activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Convolution2D(64, 2, 2, border_mode='valid', activation='relu')) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(100, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(100, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(n_classes, activation='softmax')) optimizer = Adam() model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) stopper = EarlyStopping(monitor='val_loss', patience=15, verbose=0, mode='auto') model.fit(train_x, train_y, shuffle=True, nb_epoch=100, validation_data=(val_x, val_y), callbacks = [stopper]) return model
def parseTestSet(): #get classes of trainig set (subfolders as class lables; has to be same as during training, shuffled or alphabetically) classes = [folder for folder in sorted(os.listdir(TRAIN_DIR))][CLASS_RANGE[0]:CLASS_RANGE[1]] cls_index = classes #Only use specific classes? if len(CLS) > 0: classes = CLS #load ground truth gt = getGroundTruth(classes) #get list of test files test = [] test_classes = [os.path.join(TEST_DIR, tc) for tc in sorted(os.listdir(TEST_DIR))] for tc in test_classes: if tc.rsplit("/", 1)[-1] in classes: test += [os.path.join(tc, fpath) for fpath in os.listdir(tc)] test = shuffle(test, random_state=RANDOM)[:MAX_SAMPLES] #stats #print classes print "NUMBER OF CLASSES:", len(classes) print "NUMBER OF TEST SAMPLES:", len(test) return gt, test, classes, cls_index