Python sklearn.utils 模块,shuffle() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.utils.shuffle()

项目:ensemble_amazon    作者:kaz-Anova    | 项目源码 | 文件源码
def bagged_set(X,y,model, seed, estimators, xt, update_seed=True):

   # create array object to hold predictions 
   baggedpred=[ 0.0  for d in range(0, (xt.shape[0]))]
   #loop for as many times as we want bags
   for n in range (0, estimators):
        #shuff;e first, aids in increasing variance and forces different results
        X_t,y_c=shuffle(X,y, random_state=seed+n)

        if update_seed: # update seed if requested, to give a slightly different model
            model.set_params(random_state=seed + n)
        model.fit(X_t,y_c) # fit model0.0917411475506
        preds=model.predict_proba(xt)[:,1] # predict probabilities
        # update bag's array
        for j in range (0, (xt.shape[0])):           
                baggedpred[j]+=preds[j]
   # divide with number of bags to create an average estimate            
   for j in range (0, len(baggedpred)): 
                baggedpred[j]/=float(estimators)
   # return probabilities            
   return np.array(baggedpred)
项目:convnet-nolearn    作者:jcouvy    | 项目源码 | 文件源码
def load_data():
    """
    Get and normalize data with labels, split into training, validation and test set.
    Automatically shuffles the training batch.
    """
    data = _load_data()

    X_train, y_train = data[0]
    X_valid, y_valid = data[1]
    X_test, y_test = data[2]

    X_train = X_train.reshape((-1, 1, 28, 28)).astype(np.float32)
    X_valid = X_valid.reshape((-1, 1, 28, 28)).astype(np.float32)
    X_test = X_test.reshape((-1, 1, 28, 28)).astype(np.float32)

    y_train = y_train.astype(np.int32)
    y_valid = y_valid.astype(np.int32)
    y_test = y_test.astype(np.int32)

    X_train, y_train = shuffle(X_train, y_train, random_state=0)

    return X_train, y_train, X_valid, y_valid, X_test, y_test


# --------------- Network architectures ---------------
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
项目:fastxml    作者:Refefer    | 项目源码 | 文件源码
def generate_idxs(self, dataset_len):
        if self.subsample == 1:
            return repeat(list(range(dataset_len)))

        batch_size = int(dataset_len * self.subsample) \
                if self.subsample < 1 else self.subsample

        if batch_size > dataset_len:
            raise Exception("dataset subset is larger than dataset")

        def gen(bs):
            rs = np.random.RandomState(seed=self.seed + 1000)
            idxs = list(range(dataset_len))
            while True:
                rs.shuffle(idxs)
                yield idxs[:bs]

        return gen(batch_size)
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def __init__(self, X, y, batch_size, shuffle=True, valid=False):
        """
        Constructor

        :param X: list of samples (list of lists of char_ids)
        :param y: list of labels (list of probability (of name being a male name))
        :param batch_size: the size of samples in a batch
        :param shuffle: if True, shuffle the data in every new epoch
        :param valid: if True, finish iterating on the data after one pass
        """
        assert isinstance(X, list), 'Invalid argument type type(X) = {}'.format(type(X))
        assert isinstance(y, list), 'Invalid argument type type(y) = {}'.format(type(y))
        assert len(X) == len(y), 'len(X) != len(y)'
        assert batch_size > 0, 'batch_size <= 0'

        # BatchGenerator shouldn't have a by-product
        self._X = deepcopy(X)
        self._y = deepcopy(y)

        self._batch_id = 0
        self._batch_size = batch_size
        self._shuffle = shuffle
        self._valid = valid
        self._data_size = len(self._X)
        self._finish = False
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def _gen_batch(self, batch_id, batch_size, data_size):
        """Generate batch for given X, y, batch_id, batch_size, and data_size."""
        start_index = (batch_id * batch_size) % data_size
        end_index = ((batch_id + 1) * batch_size) % data_size

        if start_index < end_index:
            return (deepcopy(self._X[start_index: end_index]),
                    deepcopy(self._y[start_index: end_index]))
        else:  # executing here means you have gone over X and y already
            X_first = deepcopy(self._X[start_index:])
            y_first = deepcopy(self._y[start_index:])

            if self._valid:
                self._finish = True
                return X_first, y_first

            # shuffle X and y after going over them if shuffle is True
            if self._shuffle:
                self._X, self._y = shuffle(self._X, self._y)

            X_second = deepcopy(self._X[:end_index])
            y_second = deepcopy(self._y[:end_index])
            return X_first + X_second, y_first + y_second
项目:MixtureOfExperts    作者:krishnakalyan3    | 项目源码 | 文件源码
def svc_model(self, X, y, x_test, y_test, x_val, y_val, i, j):
        X, y = shuffle(X, y, random_state=self.SEED)
        clf = SVC(C=self.C, kernel='rbf', gamma=self.gamma, cache_size=self.cache_size,
                  verbose=0, random_state=self.SEED)
        model = clf.fit(X, y)

        yhat_train = model.predict(X)
        yhat_val = model.predict(x_val)
        yhat_test = model.predict(x_test)

        train_error = (1 - accuracy_score(y, yhat_train)) * 100
        val_error = (1 - accuracy_score(y_val, yhat_val)) * 100
        test_error = (1 - accuracy_score(y_test, yhat_test)) * 100

        self.warn_log.append([i, train_error, val_error, test_error])

        return model
项目:Nature-Conservancy-Fish-Image-Prediction    作者:Brok-Bucholtz    | 项目源码 | 文件源码
def preprocess(image_shape, image_paths, labels=[]):
    features = []
    for image_path in tqdm(image_paths):
        image_data = list(Image.open(image_path).resize(image_shape[:2]).getdata())
        image_data = np.asarray(image_data).reshape(image_shape)

        features.append(image_data)

    # Normalizer
    features = np.asarray(features)
    features = features / 255.0

    if labels:
        # one hot encode
        label_binarizer = LabelBinarizer()
        labels = label_binarizer.fit_transform(labels)
        # Shuffle
        features, labels = shuffle(features, labels)

    return features, labels
项目:Nature-Conservancy-Fish-Image-Prediction    作者:Brok-Bucholtz    | 项目源码 | 文件源码
def maybe_cache_featurs_labels(img_shape):
    chunk_size = 500
    data_folder = './data/train/*/*.jpg'
    windows_folder = './data/windows/'

    if not isdir(windows_folder):
        makedirs(windows_folder)

    for features_file, labels_files, image_batch in tqdm(
            list(missing_chunks(chunk_size, data_folder, windows_folder)),
            unit='batch'):
        features, labels = get_feature_labels(image_batch, img_shape)
        features = features/255
        features, labels = shuffle(features, labels)

        # Save Features and Labels
        assert len(features) == len(labels)
        np.save(features_file, features)
        np.save(labels_files, labels)
项目:Tencent2017_Final_Coda_Allegro    作者:BladeCoda    | 项目源码 | 文件源码
def cutData():
    df_all=pd.read_csv('data/cutData/train_time_v9.csv')

    df_2=pd.read_csv('day30/data/train30_v9.csv') 
    print('????')
    df_all=df_all.append(df_2)
    del df_2

    df_all = shuffle(df_all,random_state=42)  

    step=len(df_all)//5

    train1=df_all[0:step]
    train2=df_all[step:2*step]
    train3=df_all[2*step:3*step]
    train4=df_all[3*step:4*step]
    train5=df_all[4*step:]

    del df_all
    return train1,train2,train3,train4,train5
项目:Tencent2017_Final_Coda_Allegro    作者:BladeCoda    | 项目源码 | 文件源码
def cutData():
    df_all=pd.read_csv('data/cutData/train_time_v9.csv')

    df_2=pd.read_csv('day30/data/train30_v9.csv') 
    print('????')
    df_all=df_all.append(df_2)
    del df_2

    df_all = shuffle(df_all,random_state=42)  

    step=len(df_all)//5

    train1=df_all[0:step]
    train2=df_all[step:2*step]
    train3=df_all[2*step:3*step]
    train4=df_all[3*step:4*step]
    train5=df_all[4*step:]

    del df_all
    return train1,train2,train3,train4,train5
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def dealingUnbalancedData(self):
        """
        Dealing with unbalanced training data
        """
        len0 = np.count_nonzero(1-self.y_train)
        len1 = np.count_nonzero(self.y_train)
        dup = int(len0/len1)
        dup = int(dup * 1.5)  # change this value, make it more possible to predict buy.

        X1 = self.X_train[np.where(self.y_train==1)[0], :]
        y1 = self.y_train[np.where(self.y_train==1)[0], :]
        y2 = self.y_train_price[np.where(self.y_train==1)[0], :]

        X1 = np.tile(X1, (dup-1,1))
        y1 = np.tile(y1, (dup-1,1))
        y2 = np.tile(y2, (dup-1,1))

        self.X_train = np.concatenate((self.X_train, X1), axis=0)
        self.y_train = np.concatenate((self.y_train, y1), axis=0)
        self.y_train_price = np.concatenate((self.y_train_price, y2), axis=0)
        # shuffle train data
        self.X_train, self.y_train, self.y_train_price = shuffle(self.X_train, self.y_train, self.y_train_price, random_state=42)
项目:Face_recognition_SVM    作者:AshStuff    | 项目源码 | 文件源码
def display(self,count=10):
        """
            count : number of Samples to display after prediction
        """
        self.images,self.labels = shuffle(self.images,self.labels)
        display_images,display_labels = self.images[:count],self.labels[:count]
        self.test(display_images,verbose=False)
        images_and_prediction = list(zip(display_images,self.prediction,display_labels))
        for index,(image,prediction,actual) in enumerate(images_and_prediction):
            plt.subplot(count/3,count/2,index+1)
            plt.axis("off")
            image = image.reshape(112,92)
            plt.imshow(image,cmap="Greys_r")
            plt.title("{}/{}".format(prediction,actual))

        plt.show()
项目:lazyprogrammer    作者:inhwane    | 项目源码 | 文件源码
def getKaggleMNIST():
    # MNIST data:
    # column 0 is labels
    # column 1-785 is data, with values 0 .. 255
    # total size of CSV: (42000, 1, 28, 28)
    train = pd.read_csv('../large_files/train.csv').as_matrix()
    train = shuffle(train)

    Xtrain = rearrange( train[:-1000,1:] )
    Ytrain = train[:-1000,0]
    Ytrain_ind  = y2indicator(Ytrain)

    Xtest  = rearrange( train[-1000:,1:] )
    Ytest  = train[-1000:,0]
    Ytest_ind  = y2indicator(Ytest)
    return Xtrain, Ytrain, Ytrain_ind, Xtest, Ytest, Ytest_ind
项目:hintbot    作者:madebyollin    | 项目源码 | 文件源码
def loadImages(datadir, maxDirectoryCount=10, split=0.9):
    for dirPath, dirNames, fileNames in os.walk(datadir):
        fileNames = [f for f in fileNames if not f[0] == '.']
        dirNames[:] = [d for d in dirNames if not d[0] == '.']
        if (maxDirectoryCount != 0):
            fullSizeFileNames = [fileName for fileName in fileNames if fileName.endswith("@2x.png") and (fileName.replace("@2x","") in fileNames)]
            for fullSizeFileName in fullSizeFileNames:
                inputImage = io.imread(dirPath + "/" + fullSizeFileName)
                targetImage = io.imread(dirPath + "/" + fullSizeFileName.replace("@2x",""))
                # print(dirPath + "/" + fullSizeFileName)
                inputSlices, targetSlices = sliceImages(inputImage, targetImage)
                # print("got", len(inputSlices), "input splices and",len(targetSlices),"targetSlices")
                inputImages.extend(inputSlices)
                targetImages.extend(targetSlices)
            maxDirectoryCount -= 1
    x, y = np.asarray(inputImages), np.asarray(targetImages)
    x_train = x[:int(len(x) * split)]
    y_train = y[:int(len(y) * split)]
    x_test = x[int(len(x) * split):]
    y_test = y[int(len(y) * split):]
    # Shuffle training data so that repeats aren't in the same batch
    # x_train, y_train = shuffle(x_train, y_train, random_state=0)
    return (x_train, y_train, x_test, y_test)
项目:facial-keypoints-detection    作者:saber1988    | 项目源码 | 文件源码
def load_data(test=False):
    fname = FTEST if test else FTRAIN
    df = pd.read_csv(fname)

    cols = df.columns[:-1]

    df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ') / 255.0)
    df = df.dropna()

    X = np.vstack(df['Image'])
    X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1)
    if not test:
        # y = (df[cols].values -48) / 48.0
        y = df[cols].values / 96.0
        X, y = shuffle(X, y)
        joblib.dump(cols, 'data/cols.pkl', compress=3)

    else:
        y = None
    return X, y
项目:facial-keypoints-detection    作者:saber1988    | 项目源码 | 文件源码
def extract_train_data(df, flip_indices, cols):
    data = df[list(cols) + ['Image']].copy()
    data = data.dropna()

    X = np.vstack(data['Image'].values)
    X = X.astype(np.float32)
    X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1)

    y = data[data.columns[:-1]].values
    if len(flip_indices) != 0:
        X_flip = X[:, :, ::-1, :]
        X = np.vstack([X, X_flip])
        y_flip = y.copy()
        y_flip[:, ::2] *= -1
        y_flip[:, ::2] += 1
        for a, b in flip_indices:
            y_flip[:, [a, b]] = y_flip[:, [b, a]]

        y = np.vstack([y, y_flip])

    X, y = shuffle(X, y, random_state=42)  # shuffle train data
    y = y.astype(np.float32)

    return X, y
项目:TF-Net    作者:Jorba123    | 项目源码 | 文件源码
def next_batch(self, batch_size, shuffle_data):
        start = self.__index_in_epoch
        self.__index_in_epoch += batch_size

        # Current epoch is finished (used all examples)
        if self.__index_in_epoch > self.__num_examples:
            self.__epochs_completed += 1

            # reshuffle data for next epoch
            if shuffle_data:
                self.__X, self.__y = shuffle(self.__X, self.__y)
            start = 0
            self.__index_in_epoch = batch_size

            # make sure batch size is smaller than the actual number of examples
            assert batch_size <= self.__num_examples
        end = self.__index_in_epoch
        return self.__X[start:end], self.__y[start:end]
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def transform_feild4rnn(power, sequence_length):
    result = []
    for index in range(len(power) - sequence_length):
        result.append(power[index: index + sequence_length])
    result = np.array(result)  # shape (2049230, 50)

    #print("result",result)
    #print("result.shape", result.shape)

    #result_mean = result.mean()
    #result -= result_mean
    #print "Shift : ", result_mean
    #print "Data  : ", result.shape

    row = int(round(0.9 * result.shape[0]))
    train = result[:row, :]
    #np.random.shuffle(train)
    #print("train", train.shape)
    #print("train", train)

    X_train = train[:, :-1]
    y_train = train[:, -1]
    X_test = result[row:, :-1]
    y_test = result[row:, -1]
    return [X_train, y_train, X_test, y_test]
项目:Kaggle_Buddy    作者:NickYi1990    | 项目源码 | 文件源码
def __init__(self, X_train, X_test, y_train, kf_n, verbose=1):
        '''Stacking for models except "neuron network" and "xgboost"

           Parameters
           ----------
           X_train: numpy array
                training data
           X_test: numpy array
                testing data
           y_train: numpy array
                training target
           kf_n: KFold object from model_selection model

           Example
           -------
           kf_5 = KFold(n_splits=5, random_state=413, shuffle=True)
           stack_generater = ka_stacking_generalization(X.values, X_test.values, y.values, kf_5)
        '''
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.kf_n = kf_n
        self.verbose = verbose
        self.cv_info = {'lgbm_info':{'cv_scores':[], 'cv_rounds':[], 'cv_losses':[]}}
项目:Kaggle_Buddy    作者:NickYi1990    | 项目源码 | 文件源码
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params
                                 , X_test, using_notebook=True, num_boost_round=0):
    '''
        early version
    '''
    # create array object to hold predictions
    baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32)
    #loop for as many times as we want bags
    if using_notebook:
        for n in tqdm_notebook(range(0, bag_round)):
            #shuffle first, aids in increasing variance and forces different results
            X_train, y_train=shuffle(X_train, y_train, random_state=seed+n)
            params['seed'] = seed + n
            model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round)
            pred = model.predict(X_test)
            baggedpred += pred/bag_round

    return baggedpred
项目:ConversationalQA    作者:btjhjeon    | 项目源码 | 文件源码
def train_model(lrmodel, X, Y, devX, devY, devscores):
    """
    Train model, using pearsonr on dev for early stopping
    """
    done = False
    best = -1.0
    r = np.arange(1,6)

    while not done:
        # Every 100 epochs, check Pearson on development set
        lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY))
        yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r)
        score = pearsonr(yhat, devscores)[0]
        if score > best:
            print score
            best = score
            bestlrmodel = copy.deepcopy(lrmodel)
        else:
            done = True

    yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r)
    score = pearsonr(yhat, devscores)[0]
    print 'Dev Pearson: ' + str(score)
    return bestlrmodel
项目:autoxd    作者:nessessary    | 项目源码 | 文件源码
def get_codes(flag=myenum.all, n=100):
    """?????????, enum????myenum
    flag : enum.all ??? , enum.exclude_cyb ?????, enum.rand10 ???10?
    n : enum.rand???
    return: list """
    def readTDXlist():
        #???csv??
        fname = 'datas/tdx_codes.csv'
        df = pd.read_csv(fname, dtype=str, header=None)
        codes = df[0].tolist()    
        if len(codes)>0:
            #?????????
            dapans = ['399001', '999999','399005','399002','399006','510050']
            codes = [unicode(code) for code in codes if code not in dapans]
            #codes = filter(lambda x: x[:2] != '88', codes)
        return codes
    key = myredis.enum.KEY_CODES    #??ths F10???
    #???????THS??? ??????? ?????
    val = myredis.createRedisVal(key, readTDXlist)
    codes = val.get()
    if flag == myenum.randn:
        from sklearn.utils import shuffle
        codes = shuffle(codes)
        return list(codes[:n])
    return codes
项目:Identify-Numbers    作者:jinhang    | 项目源码 | 文件源码
def __init__(self, hidden_layer_sizes=(100,), activation="relu",
                 algorithm='l-bfgs', alpha=0.00001,
                 batch_size=200, learning_rate="constant",
                 learning_rate_init=0.5, power_t=0.5, max_iter=200,
                 shuffle=False, random_state=None, tol=1e-5,
                 verbose=False, warm_start=False):

        sup = super(MultilayerPerceptronClassifier, self)
        sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
                     activation=activation, algorithm=algorithm, alpha=alpha,
                     batch_size=batch_size, learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init, power_t=power_t,
                     max_iter=max_iter, loss='log_loss', shuffle=shuffle,
                     random_state=random_state, tol=tol,
                     beta=0, sparsity_param=0,
                     verbose=verbose, warm_start=warm_start)

        self.label_binarizer_ = LabelBinarizer()
项目:Identify-Numbers    作者:jinhang    | 项目源码 | 文件源码
def __init__(self, hidden_layer_sizes=(100,), activation="relu",
                 algorithm='l-bfgs', alpha=0.00001,
                 batch_size=200, learning_rate="constant",
                 learning_rate_init=0.1,
                 power_t=0.5, max_iter=100, shuffle=False,
                 random_state=None, tol=1e-5,
                 verbose=False, warm_start=False):

        sup = super(MultilayerPerceptronRegressor, self)
        sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
                     activation=activation, algorithm=algorithm, alpha=alpha,
                     batch_size=batch_size, learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init, power_t=power_t,
                     max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
                     random_state=random_state, tol=tol,
                     beta=0, sparsity_param=0,
                     verbose=verbose, warm_start=warm_start)
项目:Identify-Numbers    作者:jinhang    | 项目源码 | 文件源码
def __init__(self, hidden_layer_sizes=(100,),
                 algorithm='l-bfgs', batch_size=200, learning_rate="constant",
                 learning_rate_init=0.5, alpha=3e-3,
                 power_t=0.5, max_iter=100, shuffle=False,
                 random_state=None, tol=1e-5,
                 beta=3, sparsity_param=0.1,
                 verbose=False, warm_start=False):

        sup = super(MultilayerPerceptronAutoencoder, self)
        sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
                     activation='logistic', algorithm=algorithm, alpha=alpha,
                     batch_size=batch_size, beta=beta, sparsity_param=sparsity_param,
                     learning_rate=learning_rate,
                     learning_rate_init=learning_rate_init, power_t=power_t,
                     max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
                     random_state=random_state, tol=tol,
                     verbose=verbose, warm_start=warm_start)
项目:Defect-Prediction    作者:Jorba123    | 项目源码 | 文件源码
def next_batch(self, batch_size, shuffle_data=True):
        start = self.__index_in_epoch
        self.__index_in_epoch += batch_size

        # Current epoch is finished (used all examples)
        if self.__index_in_epoch > self.__num_examples:
            self.__epochs_completed += 1

            # reshuffle data for next epoch
            if shuffle_data:
                self.__X, self.__y = shuffle(self.__X, self.__y)
            start = 0
            self.__index_in_epoch = batch_size

            # make sure batch size is smaller than the actual number of examples
            assert batch_size <= self.__num_examples
        end = self.__index_in_epoch
        return self.__X[start:end], self.__y[start:end]
项目:Facial_KeyPoints_Detection    作者:wadhwasahil    | 项目源码 | 文件源码
def get_data(test=False, cols=None):
    """Loads data from FTEST if *test* is True, otherwise from FTRAIN.
    Pass a list of *cols* if you're only interested in a subset of the
    target columns.
    """
    fname = FTEST if test else FTRAIN
    df = pd.read_csv(fname)
    df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' '))
    if cols:
        df = df[list(cols) + ['Image']]
    df = df.dropna()
    X = np.vstack(df['Image'].values).astype(np.float32) / 255.
    X = X.astype(np.float32)

    if not test:  # only FTRAIN has any target columns
        y = df[df.columns[:-1]].values
        y = (y - 48) / 48  # scale target coordinates to [-1, 1]
        X, y = shuffle(X, y, random_state=42)  # shuffle train data
        y = y.astype(np.float32)
    else:
        y = None

    return X, y
项目:Facial_KeyPoints_Detection    作者:wadhwasahil    | 项目源码 | 文件源码
def batch_iter(doc, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = list()
    for iter in doc:
        data.append(iter)
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def batch_generator(q1,q2,y,batch_size=128,shuffle=True,maxlen=238):
    sample_size = q1.shape[0]
    index_array = np.arange(sample_size)

    while 1:
        if shuffle:
            np.random.shuffle(index_array)
        batches = make_batches(sample_size, batch_size)
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            batch_ids = index_array[batch_start:batch_end]

            X_batch_1 = pad_sequences(q1[batch_ids],padding='pre',maxlen=maxlen)
            X_batch_2 = pad_sequences(q2[batch_ids],padding='pre',maxlen=maxlen)

            X_batch = [X_batch_1,X_batch_2]
            y_batch = y[batch_ids]
            yield X_batch,y_batch
项目:nn4post    作者:shuiruge    | 项目源码 | 文件源码
def get_batch_generator(x, y, y_error, batch_size):

    n_data = len(x)

    while True:

        if batch_size is None:
            yield x, y, y_error

        else:
            x, y, y_error = shuffle(x, y, y_error)  # XXX: copy ???
            batches = [
                (  x[k:k+batch_size],
                   y[k:k+batch_size],
                   y_error[k:k+batch_size],
                )
                for k in range(0, n_data, batch_size)
            ]
            yield batches
项目:Deep-Learning-para-diagnostico-a-partir-de-imagenes-Biomedicas    作者:pacocp    | 项目源码 | 文件源码
def reorderRandomly(X,Y,list_of_images):
    '''
    Reorder in the same way the vector of images and labels

    Parameters
    ------------
    numpy.ndarray, numpy.ndarray
        images and labels

    Returns
    ------------
    numpy.ndarray, numpy.ndarray
        images and labels shuffled in the same way

    '''
    X, Y,list_of_images = shuffle(X, Y,list_of_images, random_state=43)
    return (X,Y,list_of_images)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def make_data(random_state, n_samples_per_center, grid_size, scale):
    random_state = check_random_state(random_state)
    centers = np.array([[i, j]
                        for i in range(grid_size)
                        for j in range(grid_size)])
    n_clusters_true, n_features = centers.shape

    noise = random_state.normal(
        scale=scale, size=(n_samples_per_center, centers.shape[1]))

    X = np.concatenate([c + noise for c in centers])
    y = np.concatenate([[i] * n_samples_per_center
                        for i in range(n_clusters_true)])
    return shuffle(X, y, random_state=random_state)

# Part 1: Quantitative evaluation of various init methods
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def generate_data(case, sparse=False):
    """Generate regression/classification data."""
    bunch = None
    if case == 'regression':
        bunch = datasets.load_boston()
    elif case == 'classification':
        bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    X, y = shuffle(bunch.data, bunch.target)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
            'y_test': y_test}
    return data
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features, noise=noise, coef=True)

    random_seed = 13
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron(shuffle=False))
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
项目:wsics    作者:joneww    | 项目源码 | 文件源码
def patch_pixel_cluster(rgb_dat, n_colors):
    heb_label = [0]

    orig_img_rgb = rgb_dat
    orig_img = np.array(orig_img_rgb, dtype=np.float64) / 255

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(orig_img.shape)
    assert d == 3
    image_array = np.reshape(orig_img, (w * h, d))

    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)

    # Get labels for all points
    print("Predicting color indices on the full image (k-means)")
    labels = kmeans.predict(image_array)

    #sort patch_label,the first is nucleus,the second is cytoplasm,the last is background
    sort_label = label_sort(kmeans.cluster_centers_)

    #cluster the three kinds of pixels in the orig img
    heb_label = pixels_classify(orig_img_rgb,sort_label,labels, w, h)

    return  heb_label
项目:iNaturalist    作者:phunterlau    | 项目源码 | 文件源码
def gen_list(prefix):
    ann_file = '%s2017.json'%prefix
    train_out = '%s.lst'%prefix
    # load annotations
    print('Loading annotations from: ' + os.path.basename(ann_file))
    with open(ann_file) as data_file:
        ann_data = json.load(data_file)

    # set up the filenames and annotations
    imgs = [aa['file_name'] for aa in ann_data['images']]
    im_ids = [aa['id'] for aa in ann_data['images']]
    if 'annotations' in ann_data.keys():
        # if we have class labels
        classes = [aa['category_id'] for aa in ann_data['annotations']]
    else:
        # otherwise dont have class info so set to 0
        classes = [0]*len(im_ids)

    idx_to_class = {cc['id']: cc['name'] for cc in ann_data['categories']}

    print('\t' + str(len(imgs)) + ' images')
    print('\t' + str(len(idx_to_class)) + ' classes')

    for index in range(10):
        path = imgs[index]
        target = str(classes[index])
        im_id = str(im_ids[index]-1)
        print(im_id + '\t' + target + '\t' + path)

    import pandas as pd
    from sklearn.utils import shuffle

    df = pd.DataFrame(classes)
    df[1] = imgs
    df = shuffle(df)

    df.to_csv(train_out, sep='\t', header=None, index=False)
    df = pd.read_csv(train_out, delimiter='\t', header=None)
    df.to_csv(train_out, sep='\t', header=None)
项目:lsun_2017    作者:ternaus    | 项目源码 | 文件源码
def batch_generator(X_path, y_path, batch_size, horizontal_flip=False, vertical_flip=False):
    X_file_list = os.listdir(X_path)

    num_batches = int(len(X_file_list) / batch_size)
    while True:
        for batch_index in range(num_batches):
            X_batch = np.zeros((batch_size, img_rows, img_cols, num_channels))
            y_batch = np.zeros((batch_size, img_rows, img_cols))

            for i in range(batch_size):
                img_path = X_file_list[batch_size * batch_index + i]
                mask_path = X_file_list[batch_size * batch_index + i][:-3] + 'png'

                img = cv2.imread(os.path.join(X_path, img_path))
                img_mask = cv2.imread(os.path.join(y_path, mask_path), 0)
                yb = img_mask
                xb = img

                if horizontal_flip:
                    if np.random.random() < 0.5:
                        xb = flip_axis(xb, 0)
                        yb = flip_axis(yb, 0)

                if vertical_flip:
                    if np.random.random() < 0.5:
                        xb = flip_axis(xb, 1)
                        yb = flip_axis(yb, 1)

                y_batch[i] = yb

                X_batch[i] = xb

            # X_batch, y_batch = form_batch(X_path, y_path, batch_size, horizontal_flip, vertical_flip)

            # Add augmentations here

            X_batch = normalize(X_batch)

            yield X_batch, np.expand_dims(y_batch.astype(np.uint8), 3)

        X_file_list = shuffle(X_file_list)
项目:AutoSleepScorerDev    作者:skjerns    | 项目源码 | 文件源码
def shuffle_lists(*args,**options):
     """ function which shuffles two lists and keeps their elements aligned
         for now use sklearn, maybe later get rid of dependency
     """
     return shuffle(*args,**options)
项目:AutoSleepScorerDev    作者:skjerns    | 项目源码 | 文件源码
def randomize(self):
        self.X, self.Y, self.rnd_idx = shuffle(self.X, self.Y, self.rnd_idx)
项目:image-quantizer    作者:se7entyse7en    | 项目源码 | 文件源码
def quantize(cls, raster, n_colors, **kwargs):
        width, height, depth = raster.shape
        reshaped_raster = np.reshape(raster, (width * height, depth))

        palette = shuffle(reshaped_raster)[:n_colors]
        labels = pairwise_distances_argmin(reshaped_raster, palette)

        quantized_raster = cls._recreate_image(palette, labels, width, height)

        return quantized_raster
项目:melanoma-transfer    作者:learningtitans    | 项目源码 | 文件源码
def balance_shuffle_indices(y, random_state=None, weight=BALANCE_WEIGHTS):
    y = np.asarray(y)
    counter = Counter(y)
    max_count = np.max(counter.values())
    indices = []
    for cls, count in counter.items():
        ratio = weight * max_count / count + (1 - weight)
        idx = np.tile(np.where(y == cls)[0],
                      np.ceil(ratio).astype(int))
        np.random.shuffle(idx)
        indices.append(idx[:max_count])
    return shuffle(np.hstack(indices), random_state=random_state)
项目:SteinGAN    作者:DartML    | 项目源码 | 文件源码
def shuffle(*arrays, **options):
    if isinstance(arrays[0][0], basestring):
        return list_shuffle(*arrays)
    else:
        return skutils.shuffle(*arrays, random_state=np_rng)
项目:rnn-playlist-prediction    作者:burakkose    | 项目源码 | 文件源码
def data_augmentation(dataset, future=False):
    x, y = [], []
    for row in dataset:
        row = reversed(row) if future else row
        for idx in range(0, len(row) - 1):
            x.append([e + 1 for e in row[0:idx + 1]])
            y.append(row[idx + 1])
    return shuffle(x, y)
项目:OSBridge_machine_learning_101    作者:hanneshapke    | 项目源码 | 文件源码
def reshuffle_dataset(X_data_set, y_data_set):
    return shuffle(np.stack(X_data_set, axis=0), np.stack(y_data_set, axis=0), random_state=0)
项目:NLP-JD    作者:ZexinYan    | 项目源码 | 文件源码
def __init__(self, filename='./corpus/train.csv'):
        if os.path.exists(filename):
            data = pd.read_csv(filename)
            self.data = shuffle(data)
            X_data = pd.DataFrame(data.drop('sentiment', axis=1))
            Y_data = column_or_1d(data[:]['sentiment'], warn=True)
            self.X_train, self.X_val,\
            self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1)
            self.model = None
            self.load_model()
            self.preprocessor = Preprocessor.Preprocessor()
        else:
            print('No Source!')
            self.preprocessor.process_data()
项目:what-celebrity    作者:dansbecker    | 项目源码 | 文件源码
def model_from_thumbnails(train_x, train_y, val_x, val_y):
    n_obs, n_channels, n_rows, n_cols = train_x.shape
    n_classes = y.shape[1]

    model = Sequential()
    model.add(Convolution2D(32, 2, 2, border_mode='valid',
                            activation='relu',
                            input_shape=(n_channels, n_rows, n_cols)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Convolution2D(64, 2, 2, border_mode='valid',
                            activation='relu'))
    model.add(Convolution2D(64, 2, 2, border_mode='valid',
                            activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Convolution2D(64, 2, 2, border_mode='valid',
                            activation='relu'))

    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))
    optimizer = Adam()
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    stopper = EarlyStopping(monitor='val_loss', patience=15, verbose=0, mode='auto')

    model.fit(train_x, train_y, shuffle=True,
                        nb_epoch=100, validation_data=(val_x, val_y),
                        callbacks = [stopper])
    return model
项目:BirdCLEF2017    作者:kahst    | 项目源码 | 文件源码
def parseTestSet():    

    #get classes of trainig set (subfolders as class lables; has to be same as during training, shuffled or alphabetically)
    classes = [folder for folder in sorted(os.listdir(TRAIN_DIR))][CLASS_RANGE[0]:CLASS_RANGE[1]]
    cls_index = classes

    #Only use specific classes?
    if len(CLS) > 0:
        classes = CLS

    #load ground truth
    gt = getGroundTruth(classes)

    #get list of test files
    test = []
    test_classes = [os.path.join(TEST_DIR, tc) for tc in sorted(os.listdir(TEST_DIR))]
    for tc in test_classes:
        if tc.rsplit("/", 1)[-1] in classes:
            test += [os.path.join(tc, fpath) for fpath in os.listdir(tc)]
    test = shuffle(test, random_state=RANDOM)[:MAX_SAMPLES]

    #stats
    #print classes
    print "NUMBER OF CLASSES:", len(classes)
    print "NUMBER OF TEST SAMPLES:", len(test)

    return gt, test, classes, cls_index