Python theano 模块,sandbox() 实例源码


项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def print_graph_linker(print_prog=True):
    if 1:
        imap = {None:'-'}
        def blah(i, node, thunk):
            imap[node] = str(i)
            if print_prog:# and node.op.__class__ is T.DimShuffle:
                if False and  node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
                    print(node.op == T.DimShuffle((), ['x', 'x'],
                                                  inplace=True), end=' ')
                    print(node.inputs[0], type(node.inputs[0]), end=' ')
                    print(node.inputs[0].equals(T.constant(2)), end=' ')
                outputs = node.outputs
                inputs = theano.gof.graph.inputs(outputs)
                print('node ', i, node, end=' ')
                print(':'.join([imap[inp.owner] for inp in node.inputs]))
                #print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
        return theano.sandbox.wraplinker.WrapLinkerMany(
        return theano.gof.OpWiseCLinker()
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_output_broadcast_cuda(self):
        from theano.sandbox import cuda
        if not cuda.cuda_available:
            raise SkipTest("Optional package Cuda disabled")
        if cuda.use.device_number is None:
            # We should normally set VecAsRowAndCol as a GPUOp But we
            # don't want to do this here as this will disable others
            # tests in this file.  So we manually init the GPU if
            # needed to remove warning.
        v = cuda.fvector('v')
        c, r = VecAsRowAndCol()(v)
        f = theano.function([v], [c, r])

        v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_simple_shared_mrg_random(self):
        theano_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(utt.fetch_seed())

        values, updates = theano.scan(lambda: theano_rng.uniform((2,), -1, 1),
        my_f = theano.function([],

        # Just check for run-time errors
        theano_v = my_f()
        theano_v = my_f()
项目:VIMCO    作者:y0ast    | 项目源码 | 文件源码
def compile_sampling(self, data_train, data_valid, data_test, training_n_samples):
        X = tt.matrix('X')
        batch = tt.iscalar('batch')
        n_samples = tt.iscalar('n_samples')

        n_layers = len(self.layers)
        samples = [None] * n_layers

        samples[0] = replicate_batch(X, n_samples)

        if "gpu" in theano.config.device:
            from theano.sandbox import rng_mrg
            srng = rng_mrg.MRG_RandomStreams(seed=42)
            srng = tt.shared_randomstreams.RandomStreams(seed=42)

        for layer in range(n_layers - 1):
            samples[layer + 1] = self.compute_samples(srng, samples[layer], layer)

        givens = dict()
        givens[X] = data_valid[batch * self.batch_size:(batch + 1) * self.batch_size]
        self.sample_convergence = theano.function([batch, n_samples], samples, givens=givens)

        givens[n_samples] = np.int32(training_n_samples)
        givens[X] = data_train[batch * self.batch_size:(batch + 1) * self.batch_size]
        self.sample_train = theano.function([batch], samples, givens=givens)

        givens[X] = data_valid[batch * self.batch_size:(batch + 1) * self.batch_size]
        self.sample_valid = theano.function([batch], samples, givens=givens)

        givens[X] = data_test[batch * self.batch_size:(batch + 1) * self.batch_size]
        self.sample_test = theano.function([batch], samples, givens=givens)
项目:DL-Benchmarks    作者:DL-Benchmarks    | 项目源码 | 文件源码
def time_theano_fn(fn, index, GPU_bool):
    if GPU_bool:
    start = time.time()*1000
    if GPU_bool:
    elapsed_time = time.time()*1000 - start
    return elapsed_time
项目:Attentive_reader    作者:caglar    | 项目源码 | 文件源码
def print_mem(context=None):
    if theano.sandbox.cuda.cuda_enabled:
        rvals = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
        # Avaliable memory in Mb
        available = float(rvals[0]) / 1024. / 1024.
        # Total memory in Mb
        total = float(rvals[1]) / 1024. / 1024.
        if context == None:
            print ('Used %.3f Mb Free  %.3f Mb, total %.3f Mb' %
                   (total - available, available, total))
            info = str(context)
            print (('GPU status : Used %.3f Mb Free %.3f Mb,'
                    'total %.3f Mb [context %s]') %
                    (total - available, available, total, info))
项目:Attentive_reader    作者:caglar    | 项目源码 | 文件源码
def gpu_mem_free():
    Memory free on the GPU

    megs_free : float
        Number of megabytes of memory free on the GPU used by Theano
    global cuda
    if cuda is None:
        from theano.sandbox import cuda
    return cuda.mem_info()[0]/1024./1024
项目:Attentive_reader    作者:caglar    | 项目源码 | 文件源码
def print_mem(context=None):
    if theano.sandbox.cuda.cuda_enabled:
        rvals = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info()
        # Avaliable memory in Mb
        available = float(rvals[0]) / 1024. / 1024.
        # Total memory in Mb
        total = float(rvals[1]) / 1024. / 1024.
        if context == None:
            print ('Used %.3f Mb Free  %.3f Mb, total %.3f Mb' %
                   (total - available, available, total))
            info = str(context)
            print (('GPU status : Used %.3f Mb Free %.3f Mb,'
                    'total %.3f Mb [context %s]') %
                    (total - available, available, total, info))
项目:Attentive_reader    作者:caglar    | 项目源码 | 文件源码
def gpu_mem_free():
    Memory free on the GPU

    megs_free : float
        Number of megabytes of memory free on the GPU used by Theano
    global cuda
    if cuda is None:
        from theano.sandbox import cuda
    return cuda.mem_info()[0]/1024./1024
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_multinomial_0():
    # This tests the MultinomialFromUniform Op directly, not going through the
    # multinomial() call in GPU random generation.

    p = tensor.fmatrix()
    u = tensor.fvector()

    for dtype in ['int64', 'float32', 'auto']:

        m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)

        # the m*2 allows the multinomial to reuse output
        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)

        assert any([type(node.op) is GPUAMultinomialFromUniform
                    for node in f.maker.fgraph.toposort()])

        # test that both first and second samples can be drawn
        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
                            [[2, 0], [0, 2]])

        # test that both second labels can be drawn
        r = f([[.2, .8], [.3, .7]], [.31, .31])
        utt.assert_allclose(r, [[0, 2], [0, 2]])

        # test that both first labels can be drawn
        r = f([[.2, .8], [.3, .7]], [.21, .21])
        utt.assert_allclose(r, [[0, 2], [2, 0]])

        # change the size to make sure output gets reallocated ok
        # and also make sure that the GPU version doesn't screw up the
        # transposed-ness
        r = f([[.2, .8]], [.25])
        utt.assert_allclose(r, [[0, 2]])

# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_multinomial_large():
    # DEBUG_MODE will test this on GPU
    p = tensor.fmatrix()
    u = tensor.fvector()
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
    assert any([type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()])

    pval = numpy.arange(10000 * 4,
                        dtype='float32').reshape((10000, 4)) + 0.1
    pval = pval / pval.sum(axis=1)[:, None]
    uval = numpy.ones_like(pval[:, 0]) * 0.5
    mval = f(pval, uval)

    assert mval.shape == pval.shape
    if config.cast_policy == 'custom':
        assert mval.dtype == pval.dtype
    elif config.cast_policy == 'numpy+floatX':
        assert mval.dtype == config.floatX
    elif config.cast_policy == 'numpy':
        assert mval.dtype == 'float64'
        raise NotImplementedError(config.cast_policy)
    utt.assert_allclose(mval.sum(axis=1), 2)
    asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
    utt.assert_allclose(mval, asdf)  # broadcast over all rows
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_gpu_opt_dtypes():
    # Test if the returned samples are of the datatype specified
    for dtype in ['uint32', 'float32', 'int64', 'float64']:
        p = tensor.fmatrix()
        u = tensor.fvector()
        m = theano.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)

        f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
        assert any([type(node.op) is GPUAMultinomialFromUniform
                    for node in f.maker.fgraph.toposort()])
        pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
        pval = pval / pval.sum(axis=1)[:, None]
        uval = numpy.ones_like(pval[:, 0]) * 0.5
        samples = f(pval, uval)
        assert samples.dtype == dtype, "%s != %s" % (samples.dtype, dtype)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_gpu_opt():
    # Does have some overlap with test_multinomial_0

    # We test the case where we put the op on the gpu when the output
    # is moved to the gpu.
    p = tensor.fmatrix()
    u = tensor.fvector()
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
    assert m.dtype == 'float32', m.dtype

    f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
    assert any([type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()])
    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
    pval = pval / pval.sum(axis=1)[:, None]
    uval = numpy.ones_like(pval[:, 0]) * 0.5
    f(pval, uval)

    # Test with a row, it was failing in the past.
    r = tensor.frow()
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(r, u)
    assert m.dtype == 'float32', m.dtype

    f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
    assert any([type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()])
    pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
    pval = pval / pval.sum(axis=1)[:, None]
    uval = numpy.ones_like(pval[:, 0]) * 0.5
    f(pval, uval)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
        if (hasattr(theano, 'sandbox') and
                hasattr(theano.sandbox, 'cuda') and
            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
                raise Exception(
                    "You are running the Theano profiler with CUDA enabled."
                    " Theano GPU ops execution is asynchronous by default."
                    " So by default, the profile is useless."
                    " You must set the environment variable"
                    " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
                    " synchronize the execution to get a meaningful profile.")

        self.apply_callcount = {}
        self.output_size = {}
        self.apply_time = {}
        self.apply_cimpl = {}
        self.variable_shape = {}
        self.variable_strides = {}
        if flag_time_thunks is None:
            self.flag_time_thunks = config.profiling.time_thunks
            self.flag_time_thunks = flag_time_thunks
        if atexit_print:
            global _atexit_print_list
            global _atexit_registered
            if not _atexit_registered:
                _atexit_registered = True
        self.ignore_first_call = theano.config.profiling.ignore_first_call
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        from theano.sandbox import cuda
        self.gpu_backend = cuda
        self.mode_with_gpu = mode_with_gpu
        self.mode_with_gpu_nodebug = mode_with_gpu_nodebug
        super(T_Scan_Cuda, self).__init__(*args, **kwargs)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_consistent_inner_fct(self):
        # Test that scan does not falsely detect inconsistencies in a valid
        # inner graph

        rs = theano.sandbox.rng_mrg.MRG_RandomStreams(use_cuda=True)
        output, _ = theano.scan(lambda : rs.uniform((3,), dtype="float32"),

        # Also ensure that, after compilation, the Scan has been moved
        # on the gpu
        fct = theano.function([], output, mode=self.mode_with_gpu)
        scan_nodes = scan_nodes_from_fct(fct)
        assert len(scan_nodes) == 1
        assert self.is_scan_on_gpu(scan_nodes[0])
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_n_samples_compatibility():
    This test checks if the new change to MultinomialFromUniform is still compatible
    with old interface. Here I will load a graph created (using the old interface) as follows:
    RandomStreams = theano.sandbox.rng_mrg.MRG_RandomStreams
    th_rng = RandomStreams(12345)
    X = T.matrix('X')
    pvals = T.exp(X)
    pvals = pvals / pvals.sum(axis=1, keepdims=True)
    samples = th_rng.multinomial(pvals=pvals)
    pickle.dump([X, samples], open("multinomial_test_graph.pkl", "w"))
    folder = os.path.dirname(os.path.abspath(__file__))
    with open(os.path.join(folder, "multinomial_test_graph.pkl"),
              "rb") as pkl_file:
        if PY3:
            u = CompatUnpickler(pkl_file, encoding="latin1")
            u = CompatUnpickler(pkl_file)
            X, samples = u.load()
        except ImportError:
            # Windows sometimes fail with nonsensical errors like:
            #   ImportError: No module named type
            #   ImportError: No module named copy_reg
            # when "type" and "copy_reg" are builtin modules.
            if sys.platform == 'win32':
                exc_type, exc_value, exc_trace = sys.exc_info()
                reraise(SkipTest, exc_value, exc_trace)

        f = theano.function([X], samples)
        res = f(numpy.random.randn(20, 10))
        assert numpy.all(res.sum(axis=1) == 1)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
    R = MRG_RandomStreams(234, use_cuda=False)
    u = R.binomial(size=size, p=mean)
    f = theano.function(var_input, u, mode=mode)
    out = f(*input)

    # Increase the number of steps if sizes implies only a few samples
    if < 10:
        steps_ = steps * 100
        steps_ = steps
    basictest(f, steps_, const_size, prefix='mrg  cpu',
              inputs=input, allow_01=True,
              target_avg=mean, mean_rtol=rtol)

    if mode != 'FAST_COMPILE' and cuda_available:
        R = MRG_RandomStreams(234, use_cuda=True)
        u = R.binomial(size=size, p=mean, dtype='float32')
        # well, it's really that this test w GPU doesn't make sense otw
        assert u.dtype == 'float32'
        f = theano.function(var_input, theano.Out(
            borrow=True), mode=mode_with_gpu)
        gpu_out = numpy.asarray(f(*input))

        basictest(f, steps_, const_size, prefix='mrg  gpu',
                  inputs=input, allow_01=True,
                  target_avg=mean, mean_rtol=rtol)
        numpy.testing.assert_array_almost_equal(out, gpu_out,

    RR = theano.tensor.shared_randomstreams.RandomStreams(234)

    uu = RR.binomial(size=size, p=mean)
    ff = theano.function(var_input, uu, mode=mode)
    # It's not our problem if numpy generates 0 or 1
    basictest(ff, steps_, const_size, prefix='numpy', allow_01=True,
              inputs=input, target_avg=mean, mean_rtol=rtol)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def gemm_conv_op(img, kern, border_mode):
        kern = theano.sandbox.cuda.basic_ops.gpu_contiguous(
            kern[:, :, ::-1, ::-1])
        y = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=border_mode)(
            img, kern)
        return y
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def gemm_op(mode, subsample):
    return theano.sandbox.cuda.blas.GpuCorrMM(mode, subsample)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_viewop_gpu():
    from theano.sandbox import cuda
    if cuda.cuda_available is False:
        raise SkipTest('Optional package cuda disabled')
    _x = theano.tensor.fvector('x')
    x = cuda.gpu_from_host(_x)
    _out = theano.compile.ViewOp()(x)
    out = cuda.host_from_gpu(_out)
    f = theano.function([x],
    data = numpy.array([1, 2, 3], dtype='float32')
    assert numpy.allclose(f(data), data)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def contains_inf(arr, node=None, var=None):
    Test whether a numpy.ndarray contains any `np.inf` values.

    arr : np.ndarray or output of any Theano op
    node : None or an Apply instance.
        If the output of a Theano op, the node associated to it.
    var : The Theano symbolic variable.

    contains_inf : bool
        `True` if the array contains any `np.inf` values, `False` otherwise.

    Tests for the presence of `np.inf`'s by determining whether the
    values returned by `np.nanmin(arr)` and `np.nanmax(arr)` are finite.
    This approach is more memory efficient than the obvious alternative,
    calling `np.any(np.isinf(ndarray))`, which requires the construction of a
    boolean array with the same shape as the input array.

    if isinstance(arr, theano.gof.type._cdata_type):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
    elif var and getattr(var.tag, 'is_rng', False):
        return False
    elif isinstance(arr, slice):
        return False
    elif arr.size == 0:
        return False
    elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
        if (node and hasattr(theano.sandbox, 'rng_mrg') and
                # It store ints in float container
            return False
            compile_gpu_func(False, True, False)
            return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
    elif pygpu_available and isinstance(arr, GpuArray):
        return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or

    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def traverse(out, x, x_copy, d, visited=None):
    Function used by scan to parse the tree and figure out which nodes
    it needs to replace.

    There are two options :
        1) x and x_copy or on host, then you would replace x with x_copy
        2) x is on gpu, x_copy on host, then you need to replace
        host_from_gpu(x) with x_copy
    This happens because initially shared variables are on GPU... which is
    fine for the main computational graph but confuses things a bit for the
    inner graph of scan.

    # ``visited`` is a set of nodes that are already known and don't need to be
    # checked again, speeding up the traversal of multiply-connected graphs.
    # if a ``visited`` set is given, it will be updated in-place so the callee
    # knows which nodes we have seen.
    if visited is None:
        visited = set()
    if out in visited:
        return d
    from theano.sandbox import cuda
    from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
    from theano.gpuarray import pygpu_activated
    from theano.gpuarray.type import GpuArrayType
    if out == x:
        if isinstance(x.type, cuda.CudaNdarrayType):
            d[out] = cuda.gpu_from_host(x_copy)
            assert isinstance(x.type, GpuArrayType)
            d[out] = gpu_from_host(x.type.context_name)(x_copy)
        return d
    elif out.owner is None:
        return d
    elif (cuda.cuda_available and
          out.owner.op == cuda.host_from_gpu and
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d
    elif (pygpu_activated and
          out.owner.op == host_from_gpu and
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d
        for inp in out.owner.inputs:
            d = traverse(inp, x, x_copy, d, visited)
        return d

# Hashing a dictionary/list/tuple by xoring the hash of each element
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def test_multinomial():
    steps = 100
    mode_ = mode
    if mode == 'FAST_COMPILE':
        mode_ = 'FAST_RUN'

    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
            mode == 'Mode' and config.linker in ['py']):
        sample_size = (49, 5)
        sample_size = (450, 6)
    mode_ = theano.compile.mode.get_mode(mode_)
    # print ''
    # print 'ON CPU:'

    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row: row / numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
    # Note: we specify `nstreams` to avoid a warning.
    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    # theano.printing.debugprint(f)
    out = f()
    basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                          prefix='mrg ')


    if mode != 'FAST_COMPILE' and cuda_available:
        # print ''
        # print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
        pvals = numpy.asarray(pvals, dtype='float32')
        # We give the number of streams to avoid a warning.
        n = R.multinomial(pvals=pvals, dtype='float32', nstreams=30 * 256)
        # well, it's really that this test w GPU doesn't make sense otw
        assert n.dtype == 'float32'
        f = theano.function(

        # theano.printing.debugprint(f)
        gpu_out = f()
        basic_multinomialtest(f, steps, sample_size, pvals, n_samples=1,
                              prefix='gpu mrg ')
        numpy.testing.assert_array_almost_equal(out, gpu_out, decimal=6)
项目:Theano-Deep-learning    作者:GeekLiB    | 项目源码 | 文件源码
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)
    subsample = (subsx, subsy)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    if direction == 'fprop':
        i = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in npy_img.shape])()
        k = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in npy_kern.shape])()

        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
                                                subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_img, npy_kern[:, :, ::-1, ::-1])
    elif direction == 'bprop img':
        i = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in
                           npy_kern.transpose(1, 0, 2, 3).shape])()
        k = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in npy_img.shape])()

        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
    elif direction == 'bprop kern':
        i = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in
                           npy_img.transpose(1, 0, 2, 3).shape])()
        k = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in
                           npy_kern.transpose(1, 0, 2, 3).shape])()

        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = numpy.array(f(
            npy_img.transpose(1, 0, 2, 3),
            npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])
            ).transpose(1, 0, 2, 3)

    assert_allclose(cpuval, gpuval, rtol=1e-4)