我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.grad()。
def sgd_optimizer(model, lr=0.001, momentum=0.9): lr = theano.shared(np.array(lr).astype(theano.config.floatX)) # Make sure momentum is a sane value assert momentum < 1 and momentum >= 0 # the updates of SGD with momentum updates = [] grads = T.grad(model.costs[0], model.params) for param, grad in zip(model.params, grads): param_update = theano.shared(param.get_value()*0.) updates.append((param, param - lr * param_update)) updates.append((param_update, momentum*param_update + (1. - momentum)*grad)) train_func = theano.function(model.inputs, model.costs, updates=updates) valid_func = theano.function(model.inputs, model.costs) return train_func, valid_func
def e_step(self, epsilon, q, y, *params): model = self.model prior_params = model.get_prior_params(*params) h = model.prior.step_sample(epsilon, q) py = model.p_y_given_h(h, *params) consider_constant = [y] + list(params) log_py_h = -model.conditional.neg_log_prob(y[None, :, :], py) if model.prior.has_kl: KL_q_p = model.prior.step_kl_divergence(q, *prior_params) else: log_ph = -model.prior.neg_log_prob(h) log_qh = -model.posterior.neg_log_prob(h, q[None, :, :]) KL_q_p = (log_qh - log_ph).mean(axis=0) y_energy = -log_py_h.mean(axis=0) cost = (y_energy + KL_q_p).mean(axis=0) grad = theano.grad(cost, wrt=q, consider_constant=consider_constant) cost = y_energy.mean() return cost, grad
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) def Hx_plain(): Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn' ) return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: ext.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2): params = lasagne.layers.get_all_params(network, trainable=True) grads = theano.grad(loss, params) # if max_norm: # names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b'] # constraints = [grad for param, grad in zip(params, grads) if param.name in names] # assert len(constraints) == 4 # scaled_grads = total_norm_constraint(constraints, max_norm=max_norm) # counter = 0 # for i in xrange(len(params)): # param = params[i] # if param.name in names: # grads[i] = scaled_grads[counter] # counter += 1 # assert counter == 4 if opt == 'adam': updates = adam(grads, params=params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) elif opt == 'momentum': updates = nesterov_momentum(grads, params=params, learning_rate=learning_rate, momentum=momentum) else: raise ValueError('unkown optimization algorithm: %s' % opt) return updates
def fit(self, weights, o_error, tpo ): gradients = T.grad(o_error ,weights) updates = [] for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients): new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) ) new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g))) new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8)))) updates.append((w, new_weights)) updates.append((v, new_velocity)) updates.append((c, new_cache)) return updates ###### Nesterov momentum ########################################
def fit(self, weights, o_error, tpo): updates = [] gradients = theano.grad(o_error, weights) for c, w, g in zip(self.t_cache, weights, gradients): new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g) new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8) updates.append((w, new_weights)) updates.append((c, new_cache)) return updates ###### ADADELTA ########################################
def fit(self, weights, o_error, tpo): gradients = theano.grad(o_error, weights) updates = [] for v, w, g in zip(self.t_velocity, weights, gradients): #gradient = T.grad(o_error ,w) new_velocity = tpo["momentum_rate"] * v - tpo["learn_rate"] * g new_weights = w + new_velocity updates.append((w, new_weights)) updates.append((v, new_velocity)) return updates ###### Vanilla SGD ########################################
def test_convert_conv2d_model_compute_scores(self): if (self.keras_version <= 0.2): pass else: deeplift_model = kc.convert_sequential_model( model=self.keras_model) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) np.testing.assert_almost_equal( deeplift_contribs_func(task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), #when biases are 0 and ref is 0, deeplift is the same as grad*inp self.grad_func(self.inp)*self.inp, decimal=6)
def test_convert_conv1d_model_compute_scores(self): if (self.run_graph_tests==False): return deeplift_model = kc.convert_graph_model( model=self.keras_model, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_name=["inp1", "inp2"], pre_activation_target_layer_name="output_preact") grads_inp1, grads_inp2 = self.grad_func(self.inp1, self.inp2) np.testing.assert_almost_equal( np.array(deeplift_contribs_func(task_idx=0, input_data_list={ 'inp1': self.inp1, 'inp2': self.inp2}, input_references_list={ 'inp1': np.zeros_like(self.inp1), 'inp2': np.zeros_like(self.inp2)}, batch_size=10, progress_update=None)), #when biases are 0 and ref is 0, deeplift is the same as grad*inp np.array([grads_inp1*self.inp1, grads_inp2*self.inp2]), decimal=6)
def build_bprop_graph(self): optimizer = self.get_optimizer() # there are either costs assigned to specific params # OR let blocks do the gradient costs = self.link_here('costs').keys() isinstance_check = [isinstance(c, ParametersLink) for c in costs] if any(isinstance_check): assert all(isinstance_check), "Some costs have parameters associated "+\ "to them and others don't. All costs need to be binded." grads = OrderedDict() for cost in costs: grads.update(zip(cost.parameters, theano.grad(cost.model_var, cost.params))) cost = None else: cost = sum(costs) grads = None algorithm = GradientDescent( cost=cost, gradients=grads, parameters=self.model_parameters, step_rule=optimizer) self.algorithm = algorithm
def test_retNone1(self): """Test that it is not ok to return None from op.grad()""" class retNone(gof.op.Op): __props__ = () def make_node(self): inputs = [theano.tensor.vector()] outputs = [theano.tensor.vector()] return gof.Apply(self, inputs, outputs) def grad(self, inp, grads): x, = inp gz, = grads pass a = retNone().make_node() self.assertRaises(TypeError, grad_sources_inputs, [(a.out, one)], None)
def test_1in_1out(self): """Test grad is called correctly for a 1-to-1 op""" gval = theano.tensor.matrix() class O(gof.op.Op): __props__ = () def make_node(self): inputs = [theano.tensor.matrix()] outputs = [theano.tensor.matrix()] return gof.Apply(self, inputs, outputs) def grad(self, inp, grads): return gval, a1 = O().make_node() g = grad_sources_inputs([(a1.outputs[0], one)], None) self.assertTrue(g[a1.inputs[0]] is gval)
def test_1in_Nout(self): """Test grad is called correctly for a 1-to-many op""" gval = theano.tensor.matrix() class O(gof.op.Op): __props__ = () def make_node(self): inputs = [theano.tensor.matrix()] outputs = [theano.tensor.scalar(), theano.tensor.scalar()] return gof.Apply(self, inputs, outputs) def grad(self, inp, grads): x, = inp gz1, gz2 = grads return gval, a1 = O().make_node() g = grad_sources_inputs([(a1.outputs[0], one)], None) self.assertTrue(g[a1.inputs[0]] is gval)
def test_Nin_1out(self): """Test grad is called correctly for a many-to-1 op""" gval0 = theano.tensor.scalar() gval1 = theano.tensor.scalar() class O(gof.op.Op): __props__ = () def make_node(self): inputs = [theano.tensor.scalar(), theano.tensor.scalar()] outputs = [theano.tensor.matrix()] return gof.Apply(self, inputs, outputs) def grad(self, inp, grads): x0, x1 = inp gz, = grads return (gval0, gval1) a1 = O().make_node() g = grad_sources_inputs([(a1.outputs[0], one)], None) self.assertTrue(g[a1.inputs[0]] is gval0) self.assertTrue(g[a1.inputs[1]] is gval1)
def test_Nin_Nout(self): """Test grad is called correctly for a many-to-many op""" gval0 = theano.tensor.matrix() gval1 = theano.tensor.matrix() class O(gof.op.Op): __props__ = () def make_node(self): inputs = [theano.tensor.matrix(), theano.tensor.matrix()] outputs = [theano.tensor.matrix(), theano.tensor.matrix()] return gof.Apply(self, inputs, outputs) def grad(self, inp, grads): return gval0, gval1 a1 = O().make_node() g = grad_sources_inputs([(a1.outputs[0], one)], None) self.assertTrue(g[a1.inputs[0]] is gval0) self.assertTrue(g[a1.inputs[1]] is gval1)
def test_unimplemented_grad_grad(self): # tests that unimplemented grads are caught in the grad method class DummyOp(gof.Op): __props__ = () def make_node(self, x): return gof.Apply(self, [x], [x.type()]) def grad(self, inputs, output_grads): return [theano.gradient.grad_not_implemented(self, 0, inputs[0])] a = theano.tensor.scalar() b = DummyOp()(a) self.assertRaises(TypeError, theano.gradient.grad, b, a)
def test_downcast_dtype(self): # Test that the gradient of a cost wrt a float32 variable does not # get upcasted to float64. # x has dtype float32, regardless of the value of floatX x = theano.tensor.fscalar('x') y = x * 2 z = theano.tensor.lscalar('z') c = y + z dc_dx, dc_dy, dc_dz, dc_dc = theano.grad(c, [x, y, z, c]) # The dtype of dc_dy and dc_dz can be either float32 or float64, # that might depend on floatX, but is not specified. assert dc_dc.dtype in ('float32', 'float64') assert dc_dz.dtype in ('float32', 'float64') assert dc_dy.dtype in ('float32', 'float64') # When the output gradient of y is passed to op.grad, it should # be downcasted to float32, so dc_dx should also be float32 assert dc_dx.dtype == 'float32'
def test_grad_constant(self): # Test that the gradient handles Constants and consider_constant variables # consistently x = theano.tensor.scalar() y = theano.tensor.scalar() z_x = x + y z_one = one + y g_x = theano.tensor.grad(z_x, x, consider_constant=[x]) g_one = theano.tensor.grad(z_one, one) f = theano.function([x, y], [g_x, g_one]) g_x, g_one = f(1, .5) if not np.allclose(g_x, g_one): raise AssertionError("Gradient using consider constant is " + str(g_x) + " but gradient with respect to the same Constant is " + str(g_one))
def test_dxdx(): # Tests that the gradient of a scalar with respect to itself is 1 # I use an integer in this case because people keep changing this # gradient to be 0 on integers but according to our interpretation # of the gradient as defined in the Op contract, it should be 1. # If you feel the need to change this unit test you are probably # modifying the Op contract and should definitely get the approval # of multiple people on theano-dev. x = theano.tensor.iscalar() g = theano.tensor.grad(x, x) g = g.eval({x: 12}) assert np.allclose(g, 1.)
def test_undefined_cost_grad(): # Tests that if we say the cost is not differentiable via the # known_grads mechanism, it is treated as such by the rest of the # system. # This is so that Ops that are built around minigraphs like OpFromGraph # and scan can implement Op.grad by passing ograds to known_grads x = theano.tensor.iscalar() y = theano.tensor.iscalar() cost = x + y assert cost.dtype in theano.tensor.discrete_dtypes try: theano.tensor.grad(cost, [x, y], known_grads={cost: NullType()()}) except theano.gradient.NullTypeGradError: return raise AssertionError("An undefined gradient has been ignored.")
def test_disconnected_cost_grad(): # Tests that if we say the cost is disconnected via the # known_grads mechanism, it is treated as such by the rest of the # system. # This is so that Ops that are built around minigraphs like OpFromGraph # and scan can implement Op.grad by passing ograds to known_grads x = theano.tensor.iscalar() y = theano.tensor.iscalar() cost = x + y assert cost.dtype in theano.tensor.discrete_dtypes try: theano.tensor.grad(cost, [x, y], known_grads={cost: gradient.DisconnectedType()()}, disconnected_inputs='raise') except theano.gradient.DisconnectedInputError: return raise AssertionError("A disconnected gradient has been ignored.")
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix('x') expressions_gradients = [ (x * gradient.consider_constant(x), x), (x * gradient.consider_constant(T.exp(x)), T.exp(x)), (gradient.consider_constant(x), T.constant(0.)), (x**2 * gradient.consider_constant(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input='ignore') # desired gradient f2 = theano.function([x], expr_grad, on_unused_input='ignore') assert np.allclose(f(a), f2(a))
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix('x') expressions_gradients = [ (x * gradient.zero_grad(x), x), (x * gradient.zero_grad(T.exp(x)), T.exp(x)), (gradient.zero_grad(x), T.constant(0.)), (x**2 * gradient.zero_grad(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input='ignore') # desired gradient f2 = theano.function([x], expr_grad, on_unused_input='ignore') assert np.allclose(f(a), f2(a))
def test_csm_grad(self): for sparsetype in ('csr', 'csc'): x = tensor.vector() y = tensor.ivector() z = tensor.ivector() s = tensor.ivector() call = getattr(sp, sparsetype + '_matrix') spm = call(random_lil((300, 400), config.floatX, 5)) out = tensor.grad(dense_from_sparse( CSM(sparsetype)(x, y, z, s) ).sum(), x) self._compile_and_check([x, y, z, s], [out], [spm.data, spm.indices, spm.indptr, spm.shape], (CSMGrad, CSMGradC) )
def test_other_grad_tests(self): x = theano.tensor.dmatrix() x_val1 = numpy.array([[1, 2, 3], [0, 5, 6], [0, 0, 9]], dtype='float32') x_val2 = numpy.array([[1, 2, 0], [0, 5, 6], [7, 8, 9], [9, 10, 0]], dtype='float32') rng = rng = numpy.random.RandomState(43) p = Prod(axis=1) grad_p = theano.tensor.grad(p(x).sum(), x) grad_fn = theano.function([x], grad_p, mode=self.mode) assert numpy.allclose(grad_fn(x_val1), [[6., 3., 2.], [30., 0., 0.], [0., 0., 0.]]) assert numpy.allclose(grad_fn(x_val2), [[0., 0., 2.], [30., 0., 0.], [72., 63., 56.], [0., 0., 90.]]) p_axis0 = Prod(axis=0) grad_p_axis0 = theano.tensor.grad(p_axis0(x).sum(), x) grad_fn_axis0 = theano.function([x], grad_p_axis0, mode=self.mode) assert numpy.allclose(grad_fn_axis0(x_val2), [[0., 400., 0.], [63., 160., 0.], [0., 100., 0.], [0., 80., 0.]]) tensor.verify_grad(p, [x_val1], rng=rng, mode=self.mode)
def test_gt_grad(): """A user test that failed. Something about it made Elemwise.grad return something that was too complicated for get_scalar_constant_value to recognize as being 0, so gradient.grad reported that it was not a valid gradient of an integer. """ floatX = config.floatX T = theano.tensor input_ = T.vector(dtype=floatX) random_values = numpy.random.RandomState(1234).uniform( low=-1, high=1, size=(2, 2)) W_values = numpy.asarray(random_values, dtype=floatX) W = theano.shared(value=W_values, name='weights') correct_score = T.dot(input_, W) wrong_input = T.vector(dtype=floatX) wrong_score = theano.clone(correct_score, {input_: wrong_input}) # Hinge loss scores = T.ones_like(correct_score) - correct_score + wrong_score cost = (scores * (scores > 0)).sum() T.grad(cost, input_)
def test_grad_2d_inc_set_subtensor(self): for n_shape, m_shape in [ [(2, 3), (2, 2)], [(3, 2), (2, 2)], [(3, 2), (1, 2)], [(3, 2), (2,)], ]: for op in [inc_subtensor, set_subtensor]: subi = 2 data = numpy.asarray(rand(*n_shape), dtype=self.dtype) n = self.shared(data) z = scal.constant(subi) m = matrix('m', dtype=self.dtype) mv = numpy.asarray(rand(*m_shape), dtype=self.dtype) t = op(n[:z, :z], m) gn, gm = theano.tensor.grad(theano.tensor.sum(t), [n, m]) utt.verify_grad(lambda m: op(n[:z, :z], m), [mv]) utt.verify_grad(lambda nn: op(nn[:z, :z], mv), [data])
def test_grad_0d(self): data = numpy.asarray(rand(2, 3), dtype=self.dtype) n = self.shared(data) t = n[1, 0] gn = theano.tensor.grad(theano.tensor.sum(theano.tensor.exp(t)), n) f = self.function([], gn) topo = f.maker.fgraph.toposort() topo_ = [node for node in topo if not isinstance(node.op, self.ignore_topo)] if not self.fast_compile: assert_equal(len(topo_), 6) assert numpy.sum([isinstance(node.op, self.inc_sub) for node in topo_]) == 1 assert numpy.sum([isinstance(node.op, self.sub) for node in topo_]) == 1 gval = f() good = numpy.zeros_like(data) good[1, 0] = numpy.exp(data[1, 0]) self.assertTrue(numpy.allclose(gval, good), (gval, good))
def test_err_bound_list(self): n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5) l = lvector() t = n[l] # We test again AdvancedSubtensor1 as we transfer data to the cpu. self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1)) f = self.function([l], t, op=self.adv_sub1) # the grad g = self.function([l], inc_subtensor(t, numpy.asarray([[1.]], self.dtype)), op=self.adv_incsub1) for shp in [[0, 4], [0, -3], [-10]]: self.assertRaises(IndexError, f, shp) self.assertRaises(IndexError, g, shp)
def test_grad_advanced_inc_subtensor(self): def inc_slice(*s): def just_numeric_args(a, b): cost = (a[s] + b).sum() cost_wrt_a = theano.tensor.grad(cost, a) cost_wrt_b = theano.tensor.grad(cost, b) grads = cost_wrt_a.sum() + cost_wrt_b.sum() return grads return just_numeric_args # vector utt.verify_grad( inc_slice(slice(2, 4, None)), (numpy.asarray([0, 1, 2, 3, 4, 5.]), numpy.asarray([9, 9.]),)) # matrix utt.verify_grad( inc_slice(slice(1, 2, None), slice(None, None, None)), (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray([[9, 9.]]),)) # single element utt.verify_grad( inc_slice(2, 1), (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
def test_inc_adv_subtensor_with_broadcasting(self): if inplace_increment is None: raise inplace_increment_missing inc = dscalar() a = inc_subtensor(self.m[self.ix1, self.ix12], inc) g_inc = tensor.grad(a.sum(), inc) assert a.type == self.m.type, (a.type, self.m.type) f = theano.function([self.m, self.ix1, self.ix12, inc], [a, g_inc], allow_input_downcast=True) aval, gval = f([[.4, .9, .1], [5, 6, 7], [.5, .3, .15]], [1, 2, 1], [0, 1, 0], 2.1) assert numpy.allclose(aval, [[.4, .9, .1], [5 + 2.1 * 2, 6, 7], [.5, .3 + 2.1, .15]]), aval assert numpy.allclose(gval, 3.0), gval
def test_inc_adv_subtensor1_with_broadcasting(self): if inplace_increment is None: raise inplace_increment_missing inc = dscalar() a = inc_subtensor(self.m[self.ix1], inc) g_inc = tensor.grad(a.sum(), inc) assert a.type == self.m.type, (a.type, self.m.type) f = theano.function([self.m, self.ix1, inc], [a, g_inc], allow_input_downcast=True) aval, gval = f([[.4, .9, .1], [5, 6, 7], [.5, .3, .15]], [0, 1, 0], 2.1) assert numpy.allclose(aval, [[.4 + 2.1 * 2, .9 + 2.1 * 2, .1 + 2.1 * 2], [5 + 2.1, 6 + 2.1, 7 + 2.1], [.5, .3, .15]]), aval assert numpy.allclose(gval, 9.0), gval
def test_grad_argmin(self): data = rand(2, 3) n = as_tensor_variable(data) n.name = 'n' # test grad of argmin utt.verify_grad(lambda v: argmin(v, axis=-1), [data]) utt.verify_grad(lambda v: argmin(v, axis=[0]), [data]) utt.verify_grad(lambda v: argmin(v, axis=[1]), [data]) utt.verify_grad(lambda v: argmin(v.flatten()), [data]) try: cost = argmin(n, axis=-1) cost.name = None g = grad(cost, n) raise Exception('Expected an error') except TypeError: pass
def test_grad_argmax(self): data = rand(2, 3) n = as_tensor_variable(data) # test grad of argmax utt.verify_grad(lambda v: argmax(v, axis=-1), [data]) utt.verify_grad(lambda v: argmax(v, axis=[0]), [data]) utt.verify_grad(lambda v: argmax(v, axis=[1]), [data]) utt.verify_grad(lambda v: argmax(v.flatten()), [data]) try: grad(argmax(n, axis=-1), n) raise Exception('Expected an error') except TypeError: pass
def test_join_matrix_ints(self): if "float32" in self.shared.__name__: raise SkipTest( "The shared variable constructor" " need to support other dtype then float32") # Test mixed dtype. There was a bug that caused crash in the past. av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int8') bv = numpy.array([[7], [8]], dtype='int32') a = self.shared(av) b = as_tensor_variable(bv) s = join(1, a, b) want = numpy.array([[1, 2, 3, 7], [4, 5, 6, 8]], dtype='float32') out = self.eval_outputs_and_check_join([s]) self.assertTrue((out == want).all()) assert (numpy.asarray(grad(s.sum(), b).eval()) == 0).all() assert (numpy.asarray(grad(s.sum(), a).eval()) == 0).all()
def test1(self): s = scal.constant(56) t = as_tensor_variable(s) self.assertTrue(t.owner.op is tensor_from_scalar) self.assertTrue(t.type.broadcastable == (), t.type.broadcastable) self.assertTrue(t.type.ndim == 0, t.type.ndim) self.assertTrue(t.type.dtype == s.type.dtype) v = eval_outputs([t]) self.assertTrue(v == 56, v) self.assertTrue(isinstance(v, numpy.ndarray)) self.assertTrue(v.shape == (), v.shape) g = grad(t, s) self.assertTrue(eval_outputs([g]) == 0.)
def test2(self): s = scal.constant(56.) t = as_tensor_variable(s) self.assertTrue(t.owner.op is tensor_from_scalar) self.assertTrue(t.type.broadcastable == (), t.type.broadcastable) self.assertTrue(t.type.ndim == 0, t.type.ndim) self.assertTrue(t.type.dtype == s.type.dtype) v = eval_outputs([t]) self.assertTrue(v == 56., v) self.assertTrue(isinstance(v, numpy.ndarray)) self.assertTrue(v.shape == (), v.shape) g = grad(t, s) self.assertTrue(eval_outputs([g]) == 1.)
def test0(self): tt = constant(56) # scal.constant(56) ss = scalar_from_tensor(tt) self.assertTrue(ss.owner.op is scalar_from_tensor) self.assertTrue(ss.type.dtype == tt.type.dtype) v = eval_outputs([ss]) self.assertTrue(v == 56, v) if config.cast_policy == 'custom': self.assertTrue(isinstance(v, numpy.int16)) elif config.cast_policy in ('numpy', 'numpy+floatX'): self.assertTrue(isinstance( v, getattr(numpy, str(numpy.asarray(56).dtype)))) else: raise NotImplementedError(config.cast_policy) self.assertTrue(v.shape == (), v.shape) tt = lscalar() ss = scalar_from_tensor(tt) g = ss.owner.op.grad([tt], [ss]) fff = function([tt], ss) v = fff(numpy.asarray(5)) self.assertTrue(v == 5, v) self.assertTrue(isinstance(v, numpy.int64)) self.assertTrue(v.shape == (), v.shape)
def test_grad_keep_type(self): """Tests that the theano grad method returns a list if it is passed a list and a single variable if it is passed a single variable. pylearn2 depends on theano behaving this way. This functionality has been added three times and erroneously removed twice. If you do anything that requires changing this test or making it fail you are almost certainly making a common mistake, NOT fixing something. """ X = tensor.matrix() y = X.sum() G = tensor.grad(y, [X]) assert isinstance(G, list) G = tensor.grad(y, X) assert not isinstance(G, list)
def test_tile_grad(): def grad_tile(x, reps, np_x): y = tile(x, reps) z = y.sum() g = theano.function([x], grad(z, x)) grad_res = g(np_x) # The gradient should be the product of the tiling dimensions # (since the gradients are additive through the tiling operation) assert numpy.all(grad_res == numpy.prod(reps)) rng = numpy.random.RandomState(utt.fetch_seed()) # test vector grad_tile(vector('x'), [3], rng.randn(5).astype(config.floatX)) # test matrix grad_tile(matrix('x'), [3, 4], rng.randn(2, 3).astype(config.floatX)) # test tensor3 grad_tile(tensor3('x'), [3, 4, 5], rng.randn(2, 4, 3).astype(config.floatX)) # test tensor4 grad_tile(tensor4('x'), [3, 4, 5, 6], rng.randn(2, 4, 3, 5).astype(config.floatX))
def test_broadcast_grad(): # rng = numpy.random.RandomState(utt.fetch_seed()) x1 = T.tensor4('x') # x1_data = rng.randn(1, 1, 300, 300) sigma = T.scalar('sigma') # sigma_data = 20 window_radius = 3 filter_1d = T.arange(-window_radius, window_radius + 1) filter_1d = filter_1d.astype(theano.config.floatX) filter_1d = T.exp(-0.5 * filter_1d**2 / sigma ** 2) filter_1d = filter_1d / filter_1d.sum() filter_W = filter_1d.dimshuffle(['x', 'x', 0, 'x']) y = theano.tensor.nnet.conv2d(x1, filter_W, border_mode='full', filter_shape=[1, 1, None, None]) theano.grad(y.sum(), sigma)
def test_local_softmax_grad_optimization_and_big_input(self): """Test the Logsoftmax's grad substitution. Check that Log(Softmax(x))'s grad is substituted with Logsoftmax(x)'s grad and that the new operation does not explode for big inputs. Note that only the grad is checked. """ m = theano.config.mode m = theano.compile.get_mode(m) m.check_isfinite = False # some inputs that are large to make the gradient explode in the non # optimized case a = numpy.exp( 10 * numpy.random.rand(5, 10).astype(theano.config.floatX)) def myfunc(x): sm = tensor.nnet.softmax(x) logsm = tensor.log(sm) return logsm # We set step to 0.1 because for big values we need a big epsilon utt.verify_grad(myfunc, [a], eps=0.1, mode=m) sa = theano.shared(a) f = theano.function([], myfunc(sa)) self.assertTrue(check_stack_trace(f, ops_to_check='all'))
def test_grad(self): c = T.matrix() p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0, 'x') # test that function contains softmax and softmaxgrad w = T.matrix() backup = config.warn.sum_div_dimshuffle_bug config.warn.sum_div_dimshuffle_bug = False try: g = theano.function([c, w], T.grad((p_y * w).sum(), c)) finally: config.warn.sum_div_dimshuffle_bug = backup g_ops = [n.op for n in g.maker.fgraph.toposort()] # print '--- g =' # printing.debugprint(g) # print '===' raise SkipTest('Optimization not enabled for the moment') assert len(g_ops) == 2 assert softmax_op in g_ops assert softmax_grad in g_ops g(self.rng.rand(3, 4), self.rng.uniform(.5, 1, (3, 4)))
def test_transpose_basic(self): # this should be a transposed softmax c = T.matrix() p_y = T.exp(c) / T.exp(c).sum(axis=0) # test that function contains softmax and no div. theano.function([c], p_y) # printing.debugprint(f) # test that function contains softmax and no div. backup = config.warn.sum_div_dimshuffle_bug config.warn.sum_div_dimshuffle_bug = False try: theano.function([c], T.grad(p_y.sum(), c)) finally: config.warn.sum_div_dimshuffle_bug = backup # printing.debugprint(g) raise SkipTest('Optimization not enabled for the moment')
def test_sparseblockgemv_grad_shape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx) go = theano.grad(o.sum(), [b, W, h]) f = theano.function([W, h, iIdx, b, oIdx], go, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() # just make sure that it runs correcly and all the shapes are ok. b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val) assert b_g.shape == b_val.shape assert h_g.shape == h_val.shape assert W_g.shape == W_val.shape
def test_only_nonseq_inputs(self): # Compile the Theano function n_steps = 2 inp = tensor.matrix() broadcasted_inp, _ = theano.scan(lambda x: x, non_sequences=[inp], n_steps=n_steps) out = broadcasted_inp.sum() gr = tensor.grad(out, inp) fun = theano.function([inp], [broadcasted_inp, gr]) # Execute the Theano function and compare outputs to the expected outputs inputs = numpy.array([[1, 2], [3, 4]], dtype=theano.config.floatX) expected_out1 = numpy.repeat(inputs[None], n_steps, axis=0) expected_out2 = numpy.ones(inputs.shape, dtype="int8") * n_steps out1, out2 = fun(inputs) utt.assert_allclose(out1, expected_out1) utt.assert_allclose(out2, expected_out2) # simple rnn, one input, one state, weights for each; input/state # are vectors, weights are scalars
def test_verify_second_grad_sitsot(self): def get_sum_of_grad(inp): scan_outputs, updates = theano.scan(fn=lambda x: x * 2, outputs_info=[inp], n_steps=5) # Take the gradient of each output wrt its corresponding initial # state return theano.grad(scan_outputs.sum(), inp).sum() # Call verify_grad to ensure the correctness of the second gradients floatX = theano.config.floatX inputs_test_values = [numpy.random.random((3)).astype(floatX)] theano.tests.unittest_tools.verify_grad(get_sum_of_grad, inputs_test_values)