我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.gradient()。
def grad(self, inputs, g): # g[1:] is all integers, so their Jacobian in this op # is 0. We thus don't need to worry about what their values # are. # if g[0] is disconnected, then this op doesn't contribute # any gradient anywhere. but we know that at least one of # g[1:] is connected, or this grad method wouldn't have been # called, so we should report zeros (csm,) = inputs if isinstance(g[0].type, DisconnectedType): return [csm.zeros_like()] data, indices, indptr, shape = csm_properties(csm) return [CSM(csm.format)(g[0], indices, indptr, shape)] # don't make this a function or it breaks some optimizations below
def perform(self, node, inputs, outputs): (a_indices, a_indptr, b, g_ab) = inputs (out,) = outputs g_a_data = numpy.zeros(a_indices.shape, dtype=g_ab.dtype) for i in xrange(len(a_indptr) - 1): # loop over rows ind0 = a_indptr[i] ind1 = a_indptr[i + 1] # loop over values in that row (columns) for j_idx in xrange(ind0, ind1): j = a_indices[j_idx] # grad is dot product of i-th row of gradient with j-th row of b # Depending on the type of g_ab and b (sparse or dense), # the following dot product can result in a scalar or # a (1, 1) sparse matrix. dot_val = numpy.dot(g_ab[i], b[j].T) if isinstance(dot_val, scipy.sparse.spmatrix): dot_val = dot_val[0, 0] g_a_data[j_idx] = dot_val out[0] = g_a_data
def grad(self, inputs, g_outputs): r"""The gradient function should return .. math:: V\frac{\partial X^{-1}}{\partial X}, where :math:`V` corresponds to ``g_outputs`` and :math:`X` to ``inputs``. Using the `matrix cookbook <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_, one can deduce that the relation corresponds to .. math:: (X^{-1} \cdot V^{T} \cdot X^{-1})^T. """ x, = inputs xi = self(x) gz, = g_outputs # TT.dot(gz.T,xi) return [-matrix_dot(xi, gz.T, xi).T]
def grad(self, inp, cost_grad): """ Notes ----- The gradient is currently implemented for matrices only. """ a, val = inp grad = cost_grad[0] if (a.dtype.startswith('complex')): return [None, None] elif a.ndim > 2: raise NotImplementedError('%s: gradient is currently implemented' ' for matrices only' % self.__class__.__name__) wr_a = fill_diagonal(grad, 0) # valid for any number of dimensions # diag is only valid for matrices wr_val = theano.tensor.nlinalg.diag(grad).sum() return [wr_a, wr_val]
def binary_crossentropy(output, target): """ Compute the crossentropy of binary random variables. Output and target are each expectations of binary random variables; target may be exactly 0 or 1 but output must lie strictly between 0 and 1. Notes ----- We could use the x log y op to support output=0 and output=1. The gradient would still be undefined though. We do not sum, crossentropy is computed by component. TODO : Rewrite as a scalar, and then broadcast to tensor. """ return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', subsample=(1, 1), conv_mode='conv'): """ GPU convolution gradient with respect to weight using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. FIXME parameters doc :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ img = gpu_contiguous(img) topgrad = gpu_contiguous(topgrad) kerns_shp = theano.tensor.as_tensor_variable(kerns_shp) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img.shape, kerns_shp) out = gpu_alloc_empty(*kerns_shp) return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', subsample=(1, 1, 1), conv_mode='conv'): """ GPU convolution gradient with respect to weight using cuDNN from NVIDIA. The memory layout to use is 'bct01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. FIXME parameters doc :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ img = gpu_contiguous(img) topgrad = gpu_contiguous(topgrad) kerns_shp = theano.tensor.as_tensor_variable(kerns_shp) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img.shape, kerns_shp) out = gpu_alloc_empty(*kerns_shp) return GpuDnnConv3dGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', subsample=(1, 1), conv_mode='conv'): """ GPU convolution gradient with respect to input using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. FIXME parameters doc :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ kerns = gpu_contiguous(kerns) topgrad = gpu_contiguous(topgrad) img_shp = theano.tensor.as_tensor_variable(img_shp) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img_shp, kerns.shape) out = gpu_alloc_empty(*img_shp) return GpuDnnConvGradI()(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid', subsample=(1, 1), conv_mode='conv'): """ GPU convolution gradient with respect to input using cuDNN from NVIDIA. The memory layout to use is 'bct01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. FIXME parameters doc :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ kerns = gpu_contiguous(kerns) topgrad = gpu_contiguous(topgrad) img_shp = theano.tensor.as_tensor_variable(img_shp) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img_shp, kerns.shape) out = gpu_alloc_empty(*img_shp) return GpuDnnConv3dGradI()(kerns, topgrad, out, desc)
def grad(self, inputs, gout): (cond, ift, iff) = inputs (gz,) = gout first_part = switch(cond, gz, 0.) second_part = switch(cond, 0., gz) out = self(cond, ift, iff) if out.type.dtype in discrete_types: first_part = 0. second_part = 0. # cond does affect the elements of the output so it is connected. # For the sake of making the gradient convenient we assume that # condition + epsilon always triggers the same branch as condition condition_grad = cond.zeros_like().astype(theano.config.floatX) return (condition_grad, first_part, second_part)
def grad(self, inputs, gout): (x, y) = inputs (gz,) = gout if gz.type in complex_types: # max is currently defined for complex_types, # but the gradient for complex is not. raise NotImplementedError() output = self(x, y) if output.type in discrete_types: return [x.zeros_like().astype(theano.config.floatX), y.zeros_like().astype(theano.config.floatX)] gx = eq(output, x) * gz gy = eq(output, y) * gz return (gx, gy)
def grad(self, inputs, gout): (x, y) = inputs (gz,) = gout if x.type in complex_types: raise NotImplementedError() # If the output of this op is discrete, then it # it is locally flat everywhere, so the gradient # through it is 0. # This is different from it not being connected # to the output; x/y is still a function of x # and y; it's just a step function. if all(a.dtype in discrete_types for a in (x, y)): return [x.zeros_like(), y.zeros_like()] first_part = gz / y if y.type in complex_types: raise NotImplementedError() second_part = -(gz * x) / (y * y) return first_part, second_part
def grad(self, inputs, gout): (y, x) = inputs (gz,) = gout if gz.type in complex_types: raise NotImplementedError() else: if self(x, y).type in discrete_types: if x.type in discrete_types: gx = x.zeros_like(dtype=theano.config.floatX) else: gx = x.zeros_like() if y.type in discrete_types: gy = y.zeros_like(dtype=theano.config.floatX) else: gy = y.zeros_like() return [gx, gy] # If the output is float, the gradient should flow, # even if the inputs are ints return [gz * x / (sqr(x) + sqr(y)), gz * neg(y) / (sqr(x) + sqr(y))]
def grad_not_implemented(op, x_pos, x, comment=""): """ Return an un-computable symbolic variable of type `x.type`. If any call to tensor.grad results in an expression containing this un-computable variable, an exception (NotImplementedError) will be raised indicating that the gradient on the `x_pos`'th input of `op` has not been implemented. Likewise if any call to theano.function involves this variable. Optionally adds a comment to the exception explaining why this gradient is not implemented. """ return (NullType(( "This variable is Null because the grad method for " "input %s (%s) of the %s op is not implemented. %s" ) % (x_pos, x, op, comment)))()
def grad_undefined(op, x_pos, x, comment=""): """ Return an un-computable symbolic variable of type `x.type`. If any call to tensor.grad results in an expression containing this un-computable variable, an exception (GradUndefinedError) will be raised indicating that the gradient on the `x_pos`'th input of `op` is mathematically undefined. Likewise if any call to theano.function involves this variable. Optionally adds a comment to the exception explaining why this gradient is not defined. """ return (NullType( ( "This variable is Null because the grad method for " "input %s (%s) of the %s op is mathematically undefined. %s" ) % (x_pos, x, op, comment)))()
def abs_rel_errors(self, g_pt): """Return the abs and rel error of gradient estimate `g_pt` `g_pt` must be a list of ndarrays of the same length as self.gf, otherwise a ValueError is raised. Corresponding ndarrays in `g_pt` and `self.gf` must have the same shape or ValueError is raised. """ if len(g_pt) != len(self.gf): raise ValueError('argument has wrong number of elements', len(g_pt)) errs = [] for i, (a, b) in enumerate(zip(g_pt, self.gf)): if a.shape != b.shape: raise ValueError('argument element %i has wrong shape %s' % ( i, str((a.shape, b.shape)))) errs.append(numeric_grad.abs_rel_err(a, b)) return errs
def zero_grad(x): """ Consider an expression constant when computing gradients. The expression itself is unaffected, but when its gradient is computed, or the gradient of another expression that this expression is a subexpression of, it will be backpropagated through with a value of zero. In other words, the gradient of the expression is truncated to 0. :param x: A Theano expression whose gradient should be truncated. :return: The expression is returned unmodified, but its gradient is now truncated to 0. """ return zero_grad_(x)
def disconnected_grad(x): """ Consider an expression constant when computing gradients, while effectively not backpropagating through it. The expression itself is unaffected, but when its gradient is computed, or the gradient of another expression that this expression is a subexpression of, it will not be backpropagated through. This is effectively equivalent to truncating the gradient expression to 0, but is executed faster than zero_grad(), which stilll has to go through the underlying computational graph related to the expression. :param x: A Theano expression whose gradient should not be backpropagated through. :return: The expression is returned unmodified, but its gradient is now effectively truncated to 0. """ return disconnected_grad_(x)
def grad(self, inputs, ograds): ref, values, ref_dim, val_dim = inputs[:4] hash_struct = inputs[4:] ograd = ograds[0] ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) def _conv(x): return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct) # Since the kernels are separable and symmetric, the gradient w.r.t. # input is just the same filtering applied to the output grads. grad_i = _conv(ograd) def _gradr(r_i, vals, og, *args): return (og * (_conv(vals*r_i) - r_i*_conv(vals)) + vals * (_conv(og*r_i) - r_i*_conv(og))) grad_r, _ = theano.scan(fn=_gradr, sequences=[ref], non_sequences=[values, ograd] + hash_struct, outputs_info=None) grad_r = grad_r.sum(axis=1, acc_dtype="float32") grads = [DisconnectedType()() for i in range(len(inputs))] grads[0] = grad_r grads[1] = grad_i return grads
def grad(self, inputs, ograds): ref, values, ref_dim, val_dim = inputs[:4] hash_struct = inputs[4:] ograd = ograds[0] ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) def _conv(x): return GpuGaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct) # Since the kernels are separable and symmetric, the gradient w.r.t. # input is just the same filtering applied to the output grads. grad_i = _conv(ograd) def _gradr(r_i, vals, og, *args): return (og * (_conv(vals*r_i) - r_i*_conv(vals)) + vals * (_conv(og*r_i) - r_i*_conv(og))) grad_r, _ = theano.scan(fn=_gradr, sequences=[ref], non_sequences=[values, ograd] + hash_struct, outputs_info=None) grad_r = grad_r.sum(axis=1, acc_dtype="float32") grads = [DisconnectedType()() for i in range(len(inputs))] grads[0] = grad_r grads[1] = grad_i return grads
def grad(self, inp, grads): img, ws, stride, pad = inp grad, = grads grad = gpu_contiguous(grad) out = self(img, ws, stride, pad) g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad) return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def L_op(self, inputs, outputs, output_grads): desc, w, x, hx = inputs[:4] cx = inputs[4] if len(inputs) == 5 else None reserve, y, hy = outputs[:3] _, dy, dhy = output_grads[:3] dcy = output_grads[3] if len(output_grads) == 4 else None # Since the op return two outputs which contain essentially # the same information, the user will most likely only use one # of them. This leads to the situation that the other is # considered "disconnected" by theano in the gradient. # However we know that this isn't really the case so we fix it # here. # If all the ys are disconnected, then you get a boring # gradient instead of an error. But in that case you # shouldn't call this method anyway. if isinstance(dy.type, DisconnectedType): dy = as_gpuarray_variable(y.zeros_like(), context_name=y.type.context_name) if isinstance(dhy.type, DisconnectedType): dhy = None if dcy and isinstance(dcy.type, DisconnectedType): dcy = None dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode, grad_h=(dhy is not None), grad_c=(dcy is not None))( desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True) reserve2, dx, dhx = dinputs[:3] dw = GpuDnnRNNGradWeights()( desc, x, hx, y, reserve2, w) res = [DisconnectedType()(), dw, dx, dhx] if cx is not None: res.append(dinputs[3]) # dcx return res
def grad(self, inputs, output_grads): gout, = output_grads s = inputs[1] gf = curfft_op(gout, s) # Multiply the last dimension of the gradient by 2, they represent # both positive and negative frequencies, except the first # and last elements (for even transforms) which are unique. idx = [slice(None)] * (gf.ndim - 2) \ + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)] gf = T.set_subtensor(gf[idx], gf[idx] * 2) return [gf, DisconnectedType()()]
def test_Rop_dot_bug_18Oct2013_Jeremiah(self): # This test refers to a bug reported by Jeremiah Lowin on 18th Oct # 2013. The bug consists when through a dot operation there is only # one differentiable path (i.e. there is no gradient wrt to one of # the inputs). x = tensor.arange(20.0).reshape([1, 20]) v = theano.shared(numpy.ones([20])) d = tensor.dot(x, v).sum() tensor.Rop(tensor.grad(d, v), v, v)
def grad(self, inputs, g_outputs): x, ind1, ind2 = inputs gout, = g_outputs return [get_item_2lists_grad(x, ind1, ind2, gout), grad_undefined(self, 1, ind1, "No gradient for this input"), grad_undefined(self, 1, ind2, "No gradient for this input")]
def dot(x, y): """ Operation for efficiently calculating the dot product when one or all operands is sparse. Supported format are CSC and CSR. The output of the operation is dense. Parameters ---------- x Sparse or dense matrix variable. y Sparse or dense matrix variable. Returns ------- The dot product `x`.`y` in a dense format. Notes ----- The grad implemented is regular, i.e. not structured. At least one of `x` or `y` must be a sparse matrix. When the operation has the form dot(csr_matrix, dense) the gradient of this operation can be performed inplace by UsmmCscDense. This leads to significant speed-ups. """ if hasattr(x, 'getnnz'): x = as_sparse_variable(x) if hasattr(y, 'getnnz'): y = as_sparse_variable(y) x_is_sparse_variable = _is_sparse_variable(x) y_is_sparse_variable = _is_sparse_variable(y) if not x_is_sparse_variable and not y_is_sparse_variable: raise TypeError() return _dot(x, y)
def grad(self, inputs, g_outputs): r"""The gradient function should return .. math:: \sum_n\left(W_n\frac{\partial\,w_n} {\partial a_{ij}} + \sum_k V_{nk}\frac{\partial\,v_{nk}} {\partial a_{ij}}\right), where [:math:`W`, :math:`V`] corresponds to ``g_outputs``, :math:`a` to ``inputs``, and :math:`(w, v)=\mbox{eig}(a)`. Analytic formulae for eigensystem gradients are well-known in perturbation theory: .. math:: \frac{\partial\,w_n} {\partial a_{ij}} = v_{in}\,v_{jn} .. math:: \frac{\partial\,v_{kn}} {\partial a_{ij}} = \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m} """ x, = inputs w, v = self(x) # Replace gradients wrt disconnected variables with # zeros. This is a work-around for issue #1063. gw, gv = _zero_disconnected([w, v], g_outputs) return [EighGrad(self.UPLO)(x, w, v, gw, gv)]
def perform(self, node, inputs, outputs): """ Implements the "reverse-mode" gradient for the eigensystem of a square matrix. """ x, w, v, W, V = inputs N = x.shape[0] outer = numpy.outer def G(n): return sum(v[:, m] * V.T[n].dot(v[:, m]) / (w[n] - w[m]) for m in xrange(N) if m != n) g = sum(outer(v[:, n], v[:, n] * W[n] + G(n)) for n in xrange(N)) # Numpy's eigh(a, 'L') (eigh(a, 'U')) is a function of tril(a) # (triu(a)) only. This means that partial derivative of # eigh(a, 'L') (eigh(a, 'U')) with respect to a[i,j] is zero # for i < j (i > j). At the same time, non-zero components of # the gradient must account for the fact that variation of the # opposite triangle contributes to variation of two elements # of Hermitian (symmetric) matrix. The following line # implements the necessary logic. out = self.tri0(g) + self.tri1(g).T # The call to self.tri0 in perform upcast from float32 to # float64 or from int* to int64 in numpy 1.6.1 but not in # 1.6.2. We do not want version dependent dtype in Theano. # We think it should be the same as the output. outputs[0][0] = numpy.asarray(out, dtype=node.outputs[0].dtype)
def grad(self, inputs, output_gradients): num_ins = len(inputs) if num_ins == 3: x, v, sorter = inputs else: x, v = inputs x_grad = gradient._float_zeros_like(x) v_grad = gradient._float_zeros_like(v) if num_ins == 3: return [x_grad, v_grad, disconnected_type()] else: return [x_grad, v_grad]
def grad(self, inp, cost_grad): """ Notes ----- The gradient is currently implemented for matrices only. """ a, val, offset = inp grad = cost_grad[0] height, width = grad.shape if (a.dtype.startswith('complex')): return [None, None] # only valid for matrices wr_a = fill_diagonal_offset(grad, 0, offset) offset_abs = basic.abs_(offset) pos_offset_flag = basic.ge(offset, 0) neg_offset_flag = basic.lt(offset, 0) min_wh = basic.minimum(width, height) start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag num_of_step = basic.minimum(min_wh, width * pos_offset_flag + height * neg_offset_flag - offset_abs) step = a.shape[1] + 1 end = start + step * num_of_step # input of slice should be integer start = basic.cast(start, 'int32') step = basic.cast(step, 'int32') end = basic.cast(end, 'int32') wr_val = grad.flatten()[start:end:step].sum() wr_offset = theano.gradient.grad_undefined( self, 2, offset, "offset is not defined for non-integer offset so" " fill_diagonal_offset(a,val,offset+eps) is undefined") return [wr_a, wr_val, wr_offset]
def grad(self, inp, grads): s, = inp dt, = grads if s.type.dtype in float_dtypes: assert dt.type.dtype in float_dtypes return [scalar_from_tensor(dt)] # If the input dtype is an integer, then so is the output dtype, # and the "zero" gradient can be represented in that int dtype. # Currently, theano.grad insists that the dtype of the returned # gradient has a float dtype, so we use floatX. if s.type.dtype in discrete_dtypes: return [s.zeros_like().astype(theano.config.floatX)] raise NotImplementedError("grad not implemented for complex dtypes")
def clip(x, min, max): """ Clip x to be between min and max. Notes ----- When `x` is equal to the boundaries, the output is considered to be `x`, so at these points, the gradient of the cost wrt the output will be propagated to `x`, not to `min` nor `max`. In other words, on these points, the gradient wrt `x` will be equal to the gradient wrt the output, and the gradient wrt `min` and `max` will be zero. """ # see decorator for function body # for grep: clamp, bound
def grad(self, axis_and_tensors, grads): """ The gradient wrt a join op is a `Split`, used to partition the gradient along the `axis` which was used for joining. """ gz, = grads axis, tensors = axis_and_tensors[0], axis_and_tensors[1:] rval = [grad_undefined(self, 0, axis)] dtypes = [as_tensor_variable(x).type.dtype for x in tensors] out_dtype = scal.upcast(*dtypes) if 'float' in out_dtype or 'complex' in out_dtype: # assume that this is differentiable split = Split(len(tensors)) split_gz = split(gz, axis, stack([shape(x)[axis] for x in tensors])) # If there is only one split, it might not be in a list. if not isinstance(split_gz, list): split_gz = [split_gz] # Split.make_node isn't always able to infer the right # broadcast. As the grad need to keep the information, # read it if needed. split_gz = [patternbroadcast(g, t.broadcastable) for t, g in zip(tensors, split_gz)] rval = rval + split_gz else: # the output has integer type, so the gradient through it # is 0 rval = rval + [tensor.zeros_like(dtype=config.floatX) for tensor in tensors] return rval
def grad(self, inputs, output_gradients): # If the output is of an integer dtype, no gradient shall pass if 'int' in self.dtype: return [ipt.zeros_like().astype(theano.config.floatX) for ipt in inputs] grads = [] for i, inp in enumerate(inputs): grads.append(output_gradients[0][i]) return grads
def local_grad_clip(node): if isinstance(node.op, theano.gradient.GradClip): return node.inputs
def R_op(self, inputs, eval_points): outs = self(*inputs, **dict(return_list=True)) rval = [None for x in outs] # For each output for idx, out in enumerate(outs): # make such that _bgrads computes only the gradients of the # current output on the inputs ( and not all outputs) ograds = [x.zeros_like() for x in outs] ograds[idx] = theano.tensor.ones_like(out) bgrads = self._bgrad(inputs, ograds) rop_out = None for jdx, (inp, eval_point) in enumerate(izip(inputs, eval_points)): # if None, then we can just ignore this branch .. # what we do is to assume that for any non-differentiable # branch, the gradient is actually 0, which I think is not # the right thing to do .. have to talk to Ian and James # about it if bgrads[jdx] is None or \ isinstance(bgrads[jdx].type, DisconnectedType): pass elif eval_point is not None: if rop_out is None: rop_out = bgrads[jdx] * eval_point else: rop_out = rop_out + bgrads[jdx] * eval_point rval[idx] = rop_out return rval
def grad(self, inp, grads): dy, sm, y_idx = inp g_dx, = grads # TODO: currently we do not compute the gradient w.r.t. dy, because # advanced indexing is not working yet. When it works, do it to avoid # potentially misleading behavior in gradient computations! (although # typically we should not need the gradient w.r.t. dy). y_idx_range = tensor.arange(y_idx.shape[0]) g_dy = tensor.sum( g_dx * subtensor.AdvancedIncSubtensor()( sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1) g_sm = dy.dimshuffle(0, 'x') * g_dx g_y_idx = grad_not_implemented(self, 2, y_idx) return [g_dy, g_sm, g_y_idx]
def grad(self, inputs, gout): (x, y) = inputs (gz,) = gout if gz.type in complex_types: # min is currently defined for complex_types, # but the gradient for complex is not. raise NotImplementedError() output = minimum(x, y) if output.type in discrete_types: return [x.zeros_like().astype(theano.config.floatX), y.zeros_like().astype(theano.config.floatX)] gx = eq(output, x) * gz gy = eq(output, y) * gz return (gx, gy)
def grad(self, inputs, gout): (x, y) = inputs (gz,) = gout z = self(x, y) if z.type.dtype in discrete_types: # The gradient does not flow in if the output is discrete return [x.zeros_like(dtype=theano.config.floatX), y.zeros_like(dtype=theano.config.floatX)] return [gz, -(x // y) * gz]
def __str__(self): # args may have been inserted by e.g. makeTester args_msg = ", ".join(str(a) for a in self.args) return """\ GradientError: numeric gradient and analytic gradient exceed tolerance: At position %i of argument %i, abs. error = %f, abs. tolerance = %f rel. error = %f, rel. tolerance = %f Exception args: %s""" % (self.err_pos, self.arg, self.abs_err, self.abs_tol, self.rel_err, self.rel_tol, args_msg)
def grad_clip(x, lower_bound, upper_bound): """ This op do a view in the forward, but clip the gradient. This is an elemwise operation. :param x: the variable we want its gradient inputs clipped :param lower_bound: The lower bound of the gradient value :param upper_bound: The upper bound of the gradient value. :examples: x = theano.tensor.scalar() z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x) z2 = theano.tensor.grad(x**2, x) f = theano.function([x], outputs = [z, z2]) print(f(2.0)) # output (1.0, 4.0) :note: We register an opt in tensor/opt.py that remove the GradClip. So it have 0 cost in the forward and only do work in the grad. """ return GradClip(lower_bound, upper_bound)(x)
def structured_dot(x, y): """ Structured Dot is like dot, except that only the gradient wrt non-zero elements of the sparse matrix `a` are calculated and propagated. The output is presumed to be a dense matrix, and is represented by a TensorType instance. Parameters ---------- a A sparse matrix. b A sparse or dense matrix. Returns ------- A sparse matrix The dot product of `a` and `b`. Notes ----- The grad implemented is structured. """ # @todo: Maybe the triple-transposition formulation (when x is dense) # is slow. See if there is a direct way to do this. # (JB 20090528: Transposing tensors and sparse matrices is constant-time, # inplace, and fast.) if hasattr(x, 'getnnz'): x = as_sparse_variable(x) assert x.format in ["csr", "csc"] if hasattr(y, 'getnnz'): y = as_sparse_variable(y) assert y.format in ["csr", "csc"] x_is_sparse_variable = _is_sparse_variable(x) y_is_sparse_variable = _is_sparse_variable(y) if not x_is_sparse_variable and not y_is_sparse_variable: raise TypeError('structured_dot requires at least one sparse argument') if x_is_sparse_variable: return _structured_dot(x, y) else: assert y_is_sparse_variable return _structured_dot(y.T, x.T).T
def grad(self, inp, grads): # The strict sense mathematical gradient of the maximum function is # not calculated here for it is not defined at every point where some # coordinates are identical. However, since the latter set has null # Lebesgue measure, the result may be interpreted as weak gradient. # @note: This function should work correctly for L{vector}s. # (x, y), (gz, gw) # gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy # gMax * dMax/dx + gArgMax * dArgMax/dx, # gMax * dMax/daxis + gArgMax * dArgMax/daxis # g_max has one less dimension than x, so you need to complete # g_max to x's shape when axis=0 the broadcasting mechanism # does it automatically x, axis = inp g_max, g_max_idx = grads g_max_disconnected = isinstance(g_max.type, DisconnectedType) g_max_idx_disconnected = isinstance(g_max_idx.type, DisconnectedType) # if the op is totally disconnected, so are its inputs if g_max_disconnected and g_max_idx_disconnected: return [DisconnectedType()(), DisconnectedType()()] axis_grad = grad_undefined( self, 1, axis, "argmax is not defined for non-integer axes so" " argmax(x, axis+eps) is undefined") # if the max is disconnected but the argmax is not, # the gradient on its inputs is zero if g_max_disconnected: return [x.zeros_like(), axis_grad] if NoneConst.equals(axis): axis_ = list(range(x.ndim)) else: axis_ = axis xmax = max(x, axis_) # Raise the g_max and xmax to the same number of dim as the input. pattern = [] out_dim = 0 if NoneConst.equals(axis): # We are taking the max/argmax over all dimensions. axis = None for i in xrange(x.ndim): if axis is None or i in axis.data: pattern.append('x') else: pattern.append(out_dim) out_dim += 1 g_max_pad = DimShuffle(g_max.broadcastable, pattern)(g_max) xmax_pad = DimShuffle(xmax.broadcastable, pattern)(xmax) # Set the grad to the correct position. g_x = eq(xmax_pad, x) * g_max_pad return g_x, axis_grad
def flatten(x, outdim=1): """ Reshapes the variable x by keeping the first outdim-1 dimension size(s) of x the same, and making the last dimension size of x equal to the multiplication of its remaining dimension size(s). Parameters ---------- x : theano.tensor.var.TensorVariable the variable that should be reshaped. outdim : int the number of dimensions of the returned variable Returns ------- theano.tensor.var.TensorVariable the flattend variable with dimensionality of outdim """ # Any input variable can be flattened to have outdim of 1, # even if it's a scalar. Otherwise, outdim must be positive # and smaller than x.ndim. if outdim < 1 or (outdim > 1 and outdim > x.ndim): raise ValueError('outdim %s out of bound [1, %d)' % (outdim, x.ndim + 1)) if outdim > 1: dims = tuple(x.shape[:outdim - 1]) + (-1,) else: dims = (-1,) x_reshaped = x.reshape(dims) bcast_kept_dims = x.broadcastable[:outdim - 1] bcast_new_dim = python_all(x.broadcastable[outdim - 1:]) broadcastable = bcast_kept_dims + (bcast_new_dim,) x_reshaped = theano.tensor.addbroadcast( x_reshaped, *filter(lambda i: broadcastable[i], range(outdim))) return x_reshaped # class TileGrad(Op): # """ # Calculates the gradient of the Tile Op. # """ # # this is so weird, I can't think of how to make this a general thing. # def make_node(self, x, reps, g_out): # return gof.Apply(self, [x, reps, g_out], [x.type()]) # # def perform(self, node, inp, out): # x, reps, g_out = inp # gx, = out # xsh = x.shape # if len(reps) == 2 and reps[1] == 1 and len(x.shape) == 1: # gx[0] = numpy.sum(g_out, axis=0) # else: # raise NotImplementedError('x.shape, reps combination not ' # 'supported', (x.shape, reps)) # # tilegrad = TileGrad()
def grad(self, inp, grads): x, y, inverse = inp gz, = grads # First, compute the gradient wrt the broadcasted x. # If 'inverse' is False (0), apply the inverse of y on gz. # Else, apply y on gz. gx = permute_row_elements(gz, y, eq(inverse, 0)) # If x has been broadcasted along some axes, we need to sum # the gradient over these axes, but keep the dimension (as # broadcastable) broadcasted_dims = [dim for dim in xrange(gz.type.ndim) if x.type.broadcastable[dim] and not gz.type.broadcastable[dim]] gx = Sum(axis=broadcasted_dims)(gx) # Sum(...) removed the dimensions in broadcasted_dims, # so we need to put them back. newdims = [] i = 0 for dim in xrange(gz.type.ndim): if dim in broadcasted_dims: newdims.append('x') else: newdims.append(i) i += 1 gx = DimShuffle(gx.type.broadcastable, newdims)(gx) assert gx.type.broadcastable == x.type.broadcastable # if x is an integer type, then so is the output. # this means f(x+eps) = f(x) so the gradient with respect # to x is zero if x.type.dtype.find('int') != -1: gx = x.zeros_like() # The elements of y and of inverse both affect the output, # so they are connected to the output, # and the transformation isn't defined if their values # are non-integer, so the gradient with respect to them is # undefined return [gx, grad_undefined(self, 1, y), grad_undefined(self, 1, inverse)]
def _bgrad(self, inputs, ograds): # returns grad, with respect to broadcasted versions of inputs prev_setting = theano.config.compute_test_value try: theano.config.compute_test_value = 'off' def as_scalar(t): if isinstance(t.type, (NullType, DisconnectedType)): return t return get_scalar_type(t.type.dtype)() scalar_inputs = list(map(as_scalar, inputs)) scalar_ograds = list(map(as_scalar, ograds)) scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds) for igrad in scalar_igrads: assert igrad is not None, self.scalar_op finally: theano.config.compute_test_value = prev_setting if not isinstance(scalar_igrads, (list, tuple)): raise TypeError('%s.grad returned %s instead of list or tuple' % (str(self.scalar_op), str(type(scalar_igrads)))) nd = len(inputs[0].type.broadcastable) # this is the same for everyone def transform(r): # From a graph of ScalarOps, make a graph of Broadcast ops. if isinstance(r.type, (NullType, DisconnectedType)): return r if r in scalar_inputs: return inputs[scalar_inputs.index(r)] if r in scalar_ograds: return ograds[scalar_ograds.index(r)] node = r.owner if node is None: # the gradient contains a constant, translate it as # an equivalent TensorType of size 1 and proper number of # dimensions res = theano.tensor.constant(numpy.asarray(r.data), dtype=r.type.dtype) return DimShuffle((), ['x'] * nd)(res) new_r = Elemwise(node.op, {})( *[transform(ipt) for ipt in node.inputs]) return new_r ret = [] for scalar_igrad, ipt in izip(scalar_igrads, inputs): if scalar_igrad is None: # undefined gradient ret.append(None) continue ret.append(transform(scalar_igrad)) return ret
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D( W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): """ Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is an `alloc` of a scalar variable or one that has either broadcastable or matching dimensions with the output variable, by one that skips the intermediate `alloc`. """ if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx): dy, sm, y_idx = node.inputs # Those cases are directly handled by the internal broadcasting of the # `CrossentropySoftmax1HotWithBiasDx` op. if dy.ndim == 0: return False if dy.ndim == 1 and dy.broadcastable[0]: return False assert dy.ndim == 1 if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc): # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>) dz = dy.owner.inputs[0] try: shape_feature = node.fgraph.shape_feature except AttributeError: # The shape feature may not be available in some mode, but we # need it for this optimization, so don't continue. return False shape_of = shape_feature.shape_of same_shape = shape_feature.same_shape # Build `dz_broad` explicitly to include extra implicit dimensions. dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable # If we can infer statically that the shape of `sm` and # `dy` are the same in dimension `k` or the shape of `dy` is equal # to 1 (which triggers the internal broadcasting in # `CrossentropySoftmax1HotWithBiasDx`) we do not need to # check it at runtime. if (dz_broad[0] and not same_shape(sm, dy, dim_x=0, dim_y=0) and shape_of[dy][0] != 1): # If `dz` is broadcastable, we need to check whether the shapes # of `dy` and `sm` are the same or whether the shape of `dy` is # equal to 1. cond = tensor.or_(tensor.eq(dy.shape[0], 1), tensor.eq(dy.shape[0], sm.shape[0])) msg = '`sm` and `dy` do not have the same shape.' dz = opt.Assert(msg)(dz, cond) ret = node.op(dz, sm, y_idx) copy_stack_trace(node.outputs[0], ret) return [ret]
def relu(x, alpha=0): """ Compute the element-wise rectified linear activation function. .. versionadded:: 0.7.1 Parameters ---------- x : symbolic tensor Tensor to compute the activation function for. alpha : scalar or tensor, optional Slope for negative input, usually between 0 and 1. The default value of 0 will lead to the standard rectifier, 1 will lead to a linear activation function, and any value in between will give a leaky rectifier. A shared variable (broadcastable against `x`) will result in a parameterized rectifier with learnable slope(s). Returns ------- symbolic tensor Element-wise rectifier applied to `x`. Notes ----- This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)`` (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster formulation or an optimized Op, so we encourage to use this function. """ # This is probably the fastest implementation for GPUs. Both the forward # pass and the gradient get compiled into a single GpuElemwise call. # TODO: Check if it's optimal for CPU as well; add an "if" clause if not. # TODO: Check if there's a faster way for the gradient; create an Op if so. if alpha == 0: return 0.5 * (x + abs(x)) else: # We can't use 0.5 and 1 for one and half. as if alpha is a # numpy dtype, they will be considered as float64, so would # cause upcast to float64. alpha = tensor.as_tensor_variable(alpha) f1 = 0.5 * (1 + alpha) f2 = 0.5 * (1 - alpha) return f1 * x + f2 * abs(x)