我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用lasagne.updates.get_or_compute_grads()。
def deepmind_rmsprop(loss_or_grads, params, learning_rate=0.00025, rho=0.95, epsilon=0.01): grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_grad_new = rho * acc_grad + (1 - rho) * grad acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2 updates[acc_grad] = acc_grad_new updates[acc_rms] = acc_rms_new updates[param] = (param - learning_rate * (grad / T.sqrt(acc_rms_new - acc_grad_new ** 2 + epsilon))) return updates
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2): """ RMSProp with gradient clipping. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :return: updates """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def hard_rmsprop(loss_or_grads, params, learning_rate = 1.0e-2, epsilon=1e-6): """ Not an actual RMSProp: just normalizes the gradient, so it norm equal to the `learning rate` parameter. Don't use unless you have to. :param loss_or_grads: loss to minimize :param params: params to optimize :param learning_rate: norm of the gradient :param epsilon: small number for computational stability. :return: """ grads = get_or_compute_grads(loss_or_grads, params) gnorm = T.sqrt(sum(T.sum(g**2) for g in grads) + epsilon) grads = [ g / gnorm for g in grads ] updates = OrderedDict() for param, grad in zip(params, grads): updates[param] = param - learning_rate * grad return updates
def cruel_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2, param_clipping=1.0e-2): """ A version of careful RMSProp for Wassershtein GAN. :param epsilon: small number for computational stability. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :param param_clipping: after each update all params are clipped to [-`param_clipping`, `param_clipping`]. :return: """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updated = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) if param_clipping is not None: updates[param] = T.clip(updated, -param_clipping, param_clipping) else: updates[param] = updated return updates
def deepmind_rmsprop(loss_or_grads, params, learning_rate, rho, epsilon): """RMSProp updates [1]_. Scale learning rates by dividing with the moving average of the root mean squared (RMS) gradients. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for learning_rate : float or symbolic scalar The learning rate controlling the size of update steps rho : float or symbolic scalar Gradient moving average decay factor epsilon : float or symbolic scalar Small value added for numerical stability Returns ------- OrderedDict A dictionary mapping each parameter to its update expression Notes ----- `rho` should be between 0 and 1. A value of `rho` close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast. Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the learning rate :math:`\\eta_t` is calculated as: .. math:: r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\ \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}} References ---------- .. [1] Tieleman, T. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20) """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_grad_new = rho * acc_grad + (1 - rho) * grad acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2 updates[acc_grad] = acc_grad_new updates[acc_rms] = acc_rms_new updates[param] = (param - learning_rate * (grad / T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon))) return updates