我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.gradients()。
def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn): def leapfrog(pos, vel, step, i): de_dp_ = tf.gradients(tf.reduce_sum(energy_fn(pos)), pos)[0] new_vel_ = vel - step * de_dp_ new_pos_ = pos + step * new_vel_ return [new_pos_, new_vel_, step, tf.add(i, 1)] def condition(pos, vel, step, i): return tf.less(i, n_steps) de_dp = tf.gradients(tf.reduce_sum(energy_fn(initial_pos)), initial_pos)[0] vel_half_step = initial_vel - 0.5 * stepsize * de_dp pos_full_step = initial_pos + stepsize * vel_half_step i = tf.constant(0) final_pos, new_vel, _, _ = tf.while_loop(condition, leapfrog, [pos_full_step, vel_half_step, stepsize, i]) de_dp = tf.gradients(tf.reduce_sum(energy_fn(final_pos)), final_pos)[0] final_vel = new_vel - 0.5 * stepsize * de_dp return final_pos, final_vel
def __init__(self, channels=3, n_class=2, cost="cross_entropy", cost_kwargs={}, **kwargs): tf.reset_default_graph() self.n_class = n_class self.summaries = kwargs.get("summaries", True) self.x = tf.placeholder("float", shape=[None, None, None, channels]) self.y = tf.placeholder("float", shape=[None, None, None, n_class]) self.keep_prob = tf.placeholder(tf.float32) #dropout (keep probability) logits, self.variables, self.offset = create_conv_net(self.x, self.keep_prob, channels, n_class, **kwargs) self.cost = self._get_cost(logits, cost, cost_kwargs) self.gradients_node = tf.gradients(self.cost, self.variables) self.cross_entropy = tf.reduce_mean(cross_entropy(tf.reshape(self.y, [-1, n_class]), tf.reshape(pixel_wise_softmax_2(logits), [-1, n_class]))) self.predicter = pixel_wise_softmax_2(logits) self.correct_pred = tf.equal(tf.argmax(self.predicter, 3), tf.argmax(self.y, 3)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
def Grad_Penalty(real_data,fake_data,Discriminator,config): ''' Implemention from "Improved training of Wasserstein" Interpolation based estimation of the gradient of the discriminator. Used to penalize the derivative rather than explicitly constrain lipschitz. ''' batch_size=config.batch_size LAMBDA=config.lambda_W n_hidden=config.critic_hidden_size alpha = tf.random_uniform([batch_size,1],0.,1.) interpolates = alpha*real_data + ((1-alpha)*fake_data)#Could do more if not fixed batch_size disc_interpolates = Discriminator(interpolates,batch_size,n_hidden=n_hidden,config=config, reuse=True)[1]#logits gradients = tf.gradients(disc_interpolates,[interpolates])[0]#orig slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1])) gradient_penalty = tf.reduce_mean((slopes-1)**2) grad_cost = LAMBDA*gradient_penalty return grad_cost,slopes
def _get_opt(self): # build the self.opt_op for training self.set_train_var() tvars = self.var_list self.print_trainable() with tf.name_scope("Optimizer"): opt = self._get_optx() grads = tf.gradients(self.loss+self.l2loss, tvars) grads = list(zip(grads, tvars)) # Op to update all variables according to their gradient self.opt_op = opt.apply_gradients(grads_and_vars=grads,global_step = self.global_step) if self.flags.visualize and "grad" in self.flags.visualize: for grad, var in grads: tf.summary.histogram(var.name + '/gradient', grad, collections=[tf.GraphKeys.GRADIENTS])
def pwlin_grid(r_,rvar_,theta_,dtheta = .75): """piecewise linear with noise-adaptive grid spacing. returns xhat,dxdr where q = r/dtheta/sqrt(rvar) xhat = r * interp(q,theta) all but the last dimensions of theta must broadcast to r_ e.g. r.shape = (500,1000) is compatible with theta.shape=(500,1,7) """ ntheta = int(theta_.get_shape()[-1]) scale_ = dtheta / tf.sqrt(rvar_) ars_ = tf.clip_by_value( tf.expand_dims( tf.abs(r_)*scale_,-1),0.0, ntheta-1.0 ) centers_ = tf.constant( np.arange(ntheta),dtype=tf.float32 ) outer_distance_ = tf.maximum(0., 1.0-tf.abs(ars_ - centers_) ) # new dimension for distance to closest bin centers (or center) gain_ = tf.reduce_sum( theta_ * outer_distance_,axis=-1) # apply the gain (learnable) xhat_ = gain_ * r_ dxdr_ = tf.gradients(xhat_,r_)[0] return (xhat_,dxdr_)
def attack_single_step(self, x, eta, y): """ Given the original image and the perturbation computed so far, computes a new perturbation. :param x: A tensor with the original input. :param eta: A tensor the same shape as x that holds the perturbation. :param y: A tensor with the target labels or ground-truth labels. """ import tensorflow as tf from cleverhans.utils_tf import model_loss, clip_eta adv_x = x + eta preds = self.model.get_probs(adv_x) loss = model_loss(y, preds) if self.targeted: loss = -loss grad, = tf.gradients(loss, adv_x) scaled_signed_grad = self.eps_iter * tf.sign(grad) adv_x = adv_x + scaled_signed_grad if self.clip_min is not None and self.clip_max is not None: adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) return x, eta
def jacobian_graph(predictions, x, nb_classes): """ Create the Jacobian graph to be ran later in a TF session :param predictions: the model's symbolic output (linear output, pre-softmax) :param x: the input placeholder :param nb_classes: the number of classes the model has :return: """ # This function will return a list of TF gradients list_derivatives = [] # Define the TF graph elements to compute our derivatives for each class for class_ind in xrange(nb_classes): derivatives, = tf.gradients(predictions[:, class_ind], x) list_derivatives.append(derivatives) return list_derivatives
def test_fgm_gradient_max(): input_dim = 2 num_classes = 3 batch_size = 4 rng = np.random.RandomState([2017, 8, 23]) x = tf.placeholder(tf.float32, [batch_size, input_dim]) weights = tf.placeholder(tf.float32, [input_dim, num_classes]) logits = tf.matmul(x, weights) probs = tf.nn.softmax(logits) adv_x = fgm(x, probs) random_example = rng.randint(batch_size) random_feature = rng.randint(input_dim) output = tf.slice(adv_x, [random_example, random_feature], [1, 1]) dx, = tf.gradients(output, x) # The following line catches GitHub issue #243 assert dx is not None sess = tf.Session() dx = sess.run(dx, feed_dict=random_feed_dict(rng, [x, weights])) ground_truth = np.zeros((batch_size, input_dim)) ground_truth[random_example, random_feature] = 1. assert np.allclose(dx, ground_truth), (dx, ground_truth)
def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): import tensorflow as tf x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
def test_gradient(self): x_var = tf.Variable(tf.zeros([3], dtype='float64'), name='x') shape = loom.TypeShape('float64', (3,)) ops = {'add': BinaryLoomOp(shape, tf.add), 'mul': BinaryLoomOp(shape, tf.multiply)} the_loom = loom.Loom(named_tensors={'x': x_var}, named_ops=ops) output_tensor = the_loom.output_tensor(shape) output = tf.reduce_sum(output_tensor) gradient = tf.gradients(output, [x_var])[0] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) weaver = the_loom.make_weaver() m = weaver(np.array([1, 2, 3], dtype='float64')) b = weaver(np.array([47, 9, -1], dtype='float64')) mx = weaver.mul(m, weaver.x) mx_plus_b = weaver.add(mx, b) result = gradient.eval(feed_dict=weaver.build_feed_dict([mx_plus_b])) self.assertTrue((result == np.array( [1.0, 2.0, 3.0], dtype='float64')).all())
def test_gradient_with_direct_feed_dict(self): x_var = tf.Variable(tf.zeros([3], dtype='float64'), name='x') shape = loom.TypeShape('float64', (3,)) ops = {'add': BinaryLoomOp(shape, tf.add), 'mul': BinaryLoomOp(shape, tf.multiply)} the_loom = loom.Loom(named_tensors={'x': x_var}, named_ops=ops, direct_feed_dict=True) output_tensor = the_loom.output_tensor(shape) output = tf.reduce_sum(output_tensor) gradient = tf.gradients(output, [x_var])[0] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) weaver = the_loom.make_weaver() m = weaver(np.array([1, 2, 3], dtype='float64')) b = weaver(np.array([47, 9, -1], dtype='float64')) mx = weaver.mul(m, weaver.x) mx_plus_b = weaver.add(mx, b) result = gradient.eval(feed_dict=weaver.build_feed_dict([mx_plus_b])) self.assertTrue((result == np.array( [1.0, 2.0, 3.0], dtype='float64')).all())
def __init__(self, sess, state_size, action_size, batch_size, tau, learning_rate): """Init critic network.""" self.sess = sess self.batch_size = batch_size self.tau = tau self.learning_rate = learning_rate self.action_size = action_size K.set_session(sess) self.model, self.action, self.state = \ self.create_critic_network(state_size, action_size) self.target_model, self.target_action, self.target_state = \ self.create_critic_network(state_size, action_size) self.action_grads = tf.gradients(self.model.output, self.action) self.sess.run(tf.initialize_all_variables())
def __init__(self, sess, state_size, action_size, BATCH_SIZE, TAU, LEARNING_RATE): self.sess = sess self.BATCH_SIZE = BATCH_SIZE self.TAU = TAU self.LEARNING_RATE = LEARNING_RATE K.set_session(sess) #Now create the model self.model , self.weights, self.state = self.create_actor_network(state_size, action_size) self.target_model, self.target_weights, self.target_state = self.create_actor_network(state_size, action_size) self.action_gradient = tf.placeholder(tf.float32,[None, action_size]) self.params_grad = tf.gradients(self.model.output, self.weights, -self.action_gradient) grads = zip(self.params_grad, self.weights) self.optimize = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(grads) self.sess.run(tf.global_variables_initializer())
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999): ''' Adam optimizer ''' updates = [] if type(cost_or_grads) is not list: grads = tf.gradients(cost_or_grads, params) else: grads = cost_or_grads t = tf.Variable(1., 'adam_t') for p, g in zip(params, grads): mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg') if mom1 > 0: v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') v_t = mom1 * v + (1. - mom1) * g v_hat = v_t / (1. - tf.pow(mom1, t)) updates.append(v.assign(v_t)) else: v_hat = g mg_t = mom2 * mg + (1. - mom2) * tf.square(g) mg_hat = mg_t / (1. - tf.pow(mom2, t)) g_t = v_hat / tf.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append(mg.assign(mg_t)) updates.append(p.assign(p_t)) updates.append(t.assign_add(1)) return tf.group(*updates)
def get_update_op(self, loss, opts, global_step=None, max_gradient_norm=None, freeze_variables=None): if loss is None: return None freeze_variables = freeze_variables or [] # compute gradient only for variables that are not frozen frozen_parameters = [var.name for var in tf.trainable_variables() if any(re.match(var_, var.name) for var_ in freeze_variables)] params = [var for var in tf.trainable_variables() if var.name not in frozen_parameters] self.params = params gradients = tf.gradients(loss, params) if max_gradient_norm: gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) update_ops = [] for opt in opts: with tf.variable_scope('gradients' if self.name is None else 'gradients_{}'.format(self.name)): update_op = opt.apply_gradients(list(zip(gradients, params)), global_step=global_step) update_ops.append(update_op) return update_ops
def gradient_penalty(self): config = self.config gan = self.gan gradient_penalty = config.gradient_penalty if has_attr(gan.inputs, 'gradient_penalty_label'): x = gan.inputs.gradient_penalty_label else: x = gan.inputs.x generator = self.generator or gan.generator g = generator.sample discriminator = self.discriminator or gan.discriminator shape = [1 for t in g.get_shape()] shape[0] = gan.batch_size() uniform_noise = tf.random_uniform(shape=shape,minval=0.,maxval=1.) print("[gradient penalty] applying x:", x, "g:", g, "noise:", uniform_noise) interpolates = x + uniform_noise * (g - x) reused_d = discriminator.reuse(interpolates) gradients = tf.gradients(reused_d, [interpolates])[0] penalty = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1)) penalty = tf.reduce_mean(tf.square(penalty - 1.)) return float(gradient_penalty) * penalty
def _create_optimizer(self): print('Create optimizer... ') with tf.variable_scope('training'): self.global_step = tf.Variable( 0, dtype=tf.int32, trainable=False, name='global_step') if not self.fw_only: self.optimizer = tf.train.GradientDescentOptimizer(config.LR) trainable_vars = tf.trainable_variables() self.gradient_norms = [] self.train_ops = [] start = time.time() for bucket_id in range(len(config.BUCKETS)): clipped_grads, norm = tf.clip_by_global_norm( tf.gradients(self.losses[bucket_id], trainable_vars), config.MAX_GRAD_NORM) self.gradient_norms.append(norm) self.train_ops.append(self.optimizer.apply_gradients( zip(clipped_grads, trainable_vars), global_step=self.global_step)) print('Creating opt for bucket {:d} took {:.2f} seconds.'.format( bucket_id, time.time() - start)) start = time.time()
def update_weights(self, f): """ Gradient-based update of current Critic parameters. Also return the action gradients for the Actor update later. This is the dQ/da in the paper, and Q is the current Q network, not the target Q network. """ feed = { self.obs_t_BO: f['obs_t_BO'], self.act_t_BA: f['act_t_BA'], self.rew_t_B: f['rew_t_B'], self.obs_tp1_BO: f['obs_tp1_BO'], self.done_mask_B: f['done_mask_B'] } action_grads_BA, _, l2_error = self.sess.run([self.act_grads_BA, \ self.optimize_c, self.l2_error], feed) # We assume that the only item in the list has what we want. assert len(action_grads_BA) == 1 return action_grads_BA[0], l2_error
def _flatgrad(self, loss, var_list): """ A Tensorflow version of John Schulman's `flatgrad` function. It computes the gradients but does NOT apply them (for now). This is only called during the `init` of the TRPO graph, so I think it's OK. Otherwise, wouldn't it be constantly rebuilding the computational graph? Or doing something else? Eh, for now I think it's OK. Params: loss: The loss function we're optimizing, which I assume is always scalar-valued. var_list: The list of variables (from `tf.trainable_variables()`) to take gradients. This should only be for the policynets. Returns: A single flat vector with all gradients concatenated. """ grads = tf.gradients(loss, var_list) return tf.concat([tf.reshape(g, [-1]) for g in grads], axis=0)
def test_linear_iaf(self): with self.test_session(use_gpu=True) as sess: z = [] vz = [0.1, -1.2, 1.0, -0.3, 1.2, 2, 10.0, -23.2] for i in range(len(vz)): z.append(np.array([[vz[i]]])) z[i] = tf.constant(z[i], dtype=tf.float32) z_0 = tf.concat(z, axis=1) z_1, n_log_det_ja = inv_autoregressive_flow( z_0, None, [0.0], linear_ar, n_iters=1) n_log_det_ja = tf.reshape(n_log_det_ja, []) grad = [] for i in range(len(vz)): z_1i = z_1[0, i] grad.append(tf.gradients(z_1i, z_0)[0]) jocabian = tf.concat(grad, axis=0) log_det_jacobian = tf.log(tf.matrix_determinant(jocabian)) sess.run(tf.global_variables_initializer()) test_value, true_value = sess.run([-log_det_jacobian, n_log_det_ja]) self.assertAllClose(test_value, true_value)
def __init__(self,sess,state_dim,action_dim,scope): self.state_dim = state_dim self.action_dim = action_dim # create actor network self.state_input,self.action_output,self.net = self.create_network(state_dim,action_dim,scope) # create target actor network self.target_state_input,self.target_action_output,self.target_update,self.target_net = self.create_target_network(state_dim,action_dim,self.net,scope) # define training rules if scope != 'global/actor': self.q_gradient_input = tf.placeholder("float",[None,self.action_dim]) self.parameters_gradients = tf.gradients(self.action_output,self.net,-self.q_gradient_input) global_vars_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global/actor') self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(zip(self.parameters_gradients,global_vars_actor)) sess.run(tf.global_variables_initializer()) #self.update_target() #self.load_network()
def init_optimizer(self): print("setting optimizer..") # Gradients and SGD update operation for training the model trainable_params = tf.trainable_variables() if self.optimizer.lower() == 'adadelta': self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate) elif self.optimizer.lower() == 'adam': self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) elif self.optimizer.lower() == 'rmsprop': self.opt = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate) else: self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) # Compute gradients of loss w.r.t. all trainable variables gradients = tf.gradients(self.loss, trainable_params) # Clip gradients by a given maximum_gradient_norm clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) # Update the model self.updates = self.opt.apply_gradients( zip(clip_gradients, trainable_params), global_step=self.global_step)
def _deepfool2(model, x, epochs, eta, clip_min, clip_max, min_prob): y0 = tf.stop_gradient(tf.reshape(model(x), [-1])[0]) y0 = tf.to_int32(tf.greater(y0, 0.5)) def _cond(i, z): xadv = tf.clip_by_value(x + z*(1+eta), clip_min, clip_max) y = tf.stop_gradient(tf.reshape(model(xadv), [-1])[0]) y = tf.to_int32(tf.greater(y, 0.5)) return tf.logical_and(tf.less(i, epochs), tf.equal(y0, y)) def _body(i, z): xadv = tf.clip_by_value(x + z*(1+eta), clip_min, clip_max) y = tf.reshape(model(xadv), [-1])[0] g = tf.gradients(y, xadv)[0] dx = - y * g / tf.norm(g) return i+1, z+dx _, noise = tf.while_loop(_cond, _body, [0, tf.zeros_like(x)], name='_deepfool2_impl', back_prop=False) return noise
def __init__(self,x_op,y_op,sess,remove_bias=False): # Save parameters self.x_op = x_op self.y_op = y_op self.sess = sess self.remove_bias = remove_bias # Get dimensions and data types self.shape0 = x_op.get_shape() self.shape1 = y_op.get_shape() self.dtype0 = x_op.dtype self.dtype1 = y_op.dtype # Create the ops for the gradient. If the linear operator is y=F(x), # then z = y'*F(x). Therefore, dz/dx = F'(y). self.ytr_op = tf.placeholder(self.dtype1,self.shape1) self.z_op = tf.reduce_sum(tf.multiply(tf.conj(self.ytr_op),self.y_op)) self.zgrad_op = tf.gradients(self.z_op,self.x_op)[0] # Compute output at zero to subtract if self.remove_bias: xzero = np.zeros(self.shape0) self.y_bias = self.sess.run(self.y_op, feed_dict={self.x_op: xzero}) else: self.y_bias = 0
def _build_train_op(self): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) tf.summary.scalar('learning_rate', self.lrn_rate) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == 'mom': optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
def _deconvolution(graph, sess, op_tensor, X, feed_dict): out = [] with graph.as_default() as g: # get shape of tensor tensor_shape = op_tensor.get_shape().as_list() with sess.as_default() as sess: # creating placeholders to pass featuremaps and # creating gradient ops featuremap = [tf.placeholder(tf.int32) for i in range(config["N"])] reconstruct = [tf.gradients(tf.transpose(tf.transpose(op_tensor)[featuremap[i]]), X)[0] for i in range(config["N"])] # Execute the gradient operations in batches of 'n' for i in range(0, tensor_shape[-1], config["N"]): c = 0 for j in range(config["N"]): if (i + j) < tensor_shape[-1]: feed_dict[featuremap[j]] = i + j c += 1 if c > 0: out.extend(sess.run(reconstruct[:c], feed_dict = feed_dict)) return out
def create_summaries(self, verbose=2): """ Create summaries with `verbose` level """ summ_collection = self.name + "_training_summaries" if verbose in [3]: # Summarize activations activations = tf.get_collection(tf.GraphKeys.ACTIVATIONS) summarize_activations(activations, summ_collection) if verbose in [2, 3]: # Summarize variable weights summarize_variables(self.train_vars, summ_collection) if verbose in [1, 2, 3]: # Summarize gradients summarize_gradients(self.grad, summ_collection) self.summ_op = merge_summary(tf.get_collection(summ_collection))
def classify(model_range, seg_range, feature_lr, classifier_lr): feat_opt = tf.train.AdamOptimizer(feature_lr) clas_opt = tf.train.AdamOptimizer(classifier_lr) for model in model_range: for seg in seg_range: with tf.variable_scope('classifier-{}-{}'.format(model, seg)): self.preds[(model, seg)] = slim.conv2d(self.feature, 500, [1, 1]) self.clas_vars[(model, seg)] = slim.get_model_variables()[-2:] with tf.variable_scope('losses-{}-{}'.format(model, seg)): self.losses[(model, seg)] = self.loss(self.labels, self.preds[(model, seg)]) grad = tf.gradients(self.losses[(model, seg)], self.feat_vars + self.clas_vars[(model, seg)]) train_op_feat = feat_opt.apply_gradients(zip(grad[:-2], self.feat_vars)) train_op_clas = clas_opt.apply_gradients(zip(grad[-2:], self.clas_vars[(model, seg)])) self.train_ops[(model, seg)] = tf.group(train_op_feat, train_op_clas) return self.losses, self.train_ops
def _GradMom(op, v, out_grad, batch_size, mom=2): """Wrapper function for the operation type-specific GradMom functions below. Inputs: :op: A tensorflow operation of type in VALID_TYPES. :v: The read-tensor of the trainable variable consumed by this operation. :out_grad: The tensor containing the gradient w.r.t. to the output of the op (as computed by ``tf.gradients``). :batch_size: Batch size ``m`` (constant integer or scalar int tf.Tensor) :mom: Integer moment desired (defaults to 2).""" with tf.name_scope(op.name+"_grad_mom"): if op.type == "MatMul": return _MatMulGradMom(op, v, out_grad, batch_size, mom) elif op.type == "Conv2D": return _Conv2DGradMom(op, v, out_grad, batch_size, mom) elif op.type == "Add": return _AddGradMom(op, v, out_grad, batch_size, mom) else: raise ValueError("Don't know how to compute gradient moment for " "variable {}, consumed by operation of type {}".format(v.name, op.type))
def _MatMulGradMom(op, W, out_grad, batch_size, mom=2): """Computes gradient moment for a weight matrix through a MatMul operation. Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A`` are the nxd1 activations of the previous layer (n being the batch size). ``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``. No transposes in the MatMul operation allowed. Inputs: :op: The MatMul operation :W: The weight matrix (the tensor, not the variable) :out_grad: The tensor of gradient w.r.t. to the output of the op :batch_size: Batch size n (constant integer or scalar int tf.Tensor) :mom: Integer moment desired (defaults to 2)""" assert op.type == "MatMul" t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b") assert W is op.inputs[1] and not t_a and not t_b A = op.inputs[0] out_grad_pow = tf.pow(out_grad, mom) A_pow = tf.pow(A, mom) return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
def testUsage(self): with tf.variable_scope("", custom_getter=snt.custom_getters.stop_gradient): lin1 = snt.Linear(10, name="linear1") x = tf.placeholder(tf.float32, [10, 10]) y = lin1(x) variables = tf.trainable_variables() variable_names = [v.name for v in variables] self.assertEqual(2, len(variables)) self.assertIn("linear1/w:0", variable_names) self.assertIn("linear1/b:0", variable_names) grads = tf.gradients(y, variables) names_to_grads = {var.name: grad for var, grad in zip(variables, grads)} self.assertEqual(None, names_to_grads["linear1/w:0"]) self.assertEqual(None, names_to_grads["linear1/b:0"])
def testOpClip(self): x = tf.placeholder(tf.float32, shape=[2, 1]) y = snt.clip_gradient(x, 2, 3) z = tf.reduce_sum(y * y) dzdy = tf.gradients(z, y)[0] dzdx = tf.gradients(z, x)[0] x_np = np.array([[0.5], [2]]) with self.test_session() as sess: y_np, dzdy_np, dzdx_np = sess.run([y, dzdy, dzdx], feed_dict={x: x_np}) self.assertAllEqual(y_np, x_np) # We do not expect the gradients with respect to the output to be clipped. self.assertAllEqual(dzdy_np, np.array([[1], [4]])) # We expect the gradients with respect to the input to be clipped [2, 3]. self.assertAllEqual(dzdx_np, np.array([[2], [3]]))
def testOpScale(self, x_, scale): x = tf.placeholder(tf.float32, [1]) y = x * x y = snt.scale_gradient(y, scale) dydx = tf.gradients([y], [x])[0] if scale == 0.0: self.assertEqual(y.op.type, "StopGradient") self.assertIs(dydx, None) else: if scale == 1.0: self.assertEqual(y.op.type, "Identity") else: self.assertEqual(y.op.type, "ScaleGradient_float32") with self.test_session() as sess: dydx_, y_ = sess.run([dydx, y], feed_dict={x: [x_]}) self.assertAlmostEqual(dydx_[0], 2 * scale * x_, places=6) self.assertAlmostEqual(y_[0], x_ ** 2, places=6)
def testTwoOps(self): """Tests that the op can be instantiated twice with appropriate results. Implementations with inappropriate global registration of gradients will fail this test. """ x = tf.placeholder(tf.float32, [1]) y = x * x y = snt.scale_gradient(y, 0.1) y = snt.scale_gradient(y, 0.1) dydx = tf.gradients([y], [x])[0] with self.test_session() as sess: dydx_, y_ = sess.run([dydx, y], feed_dict={x: [3.0]}) self.assertAlmostEqual(dydx_[0], 2 * 0.1**2 * 3.0, places=6) self.assertAlmostEqual(y_[0], 3.0 ** 2, places=6)
def build_graph(self, kl_first_fixed, weights): weight_list = list(utils.Utils.flatten(weights.node)) gradients1 = tf.gradients(kl_first_fixed.node, weight_list) ph_tangent = graph.Placeholder(np.float32, shape=(None,)) gvp = [] start = 0 for g in gradients1: size = np.prod(g.shape.as_list()) gvp.append(tf.reduce_sum(tf.reshape(g, [-1]) * ph_tangent.node[start:start + size])) start += size gradients2 = tf.gradients(gvp, weight_list) fvp = tf.concat([tf.reshape(g, [-1]) for g in gradients2], axis=0) self.ph_tangent = ph_tangent return fvp
def build_graph(self, weights, loss=None, optimizer=None, norm=False, batch_size=None, grad_ys=None): if loss is not None: gradients = tf.gradients(loss.node, list(utils.Utils.flatten(weights.node)), grad_ys) gradients = [tf.check_numerics(g, 'gradient_%d' % i) for i, g in enumerate(gradients)] if batch_size is not None: gradients = [g / float(batch_size) for g in gradients] # store gradients global norm before clipping self.global_norm = tf.global_norm(gradients) # clip gradients after global norm has been stored if norm: gradients, _ = tf.clip_by_global_norm(gradients, norm) self.calculate = graph.TfNode(utils.Utils.reconstruct(gradients, weights.node)) if optimizer is not None: self.ph_gradients = graph.Placeholders(weights) self.apply = graph.TfNode(optimizer.node.apply_gradients( utils.Utils.izip(self.ph_gradients.checked, weights.node)))
def hessian_vec_fw(ys, xs, vs, grads=None): """Implements Hessian vector product using forward on backward AD. Args: ys: Loss function. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: Hv: Hessian vector product, same size, same shape as xs. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") if grads is None: grads = tf.gradients(ys, xs, gate_gradients=True) return forward_gradients(grads, xs, vs, gate_gradients=True)
def hessian_vec_bk(ys, xs, vs, grads=None): """Implements Hessian vector product using backward on backward AD. Args: ys: Loss function. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: Hv: Hessian vector product, same size, same shape as xs. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") if grads is None: grads = tf.gradients(ys, xs, gate_gradients=True) return tf.gradients(grads, xs, vs, gate_gradients=True)
def fisher_vec_fw(ys, xs, vs): """Implements Fisher vector product using backward and forward AD. Args: ys: Loss function or output variables. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: J'Jv: Fisher vector product. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") jv = forward_gradients(ys, xs, vs, gate_gradients=True) jjv = tf.gradients(ys, xs, jv, gate_gradients=True) return jjv
def gauss_newton_vec(ys, zs, xs, vs): """Implements Gauss-Newton vector product. Args: ys: Loss function. zs: Before output layer (input to softmax). xs: Weights, list of tensors. vs: List of perturbation vector for each weight tensor. Returns: J'HJv: Guass-Newton vector product. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") grads_z = tf.gradients(ys, zs, gate_gradients=True) hjv = forward_gradients(grads_z, xs, vs, gate_gradients=True) jhjv = tf.gradients(zs, xs, hjv, gate_gradients=True) return jhjv, hjv
def fisher_vec_z(ys, xs, vs): """Implements JJ'v, where v is on the output space. Args: ys: Loss function or output variables. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: JJ'v: Fisher vector product on the output space. """ # Validate the input if type(ys) == list: if len(vs) != len(ys): raise ValueError("ys and vs must have the same length.") jv = tf.gradients(ys, xs, vs, gate_gradients=True) jjv = forward_gradients(ys, xs, jv, gate_gradients=True) return jjv
def gauss_newton_vec_z(ys, zs, xs, vs): """Implements HJJ'v, where v is on the output space. Args: ys: Loss function or output variables. zs: Before output layer (input to softmax). xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: HJJ'v: Gauss-Newton vector product on the output space. """ # Validate the input if type(zs) == list: if len(vs) != len(zs): raise ValueError("zs and vs must have the same length.") grads_z = tf.gradients(ys, zs, gate_gradients=True) jv = tf.gradients(zs, xs, vs, gate_gradients=True) hjjv = forward_gradients(grads_z, xs, jv, gate_gradients=True) return hjjv
def test_hessian_quadratic(self): rnd = np.random.RandomState(0) dtype = tf.float64 with tf.Graph().as_default(): r = tf.Variable(0.0, dtype=dtype) x = tf.constant(rnd.uniform(-1.0, 1.0, [2, 27]), dtype=dtype, name="x") w2 = tf.constant(rnd.uniform(-1.0, 1.0, [27, 1]), dtype=dtype, name="w2") v2 = tf.constant(rnd.uniform(-1.0, 1.0, [27, 1]), dtype=dtype, name="v2") w2v = tf.add(w2, tf.multiply(r, v2)) h2 = tf.matmul(x, w2v) y2 = tf.reduce_sum(h2 * h2) grad_w = tf.gradients(y2, w2) hv_fw = hessian_vec_fw(y2, [w2v], [v2]) hv_bk = hessian_vec_bk(y2, [w2], [v2]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) grad_w = sess.run(grad_w) hv_fw_val = sess.run(hv_fw) hv_bk_val = sess.run(hv_bk) np.testing.assert_allclose(hv_fw_val, hv_bk_val, rtol=1e-5)
def main(): sess = tf.Session() t_input = tf.placeholder(np.float32, name='input') # define the input tensor image_mean = 117.0 t_preprocessed = tf.expand_dims(t_input-image_mean, 0) # Build the inference graph nodes = tmp.vggface16.load('data/vgg_face.mat', t_preprocessed) img_noise = np.random.uniform(size=(224,224,3)) + 117.0 # Picking some internal layer. Note that we use outputs before applying the ReLU nonlinearity # to have non-zero gradients for features with negative initial activations. layer = 'conv5_3' channel = 140 # picking some feature channel to visualize img = render_naive(sess, t_input, nodes[layer][:,:,:,channel], img_noise) showarray(img)
def grad_supervised(self, prob, labels): """ return: loss = 1 / M * sum_i_{1..M} cross_entroy_loss(groundtruth, a_T) grads = grad(loss, params) inputs: prob labels = (n_batch,) [tensor variable] """ labels = tf.cast(labels, tf.int64) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(prob, labels, name = 'cross_entropy_per_example') loss = tf.reduce_mean(cross_entropy, name = 'cross_entropy') tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) for i in xrange(len(grads)): if grads[i] == None: grads[i] = tf.zeros(shape = tvars[i].get_shape()) return loss, grads
def __init__(self, model): ''' :param model: Keras model. This code makes a bunch of assumptions about the model: - Model has single input - Embedding is the first layer - Model output is a scalar (logistic regression) ''' input_tensor = model.input embedding_tensor = model.layers[0](input_tensor) output_tensor = embedding_tensor for layer in model.layers[1:]: output_tensor = layer(output_tensor) grad_tensor, = tf.gradients(output_tensor, [embedding_tensor]) grad_sum_tensor = tf.reduce_sum(grad_tensor, reduction_indices=2) self.model = model self.input_tensor = input_tensor self.grad_sum_tensor = grad_sum_tensor
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.summary.scalar('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.summary.scalar('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def init_ops_for_training(self, critic): # actors gradients are the gradients for it's output w.r.t it's vars using initial # gradients provided by critic. this requires that critic was init'd with an # input_action = actor.output_action (which is natural anyway) # we wrap the optimiser in namespace since we don't want this as part of copy to # target networks. # note that we negate the gradients from critic since we are trying to maximise # the q values (not minimise like a loss) with tf.variable_scope("optimiser"): gradients = tf.gradients(self.output_action, self.trainable_model_vars(), tf.neg(critic.q_gradients_wrt_actions())) gradients = zip(gradients, self.trainable_model_vars()) # potentially clip and wrap with debugging gradients = util.clip_and_debug_gradients(gradients, opts) # apply optimiser = tf.train.GradientDescentOptimizer(opts.actor_learning_rate) self.train_op = optimiser.apply_gradients(gradients)