我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.clip_by_global_norm()。
def add_training_op(self, loss): #optimizer = tf.train.AdamOptimizer(self.config.lr) #optimizer = tf.train.AdagradOptimizer(self.config.lr) optclass = getattr(tf.train, self.config.optimizer + 'Optimizer') assert issubclass(optclass, tf.train.Optimizer) optimizer = optclass(self.config.learning_rate) gradient_var_pairs = optimizer.compute_gradients(loss) vars = [x[1] for x in gradient_var_pairs] gradients = [x[0] for x in gradient_var_pairs] if self.config.gradient_clip > 0: clipped, _ = tf.clip_by_global_norm(gradients, self.config.gradient_clip) else: clipped = gradients self.grad_norm = tf.global_norm(clipped) train_op = optimizer.apply_gradients(zip(clipped, vars)) return train_op
def get_training_tensors(self, learning_rate = 0.001, grad_clip = 5): #----------------------------------------------------------------------- # Build a loss function #----------------------------------------------------------------------- with tf.name_scope('targets-encode'): y_one_hot = tf.one_hot(self.targets, self.n_classes) y_reshaped = tf.reshape(y_one_hot, self.logits.get_shape()) with tf.name_scope('loss'): loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=y_reshaped) loss = tf.reduce_mean(loss) tf.summary.scalar('loss', loss) #----------------------------------------------------------------------- # Build the optimizer #----------------------------------------------------------------------- with tf.name_scope('optimizer'): tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip) train_op = tf.train.AdamOptimizer(learning_rate) optimizer = train_op.apply_gradients(zip(grads, tvars)) return loss, optimizer
def get_optimizer(self, learning_rate = 0.001, grad_clip = 5): #----------------------------------------------------------------------- # Build a loss function #----------------------------------------------------------------------- with tf.variable_scope('loss'): loss = tf.losses.mean_squared_error(self.target, self.output) #----------------------------------------------------------------------- # Build the optimizer #----------------------------------------------------------------------- with tf.variable_scope('optimizer'): tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip) train_op = tf.train.AdamOptimizer(learning_rate) optimizer = train_op.apply_gradients(zip(grads, tvars)) return optimizer, loss
def build_model(self): self.model = classmap[FLAGS.model_type](hidden_size=FLAGS.hidden, vocab_size=self.vocab_size, encoder_in_size=self.data.feats.shape[-1], encoder_in_length=self.data.feats.shape[1], decoder_in_length=self.data.decoder_in.shape[-1] - 1, word2vec_weight=self.w2v_W, embedding_size=FLAGS.embedding_dim, neg_sample_num=self.sample_num, start_id=self.vocab_processor._mapping['<BOS>'], end_id=self.vocab_processor._mapping['<EOS>'], Bk=FLAGS.K) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.RMSPropOptimizer(FLAGS.lr) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.model.cost, tvars), 5) self.updates = self.optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def get_update_op(self, loss, opts, global_step=None, max_gradient_norm=None, freeze_variables=None): if loss is None: return None freeze_variables = freeze_variables or [] # compute gradient only for variables that are not frozen frozen_parameters = [var.name for var in tf.trainable_variables() if any(re.match(var_, var.name) for var_ in freeze_variables)] params = [var for var in tf.trainable_variables() if var.name not in frozen_parameters] self.params = params gradients = tf.gradients(loss, params) if max_gradient_norm: gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) update_ops = [] for opt in opts: with tf.variable_scope('gradients' if self.name is None else 'gradients_{}'.format(self.name)): update_op = opt.apply_gradients(list(zip(gradients, params)), global_step=global_step) update_ops.append(update_op) return update_ops
def _create_optimizer(self): print('Create optimizer... ') with tf.variable_scope('training'): self.global_step = tf.Variable( 0, dtype=tf.int32, trainable=False, name='global_step') if not self.fw_only: self.optimizer = tf.train.GradientDescentOptimizer(config.LR) trainable_vars = tf.trainable_variables() self.gradient_norms = [] self.train_ops = [] start = time.time() for bucket_id in range(len(config.BUCKETS)): clipped_grads, norm = tf.clip_by_global_norm( tf.gradients(self.losses[bucket_id], trainable_vars), config.MAX_GRAD_NORM) self.gradient_norms.append(norm) self.train_ops.append(self.optimizer.apply_gradients( zip(clipped_grads, trainable_vars), global_step=self.global_step)) print('Creating opt for bucket {:d} took {:.2f} seconds.'.format( bucket_id, time.time() - start)) start = time.time()
def init_optimizer(self): print("setting optimizer..") # Gradients and SGD update operation for training the model trainable_params = tf.trainable_variables() if self.optimizer.lower() == 'adadelta': self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate) elif self.optimizer.lower() == 'adam': self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) elif self.optimizer.lower() == 'rmsprop': self.opt = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate) else: self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) # Compute gradients of loss w.r.t. all trainable variables gradients = tf.gradients(self.loss, trainable_params) # Clip gradients by a given maximum_gradient_norm clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) # Update the model self.updates = self.opt.apply_gradients( zip(clip_gradients, trainable_params), global_step=self.global_step)
def setup_train_op(self): """ Add train_op to self """ with tf.variable_scope("train_step"): adam_optimizer = tf.train.AdamOptimizer() grads, vars = zip(*adam_optimizer.compute_gradients(self.loss)) clip_val = self.config.max_gradient_norm # if -1 then do not perform gradient clipping if clip_val != -1: clipped_grads, _ = tf.clip_by_global_norm(grads, self.config.max_gradient_norm) self.global_grad = tf.global_norm(clipped_grads) self.gradients = zip(clipped_grads, vars) else: self.global_grad = tf.global_norm(grads) self.gradients = zip(grads, vars) self.train_op = adam_optimizer.apply_gradients(self.gradients) self.init = tf.global_variables_initializer()
def build_graph(self, weights, loss=None, optimizer=None, norm=False, batch_size=None, grad_ys=None): if loss is not None: gradients = tf.gradients(loss.node, list(utils.Utils.flatten(weights.node)), grad_ys) gradients = [tf.check_numerics(g, 'gradient_%d' % i) for i, g in enumerate(gradients)] if batch_size is not None: gradients = [g / float(batch_size) for g in gradients] # store gradients global norm before clipping self.global_norm = tf.global_norm(gradients) # clip gradients after global norm has been stored if norm: gradients, _ = tf.clip_by_global_norm(gradients, norm) self.calculate = graph.TfNode(utils.Utils.reconstruct(gradients, weights.node)) if optimizer is not None: self.ph_gradients = graph.Placeholders(weights) self.apply = graph.TfNode(optimizer.node.apply_gradients( utils.Utils.izip(self.ph_gradients.checked, weights.node)))
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.summary.scalar('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.summary.scalar('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def _add_train_op(self): """Sets self._train_op, op to run for training.""" config = self._config tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus - 1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), config.max_grad_norm) tf.summary.scalar('global_norm', global_norm) lr_rate = tf.maximum( config.min_lr, # min_lr_rate. tf.train.exponential_decay(config.lr, self.global_step, config.decay_steps, config.decay_rate)) optimizer = tf.train.AdamOptimizer(lr_rate, epsilon=config.adam_epsilon) tf.summary.scalar('learning_rate', lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def _add_train_op(self): """Sets self._train_op, op to run for training.""" config = self._config tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus - 1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), config.max_grad_norm) tf.summary.scalar('global_norm', global_norm) lr_rate = tf.maximum( config.min_lr, # min_lr_rate. tf.train.exponential_decay(config.lr, self.global_step, 30000, 0.98)) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(lr_rate, epsilon=config.adam_epsilon) else: assert config.optimizer == 'gradient_descent', config.optimizer optimizer = tf.train.GradientDescentOptimizer(lr_rate) tf.summary.scalar('learning_rate', lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def training_graph(loss, learning_rate=1.0, max_grad_norm=5.0): ''' Builds training graph. ''' global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('SGD_Training'): # SGD learning parameter learning_rate = tf.Variable(learning_rate, trainable=False, name='learning_rate') # collect all trainable variables tvars = tf.trainable_variables() grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, tvars), max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) return adict( learning_rate=learning_rate, global_step=global_step, global_norm=global_norm, train_op=train_op)
def clip_grad_global_norms(tvars, loss, opt, global_norm=1, gate_gradients=1, gradient_noise_scale=4.0, GATE_GRAPH=2, grad_loss=None, agre_method=None, col_grad_ops=False): """Clips the gradients by the given value. Args: tvars: trainable variables used for gradint updates loss: total loss of the network opt: optimizer global_norm: the maximum global norm Returns: A list of clipped gradient to variable pairs. """ var_refs = [v.ref() for v in tvars] grads = tf.gradients(loss, var_refs, grad_ys=grad_loss, gate_gradients=( gate_gradients == 1), aggregation_method=agre_method, colocate_gradients_with_ops=col_grad_ops) if gradient_noise_scale > 1: grads = add_scaled_noise_to_gradients( list(zip(grads, tvars)), gradient_noise_scale=gradient_noise_scale) if gate_gradients == GATE_GRAPH: grads = tf.tuple(grads) grads, _ = tf.clip_by_global_norm(grads, global_norm) grads_and_vars = list(zip(grads, tvars)) return grads_and_vars
def _clip_grad_global_norms(self, tvars, loss, opt, global_norm=8, gate_gradients=1, gradient_noise_scale=None, GATE_GRAPH=2, grad_loss=None, agre_method=None, col_grad_ops=False): """Clips the gradients by the given value. Args: tvars: trainable variables used for gradint updates loss: total loss of the network opt: optimizer global_norm: the maximum global norm Returns: A list of clipped gradient to variable pairs. """ var_refs = [v.read_value() for v in tvars] grads = tf.gradients(loss, var_refs, grad_ys=grad_loss, gate_gradients=( gate_gradients == 1), aggregation_method=agre_method, colocate_gradients_with_ops=col_grad_ops) if gradient_noise_scale is not None: grads = self._add_scaled_noise_to_gradients( list(zip(grads, tvars)), gradient_noise_scale=gradient_noise_scale) if gate_gradients == GATE_GRAPH: grads = tf.tuple(grads) grads, _ = tf.clip_by_global_norm(grads, global_norm) grads_and_vars = list(zip(grads, tvars)) return grads_and_vars
def _build_optimizer(self): """Based on the loss tensor, build an optimizer that minimizes the loss. This function returns an optimizer operation that updates the model's trainable parameters by determining the loss's gradients w.r.t. each of the trainable parameters. Specifically, RMSProp is used to minimize the loss. The gradients are clipped to the max_gradient_norm to prevent too drastic updates of the trainable parameters. See also tf.clip_by_global_norm Returns: tf.Operation: An operation that updates the model's trainable parameters. """ # Clip the gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._loss, tvars), self.max_gradient_norm) # Optimize the variables optimizer = tf.train.RMSPropOptimizer(self._learning_rate) return optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size self._input_data = tf.placeholder(tf.float32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.float32, [batch_size, num_steps]) lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) iw = tf.get_variable("input_w", [1, size]) ib = tf.get_variable("input_b", [size]) inputs = [tf.nn.xw_plus_b(i_, iw, ib) for i_ in tf.split(1, num_steps, self._input_data)] if is_training and config.keep_prob < 1: inputs = [tf.nn.dropout(input_, config.keep_prob) for input_ in inputs] outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) rnn_output = tf.reshape(tf.concat(1, outputs), [-1, size]) self._output = output = tf.nn.xw_plus_b(rnn_output, tf.get_variable("out_w", [size, 1]), tf.get_variable("out_b", [1])) self._cost = cost = tf.reduce_mean(tf.square(output - tf.reshape(self._targets, [-1]))) self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_train(self, losses): # TODO: modify total_loss to handle buckets self.updates = None with self.G.as_default(): # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not self.forward_only: self.gradient_norms = [] self.updates = [] self.opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(self.opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) return self.updates # note: this is per-bucket
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.scalar_summary('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.scalar_summary('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*grads_tvars) with tf.variable_scope("apply_updates"): if self._clip_thresh_var is not None: self._grads_clip, self._grads_norm = tf.clip_by_global_norm(self._grads, self._clip_thresh_var) apply_grad_op = \ self._optimizer.apply_gradients(zip(self._grads_clip, self._tvars) ) else: apply_grad_op = \ self._optimizer.apply_gradients(zip(self._grads, self._tvars) ) with tf.variable_scope("after_apply"): after_apply_op = self.after_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies( [after_apply_op] ): update_hyper_op = self.update_hyper_param() with tf.control_dependencies([update_hyper_op] ): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) return tf.group(apply_grad_op, after_apply_op, update_hyper_op, self._increment_global_step_op)
def build_optimizer(loss, learning_rate, grad_clip): ''' ??Optimizer loss: ?? learning_rate: ??? ''' # ??clipping gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip) train_op = tf.train.AdamOptimizer(learning_rate) optimizer = train_op.apply_gradients(zip(grads, tvars)) return optimizer
def _create_optimizer(self, args): # Find negagtive log-likelihood of true actions std_a = tf.exp(self.a_logstd) pl_1 = 0.5 * tf.to_float(args.action_dim) * np.log(2. * np.pi) pl_2 = tf.to_float(args.action_dim) * tf.reduce_sum(tf.log(std_a)) pl_3 = 0.5 * \ tf.reduce_mean(tf.reduce_sum( tf.square((self.targets - self.a_mean) / std_a), 1)) policy_loss = pl_1 + pl_2 + pl_3 # Find overall loss self.cost = policy_loss self.summary_policy = tf.scalar_summary( "Policy loss", tf.reduce_mean(policy_loss)) # Perform parameter update tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train = optimizer.apply_gradients(zip(grads, tvars))
def _add_train_op(self): params = self._params self._lr_rate = tf.maximum( params.min_lr, tf.train.exponential_decay(params.lr, self._global_step, 30000, 0.98)) tvars = tf.trainable_variables() # use reserved gpu for gradient computation with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), params.max_grad_norm) tf.scalar_summary('global_norm', global_norm) optimizer = tf.train.AdamOptimizer(self._lr_rate) tf.scalar_summary('learning rate', self._lr_rate) with tf.device(self._next_device()): self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self._global_step, name='train_step') self._summaries = tf.merge_all_summaries() return self._train_op, self._loss,
def set_optimizer(self, session, learning_rate=0.5, learning_rate_decay_factor=0.99, max_gradient_norm=5.0, load_if_exist=True): self.global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_opr = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) self.outputs, self.losses = self.calc_loss() params = tf.trainable_variables() for b in range(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(self.optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables()) session.run(tf.initialize_all_variables()) if load_if_exist and self.train_dir: saved = tf.train.get_checkpoint_state(self.train_dir) if saved and tf.gfile.Exists(saved.model_checkpoint_path): self.saver.restore(session, saved.model_checkpoint_path)
def set_optimizer(self, session, learning_rate=0.1, learning_rate_decay_factor=0.99, max_gradient_norm=5.0, load_if_exist=True): self.global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_opr = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) self.outputs, self.losses = self.calc_loss() params = tf.trainable_variables() for b in range(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(self.optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables()) session.run(tf.initialize_all_variables()) if load_if_exist and self.train_dir: saved = tf.train.get_checkpoint_state(self.train_dir) if saved and tf.gfile.Exists(saved.model_checkpoint_path): self.saver.restore(session, saved.model_checkpoint_path)
def build_shared_grad(self): self.grads = tf.gradients(self.loss, self.local_network.var_list) clipped_grads, _ = tf.clip_by_global_norm(self.grads, self.config.max_grad_norm) # copy weights from the parameter server to the local model self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(self.local_network.var_list, self.network.var_list)]) grads_and_vars = list(zip(clipped_grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(self.local_network.x)[0]) # each worker has a different set of adam optimizer parameters self.lr = tf.train.exponential_decay( self.config.lr_start, self.global_step, self.config.lr_decay_step, self.config.lr_decay_rate, staircase=True, name='lr') opt = tf.train.AdamOptimizer(self.lr) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0 self.build_summary()
def clip(grads_and_vars, max_global_norm): """ Clip the gradients that are returned from a TensorFlow Optimizer. Note that the term "clipping" is often used in literature but here is actually the wrong term: if the norm of all gradients concatenated does not exceed `max_global_norm`, then don't modify them. If the norm does exceed `max_global_norm`, then rescale all gradients globally so that the new norm becomes `max_global_norm`. Args: grads_and_vars: A list of `(grad, var)` pairs. max_global_norm: A float. Returns: A list of `(grad, var)` pairs with clipped gradients. """ grads, vars = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, clip_norm=max_global_norm) grads_and_vars = list(zip(grads, vars)) return grads_and_vars
def _training(self, loss, config): """Sets up training ops Creates the optimiser The op returned from this is what is passed to session run Args: loss float learning_rate float returns: Op for training """ # Create the gradient descent optimizer with the # given learning rate. tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer() train_op = optimizer.apply_gradients(zip(grads, tvars)) return train_op
def get_gradient_clipper(clipper, *args, **kwargs): """ Simple helper to get Gradient Clipper E.g: clipper = get_gradient_clipper('value', value_min, value_max, name='ValueClip') :param clipper: a string denoting TF Gradient Clipper (e.g. "global_norm", denote tf.clip_by_global_norm) or a function of type f(tensor) -> clipped_tensor :param args: used to create the clipper :param kwargs: used to create the clipper :return: a function (tensor) -> (clipped tensor) """ if callable(clipper): return clipper # workaround of global_norm clipper, since it returns two variable with the second one as a scalar tensor if clipper == 'global_norm': return lambda t_list: tf.clip_by_global_norm(t_list, *args, **kwargs)[0] if clipper in _str2clipper: clipper = _str2clipper[clipper] else: raise ValueError('clipper should be a callable function or a given key in _str2clipper!') return lambda t_list: [clipper(t, *args, **kwargs) for t in t_list]
def _add_train_op(self): # op???? """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.scalar_summary('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.scalar_summary('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def attach_cost(self, gen_model): # TODO: Shouldn't dynamic RNN be used here? # output_text, states_text = rnn.rnn(cell, inputs, initial_state=self.initial_state) predicted_classes_text = self.discriminate_text(self.input_data_text) self.loss_text = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(predicted_classes_text, np.ones((self.args.batch_size, 1), dtype=np.float32))) generated_wv = gen_model.generate() predicted_classes_wv = self.discriminate_wv(generated_wv) self.loss_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(predicted_classes_wv, np.zeros((self.args.batch_size, 1), dtype=np.float32))) self.loss = .5 * self.loss_gen + .5 * self.loss_text tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.args.grad_clip) # optimize only discriminator owned variables g_and_v = [(g, v) for g, v in zip(grads, tvars) if v.name.startswith('DISC')] optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(g_and_v)
def _create_training_tensors(self, optimizer_algorithm): """ Create the tensors used for training """ with tf.name_scope('training'): if optimizer_algorithm == 'adagrad': optimizer = tf.train.AdagradOptimizer(self.learning_rate) elif optimizer_algorithm == 'adam': optimizer = tf.train.AdamOptimizer(self.learning_rate) elif optimizer_algorithm == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(self.learning_rate) else: ValueError('Unknown optimizer: %s' % optimizer_algorithm) gradients, v = zip(*optimizer.compute_gradients(self.loss)) if self.clip_value is not None: gradients, _ = tf.clip_by_global_norm(gradients, self.clip_value) self.train_op = optimizer.apply_gradients(zip(gradients, v))
def add_train_op(self, loss): self.global_step = tf.Variable(0, name='global_step', trainable=False) opt = tf.train.AdamOptimizer(learning_rate=self.lr) gradients, variables = zip(*opt.compute_gradients(loss)) # save selected gradient summaries #for grad in gradients: #if 'BasicDecoder' in grad.name or 'gru_cell' in grad.name or 'highway_3' in grad.name: #tf.summary.scalar(grad.name, tf.reduce_sum(grad)) # optionally cap and noise gradients to regularize if self.config.cap_grads > 0: with tf.variable_scope('cap_grads'): tf.summary.scalar('global_gradient_norm', tf.global_norm(gradients)) gradients, _ = tf.clip_by_global_norm(gradients, self.config.cap_grads) train_op = opt.apply_gradients(zip(gradients, variables), global_step=self.global_step) return train_op
def _build_train_op(self): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) tf.scalar_summary(self.mode + '/learning rate', self.lrn_rate) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == 'mom': #optimizer = tf.train.AdamOptimizer(0.001) #ooptimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9, use_nesterov=True) optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) clipped_grads, _ = tf.clip_by_global_norm(grads, 1) apply_op = optimizer.apply_gradients( zip(clipped_grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops + tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.train_op = tf.group(*train_ops)
def train_neural_network(): logits, last_state, _, _, _ = neural_network() targets = tf.reshape(output_targets, [-1]) loss = tf.nn.seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words)) cost = tf.reduce_mean(loss) learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars)) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(tf.all_variables()) for epoch in range(50): sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch))) n = 0 for batche in range(n_chunk): train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]}) n += 1 print(epoch, batche, train_loss) if epoch % 7 == 0: saver.save(sess, 'poetry.module', global_step=epoch)
def _clip_gradients_fn(self, grads_and_vars): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) self._grads_and_vars = grads_and_vars if self._clip_gradients > 0.0: clipped_gradients, _ = tf.clip_by_global_norm( t_list=gradients, clip_norm=self._clip_gradients) grads_and_vars = list(zip(clipped_gradients, variables)) if self._clip_embed_gradients > 0.0: clipped_gradients = [] variables = [] for gradient, variable in grads_and_vars: if "embedding" in variable.name or "Embedding" in variable.name: tmp = tf.clip_by_norm(t=gradient.values, clip_norm=self._clip_embed_gradients) gradient = tf.IndexedSlices(tmp, gradient.indices, gradient.dense_shape) clipped_gradients.append(gradient) variables.append(variable) grads_and_vars = list(zip(clipped_gradients, variables)) return grads_and_vars
def compute_gradients(loss, learning_rate, gradient_clipping=-1): """ Create optimizer, compute gradients and (optionally) apply gradient clipping """ opt = tf.train.AdamOptimizer(learning_rate) if gradient_clipping > 0: vars_to_optimize = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, vars_to_optimize), clip_norm=gradient_clipping) grads_and_vars = list(zip(grads, vars_to_optimize)) else: grads_and_vars = opt.compute_gradients(loss) return opt, grads_and_vars
def _clip_gradients(self, grads_and_vars): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) clipped_gradients, _ = tf.clip_by_global_norm( gradients, self.params["optimizer.clip_gradients"]) return list(zip(clipped_gradients, variables))
def TrainingOp(loss, dataSetSize, batch_size, max_grad_norm): var_list = tf.trainable_variables() grads = tf.gradients(loss, var_list) grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False ) training_steps_per_epoch = dataSetSize // batch_size learning_rate = tf.train.exponential_decay( 1e-3, global_step, training_steps_per_epoch, 0.999,staircase=True) optimizer = tf.train.RMSPropOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, var_list), global_step=global_step) return train_op, learning_rate
def clip_gradient(pair_list, max_norm): """Perform gradient clipping. If the gradients' global norm exceed 'max_norm', then shrink it to 'max_norm'. :param pair_list: (grad, var) pair list. :param max_norm: The max global norm. :return: (grad, var) pair list, the original gradients' norm, the clipped gradients' norm """ grad_list = [grad for grad, _ in pair_list] grad_list, raw_grad = tf.clip_by_global_norm(grad_list, max_norm) grad = tf.global_norm(grad_list) pair_list = [(grad, pair[1]) for grad, pair in zip(grad_list, pair_list)] return pair_list, raw_grad, grad
def _backward(self, loss, summaries=False): hps = self.hps loss = loss * hps.num_steps emb_vars = find_trainable_variables("emb") lstm_vars = find_trainable_variables("LSTM") softmax_vars = find_trainable_variables("softmax") all_vars = emb_vars + lstm_vars + softmax_vars grads = tf.gradients(loss, all_vars) orig_grads = grads[:] emb_grads = grads[:len(emb_vars)] grads = grads[len(emb_vars):] for i in range(len(emb_grads)): assert isinstance(emb_grads[i], tf.IndexedSlices) emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices, emb_grads[i].dense_shape) lstm_grads = grads[:len(lstm_vars)] softmax_grads = grads[len(lstm_vars):] lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm) clipped_grads = emb_grads + lstm_grads + softmax_grads assert len(clipped_grads) == len(orig_grads) if summaries: tf.scalar_summary("model/lstm_grad_norm", lstm_norm) tf.scalar_summary("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0)) tf.scalar_summary("model/lstm_weight_norm", tf.global_norm(lstm_vars)) # for v, g, cg in zip(all_vars, orig_grads, clipped_grads): # name = v.name.lstrip("model/") # tf.histogram_summary(name + "/var", v) # tf.histogram_summary(name + "/grad", g) # tf.histogram_summary(name + "/clipped_grad", cg) return list(zip(clipped_grads, all_vars))
def apply_gradients(self, grads): coldOptim = tf.train.MomentumOptimizer( self._cold_lr, self._momentum) def coldSGDstart(): sgd_grads, sgd_var = zip(*grads) if self.max_grad_norm != None: sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm) sgd_grads = list(zip(sgd_grads,sgd_var)) sgd_step_op = tf.assign_add(self.sgd_step, 1) coldOptim_op = coldOptim.apply_gradients(sgd_grads) if KFAC_DEBUG: with tf.control_dependencies([sgd_step_op, coldOptim_op]): sgd_step_op = tf.Print( sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) return tf.group(*[sgd_step_op, coldOptim_op]) kfacOptim_op, qr = self.apply_gradients_kfac(grads) def warmKFACstart(): return kfacOptim_op return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
def __call__(self, enc_input, dec_input_indices, valid_indices, left_indices, right_indices, values, valid_masks=None): batch_size = tf.shape(enc_input)[0] # forward computation graph with tf.variable_scope(self.scope): # encoder output enc_memory, enc_final_state_fw, _ = self.encoder(enc_input) # decoder dec_hiddens, dec_actions, dec_act_logps = self.decoder( enc_memory, dec_input_indices, valid_indices, left_indices, right_indices, valid_masks, init_state=enc_final_state_fw) # cost costs = [] update_ops = [] for step_idx, (act_logp, value, baseline) in enumerate(zip(dec_act_logps, values, self.baselines)): # costs.append(-tf.reduce_mean(act_logp * (value - baseline))) new_baseline = self.bl_ratio * baseline + (1-self.bl_ratio) * tf.reduce_mean(value) costs.append(-tf.reduce_mean(act_logp * value)) update_ops.append(tf.assign(baseline, new_baseline)) # gradient computation graph self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope) train_ops = [] for limit in self.buckets: print '0 ~ %d' % (limit-1) grad_params = tf.gradients(tf.reduce_sum(tf.pack(costs[:limit])), self.params) if self.max_grad_norm is not None: clipped_gradients, norm = tf.clip_by_global_norm(grad_params, self.max_grad_norm) else: clipped_gradients = grad_params train_op = self.optimizer.apply_gradients( zip(clipped_gradients, self.params)) with tf.control_dependencies([train_op] + update_ops[:limit]): # train_ops.append(tf.Print(tf.constant(1.), [norm])) train_ops.append(tf.constant(1.)) return dec_hiddens, dec_actions, train_ops #### test script
def define(self, char_num, rnn_dim, emb_dim, max_x, max_y, write_trans_model=True): self.decode_step = max_y self.encode_step = max_x self.en_vec = [tf.placeholder(tf.int32, [None], name='en_input' + str(i)) for i in range(max_x)] self.trans_labels = [tf.placeholder(tf.int32, [None], name='de_input' + str(i)) for i in range(max_y)] weights = [tf.cast(tf.sign(ot_t), tf.float32) for ot_t in self.trans_labels] self.de_vec = [tf.zeros_like(self.trans_labels[0], tf.int32)] + self.trans_labels[:-1] self.feed_previous = tf.placeholder(tf.bool) self.trans_l_rate = tf.placeholder(tf.float32, [], name='learning_rate') seq_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_dim, state_is_tuple=True) self.trans_output, states = seq2seq.embedding_attention_seq2seq(self.en_vec, self.de_vec, seq_cell, char_num, char_num, emb_dim, feed_previous=self.feed_previous) loss = seq2seq.sequence_loss(self.trans_output, self.trans_labels, weights) optimizer = tf.train.AdagradOptimizer(learning_rate=self.trans_l_rate) params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, 5.0) self.trans_train = optimizer.apply_gradients(zip(clipped_gradients, params)) self.saver = tf.train.Saver() if write_trans_model: param_dic = {} param_dic['char_num'] = char_num param_dic['rnn_dim'] = rnn_dim param_dic['emb_dim'] = emb_dim param_dic['max_x'] = max_x param_dic['max_y'] = max_y # print param_dic f_model = open(self.trained + '_model', 'w') pickle.dump(param_dic, f_model) f_model.close()
def build(H, dat, sess): with open(META_DIR + 'fp.json') as fpj: meta = json.load(fpj) bsize = H['batch_size'] x = tf.placeholder(tf.float32, shape = [64, 64, 1]) y = tf.placeholder(tf.float32, shape = [1,]) training = tf.placeholder(tf.bool) fptrunk = FPTrunk(dat, x, y, bsize, sess) Xt, Yt = tf.train.batch(fptrunk.q['train'].dequeue(), batch_size = bsize, capacity = bsize) Xv, Yv = tf.train.batch(fptrunk.q['valid'].dequeue(), batch_size = bsize, capacity = bsize) logits, preds = model(H, Xt, training) loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.cast(Yt, tf.float32))) varst = tf.trainable_variables() gstep = tf.Variable(0, trainable = False) opts = { 'RMS': tf.train.RMSPropOptimizer, 'Adam': tf.train.AdamOptimizer, 'SGD': tf.train.GradientDescentOptimizer, 'Adagrad': tf.train.AdagradOptimizer } opt = opts[H['opt']](learning_rate = H['lr']) grads_vars = opt.compute_gradients(loss, varst) grads = [gv[0] for gv in grads_vars] vars = [gv[1] for gv in grads_vars] capped, norm = tf.clip_by_global_norm(grads, H['norm_clip']) train_opt = opt.apply_gradients([(capped[i], vars[i]) for i in range(len(vars))], global_step = gstep) saver = tf.train.Saver(max_to_keep = None) return (x, y, training, Xt, Yt, Xv, Yv, logits, loss, preds, opt, varst, gstep, train_opt, saver, fptrunk)