我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.clip_by_norm()。
def get_train_op(self): """ define optimization operation """ if self.args.optimizer == "SGD": optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.args.lr) elif self.args.optimizer == "ADAM": optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr) else: raise NotImplementedError("Other Optimizer Not Implemented.-_-||") # gradient clip grad_vars = optimizer.compute_gradients(self.loss) grad_vars = [ (tf.clip_by_norm(grad, self.args.grad_clipping), var) if grad is not None else (grad, var) for grad, var in grad_vars] self.train_op = optimizer.apply_gradients(grad_vars, self.step) return
def build_model(self): self.build_memory() self.W = tf.Variable(tf.random_normal([self.edim, self.nwords], stddev=self.init_std)) z = tf.matmul(self.hid[-1], self.W) self.loss = tf.nn.softmax_cross_entropy_with_logits(z, self.target) self.lr = tf.Variable(self.current_lr) self.opt = tf.train.GradientDescentOptimizer(self.lr) params = [self.A, self.B, self.C, self.T_A, self.T_B, self.W] grads_and_vars = self.opt.compute_gradients(self.loss,params) clipped_grads_and_vars = [(tf.clip_by_norm(gv[0], self.max_grad_norm), gv[1]) \ for gv in grads_and_vars] inc = self.global_step.assign_add(1) with tf.control_dependencies([inc]): self.optim = self.opt.apply_gradients(clipped_grads_and_vars) tf.initialize_all_variables().run() self.saver = tf.train.Saver()
def clip_gradient_norms(gradients_to_variables, max_norm): """Clips the gradients by the given value. Args: gradients_to_variables: A list of gradient to variable pairs (tuples). max_norm: the maximum norm value. Returns: A list of clipped gradient to variable pairs. """ clipped_grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, max_norm) grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: grad = tf.clip_by_norm(grad, max_norm) clipped_grads_and_vars.append((grad, var)) return clipped_grads_and_vars
def _clip_grad_norms(self, gradients_to_variables, max_norm=5): """Clips the gradients by the given value. Args: gradients_to_variables: A list of gradient to variable pairs (tuples). max_norm: the maximum norm value. Returns: A list of clipped gradient to variable pairs. """ grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, max_norm) grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: grad = tf.clip_by_norm(grad, max_norm) grads_and_vars.append((grad, var)) return grads_and_vars
def set_train_op(loss, tvars): if FLAGS.optimizer_type == "sgd": optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate) elif FLAGS.optimizer_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(learning_rate=FLAGS.learning_rate) elif FLAGS.optimizer_type == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) else: raise ValueError("Wrong optimizer_type.") gradients = optimizer.compute_gradients(loss, var_list=tvars) clipped_gradients = [(grad if grad is None else tf.clip_by_norm(grad, FLAGS.max_grads), var) for grad, var in gradients] train_op = optimizer.apply_gradients(clipped_gradients) return train_op
def _clip_grad_norms(gradients_to_variables, max_norm=10): """Clips the gradients by the given value. Args: gradients_to_variables: A list of gradient to variable pairs (tuples). max_norm: the maximum norm value. Returns: A list of clipped gradient to variable pairs. """ grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, max_norm) grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: grad = tf.clip_by_norm(grad, max_norm) grads_and_vars.append((grad, var)) return grads_and_vars
def _clip_sparse(self, grad, var): assert isinstance(grad, tf.IndexedSlices) clip_dims = self._vars_to_clip_dims[var] if 0 in clip_dims: log.warn("Clipping norm across dims %s for %s is inefficient " "when including sparse dimension 0.", clip_dims, var.op.name) return self._clip_dense(var) with tf.colocate_with(var): var_subset = tf.gather(var, grad.indices) with self._maybe_colocate_with(var): normalized_var_subset = tf.clip_by_norm( var_subset, self._max_norm, clip_dims) delta = tf.IndexedSlices( var_subset - normalized_var_subset, grad.indices, grad.dense_shape) with tf.colocate_with(var): return var.scatter_sub(delta, use_locking=self._use_locking)
def _clip_grad_norms(self, gradients_to_variables, max_norm=5): """Clips the gradients by the given value. Args: gradients_to_variables: A list of gradient to variable pairs (tuples). max_norm: the maximum norm value. Returns: A list of clipped gradient to variable pairs. """ grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, max_norm) grad = tf.IndexedSlices( tmp, grad.indices, grad.dense_shape) else: grad = tf.clip_by_norm(grad, max_norm) grads_and_vars.append((grad, var)) return grads_and_vars
def set_up_optimizer(loss, optimizer, params, clip_gradients): opt = { 'adam': tf.train.AdamOptimizer, 'sgd': tf.train.GradientDescentOptimizer, 'momentum': tf.train.MomentumOptimizer, 'adadelta': tf.train.AdadeltaOptimizer, 'adagrad': tf.train.AdagradOptimizer, 'rmsprop': tf.train.RMSPropOptimizer }[optimizer](**params) # optionally clip gradients by norm grads_and_vars = opt.compute_gradients(loss) if clip_gradients is not None: grads_and_vars = [(tf.clip_by_norm(grad, clip_gradients), var) for grad, var in grads_and_vars] return opt, opt.apply_gradients(grads_and_vars)
def apply_gradients(self, var_list, accum_grad_list, name=None): update_ops = [] with tf.device(self._device): with tf.control_dependencies(None): self._create_slots(var_list) with tf.name_scope(name, self._name, []) as name: self._prepare() for var, accum_grad in zip(var_list, accum_grad_list): with tf.name_scope("update_" + var.op.name), tf.device(var.device): clipped_accum_grad = tf.clip_by_norm(accum_grad, self._clip_norm) update_ops.append(self._apply_dense(clipped_accum_grad, var)) return update_ops; #return tf.group(*update_ops, name=name)
def build_train(self, total_loss): with self.G.as_default(): self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) # can't use opt.minimize because we need to clip the gradients grads_and_vars = self.opt.compute_gradients(self.loss) grads_and_vars = [(tf.clip_by_norm(g, self.max_grad_norm), v) for g,v in grads_and_vars] grads_and_vars = [(add_gradient_noise(g), v) for g,v in grads_and_vars] nil_grads_and_vars = [] for g, v in grads_and_vars: if v.name in self.nil_vars: nil_grads_and_vars.append((zero_nil_slot(g), v)) else: nil_grads_and_vars.append((g, v)) self.train_op = self.opt.apply_gradients(nil_grads_and_vars, name="train_op") return self.train_op
def minimize_and_clip(optimizer, objective, var_list = None, clip_val=10, exclude = None): """ Minimized `objective` using `optimizer` w.r.t. variables in `var_list` while ensure the norm of the gradients for each variable is clipped to `clip_val` """ gradients = optimizer.compute_gradients(objective, var_list=var_list) for i, (grad, var) in enumerate(gradients): if grad is not None: #gradients[i] = (tf.clip_by_value(grad, -clip_val, clip_val), var) if (exclude is None) or (var not in exclude): gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return optimizer.apply_gradients(gradients) ############################ # Other NN Related
def __call__(self, inputs, center_state, module_state): """ :return: output, new_center_features, new_module_state """ with tf.variable_scope(self.name): reading_weights = tf.get_variable('reading_weights',shape=[self.center_size,self.context_input_size],initializer=tf.truncated_normal_initializer(stddev=0.1)) context_input = tf.matmul(center_state, tf.clip_by_norm(reading_weights,1.0)) inputs = tf.concat([inputs, context_input], axis=1) if self.input_size else context_input inputs = tf.contrib.layers.fully_connected(inputs, num_outputs=self.center_output_size) gru = tf.nn.rnn_cell.GRUCell(self.num_gru_units) gru_output, new_module_state = gru(inputs=inputs, state=module_state) output, center_feature_output = tf.split(gru_output, [self.output_size, self.center_output_size], axis=1) if self.output_size else (None, gru_output) return output, center_feature_output, new_module_state
def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { lstm.input_x: x_batch, lstm.input_y: y_batch, lstm.dropout_keep_prob: FLAGS.dropout_keep_prob, lstm.batch_size: FLAGS.batch_size, lstm.pad: np.zeros([FLAGS.batch_size, 1, FLAGS.embedding_dim, 1]), lstm.real_len: real_len(x_batch), } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, lstm.loss, lstm.accuracy], feed_dict) #lstm.W = tf.clip_by_norm(lstm.W, 3) print("TRAIN step {}, loss {:g}, acc {:g}".format(step, loss, accuracy)) train_summary_writer.add_summary(summaries, step)
def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { lstm.input_x: x_batch, lstm.input_y: y_batch, lstm.dropout_keep_prob: FLAGS.dropout_keep_prob, lstm.batch_size: FLAGS.batch_size, lstm.real_len: real_len(x_batch) } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, lstm.loss, lstm.accuracy], feed_dict) lstm.W = tf.clip_by_norm(lstm.W, 3) time_str = datetime.datetime.now().isoformat() print("TRAIN step {}, loss {:g}, acc {:g}".format(step, loss, accuracy)) train_summary_writer.add_summary(summaries, step)
def train(self): learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step,self.decay_steps, self.decay_rate,staircase = True) #use grad_clip to hand exploding or vanishing gradients optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(self.loss_val) for idx ,(grad,var) in enumerate(grads_and_vars): if grad is not None: grads_and_vars[idx] = (tf.clip_by_norm(grad,self.grad_clip),var) train_op = optimizer.apply_gradients(grads_and_vars, global_step = self.global_step) return train_op
def train(self): learning_rate = tf.train.exponential_decay(self.learning_rate,self.global_step, self.decay_steps,self.decay_rate,staircase = True) #train_op = tf.contrib.layers.optimize_loss(self.loss_val,global_step = self.global_step, # learning_rate = learning_rate,optimizer = 'Adam') #use grad_clip to hand exploding or vanishing gradients optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(self.loss_val) for idx ,(grad,var) in enumerate(grads_and_vars): if grad is not None: grads_and_vars[idx] = (tf.clip_by_norm(grad,self.grad_clip),var) train_op = optimizer.apply_gradients(grads_and_vars, global_step = self.global_step) return train_op #test started
def _clip_gradients_fn(self, grads_and_vars): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) self._grads_and_vars = grads_and_vars if self._clip_gradients > 0.0: clipped_gradients, _ = tf.clip_by_global_norm( t_list=gradients, clip_norm=self._clip_gradients) grads_and_vars = list(zip(clipped_gradients, variables)) if self._clip_embed_gradients > 0.0: clipped_gradients = [] variables = [] for gradient, variable in grads_and_vars: if "embedding" in variable.name or "Embedding" in variable.name: tmp = tf.clip_by_norm(t=gradient.values, clip_norm=self._clip_embed_gradients) gradient = tf.IndexedSlices(tmp, gradient.indices, gradient.dense_shape) clipped_gradients.append(gradient) variables.append(variable) grads_and_vars = list(zip(clipped_gradients, variables)) return grads_and_vars
def _create_training_op(self, learning_rate, opt=tf.train.AdamOptimizer, opt_config=dict(), var_list=None): loss_sy = tf.losses.get_total_loss() optimizer = opt(learning_rate, **opt_config) grads_and_vars = optimizer.compute_gradients(loss_sy, var_list=var_list) if self.grad_clip_norm is not None: with tf.variable_scope('gradient_clipping'): grads_and_vars = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in grads_and_vars if grad is not None] tf.summary.histogram('gradients', grads_and_vars[0]) self.training_op = optimizer.apply_gradients(grads_and_vars)
def _add_train_graph(self): """Define the training operation.""" mc = self.mc self.global_step = tf.Variable(0, name='global_step', trainable=False) lr = tf.train.exponential_decay(mc.LEARNING_RATE, self.global_step, mc.DECAY_STEPS, mc.LR_DECAY_FACTOR, staircase=True) tf.summary.scalar('learning_rate', lr) _add_loss_summaries(self.loss) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=mc.MOMENTUM) grads_vars = opt.compute_gradients(self.loss, tf.trainable_variables()) with tf.variable_scope('clip_gradient') as scope: for i, (grad, var) in enumerate(grads_vars): grads_vars[i] = (tf.clip_by_norm(grad, mc.MAX_GRAD_NORM), var) apply_gradient_op = opt.apply_gradients(grads_vars, global_step=self.global_step) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) with tf.control_dependencies([apply_gradient_op]): self.train_op = tf.no_op(name='train')
def clip_gradient_norms(gradients_to_variables, max_norm): clipped_grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, max_norm) grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: grad = tf.clip_by_norm(grad, max_norm) clipped_grads_and_vars.append((grad, var)) return clipped_grads_and_vars
def minimize_and_clip(optimizer, objective, var_list, clip_val=10): """Minimized `objective` using `optimizer` w.r.t. variables in `var_list` while ensure the norm of the gradients for each variable is clipped to `clip_val` """ gradients = optimizer.compute_gradients(objective, var_list=var_list) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return optimizer.apply_gradients(gradients)
def _clip_gradients(self, grads_and_vars): """In addition to standard gradient clipping, also clips embedding gradients to a specified value.""" grads_and_vars = super(Seq2SeqModel, self)._clip_gradients(grads_and_vars) clipped_gradients = [] variables = [] for gradient, variable in grads_and_vars: if "embedding" in variable.name: tmp = tf.clip_by_norm( gradient.values, self.params["optimizer.clip_embed_gradients"]) gradient = tf.IndexedSlices(tmp, gradient.indices, gradient.dense_shape) clipped_gradients.append(gradient) variables.append(variable) return list(zip(clipped_gradients, variables))
def minimize_and_clip(optimizer, objective, var_list, clip_val=10): """Minimized `objective` using `optimizer` w.r.t. variables in `var_list` while ensure the norm of the gradients for each variable is clipped to `clip_val` """ gradients = optimizer.compute_gradients(objective, var_list=var_list) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return gradients, optimizer.apply_gradients(gradients)
def compute_gradients(self, loss, var_list=None, gate_gradients=1): grads_and_vars = self._optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients) results = [] for grad, var in grads_and_vars: # grad, var = pair[0], pair[1] if grad is not None: grad = tf.clip_by_norm(grad, self._clip) results.append((grad, var)) return results
def _add_optimizer(self): self.optimizer = AdamOptimizer() self.final_train_loss = self.main_train_loss with tf.variable_scope('l2_regularization'): # Find variables to regularize by iterating over all variables and checking if in set. Haven't found way to # directly get variables by absolute path. l2_regularized_names = { 'encoder/bidirectional_rnn/fw/gru_cell/gates/weights:0' # If used, add additional complete variables names } l2_regularized = [variable for variable in tf.trainable_variables() if variable.name in l2_regularized_names] l2_loss = 0.001 * tf.add_n([tf.nn.l2_loss(variable) for variable in l2_regularized]) # self.train_loss += l2_loss gradients = self.optimizer.compute_gradients(self.final_train_loss) with tf.variable_scope('gradient_clipping'): def clip_gradient(gradient, variable): # Only clip normal tensors, IndexedSlices gives warning otherwise if isinstance(gradient, tf.Tensor): gradient = tf.clip_by_norm(gradient, 10) return gradient, variable gradients = [clip_gradient(gradient, variable) for gradient, variable in gradients] self.minimize_operation = self.optimizer.apply_gradients(gradients, global_step=self.global_step)
def _setup_training(self, batch_size, clip, optimizer, training_set, summary_writer, l2, clip_op, **kwargs): global_step = tf.train.create_global_step() if not self._is_setup: # First setup shared resources, e.g., vocabulary. This depends on the input module. logger.info("Setting up model...") self.setup_from_data(training_set, is_training=True) logger.info("Preparing training data...") batches = self.input_module.batch_generator(training_set, batch_size, is_eval=False) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) loss = self.model_module.tensors[Ports.loss] summaries = None if summary_writer is not None: summaries = tf.summary.merge_all() if l2: loss += tf.add_n([tf.nn.l2_loss(v) for v in self.model_module.train_variables]) * l2 if clip: gradients = optimizer.compute_gradients(loss) if clip_op == tf.clip_by_value: gradients = [(tf.clip_by_value(grad, clip[0], clip[1]), var) for grad, var in gradients if grad] elif clip_op == tf.clip_by_norm: gradients = [(tf.clip_by_norm(grad, clip), var) for grad, var in gradients if grad] min_op = optimizer.apply_gradients(gradients, global_step) else: min_op = optimizer.minimize(loss, global_step) variable_size = lambda v: reduce(lambda x, y: x * y, v.get_shape().as_list()) if v.get_shape() else 1 num_params = sum(variable_size(v) for v in self.model_module.train_variables) logger.info("Number of parameters: %d" % num_params) # initialize non model variables like learning rate, optimizer vars ... self.session.run([v.initializer for v in tf.global_variables() if v not in self.model_module.variables]) return batches, loss, min_op, summaries
def setup_models(self, hidden_layer_size, summary_file): # setup the seperate core and target networks self.core_state, self.core_q_values = build_model("core", self.state_size, self.num_actions, hidden_layer_size) self.target_state, self.target_q_values = build_model("target", self.state_size, self.num_actions, hidden_layer_size) # build the global copy op that will copy core network onto target self.clobber_target_net_op = copy_all_vars(from_namespace="core", to_namespace="target", affine_coefficient=self.target_network_update_coeff) # left hand side of the bellman update; Q(s1, a) self.core_action_mask = tf.placeholder(dtype=tf.float32, shape=[None, self.num_actions], name="core_action_mask") self.core_q_value_for_action = tf.reduce_sum(self.core_q_values * self.core_action_mask) # right hand side of bellman update; reward + max_a Q(s2, a') self.reward = tf.placeholder(dtype=tf.float32, name="reward") self.discount_p = tf.placeholder(dtype=tf.float32, name="discount") self.max_target_q_value_plus_reward = self.reward + (self.discount_p * tf.stop_gradient(tf.reduce_max(self.target_q_values))) # for loss just use squared loss on the difference self.temporal_difference_loss = tf.reduce_mean(tf.pow(self.max_target_q_value_plus_reward - self.core_q_value_for_action, 2)) self.learning_rate_p = tf.placeholder(dtype=tf.float32, name="learning_rate") optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_p) #optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.9) gradients = optimizer.compute_gradients(self.temporal_difference_loss) for i, (gradient, variable) in enumerate(gradients): if gradient is None: # eg stop gradient cases continue gradients[i] = (tf.clip_by_norm(gradient, self.gradient_clip), variable) tf.histogram_summary(variable.name, variable) tf.histogram_summary(variable.name + '/gradients', gradient) tf.scalar_summary("temporal_difference_loss", self.temporal_difference_loss) self.train_op = optimizer.apply_gradients(gradients) # build session self.sess = tf.Session() self.sess.run(tf.initialize_all_variables()) self.summaries = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(summary_file, self.sess.graph_def)