我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.global_norm()。
def add_training_op(self, loss): #optimizer = tf.train.AdamOptimizer(self.config.lr) #optimizer = tf.train.AdagradOptimizer(self.config.lr) optclass = getattr(tf.train, self.config.optimizer + 'Optimizer') assert issubclass(optclass, tf.train.Optimizer) optimizer = optclass(self.config.learning_rate) gradient_var_pairs = optimizer.compute_gradients(loss) vars = [x[1] for x in gradient_var_pairs] gradients = [x[0] for x in gradient_var_pairs] if self.config.gradient_clip > 0: clipped, _ = tf.clip_by_global_norm(gradients, self.config.gradient_clip) else: clipped = gradients self.grad_norm = tf.global_norm(clipped) train_op = optimizer.apply_gradients(zip(clipped, vars)) return train_op
def _add_gradients_summaries(grads_and_vars): """Add histogram summaries to gradients. Note: The summaries are also added to the SUMMARIES collection. Args: grads_and_vars: A list of gradient to variable pairs (tuples). Returns: The _list_ of the added summaries for grads_and_vars. """ summaries = [] for grad, var in grads_and_vars: if grad is not None: if isinstance(grad, tf.IndexedSlices): grad_values = grad.values else: grad_values = grad summaries.append(tf.summary.histogram(var.op.name + ':gradient', grad_values)) summaries.append(tf.summary.histogram(var.op.name + ':gradient_norm', tf.global_norm([grad_values]))) else: tf.logging.info('Var %s has no gradient', var.op.name) return summaries
def setup_train_op(self): """ Add train_op to self """ with tf.variable_scope("train_step"): adam_optimizer = tf.train.AdamOptimizer() grads, vars = zip(*adam_optimizer.compute_gradients(self.loss)) clip_val = self.config.max_gradient_norm # if -1 then do not perform gradient clipping if clip_val != -1: clipped_grads, _ = tf.clip_by_global_norm(grads, self.config.max_gradient_norm) self.global_grad = tf.global_norm(clipped_grads) self.gradients = zip(clipped_grads, vars) else: self.global_grad = tf.global_norm(grads) self.gradients = zip(grads, vars) self.train_op = adam_optimizer.apply_gradients(self.gradients) self.init = tf.global_variables_initializer()
def build_graph(self, weights, loss=None, optimizer=None, norm=False, batch_size=None, grad_ys=None): if loss is not None: gradients = tf.gradients(loss.node, list(utils.Utils.flatten(weights.node)), grad_ys) gradients = [tf.check_numerics(g, 'gradient_%d' % i) for i, g in enumerate(gradients)] if batch_size is not None: gradients = [g / float(batch_size) for g in gradients] # store gradients global norm before clipping self.global_norm = tf.global_norm(gradients) # clip gradients after global norm has been stored if norm: gradients, _ = tf.clip_by_global_norm(gradients, norm) self.calculate = graph.TfNode(utils.Utils.reconstruct(gradients, weights.node)) if optimizer is not None: self.ph_gradients = graph.Placeholders(weights) self.apply = graph.TfNode(optimizer.node.apply_gradients( utils.Utils.izip(self.ph_gradients.checked, weights.node)))
def _numerically_stable_global_norm(tensor_list): """Compute the global norm of a list of Tensors, with improved stability. The global norm computation sometimes overflows due to the intermediate L2 step. To avoid this, we divide by a cheap-to-compute max over the matrix elements. Args: tensor_list: A list of tensors, or `None`. Returns: A scalar tensor with the global norm. """ if np.all([x is None for x in tensor_list]): return 0.0 list_max = tf.reduce_max([tf.reduce_max(tf.abs(x)) for x in tensor_list if x is not None]) return list_max * tf.global_norm([x / list_max for x in tensor_list if x is not None])
def summary_gradients(grad_vars, summary_types, collections=None): """ Add summary to all gradient tensors Args: grads_vars: grads and vars list summary_type: a list of all sumary types to add e.g.: ['scalar', 'histogram', 'sparsity', 'mean', 'rms', 'stddev', 'norm', 'max', 'min'] collections: training or validation collections """ with tf.name_scope('summary/gradient'): for grad, var in grad_vars: ndims = grad.get_shape().ndims for s_type in summary_types: summary_param(s_type, grad, ndims, var.op.name + '/grad', collections=None) try: tf.summary.scalar('/global_norm', tf.global_norm( map(lambda grad_v: grad_v[0], grad_vars)), collections=collections) except Exception: return
def _add_gradients_summaries(grads_and_vars): """Add histogram summaries to gradients. Note: The summaries are also added to the SUMMARIES collection. Args: grads_and_vars: A list of gradient to variable pairs (tuples). Returns: The _list_ of the added summaries for grads_and_vars. """ summaries = [] for grad, var in grads_and_vars: if grad is not None: if isinstance(grad, tf.IndexedSlices): grad_values = grad.values else: grad_values = grad summaries.append(tf.histogram_summary(var.op.name + ':gradient', grad_values)) summaries.append(tf.histogram_summary(var.op.name + ':gradient_norm', tf.global_norm([grad_values]))) else: tf.logging.info('Var %s has no gradient', var.op.name) return summaries
def add_train_op(self, loss): self.global_step = tf.Variable(0, name='global_step', trainable=False) opt = tf.train.AdamOptimizer(learning_rate=self.lr) gradients, variables = zip(*opt.compute_gradients(loss)) # save selected gradient summaries #for grad in gradients: #if 'BasicDecoder' in grad.name or 'gru_cell' in grad.name or 'highway_3' in grad.name: #tf.summary.scalar(grad.name, tf.reduce_sum(grad)) # optionally cap and noise gradients to regularize if self.config.cap_grads > 0: with tf.variable_scope('cap_grads'): tf.summary.scalar('global_gradient_norm', tf.global_norm(gradients)) gradients, _ = tf.clip_by_global_norm(gradients, self.config.cap_grads) train_op = opt.apply_gradients(zip(gradients, variables), global_step=self.global_step) return train_op
def _build_gradient(self, target): """ Local gradient for remote vars """ local_grad = tf.gradients(self.total_loss, self.get_trainable_weights()) self.for_summary_scalar += [tf.global_norm(local_grad, name='grad_norm'), tf.global_norm(self.get_trainable_weights(), name='vars_norm')] # clip grad by norm local_grad, _ = tf.clip_by_global_norm(local_grad, self.clip_grad_norm) # mix with remote vars remote_vars = target.get_trainable_weights() assert len(local_grad) == len(remote_vars) vars_and_grads = list(zip(local_grad, remote_vars)) # each worker has a different set of adam optimizer parameters optimizer = tf.train.AdamOptimizer(self.lr) # apply apply_grad = optimizer.apply_gradients(vars_and_grads) inc_step = self.global_step.assign_add(tf.shape(self.x)[0]) self.train_op = tf.group(apply_grad, inc_step)
def gradient_summaries(gvs, norm=True, ratio=True, histogram=True): """Register gradient summaries. Logs the global norm of the gradient, ratios of gradient_norm/uariable_norm and histograms of gradients. :param gvs: list of (gradient, variable) tuples :param norm: boolean, logs norm of the gradient if True :param ratio: boolean, logs ratios if True :param histogram: boolean, logs gradient histograms if True """ with tf.name_scope('grad_summary'): if norm: grad_norm = tf.global_norm([gv[0] for gv in gvs]) tf.summary.scalar('grad_norm', grad_norm) for g, v in gvs: var_name = v.name.split(':')[0] if g is None: print 'Gradient for variable {} is None'.format(var_name) continue if ratio: log_ratio((g, v), '/'.join(('grad_ratio', var_name))) if histogram: tf.summary.histogram('/'.join(('grad_hist', var_name)), g)
def clip_gradient(pair_list, max_norm): """Perform gradient clipping. If the gradients' global norm exceed 'max_norm', then shrink it to 'max_norm'. :param pair_list: (grad, var) pair list. :param max_norm: The max global norm. :return: (grad, var) pair list, the original gradients' norm, the clipped gradients' norm """ grad_list = [grad for grad, _ in pair_list] grad_list, raw_grad = tf.clip_by_global_norm(grad_list, max_norm) grad = tf.global_norm(grad_list) pair_list = [(grad, pair[1]) for grad, pair in zip(grad_list, pair_list)] return pair_list, raw_grad, grad
def _backward(self, loss, summaries=False): hps = self.hps loss = loss * hps.num_steps emb_vars = find_trainable_variables("emb") lstm_vars = find_trainable_variables("LSTM") softmax_vars = find_trainable_variables("softmax") all_vars = emb_vars + lstm_vars + softmax_vars grads = tf.gradients(loss, all_vars) orig_grads = grads[:] emb_grads = grads[:len(emb_vars)] grads = grads[len(emb_vars):] for i in range(len(emb_grads)): assert isinstance(emb_grads[i], tf.IndexedSlices) emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices, emb_grads[i].dense_shape) lstm_grads = grads[:len(lstm_vars)] softmax_grads = grads[len(lstm_vars):] lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm) clipped_grads = emb_grads + lstm_grads + softmax_grads assert len(clipped_grads) == len(orig_grads) if summaries: tf.scalar_summary("model/lstm_grad_norm", lstm_norm) tf.scalar_summary("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0)) tf.scalar_summary("model/lstm_weight_norm", tf.global_norm(lstm_vars)) # for v, g, cg in zip(all_vars, orig_grads, clipped_grads): # name = v.name.lstrip("model/") # tf.histogram_summary(name + "/var", v) # tf.histogram_summary(name + "/grad", g) # tf.histogram_summary(name + "/clipped_grad", cg) return list(zip(clipped_grads, all_vars))
def build_graph(self, *layers): weights = [layer.weight.node for layer in layers] self.ph_weights = graph.Placeholders(variables=graph.TfNode(weights)) self.assign = graph.TfNode([tf.assign(variable, value) for variable, value in utils.Utils.izip(weights, self.ph_weights.checked)]) self.check = graph.TfNode(tf.group(*[tf.check_numerics(w, 'weight_%d' % i) for i, w in enumerate(utils.Utils.flatten(weights))])) self.global_norm = tf.global_norm(list(utils.Utils.flatten(weights))) return weights
def gradient_clip(gradients, max_gradient_norm): """Clipping gradients of a model.""" clipped_gradients, gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)] gradient_norm_summary.append( tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients))) return clipped_gradients, gradient_norm_summary, gradient_norm
def summarize_gradients(model_name, gradients): """ Adds histograms for gradients and gradient norms of the input gradients """ def get_prefix(var): return model_name + '/' + var.name for gradient, variable in gradients: if gradient is not None: tf.summary.histogram(get_prefix(variable) + "/gradients", gradient) tf.summary.histogram(get_prefix(variable) + "/gradient_norm", tf.global_norm([gradient]))
def test_stable_global_norm_avoids_overflow(self): tensors = [tf.ones([4]), tf.ones([4, 4]) * 1e19, None] gnorm_is_inf = tf.is_inf(tf.global_norm(tensors)) stable_gnorm_is_inf = tf.is_inf( tfgan_losses._numerically_stable_global_norm(tensors)) with self.test_session(use_gpu=True): self.assertTrue(gnorm_is_inf.eval()) self.assertFalse(stable_gnorm_is_inf.eval())
def test_stable_global_norm_unchanged(self): """Test that preconditioning doesn't change global norm value.""" tf.set_random_seed(1234) tensors = [tf.random_uniform( [3] * i, -10.0, 10.0) for i in range(6)] gnorm = tf.global_norm(tensors) precond_gnorm = tfgan_losses._numerically_stable_global_norm(tensors) with self.test_session(use_gpu=True) as sess: # spot check closeness on more than one sample. for _ in range(10): gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm]) self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.): """Clips gradients of a multitask loss by their global norm. Ignores all-zero tensors when computing the global norm. Args: gradients_variables: a list of pairs (gradient, variable). clip_norm: a float Tensor, the global norm to clip on. Default is 20.0. Returns: list: A list of pairs of the same type as gradients_variables,. fixed_global_norm: A 0-D (scalar) Tensor representing the global norm. """ gradients, variables = six.moves.zip(*gradients_variables) def _replace_nonexisting_grad(grad): if grad is None: return grad all_zeros = _is_all_zeros(grad) return tf.cond( all_zeros, lambda: tf.zeros([], dtype=tf.as_dtype(grad.dtype)), lambda: grad) nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients] fixed_global_norm = tf.global_norm(nonzero_gradients) gradients, _ = tf.clip_by_global_norm( gradients, clip_norm, use_norm=fixed_global_norm) return list(six.moves.zip(gradients, variables)), fixed_global_norm
def _adaptive_gradient_clipping(self, grads_and_vars, std_factor=2., decay=0.95, static_max_norm=None, global_step=None, epsilon=1e-8, name=None): """function for adaptive gradient clipping.""" grads, variables = zip(*grads_and_vars) norm = tf.global_norm(grads) max_norm, log_mean = self._adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name) # factor will be 1. if norm is smaller than max_norm factor = tf.where(norm < max_norm, tf.ones_like(norm), tf.exp(log_mean) / norm) if static_max_norm is not None: factor = tf.minimum(static_max_norm / norm, factor) # apply factor clipped_grads = [] for grad in grads: if grad is None: clipped_grads.append(None) elif isinstance(grad, tf.IndexedSlices): clipped_grads.append(tf.IndexedSlices(grad.values * factor, grad.indices, grad.dense_shape)) else: clipped_grads.append(grad * factor) return list(zip(clipped_grads, variables))
def _create_train(self): with tf.variable_scope(self.scope): self.actions = tf.placeholder( shape=[None, self.action_size], dtype=tf.float32, name='actions') self.target_v = tf.placeholder( shape=[None], dtype=tf.float32, name='target_v') self.advantages = tf.placeholder( shape=[None], dtype=tf.float32, name='advantages') # Determine the policy loss using the actions and the advantage log_prob = self.normal_dist.log_prob(self.actions) exp_v = tf.transpose( tf.multiply(tf.transpose(log_prob), self.advantages)) entropy = self.normal_dist.entropy() exp_v = 0.01 * entropy + exp_v self.policy_loss = tf.reduce_sum(-exp_v) self.value_loss = 0.5 * tf.reduce_sum( tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.loss = 0.5*self.value_loss + self.policy_loss local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = self.trainer.apply_gradients( zip(grads, global_vars))
def _add_gradients_summaries(grads_and_vars): """Add histogram summaries to gradients. Note: The summaries are also added to the SUMMARIES collection. Args: grads_and_vars: A list of gradient to variable pairs (tuples). Returns: The _list_ of the added summaries for grads_and_vars. """ summaries = [] for grad, var in grads_and_vars: if grad is not None: if isinstance(grad, tf.IndexedSlices): grad_values = grad.values else: grad_values = grad summaries.append( tf.histogram_summary(var.op.name + ':gradient', grad_values)) summaries.append( tf.histogram_summary(var.op.name + ':gradient_norm', tf.global_norm([grad_values]))) else: tf.logging.info('Var %s has no gradient', var.op.name) return summaries
def _backward(self, loss, summaries=False): hps = self.hps loss = loss * hps.num_steps emb_vars = find_trainable_variables("emb") lstm_vars = find_trainable_variables("LSTM") softmax_vars = find_trainable_variables("softmax") all_vars = emb_vars + lstm_vars + softmax_vars grads = tf.gradients(loss, all_vars) orig_grads = grads[:] emb_grads = grads[:len(emb_vars)] grads = grads[len(emb_vars):] for i in range(len(emb_grads)): assert isinstance(emb_grads[i], tf.IndexedSlices) emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices, emb_grads[i].dense_shape) lstm_grads = grads[:len(lstm_vars)] softmax_grads = grads[len(lstm_vars):] lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm) clipped_grads = emb_grads + lstm_grads + softmax_grads assert len(clipped_grads) == len(orig_grads) if summaries: tf.summary.scalar("model/lstm_grad_norm", lstm_norm) tf.summary.scalar("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0)) tf.summary.scalar("model/lstm_weight_norm", tf.global_norm(lstm_vars)) # for v, g, cg in zip(all_vars, orig_grads, clipped_grads): # name = v.name.lstrip("model/") # tf.histogram_summary(name + "/var", v) # tf.histogram_summary(name + "/grad", g) # tf.histogram_summary(name + "/clipped_grad", cg) return list(zip(clipped_grads, all_vars))
def gradient_clip(gradients, params, max_gradient_norm): """Clipping gradients of a model.""" clipped_gradients, gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)] gradient_norm_summary.append( tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients))) return clipped_gradients, gradient_norm_summary
def get_gradients(self, loss_or_grads, params): """ Note ---- The returned gradients may contain None value """ # check valid algorithm if self.algorithm is None or \ not hasattr(self.algorithm, 'compute_gradients') or \ not hasattr(self.algorithm, 'apply_gradients'): raise RuntimeError("Optimizer is None, or doesn't has attributes: " "compute_gradients and apply_gradients.") with tf.variable_scope(self.name): # get the gradient grads_var = self.algorithm.compute_gradients(loss_or_grads, var_list=params) grads_var = {g: v for g, v in grads_var if g is not None} grads = list(grads_var.keys()) params = list(grads_var.values()) # ====== clipnorm ====== # if self.clipnorm is not None: if self.clip_alg == 'norm': grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads] elif self.clip_alg == 'total_norm': grads, _ = tf.clip_by_global_norm(grads, self.clipnorm) elif self.clip_alg == 'avg_norm': grads = [tf.clip_by_average_norm(g, self.clipnorm) for g in grads] # ====== clipvalue ====== # if self.clipvalue is not None: grads = [tf.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads] # ====== get final norm value ====== # self._norm = add_role(tf.global_norm(grads, name="GradientNorm"), GradientsNorm) return [(g, p) for g, p in zip(grads, params)]
def initialize(self): if self.summarize: bs = tf.to_float(tf.shape(self.x)[0]) tf.summary.scalar("model/policy_loss", self.pi_loss / bs) tf.summary.scalar("model/value_loss", self.vf_loss / bs) tf.summary.scalar("model/entropy", self.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) self.variables = ray.experimental.TensorFlowVariables(self.loss, self.sess) self.sess.run(tf.global_variables_initializer())
def build_summary(self): bs = tf.to_float(tf.shape(self.local_network.x)[0]) tf.summary.scalar("model/policy_loss", self.pi_loss / bs) tf.summary.scalar("model/value_loss", self.vf_loss / bs) tf.summary.scalar("model/entropy", self.entropy / bs) tf.summary.image("model/state", self.local_network.x) tf.summary.scalar("model/grad_global_norm", tf.global_norm(self.grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(self.local_network.var_list)) tf.summary.scalar("model/lr", self.lr) self.summary_op = tf.summary.merge_all()
def _make_training_op(self): if self.config.optimizer == 'sgd': self.learning_rate = tf.cond( self.global_step < self.config.start_decay_step, lambda: tf.constant(self.config.learning_rate), lambda: tf.train.exponential_decay( self.config.learning_rate, (self.global_step - self.config.start_decay_step), self.config.decay_steps, self.config.decay_factor, staircase=True), name='learning_rate') optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) elif self.config.optimizer == 'adam': assert self.config.learning_rate < 0.007 self.learning_rate = tf.constant(self.config.learning_rate) optimizer = tf.train.AdamOptimizer(self.learning_rate) params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, gradient_norm = tf.clip_by_global_norm( gradients, self.config.max_gradient_norm) tf.summary.scalar("grad_norm", gradient_norm) tf.summary.scalar("clipped_norm", tf.global_norm(clipped_gradients)) tf.summary.scalar("learning_rate", self.learning_rate) train_op = optimizer.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) return train_op
def add_training_op(self, loss): """Sets up the training Ops. Creates an optimizer and applies the gradients to all trainable variables. The Op returned by this function is what must be passed to the `sess.run()` call to cause the model to train. See TODO: - Get the gradients for the loss from optimizer using optimizer.compute_gradients. - if self.clip_gradients is true, clip the global norm of the gradients using tf.clip_by_global_norm to self.config.max_grad_norm - Compute the resultant global norm of the gradients using tf.global_norm and save this global norm in self.grad_norm. - Finally, actually create the training operation by calling optimizer.apply_gradients. See: https://www.tensorflow.org/api_docs/python/train/gradient_clipping Args: loss: Loss tensor. Returns: train_op: The Op for training. """ optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.config.lr) ### YOUR CODE HERE (~6-10 lines) # - Remember to clip gradients only if self.config.clip_gradients # is True. # - Remember to set self.grad_norm ### END YOUR CODE assert self.grad_norm is not None, "grad_norm was not set properly!" return train_op
def build_network(self): state = tf.placeholder(tf.float32, [None, 84, 84, 4]) cnn_1 = slim.conv2d(state, 16, [8,8], stride=4, scope=self.name + '/cnn_1', activation_fn=nn.relu) cnn_2 = slim.conv2d(cnn_1, 32, [4,4], stride=2, scope=self.name + '/cnn_2', activation_fn=nn.relu) flatten = slim.flatten(cnn_2) fcc_1 = slim.fully_connected(flatten, 256, scope=self.name + '/fcc_1', activation_fn=nn.relu) adv_probas = slim.fully_connected(fcc_1, self.nb_actions, scope=self.name + '/adv_probas', activation_fn=nn.softmax) value_state = slim.fully_connected(fcc_1, 1, scope=self.name + '/value_state', activation_fn=None) tf.summary.scalar("model/cnn1_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/cnn_1'))) tf.summary.scalar("model/cnn2_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/cnn_2'))) tf.summary.scalar("model/fcc1_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/fcc_1'))) tf.summary.scalar("model/adv_probas_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/adv_probas'))) tf.summary.scalar("model/value_state_global_norm", tf.global_norm(slim.get_variables(scope=self.name + '/value_state'))) #Input self._tf_state = state #Output self._tf_adv_probas = adv_probas self._tf_value_state = value_state
def __add_summaries(self, grads_and_vars, grad_noise_scale, dev_set_scope, summaries=[]): if summaries == []: return # Add summary for the noise on the gradient # ----------------------------------------- if grad_noise_scale is not None: with tf.name_scope(dev_set_scope): tf.summary.scalar("NoiseGrad", grad_noise_scale, summaries) # Add histograms for variables, grads and grad norms # -------------------------------------------------- for grad, var in grads_and_vars: if isinstance(grad, tf.IndexedSlices): grad_vals = grad.values else: grad_vals = grad if grad_vals is not None: # Remove model_name/ var_name = var.op.name.replace( self.cfg.model_name + '/', '') scope_str = dev_set_scope + '_%s' # metric scope_str, var_name = squash_maybe(scope_str, var_name) scope_str += '_%s' # var name # Write the summary with tf.name_scope(None): tf.summary.scalar( scope_str % ('GradientNorm', var_name), tf.global_norm([grad_vals]), summaries) tf.summary.histogram( scope_str % ('GradientHist', var_name), grad_vals, summaries) # Add global norm summary # ----------------------- # Remove the name_scopes (the one from the variable_scope # and the one from the name_scope) with tf.name_scope(dev_set_scope): name = ('clipped_grad_norm' if self.cfg.max_grad_norm else 'grad_norm') tf.summary.scalar('Global_norm/' + name, tf.global_norm(list(zip(*grads_and_vars))[0]), summaries)
def createGraph(self): """Creates graph for training""" self.base_cost=0.0 self.accuracy = 0 num_sizes = len(self.bins) self.cost_list = [] sum_weight=0 self.bin_losses = [] saturation_loss = [] # Create all bins and calculate losses for them with vs.variable_scope("var_lengths"): for seqLength,itemCount, ind in zip(self.bins, self.count_list, range(num_sizes)): x_in = tf.placeholder("int32", [itemCount, seqLength]) y_in = tf.placeholder("int64", [itemCount, seqLength]) self.x_input.append(x_in) self.y_input.append(y_in) self.saturation_costs = [] c, a, _, _, perItemCost, _ = self.createLoss(x_in,y_in,seqLength) weight = 1.0#/seqLength sat_cost = tf.add_n(self.saturation_costs) / ((seqLength ** 2) * itemCount) saturation_loss.append(sat_cost*weight) self.bin_losses.append(perItemCost) self.base_cost += c * weight sum_weight+=weight self.accuracy += a self.cost_list.append(c) tf.get_variable_scope().reuse_variables() # calculate the total loss self.base_cost /= sum_weight self.accuracy /= num_sizes self.sat_loss = tf.reduce_sum(tf.stack(saturation_loss))*self.saturation_weight / sum_weight cost = self.base_cost + self.sat_loss # add gradient noise proportional to learning rate tvars = tf.trainable_variables() grads_0 = tf.gradients(cost, tvars) grads = [] for grad in grads_0: grad1 = grad+tf.truncated_normal(tf.shape(grad)) * self.learning_rate*1e-4 grads.append(grad1) # optimizer optimizer = AdamaxOptimizer(self.learning_rate, beta1=0.9, beta2 = 1.0-self.beta2_rate, epsilon=1e-8) self.optimizer = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step) # some values for printout max_vals=[] for var in tvars: varV = optimizer.get_slot(var, "m") max_vals.append(varV) self.gnorm = tf.global_norm(max_vals) self.cost_list = tf.stack(self.cost_list)
def add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm """ ############################################################## """ TODO: 1. get Adam Optimizer (remember that we defined self.lr in the placeholders section) 2. compute grads wrt to variables in scope for self.loss 3. clip the grads by norm with self.config.clip_val if self.config.grad_clip is True 4. apply the gradients and store the train op in self.train_op (sess.run(train_op) must update the variables) 5. compute the global norm of the gradients and store this scalar in self.grad_norm HINT: you may find the following functinos useful - tf.get_collection - optimizer.compute_gradients - tf.clip_by_norm - optimizer.apply_gradients - tf.global_norm you can access config variable by writing self.config.variable_name (be sure that you set self.train_op and self.grad_norm) """ ############################################################## #################### YOUR CODE HERE - 8-12 lines ############# optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) gradients, v = list(zip(*optimizer.compute_gradients(self.loss, variables))) if self.config.grad_clip: gradients, _ = tf.clip_by_global_norm(gradients, self.config.clip_val) # Use the clipped gradients for optimization self.grad_norm = tf.global_norm(gradients) self.train_op = optimizer.apply_gradients(list(zip(gradients, v))) ############################################################## ######################## END YOUR CODE #######################
def _createModel(self): with tf.variable_scope(self.scope): self.inputs = tf.placeholder('float', shape=[None,self.stateSize]) x1 = slim.fully_connected( self.inputs, 64, scope='fc/fc_1', activation_fn=tf.nn.relu) self.policy = slim.fully_connected(x1, self.actionSize, activation_fn=tf.nn.softmax, weights_initializer=Brian.normalized_columns_initializer(0.01), biases_initializer=None) self.value = slim.fully_connected(x1,1, activation_fn=None, weights_initializer=Brian.normalized_columns_initializer(1.0), biases_initializer=None) self.update_local_ops = Brian.update_target_graph('global',self.scope) if self.scope != 'global': self.actions = tf.placeholder( shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, self.actionSize, dtype=tf.float32) self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) #Loss functions self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy)) self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages) self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 #Get gradients from local network using local losses local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) self.gradients = tf.gradients(self.loss,local_vars) self.var_norms = tf.global_norm(local_vars) grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) #Apply local gradients to global network global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = self.trainer.apply_gradients(zip(grads,global_vars))