我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用tensorflow.reduce_logsumexp()。
def softmax_loss(self, antecedent_scores, antecedent_labels): """ Computes the value of the loss function using antecedent_scores and antecedent_labels. Practically standard softmax function. Args: antecedent_scores: tf.float64, [num_mentions, max_ant + 1], output of fully-connected network that compute antecedent scores. antecedent_labels: True labels for antecedent. Returns: [num_mentions] The value of loss function. """ gold_scores = antecedent_scores + tf.log(tf.cast(antecedent_labels, tf.float64)) # [num_mentions, max_ant + 1] marginalized_gold_scores = tf.reduce_logsumexp(gold_scores, [1]) # [num_mentions] log_norm = tf.reduce_logsumexp(antecedent_scores, [1]) # [num_mentions] return log_norm - marginalized_gold_scores # [num_mentions]
def log_sum_exp(x, axis=None, keep_dims=False): """ Deprecated: Use tf.reduce_logsumexp(). Tensorflow numerically stable log sum of exps across the `axis`. :param x: A Tensor or numpy array. :param axis: An int or list or tuple. The dimensions to reduce. If `None` (the default), reduces all dimensions. :param keep_dims: Bool. If true, retains reduced dimensions with length 1. Default to be False. :return: A Tensor after the computation of log sum exp along given axes of x. """ x = tf.cast(x, dtype=tf.float32) x_max = tf.reduce_max(x, axis=axis, keep_dims=True) ret = tf.log(tf.reduce_sum(tf.exp(x - x_max), axis=axis, keep_dims=True)) + x_max if not keep_dims: ret = tf.reduce_sum(ret, axis=axis) return ret
def _log_prob(self, given): logits, temperature = self.path_param(self.logits), \ self.path_param(self.temperature) log_given = tf.log(given) log_temperature = tf.log(temperature) n = tf.cast(self.n_categories, self.dtype) if self._check_numerics: log_given = tf.check_numerics(log_given, "log(given)") log_temperature = tf.check_numerics( log_temperature, "log(temperature)") temp = logits - temperature * log_given return tf.lgamma(n) + (n - 1) * log_temperature + \ tf.reduce_sum(temp - log_given, axis=-1) - \ n * tf.reduce_logsumexp(temp, axis=-1)
def traditional_transition_loss_pred(self, i, j, combined_head, combined_dep): rel_trans_feat_ids = self.trans_feat_ids[i*self.args.beam_size+j] if not self.train else self.trans_feat_ids[i, j] rel_head = tf.reshape(tf.gather(combined_head, rel_trans_feat_ids[:4]), [4, self.args.rel_emb_dim]) rel_dep = tf.reshape(tf.gather(combined_dep, rel_trans_feat_ids[:4]), [4, self.args.rel_emb_dim]) mask = tf.cast(tf.reshape(tf.greater_equal(rel_trans_feat_ids[:4], 0), [4,1]), tf.float32) rel_head = tf.multiply(mask, rel_head) rel_dep = tf.multiply(mask, rel_dep) rel_hid = self.rel_merge(rel_head, rel_dep) rel_logit = self.rel_dense(tf.reshape(rel_hid, [1, -1])) rel_logit = tf.reshape(rel_logit, [-1]) log_partition = tf.reduce_logsumexp(rel_logit) if self.train: res = log_partition - rel_logit[self.trans_labels[i, j]] return res else: arc_pred = log_partition - rel_logit return arc_pred
def pos_loss_pred(self, i, pos_embeddings, pos_logit, NUM_POS, gold_pos, pos_trainables): if self.args.no_pos: pos_emb = tf.nn.embedding_lookup(pos_embeddings, gold_pos[i]) if self.train: return 0, pos_emb else: return tf.gather(gold_pos[i], tf.range(1, self.sent_length)), pos_emb else: pos_logit = pos_logit[1:] log_partition = tf.reduce_logsumexp(pos_logit, [1]) pos_pred = tf.exp(pos_logit - tf.reshape(log_partition, (-1, 1))) pos_emb = tf.concat([tf.reshape(tf.nn.embedding_lookup(pos_embeddings, NUM_POS), (1, -1)), tf.matmul(pos_pred, pos_trainables)], 0) if self.train: loss = tf.reduce_sum(tf.gather(log_partition, tf.range(self.sent_lengths[i]-1)) - tf.gather(tf.reshape(pos_logit, [-1]), tf.range(self.sent_lengths[i]-1) * NUM_POS + tf.gather(gold_pos[i], tf.range(1, self.sent_lengths[i])))) return loss, pos_emb else: return tf.cast(tf.argmax(pos_pred, 1), tf.int32), pos_emb
def optimized_loss(self, targets, logits): """ Function that computes the loss of a mixture density network in a way that it handles underflow and overflow and avoids unstable behaviors """ # Obtain parameters mixings, sigma, mean = self.logits_to_params(logits) output_size = tf.cast(tf.shape(targets)[1], tf.float32) variance = tf.square(sigma) # Convert expressions into exponent-based terms mixings_exp = tf.log(mixings) # By properties of logarithm we can simplify the original expression # log(x/y) = log(x) - log(y), log(xy) = log(x) + log(y), log(1) = 0 sqrt_exp = - output_size * (0.5 * tf.log(2*np.pi) + tf.log(sigma)) gaussian_exp = -tf.divide(tf.square(targets - mean), 2 * variance) exponent = mixings_exp + sqrt_exp + gaussian_exp # Use optimized logsumexp function to control underflow/overflow return tf.reduce_logsumexp(exponent, axis=1)
def weighted_sum(components, weights, scope=""): # n: num_components # b: batch_size # c: component_size with tf.name_scope(scope): weight_is_batched = (weights.get_shape().ndims == 2) if weight_is_batched: set_batch_size = tf.shape(weights)[0] else: set_batch_size = None components, is_batched = make_batch_consistent(components, set_batch_size=set_batch_size) components = tf.pack(components) # [n x b x c] weight_rank = weights.get_shape().ndims assert_rank_1_or_2(weight_rank) if weight_rank == 1: weights = tf.reshape(weights, [-1,1,1]) # [n x 1 x 1] elif weight_rank == 2: weights = tf.expand_dims(tf.transpose(weights, [1, 0]),2) # [n x b x 1] components += weights # TODO: change this to tf.reduce_logsumexp when it is relased w_sum = logsumexp(components, reduction_indices=0) # [b x c] if not is_batched: w_sum = tf.squeeze(w_sum) # [c] return w_sum
def tf_parameterize(self, x): # Flat logits logits = self.logits.apply(x=x) # Reshape logits to action shape shape = (-1,) + self.shape + (self.num_actions,) logits = tf.reshape(tensor=logits, shape=shape) # !!! state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, dim=-1) # Min epsilon probability for numerical stability probabilities = tf.maximum(x=probabilities, y=util.epsilon) # "Normalized" logits logits = tf.log(x=probabilities) return logits, probabilities, state_value
def get_probs_and_accuracy(preds,O): """ helper function. we have a prediction for each MC sample of each observation in this batch. need to distill the multiple preds from each MC into a single pred for this observation. also get accuracy. use true probs to get ROC, PR curves in sklearn """ all_probs = tf.exp(preds[:,1] - tf.reduce_logsumexp(preds, axis = 1)) #normalize; and drop a dim so only prob of positive case N = tf.cast(tf.shape(preds)[0]/n_mc_smps,tf.int32) #actual number of observations in preds, collapsing MC samples #predicted probability per observation; collapse the MC samples probs = tf.zeros([0]) #store all samples in a list, then concat into tensor at end #setup tf while loop (have to use this bc loop size is variable) def cond(i,probs): return i < N def body(i,probs): probs = tf.concat([probs,[tf.reduce_mean(tf.slice(all_probs,[i*n_mc_smps],[n_mc_smps]))]],0) return i+1,probs i = tf.constant(0) i,probs = tf.while_loop(cond,body,loop_vars=[i,probs],shape_invariants=[i.get_shape(),tf.TensorShape([None])]) #compare to truth; just use cutoff of 0.5 for right now to get accuracy correct_pred = tf.equal(tf.cast(tf.greater(probs,0.5),tf.int32), O) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) return probs,accuracy
def logsumexp(x, axis=None, keepdims=False): """Computes log(sum(exp(elements across dimensions of a tensor))). This function is more numerically stable than log(sum(exp(x))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. # Arguments x: A tensor or variable. axis: An integer, the axis to reduce over. keepdims: A boolean, whether to keep the dimensions or not. If `keepdims` is `False`, the rank of the tensor is reduced by 1. If `keepdims` is `True`, the reduced dimension is retained with length 1. # Returns The reduced tensor. """ axis = _normalize_axis(axis, ndim(x)) return tf.reduce_logsumexp(x, reduction_indices=axis, keep_dims=keepdims)
def predict_density(self, Xnew, Ynew, num_samples): Fmean, Fvar = self.build_predict(Xnew, full_cov=False, S=num_samples) S, N, D = shape_as_list(Fmean) Ynew = tile_over_samples(Ynew, num_samples) flat_arrays = [tf.reshape(a, [S*N, -1]) for a in [Fmean, Fvar, Ynew]] l_flat = self.likelihood.predict_density(*flat_arrays) l = tf.reshape(l_flat, [S, N, -1]) log_num_samples = tf.log(tf.cast(num_samples, float_type)) return tf.reduce_logsumexp(l - log_num_samples, axis=0)
def logsumexp(x, axis=None): '''Returns `log(sum(exp(x), axis=axis))` with improved numerical stability. ''' return tf.reduce_logsumexp(x, axis=[axis])
def _log_prob(self, given): given = tf.cast(given, self.param_dtype) given, logits = maybe_explicit_broadcast( given, self.logits, 'given', 'logits') normalized_logits = logits - tf.reduce_logsumexp( logits, axis=-1, keep_dims=True) n = tf.cast(self.n_experiments, self.param_dtype) log_p = log_combination(n, given) + \ tf.reduce_sum(given * normalized_logits, -1) return log_p
def _log_prob(self, given): logits, temperature = self.path_param(self.logits),\ self.path_param(self.temperature) n = tf.cast(self.n_categories, self.dtype) log_temperature = tf.log(temperature) if self._check_numerics: log_temperature = tf.check_numerics( log_temperature, "log(temperature)") temp = logits - temperature * given return tf.lgamma(n) + (n - 1) * log_temperature + \ tf.reduce_sum(temp, axis=-1) - \ n * tf.reduce_logsumexp(temp, axis=-1)
def init_policy(self): output_vec = L.get_output(self._output_vec_layer, deterministic=True) / self._c prob = tf.nn.softmax(output_vec) max_qval = tf.reduce_logsumexp(output_vec, [1]) self._f_prob = tensor_utils.compile_function([self._obs_layer.input_var], prob) self._f_max_qvals = tensor_utils.compile_function([self._obs_layer.input_var], max_qval) self._dist = Categorical(self._n)
def log_prob_from_logits(logits): return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def build_vimco_loss(cfg, l, log_q_h): """Builds negative VIMCO loss as in the paper. Reference: Variational Inference for Monte Carlo Objectives, Algorithm 1 https://arxiv.org/abs/1602.06725 """ k, b = l.get_shape().as_list() # n_samples, batch_size kf = tf.cast(k, tf.float32) if cfg['optim/geometric_mean']: # implicit multi-sample objective (importance-sampled ELBO) l_logsumexp = tf.reduce_logsumexp(l, [0], keep_dims=True) L_hat = l_logsumexp - tf.log(kf) else: # standard ELBO L_hat = tf.reduce_mean(l, [0], keep_dims=True) s = tf.reduce_sum(l, 0, keep_dims=True) diag_mask = tf.expand_dims(tf.diag(tf.ones([k], dtype=tf.float32)), -1) off_diag_mask = 1. - diag_mask diff = tf.expand_dims(s - l, 0) # expand for proper broadcasting l_i_diag = 1. / (kf - 1.) * diff * diag_mask l_i_off_diag = off_diag_mask * tf.stack([l] * k) l_i = l_i_diag + l_i_off_diag if cfg['optim/geometric_mean']: L_hat_minus_i = tf.reduce_logsumexp(l_i, [1]) - tf.log(kf) w = tf.stop_gradient(tf.exp((l - l_logsumexp))) else: L_hat_minus_i = tf.reduce_mean(l_i, [1]) w = 1. local_l = tf.stop_gradient(L_hat - L_hat_minus_i) if not cfg['optim/geometric_mean']: # correction factor for multiplying by 1. / (kf - 1.) above # to verify this, work out 2x2 matrix of samples by hand local_l = local_l * k loss = local_l * log_q_h + w * l return loss / tf.to_float(b)
def softmax_loss(self, antecedent_scores, antecedent_labels): gold_scores = antecedent_scores + tf.log(tf.to_float(antecedent_labels)) # [num_mentions, max_ant + 1] marginalized_gold_scores = tf.reduce_logsumexp(gold_scores, [1]) # [num_mentions] log_norm = tf.reduce_logsumexp(antecedent_scores, [1]) # [num_mentions] return log_norm - marginalized_gold_scores # [num_mentions]
def segment_logsumexp(xs, segments): """ Similar tf.segment_sum but compute logsumexp rather then sum """ # Stop gradients following the implementation of tf.reduce_logsumexp maxs = tf.stop_gradient(tf.reduce_max(xs, axis=1)) segment_maxes = tf.segment_max(maxs, segments) xs -= tf.expand_dims(tf.gather(segment_maxes, segments), 1) sums = tf.reduce_sum(tf.exp(xs), axis=1) return tf.log(tf.segment_sum(sums, segments)) + segment_maxes
def predict(self, answer, start_logits, end_logits, mask) -> Prediction: masked_start_logits = exp_mask(start_logits, mask) masked_end_logits = exp_mask(end_logits, mask) if len(answer) == 1: # answer span is encoding in a sparse int array answer_spans = answer[0] losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=masked_start_logits, labels=answer_spans[:, 0]) losses2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=masked_end_logits, labels=answer_spans[:, 1]) loss = tf.add_n([tf.reduce_mean(losses1), tf.reduce_mean(losses2)], name="loss") elif len(answer) == 2 and all(x.dtype == tf.bool for x in answer): # all correct start/end bounds are marked in a dense bool array # In this case there might be multiple answer spans, so we need an aggregation strategy losses = [] for answer_mask, logits in zip(answer, [masked_start_logits, masked_end_logits]): log_norm = tf.reduce_logsumexp(logits, axis=1) if self.aggregate == "sum": log_score = tf.reduce_logsumexp(logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer_mask, tf.float32)), axis=1) elif self.aggregate == "max": log_score = tf.reduce_max(logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer_mask, tf.float32)), axis=1) else: raise ValueError() losses.append(tf.reduce_mean(-(log_score - log_norm))) loss = tf.add_n(losses) else: raise NotImplemented() tf.add_to_collection(tf.GraphKeys.LOSSES, loss) return BoundaryPrediction(tf.nn.softmax(masked_start_logits), tf.nn.softmax(masked_end_logits), masked_start_logits, masked_end_logits, mask)
def predict(self, answer, start_logits, end_logits, mask) -> Prediction: masked_start_logits = exp_mask(start_logits, mask) masked_end_logits = exp_mask(end_logits, mask) batch_dim = tf.shape(start_logits)[0] if len(answer) == 2 and all(x.dtype == tf.bool for x in answer): none_logit = tf.get_variable("none-logit", initializer=self.non_init, dtype=tf.float32) none_logit = tf.tile(tf.expand_dims(none_logit, 0), [batch_dim]) all_logits = tf.reshape(tf.expand_dims(masked_start_logits, 1) + tf.expand_dims(masked_end_logits, 2), (batch_dim, -1)) # (batch, (l * l) + 1) logits including the none option all_logits = tf.concat([all_logits, tf.expand_dims(none_logit, 1)], axis=1) log_norms = tf.reduce_logsumexp(all_logits, axis=1) # Now build a "correctness" mask in the same format correct_mask = tf.logical_and(tf.expand_dims(answer[0], 1), tf.expand_dims(answer[1], 2)) correct_mask = tf.reshape(correct_mask, (batch_dim, -1)) correct_mask = tf.concat([correct_mask, tf.logical_not(tf.reduce_any(answer[0], axis=1, keep_dims=True))], axis=1) log_correct = tf.reduce_logsumexp( all_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(correct_mask, tf.float32)), axis=1) loss = tf.reduce_mean(-(log_correct - log_norms)) probs = tf.nn.softmax(all_logits) tf.add_to_collection(tf.GraphKeys.LOSSES, loss) return ConfidencePrediction(probs[:, :-1], masked_start_logits, masked_end_logits, probs[:, -1], none_logit) else: raise NotImplemented()
def logsumexp(v, reduction_indices=None, keep_dims=False): if float(tf.__version__[:4]) > 0.10: # reduce_logsumexp does not exist below tfv0.11 if isinstance(reduction_indices, int): # due to a bug in tfv0.11 reduction_indices = [reduction_indices] return handle_inf( tf.reduce_logsumexp(v, reduction_indices, # this is a bit fragile. reduction_indices got renamed to axis in tfv0.12 keep_dims=keep_dims) ) else: m = tf.reduce_max(v, reduction_indices=reduction_indices, keep_dims=keep_dims) # Use SMALL_NUMBER to handle v = [] return m + tf.log(tf.reduce_sum(tf.exp(v - m), reduction_indices=reduction_indices, keep_dims=keep_dims) + SMALL_NUMBER)
def log_prob_from_logits(logits): """Softmax function.""" return logits - tf.reduce_logsumexp(logits, keep_dims=True)
def gm_log_p(params_out, x_target, dim): """ computes log probability of target in Gaussian mixture with given parameters """ mean_x, cov_x, pi_x_logit = params_out pi_x = tf.nn.softmax(pi_x_logit) mean_x = tf.transpose(mean_x, perm=[1, 0, 2]) cov_x = tf.transpose(cov_x, perm=[1, 0, 2]) pi_x = tf.transpose(pi_x, perm=[1, 0]) x_diff = x_target - mean_x x_square = tf.reduce_sum((x_diff / cov_x) * x_diff, axis=[2]) log_x_exp = -0.5 * x_square log_cov_x_det = tf.reduce_sum(tf.log(cov_x), axis=[2]) log_x_norm = -0.5 * (dim * tf.log(2 * np.pi) + log_cov_x_det) + pi_x log_p = tf.reduce_logsumexp(log_x_norm + log_x_exp, axis=[0]) return log_p, log_x_norm, log_x_exp, tf.abs(x_diff)
def lookup(self, symbol): if symbol == None: return None if type(symbol) == type([]): return [self.lookup(k) for k in symbol] if type(symbol) == type({}) or type(symbol) == hc.Config: return hc.Config({k: self.lookup(symbol[k]) for k in symbol.keys()}) if type(symbol) != type(""): return symbol if symbol.startswith('function:'): return self.lookup_function(symbol) if symbol.startswith('class:'): return self.lookup_class(symbol) if symbol == 'tanh': return tf.nn.tanh if symbol == 'sigmoid': return tf.nn.sigmoid if symbol == 'batch_norm': return layer_regularizers.batch_norm_1 if symbol == 'layer_norm': return layer_regularizers.layer_norm_1 if symbol == "crelu": return tf.nn.crelu if symbol == "prelu": return self.prelu() if symbol == "selu": return selu if symbol == "lrelu": return lrelu if symbol == "relu": return tf.nn.relu if symbol == 'square': return tf.square if symbol == 'reduce_mean': return tf.reduce_mean if symbol == 'reduce_min': return tf.reduce_min if symbol == 'reduce_sum': return tf.reduce_sum if symbol == 'reduce_logsumexp': return tf.reduce_logsumexp if symbol == 'reduce_linear': return self.reduce_linear() if symbol == 'l1_distance': return l1_distance if symbol == 'l2_distance': return l2_distance return symbol
def ASw_transition_loss_pred(self, i, j, combined_head, combined_dep, transition_logit, SHIFT): # extract relevant portions of params rel_trans_feat_ids = self.trans_feat_ids[i*self.args.beam_size+j] if not self.train else self.trans_feat_ids[i, j] rel_trans_feat_size = self.trans_feat_sizes[i*self.args.beam_size+j] if not self.train else self.trans_feat_sizes[i, j] # core computations has_shift = tf.cond(tf.equal(rel_trans_feat_ids[0, 0], SHIFT), lambda: tf.constant(1), lambda: tf.constant(0)) arc_trans_count = rel_trans_feat_size - has_shift arc_trans_feat_ids = tf.gather(rel_trans_feat_ids, tf.range(has_shift, rel_trans_feat_size)) rel_head = tf.reshape(tf.gather(combined_head, arc_trans_feat_ids[:, 1]), [arc_trans_count, self.args.rel_emb_dim]) rel_dep = tf.reshape(tf.gather(combined_dep, arc_trans_feat_ids[:, 2]), [arc_trans_count, self.args.rel_emb_dim]) rel_hid = self.rel_merge(rel_head, rel_dep) rel_logit = self.rel_dense(rel_hid) arc_logit = tf.reshape(rel_logit, [-1]) def logaddexp(a, b): mx = tf.maximum(a, b) return tf.log(tf.exp(a-mx) + tf.exp(b-mx)) + mx if self.train: # compute a loss and return it log_partition = tf.reduce_logsumexp(arc_logit) log_partition = tf.cond(tf.greater(has_shift, 0), lambda: logaddexp(log_partition, transition_logit[rel_trans_feat_ids[0, 3]]), lambda: log_partition) arc_logit = log_partition - arc_logit res = tf.cond(tf.greater(has_shift, 0), lambda: tf.cond(tf.greater(self.trans_labels[i, j], 0), lambda: arc_logit[self.trans_labels[i, j]-1], lambda: log_partition - transition_logit[rel_trans_feat_ids[0, 3]]), lambda: arc_logit[self.trans_labels[i, j]]) return res else: # just return predictions arc_logit = tf.reshape(rel_logit, [-1]) log_partition = tf.reduce_logsumexp(arc_logit) log_partition = tf.cond(tf.greater(has_shift, 0), lambda: logaddexp(log_partition, transition_logit[rel_trans_feat_ids[0, 3]]), lambda: log_partition) arc_logit = log_partition - arc_logit arc_pred = tf.cond(tf.greater(has_shift, 0), lambda: tf.concat([tf.reshape(log_partition - transition_logit[rel_trans_feat_ids[0, 3]], (-1,1)), tf.reshape(arc_logit, (-1,1))], 0), lambda: tf.reshape(arc_logit, (-1, 1))) # correct shape current_output_shape = has_shift + arc_trans_count * rel_logit.get_shape()[1] arc_pred = tf.concat([arc_pred, 1e20 * tf.ones((tf.subtract(self.pred_output_size, current_output_shape), 1), dtype=tf.float32)], 0) arc_pred = tf.reshape(arc_pred, [-1]) return arc_pred
def bow_loss_by_example(logits, targets, weights, average_across_timesteps=False): """Loss for a bow of logits (per example). As opposed to sequence loss this is supposed to ignore the order. Does not seem to work yet. Args: logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: List of 1D batch-sized int32 Tensors of the same length as logits. weights: List of 1D batch-sized float-Tensors of the same length as logits. average_across_timesteps: If set, divide the returned cost by the total label weight. Returns: 1D batch-sized float Tensor: The loss for each bow. Raises: ValueError: If len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError('Lengths of logits, weights, and targets must be the same ' '%d, %d, %d.' % (len(logits), len(weights), len(targets))) batch_size = logits[0].shape[0] vocab_size = logits[0].shape[1] logitssum = tf.zeros((batch_size, vocab_size), tf.float32) targetset = tf.zeros((batch_size, vocab_size), tf.float32) for target, weight in zip(targets, weights): targetset += (tf.one_hot(target, vocab_size) * weight[:, None]) weight = tf.ones((batch_size), tf.float32) for logit in logits: softmax = tf.nn.softmax(logit) logitssum += (logitssum * weight[:, None]) weight = tf.maximum(0.0, weight - softmax[:, 3]) # logitssum = tf.minimum(logitssum, 1.0) # targetset = tf.minimum(targetset, 1.0) # loss = tf.nn.sigmoid_cross_entropy_with_logits( # labels=targetset, logits=logitssum) loss = tf.reduce_sum(tf.squared_difference(logitssum, targetset), axis=1) # crossent = tf.maximum(logitssum, 0.0) - ( # logitssum * targetset) + tf.log(1.0 + tf.exp(-1.0 * tf.abs(logitssum))) # log_perps = tf.reduce_logsumexp(crossent, axis=1) if average_across_timesteps: total_size = tf.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. loss /= total_size return loss
def blurred_cross_entropy(output, target, filter_size=11, sampling_range=3.5, pixel_weights=None): """ Apply a Gaussian smoothing filter to the target probabilities (i.e. the one-hot representation of target) and compute the cross entropy loss between softmax(output) and the blurred target probabilities. :param output: A rank-4 or rank-5 tensor with shape=(samples, [sequence_position,] x, y, num_classes) representing the network input of the output layer (not activated) :param target: A rank-3 or rank-4 tensor with shape=(samples, [sequence_position,] x, y) representing the target labels. It must contain int values in 0..num_classes-1. :param filter_size: A length-2 list of int specifying the size of the Gaussian filter that will be applied to the target probabilities. :param pixel_weights: A rank-3 or rank-4 tensor with shape=(samples, [sequence_position,] x, y) representing factors, that will be applied to the loss of the corresponding pixel. This can be e.g. used to void certain pixels by weighting them to 0, i.e. suppress their error induction. :return: A scalar operation representing the blurred cross entropy loss. """ # convert target to one-hot output_shape = output.shape.as_list() one_hot = tf.one_hot(target, output_shape[-1], dtype=tf.float32) if (len(output_shape) > 4): one_hot = tf.reshape(one_hot, [np.prod(output_shape[:-3])] + output_shape[-3:]) # blur target probabilities #gauss_filter = weight_gauss_conv2d(filter_size + [output_shape[-1], 1]) #blurred_target = tf.nn.depthwise_conv2d(one_hot, gauss_filter, [1, 1, 1, 1], 'SAME') blurred_target = gaussian_blur(one_hot, filter_size, sampling_range) if (len(output_shape) > 4): blurred_target = tf.reshape(blurred_target, output_shape) # compute log softmax predictions and cross entropy log_pred = output - tf.reduce_logsumexp(output, axis=[len(output_shape) - 1], keep_dims=True) # Apply pixel-wise weighting if pixel_weights is not None: log_pred *= pixel_weights cross_entropy = -tf.reduce_sum(blurred_target * log_pred, axis=[len(output_shape)-1]) if pixel_weights is not None: loss = tf.reduce_sum(cross_entropy) / tf.reduce_sum(pixel_weights) else: loss = tf.reduce_mean(cross_entropy) return loss
def build_elbo(self, n_samples, training=False): cfg = self.config reuse = False if training: reuse = True z = self.variational.sample(self.data, n_samples=n_samples, reuse=reuse) log_q_z = self.variational.log_prob(z, reuse=reuse) self.log_q_z = log_q_z log_p_x_z = self.model.log_prob(self.data, z, reuse=reuse) if cfg['optim/deterministic_annealing'] and training: self.build_magnitude() tf.summary.scalar('c/magnitude', self.magnitude) magnitude = tf.maximum(1., self.magnitude) elbo = log_p_x_z - magnitude * log_q_z else: elbo = log_p_x_z - log_q_z if training: self.elbo_loss = elbo _, variance = tf.nn.moments(elbo, [0]) self.elbo_variance = tf.reduce_mean(variance) self.log_q_z_loss = log_q_z self.variational.build_entropy(z) self.q_z_sample = z slim.summarize_collection('variational') slim.summarize_collection('model') slim.summarize_activations('variational') slim.summarize_activations('model') else: self.elbo = elbo self.log_q_z = log_q_z self.log_p_x_hat = (tf.reduce_logsumexp(elbo, [0], keep_dims=True) - tf.log(float(cfg['q/n_samples_stats']))) tf.summary.scalar('o/log_p_x_hat', tf.reduce_mean(self.log_p_x_hat)) def sum_mean(x): return tf.reduce_sum(tf.reduce_mean(x, 0)) self.elbo_sum = sum_mean(elbo) self.q_entropy = -sum_mean(log_q_z) self.E_log_lik = sum_mean(log_p_x_z) tf.summary.scalar('o/elbo_sum', sum_mean(elbo)) tf.summary.scalar('o/elbo_mean', sum_mean(elbo) / cfg['batch_size']) tf.summary.scalar('o/E_log_q_z', sum_mean(log_q_z)) tf.summary.scalar('o/E_log_p_x_z', self.E_log_lik)
def predict(self, answer, start_logits, end_logits, mask) -> Prediction: bound = self.bound f1_weight = self.f1_weight aggregate = self.aggregate masked_logits1 = exp_mask(start_logits, mask) masked_logits2 = exp_mask(end_logits, mask) span_logits = [] for i in range(self.bound): if i == 0: span_logits.append(masked_logits1 + masked_logits2) else: span_logits.append(masked_logits1[:, :-i] + masked_logits2[:, i:]) span_logits = tf.concat(span_logits, axis=1) l = tf.shape(start_logits)[1] if len(answer) == 1: answer = answer[0] if answer.dtype == tf.int32: if f1_weight == 0: answer_ix = to_packed_coordinates(answer, l, bound) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=span_logits, labels=answer_ix)) else: f1_mask = packed_span_f1_mask(answer, l, bound) if f1_weight < 1: f1_mask *= f1_weight f1_mask += (1 - f1_weight) * tf.one_hot(to_packed_coordinates(answer, l, bound), l) # TODO can we stay in log space? (actually its tricky since f1_mask can have zeros...) probs = tf.nn.softmax(span_logits) loss = -tf.reduce_mean(tf.log(tf.reduce_sum(probs * f1_mask, axis=1))) else: log_norm = tf.reduce_logsumexp(span_logits, axis=1) if aggregate == "sum": log_score = tf.reduce_logsumexp( span_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer, tf.float32)), axis=1) elif aggregate == "max": log_score = tf.reduce_max(span_logits + VERY_NEGATIVE_NUMBER * (1 - tf.cast(answer, tf.float32)), axis=1) else: raise NotImplementedError() loss = tf.reduce_mean(-(log_score - log_norm)) else: raise NotImplementedError() tf.add_to_collection(tf.GraphKeys.LOSSES, loss) return PackedSpanPrediction(span_logits, l, bound)