我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.pow()。
def ae(x): if nonlinearity_name == 'relu': f = tf.nn.relu elif nonlinearity_name == 'elu': f = tf.nn.elu elif nonlinearity_name == 'gelu': # def gelu(x): # return tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.) # f = gelu def gelu_fast(_x): return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3)))) f = gelu_fast elif nonlinearity_name == 'silu': def silu(_x): return _x * tf.sigmoid(_x) f = silu # elif nonlinearity_name == 'soi': # def soi_map(x): # u = tf.random_uniform(tf.shape(x)) # mask = tf.to_float(tf.less(u, (1 + tf.erf(x / tf.sqrt(2.))) / 2.)) # return tf.cond(is_training, lambda: tf.mul(mask, x), # lambda: tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.)) # f = soi_map else: raise NameError("Need 'relu', 'elu', 'gelu', or 'silu' for nonlinearity_name") h1 = f(tf.matmul(x, W['1']) + b['1']) h2 = f(tf.matmul(h1, W['2']) + b['2']) h3 = f(tf.matmul(h2, W['3']) + b['3']) h4 = f(tf.matmul(h3, W['4']) + b['4']) h5 = f(tf.matmul(h4, W['5']) + b['5']) h6 = f(tf.matmul(h5, W['6']) + b['6']) h7 = f(tf.matmul(h6, W['7']) + b['7']) return tf.matmul(h7, W['8']) + b['8']
def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): sigma_2 = sigma ** 2 box_diff = bbox_pred - bbox_targets in_box_diff = bbox_inside_weights * box_diff abs_in_box_diff = tf.abs(in_box_diff) smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2))) in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) out_loss_box = bbox_outside_weights * in_loss_box loss_box = tf.reduce_mean(tf.reduce_sum( out_loss_box, axis=dim )) return loss_box
def _anneal_weight(init_val, final_val, anneal_type, global_step, anneal_steps, hold_for=0., steps_div=1., dtype=tf.float64): val, final, step, hold_for, anneal_steps, steps_div = (tf.cast(i, dtype) for i in (init_val, final_val, global_step, hold_for, anneal_steps, steps_div)) step = tf.maximum(step - hold_for, 0.) if anneal_type == 'exp': decay_rate = tf.pow(final / val, steps_div / anneal_steps) val = tf.train.exponential_decay(val, step, steps_div, decay_rate) elif anneal_type == 'linear': val = final + (val - final) * (1. - step / anneal_steps) else: raise NotImplementedError anneal_weight = tf.maximum(final, val) return anneal_weight
def _embed_sentences(self): """Tensorflow implementation of Simple but Tough-to-Beat Baseline""" # Get word features word_embeddings = self._get_embedding() word_feats = tf.nn.embedding_lookup(word_embeddings, self.input) # Get marginal estimates and scaling term batch_size = tf.shape(word_feats)[0] a = tf.pow(10.0, self._get_a_exp()) p = tf.constant(self.marginals, dtype=tf.float32, name='marginals') q = tf.reshape( a / (a + tf.nn.embedding_lookup(p, self.input)), (batch_size, self.mx_len, 1) ) # Compute initial sentence embedding z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1)) S = z * tf.reduce_sum(q * word_feats, axis=1) # Compute common component S_centered = S - tf.reduce_mean(S, axis=0) _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True) self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0)) # Common component removal ccx = tf.reshape(self._get_common_component(), (1, self.d)) sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx} return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999): ''' Adam optimizer ''' updates = [] if type(cost_or_grads) is not list: grads = tf.gradients(cost_or_grads, params) else: grads = cost_or_grads t = tf.Variable(1., 'adam_t') for p, g in zip(params, grads): mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg') if mom1 > 0: v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') v_t = mom1 * v + (1. - mom1) * g v_hat = v_t / (1. - tf.pow(mom1, t)) updates.append(v.assign(v_t)) else: v_hat = g mg_t = mom2 * mg + (1. - mom2) * tf.square(g) mg_hat = mg_t / (1. - tf.pow(mom2, t)) g_t = v_hat / tf.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append(mg.assign(mg_t)) updates.append(p.assign(p_t)) updates.append(t.assign_add(1)) return tf.group(*updates)
def tune(self, acceptance_rate, fresh_start): def adapt_stepsize(): new_step = tf.assign(self.step, (1 - fresh_start) * self.step + 1) rate1 = tf.div(1.0, new_step + self.t0) new_h_bar = tf.assign( self.h_bar, (1 - fresh_start) * (1 - rate1) * self.h_bar + rate1 * (self.delta - acceptance_rate)) log_epsilon = self.mu - tf.sqrt(new_step) / self.gamma * new_h_bar rate = tf.pow(new_step, -self.kappa) new_log_epsilon_bar = tf.assign( self.log_epsilon_bar, rate * log_epsilon + (1 - fresh_start) * (1 - rate) * self.log_epsilon_bar) with tf.control_dependencies([new_log_epsilon_bar]): new_log_epsilon = tf.identity(log_epsilon) return tf.exp(new_log_epsilon) c = tf.cond(self.adapt_step_size, adapt_stepsize, lambda: tf.exp(self.log_epsilon_bar)) return c
def update(self, x): # x: (chain_dims data_dims) new_t = tf.assign(self.t, self.t + 1) weight = (1 - self.decay) / (1 - tf.pow(self.decay, new_t)) # incr: (chain_dims data_dims) incr = [weight * (q - mean) for q, mean in zip(x, self.mean)] # mean: (1,...,1 data_dims) update_mean = [mean.assign_add( tf.reduce_mean(i, axis=self.chain_axes, keep_dims=True)) for mean, i in zip(self.mean, incr)] # var: (1,...,1 data_dims) new_var = [ (1 - weight) * var + tf.reduce_mean(i * (q - mean), axis=self.chain_axes, keep_dims=True) for var, i, q, mean in zip(self.var, incr, x, update_mean)] update_var = [tf.assign(var, n_var) for var, n_var in zip(self.var, new_var)] return update_var
def __init__(self, n_features, lenscale=1.0, p=1, variational=False, lenscale_posterior=None): """Create an instance of an arc cosine kernel layer.""" # Setup random weights if variational: kern = RBFVariational(lenscale=lenscale, lenscale_posterior=lenscale_posterior) else: kern = RBF(lenscale=lenscale) super().__init__(n_features=n_features, kernel=kern) # Kernel order assert isinstance(p, int) and p >= 0 if p == 0: self.pfunc = tf.sign elif p == 1: self.pfunc = lambda x: x else: self.pfunc = lambda x: tf.pow(x, p)
def _MatMulGradMom(op, W, out_grad, batch_size, mom=2): """Computes gradient moment for a weight matrix through a MatMul operation. Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A`` are the nxd1 activations of the previous layer (n being the batch size). ``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``. No transposes in the MatMul operation allowed. Inputs: :op: The MatMul operation :W: The weight matrix (the tensor, not the variable) :out_grad: The tensor of gradient w.r.t. to the output of the op :batch_size: Batch size n (constant integer or scalar int tf.Tensor) :mom: Integer moment desired (defaults to 2)""" assert op.type == "MatMul" t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b") assert W is op.inputs[1] and not t_a and not t_b A = op.inputs[0] out_grad_pow = tf.pow(out_grad, mom) A_pow = tf.pow(A, mom) return tf.mul(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
def get_cubic_root(self): # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 # where x = sqrt(mu). # We substitute x, which is sqrt(mu), with x = y + 1. # It gives y^3 + py = q # where p = (D^2 h_min^2)/(2*C) and q = -p. # We use the Vieta's substution to compute the root. # There is only one real solution y (which is in [0, 1] ). # http://mathworld.wolfram.com/VietasSubstitution.html # assert_array = \ # [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), # tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]), # tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]), # tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), # tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]), # tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])] # with tf.control_dependencies(assert_array): # EPS in the numerator to prevent momentum being exactly one in case of 0 gradient p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS) w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0 w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0) y = w - p / 3.0 / (w + EPS) x = y + 1 return x
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding, name): with tf.variable_scope(name): if pnorm == 2: pwr = tf.square(inpOp) else: pwr = tf.pow(inpOp, pnorm) subsamp = tf.nn.avg_pool(pwr, ksize=[1, kH, kW, 1], strides=[1, dH, dW, 1], padding=padding) subsamp_sum = tf.multiply(subsamp, kH*kW) if pnorm == 2: out = tf.sqrt(subsamp_sum) else: out = tf.pow(subsamp_sum, 1/pnorm) return out
def update_target_network(source_network, target_network, update_rate): target_network_update = [] for v in source_network.variables(): # this is equivalent to target = (1-alpha) * target + alpha * source # print ("source: " + v.name + " : " + str(v.get_shape())) pass for v in target_network.variables(): # this is equivalent to target = (1-alpha) * target + alpha * source # print ("target: " + v.name + " : " + str(v.get_shape())) pass for v_source, v_target in zip(source_network.variables(), target_network.variables()): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(update_rate * (v_target - v_source)) target_network_update.append(update_op) return tf.group(*target_network_update) # def concat_nn_input(self, input1, input2): # return tf.concat(1, [input1, input2]) # def add_pow_values(self, values): # return self.concat_nn_input(values, 0.01 * tf.pow(values, [2 for i in range(self.action_size)]))
def loss_with_spring(self): margin = 5.0 labels_t = self.y_ labels_f = tf.subtract(1.0, self.y_, name="1-yi") # labels_ = !labels; eucd2 = tf.pow(tf.subtract(self.o1, self.o2), 2) eucd2 = tf.reduce_sum(eucd2, 1) eucd = tf.sqrt(eucd2+1e-6, name="eucd") C = tf.constant(margin, name="C") # yi*||CNN(p1i)-CNN(p2i)||^2 + (1-yi)*max(0, C-||CNN(p1i)-CNN(p2i)||^2) pos = tf.multiply(labels_t, eucd2, name="yi_x_eucd2") # neg = tf.multiply(labels_f, tf.subtract(0.0,eucd2), name="yi_x_eucd2") # neg = tf.multiply(labels_f, tf.maximum(0.0, tf.subtract(C,eucd2)), name="Nyi_x_C-eucd_xx_2") neg = tf.multiply(labels_f, tf.pow(tf.maximum(tf.subtract(C, eucd), 0), 2), name="Nyi_x_C-eucd_xx_2") losses = tf.add(pos, neg, name="losses") loss = tf.reduce_mean(losses, name="loss") return loss
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding): global pool_counter global parameters name = 'pool' + str(pool_counter) pool_counter += 1 with tf.name_scope('lppool'): if pnorm == 2: pwr = tf.square(inpOp) else: pwr = tf.pow(inpOp, pnorm) subsamp = tf.nn.avg_pool(pwr, ksize=[1, kH, kW, 1], strides=[1, dH, dW, 1], padding=padding, name=name) subsamp_sum = tf.mul(subsamp, kH*kW) if pnorm == 2: out = tf.sqrt(subsamp_sum) else: out = tf.pow(subsamp_sum, 1/pnorm) return out
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999): ''' Adam optimizer ''' updates = [] if type(cost_or_grads) is not list: grads = tf.gradients(cost_or_grads, params) else: grads = cost_or_grads t = tf.Variable(1., 'adam_t') for p, g in zip(params, grads): mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg') if mom1>0: v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') v_t = mom1*v + (1. - mom1)*g v_hat = v_t / (1. - tf.pow(mom1,t)) updates.append(v.assign(v_t)) else: v_hat = g mg_t = mom2*mg + (1. - mom2)*tf.square(g) mg_hat = mg_t / (1. - tf.pow(mom2,t)) g_t = v_hat / tf.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append(mg.assign(mg_t)) updates.append(p.assign(p_t)) updates.append(t.assign_add(1)) return tf.group(*updates)
def weighted_loss(y_true, y_softmax_conv, weight): """Compute weighted loss function per pixel. Loss = (1 - softmax(logits)) * targets * weight + softmax(logits) * (1 - targets) * weight Argument: y_true: [batch_size, depth, height, width, 1] weight_map: [batch_size, depth, height, width, 1] y_softmax_conv: [batch_size, depth, height, width, 2] """ y_true = tf.to_float(tf.reshape(y_true[..., 0], [-1])) weight = tf.to_float(tf.reshape(weight[..., 0], [-1])) y_conv = tf.to_float(tf.reshape(y_softmax_conv[..., 1], [-1])) loss_pos = 1 / 2 * tf.pow((1 - y_conv), 2) * y_true * weight loss_neg = 1 / 2 * tf.pow(y_conv, 2) * (1 - y_true) * weight return tf.reduce_mean(loss_pos + loss_neg)
def apply_update(self, optimizer, grads_and_vars): (grads, vars) = zip(*grads_and_vars) # Gradient clipping if CustomTrainer.GRADIENT_CLIP in self.train_hypers: grads, global_norm = clip_ops.clip_by_global_norm(grads, self.train_hypers[CustomTrainer.GRADIENT_CLIP]) # Gradient noise if CustomTrainer.GRADIENT_NOISE in self.train_hypers: sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE] if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers: sigma_sqr /= tf.pow(1.0 + tf.to_float(self.global_step), self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY]) grads_tmp = [] for g in grads: if g is not None: noisy_grad = g + tf.sqrt(sigma_sqr)*tf.random_normal(tf.shape(g)) grads_tmp.append(noisy_grad) else: grads_tmp.append(g) grads = grads_tmp train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.global_step) return train_op
def __init__(self, lin, lout, iniRange, graph= None): if graph!=None: with graph.as_default(): self.v = tf.Variable(tf.random_uniform([lin, lout], iniRange[0], iniRange[1])) self.g = tf.Variable(tf.random_uniform([lout], -1.0,1.0)) self.pow2 = tf.fill([lin, lout],2.0) self.v_norm = tf.sqrt(tf.reduce_sum(tf.pow(self.v, self.pow2),0)) self.tile_div = tf.tile(tf.expand_dims(tf.div(self.g, self.v_norm),0),[lin, 1]) self.w = tf.mul(self.tile_div, self.v) else: self.v = tf.Variable(tf.random_uniform([lin, lout], -1/math.sqrt(lin), 1/math.sqrt(lin))) self.g = tf.Variable(tf.random_uniform([lout], -1.0,1.0)) self.pow2 = tf.fill([lin, lout],2.0) self.v_norm = tf.sqrt(tf.reduce_sum(tf.pow(self.v, self.pow2),0)) self.tile_div = tf.tile(tf.expand_dims(tf.div(self.g, self.v_norm),0),[lin, 1]) self.w = tf.mul(self.tile_div, self.v)
def gauss(mean, stddev, ksize): """Use Tensorflow to compute a Gaussian Kernel. Parameters ---------- mean : float Mean of the Gaussian (e.g. 0.0). stddev : float Standard Deviation of the Gaussian (e.g. 1.0). ksize : int Size of kernel (e.g. 16). Returns ------- kernel : np.ndarray Computed Gaussian Kernel using Tensorflow. """ g = tf.Graph() with tf.Session(graph=g): x = tf.linspace(-3.0, 3.0, ksize) z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) / (2.0 * tf.pow(stddev, 2.0)))) * (1.0 / (stddev * tf.sqrt(2.0 * 3.1415)))) return z.eval()
def adam_updates(params, cost_or_grads, lr=0.001, B1=0.9, B2=0.999): ''' Adam optimizer ''' updates = [] if type(cost_or_grads) is not list: grads = tf.gradients(cost_or_grads, params) else: grads = cost_or_grads t = tf.Variable(1., 'adam_t') for p, g in zip(params, grads): v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') if B1>0: m = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_m') m_t = B1*m + (1. - B1)*g m_hat = m_t / (1. - tf.pow(B1,t)) updates.append(m.assign(m_t)) else: m_hat = g v_t = B2*v + (1. - B2)*tf.square(g) v_hat = v_t / (1. - tf.pow(B2,t)) g_t = m_hat / tf.sqrt(v_hat + 1e-8) p_t = p - lr * g_t updates.append(v.assign(v_t)) updates.append(p.assign(p_t)) updates.append(t.assign_add(1)) return tf.group(*updates)
def address(M0, w0, head): # Content focusing # Compute cosine similarity key = tf.expand_dims(head["key"], 1) key_matches = tf.batch_matmul(key, tf.transpose(M0, [0, 2, 1])) key_matches = tf.squeeze(key_matches) key_mag = tf.expand_dims(NTMCell.magnitude(head["key"], 1), 1) M_col_mag = NTMCell.magnitude(M0, 2) cosine_sim = key_matches / (key_mag * M_col_mag) # Compute content weights wc = tf.nn.softmax(head["key_str"] * cosine_sim) # Location focusing wg = head["interp"] * wc + (1 - head["interp"]) * w0 ws = rotate.ntm_rotate(wg, head["shift"]) ws_pow = tf.pow(ws, head["sharp"]) w1 = ws_pow / tf.reduce_sum(ws_pow, 1, keep_dims=True) return w1
def build_model(self): self.input_y = tf.placeholder(tf.float32, [None,self.num_class], name="input_y") # 1*1, 1doc self.one_hot = tf.reshape(tf.cast(tf.one_hot(tf.cast(self.input_y, tf.int32), 2,0,1), tf.float32), [-1,2]) self.recon_loss = -tf.reduce_sum(tf.log(0.0001 + tf.gather(self.p_xi_h, self.x_id))) self.KL = -0.5 * tf.reduce_sum(1.0 + self.hlogvar - tf.pow(self.hmean, 2)\ - tf.exp(self.hlogvar), reduction_indices = 1) self.loss = tf.reduce_mean(0.0001 * self.KL + self.recon_loss) self.optimizer = tf.train.AdamOptimizer(self.learning_rate,0.9) self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in self.grads_and_vars] self.train_op = self.optimizer.apply_gradients(self.capped_gvs) #self.optimizer = tf.train.AdamOptimizer(self.learning_rate,beta1=0.9).minimize(self.loss) self.init = tf.initialize_all_variables() self.sess.run(self.init)
def get_total_variation(x, shape): with tf.name_scope('get_total_variation'): # Get the dimensions of the variable image height = shape[1] width = shape[2] size = reduce(lambda a, b: a * b, shape) ** 2 # Disjoin the variable image and evaluate the total variation x_cropped = x[:, :height - 1, :width - 1, :] left_term = tf.square(x[:, 1:, :width - 1, :] - x_cropped) right_term = tf.square(x[:, :height - 1, 1:, :] - x_cropped) smoothed_terms = tf.pow(left_term + right_term, TOTAL_VARIATION_SMOOTHING / 2.) return tf.reduce_sum(smoothed_terms) / size # Parse arguments and assign them to their respective global variables
def lppool(inpOp, pnorm, kH, kW, dH, dW, padding, name): with tf.variable_scope(name): if pnorm == 2: pwr = tf.square(inpOp) else: pwr = tf.pow(inpOp, pnorm) subsamp = tf.nn.avg_pool(pwr, ksize=[1, kH, kW, 1], strides=[1, dH, dW, 1], padding=padding) subsamp_sum = tf.mul(subsamp, kH*kW) if pnorm == 2: out = tf.sqrt(subsamp_sum) else: out = tf.pow(subsamp_sum, 1/pnorm) return out
def chi2(exp, obs): """ Compute CHI^2 statistics of non-zero expected elements """ zero = tf.constant(0, dtype=tf.float32) mask = tf.not_equal(exp, zero) def masking(tensor, mask): return tf.boolean_mask(tensor, mask) stat = tf.reduce_sum( tf.div( tf.pow( tf.subtract(masking(obs, mask), masking(exp, mask)), 2), masking(exp, mask)), name="chi2_statistics") return stat
def _apply_dense(self, grad, var): lr = (self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - self._beta1_t) m_t = m * self._beta1_t m_t = m_t + m_scaled_g_values # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = tf.pow(grad, 2) * (1 - self._beta2_t) v_t = v * self._beta2_t v_t = v_t + v_scaled_g_values v_sqrt = tf.pow(v_t, self._pow_t) var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking) # regularization var_update = state_ops.assign_sub(var_update, self._dense_regularization * var, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
def scaled_dot_product_attention_simple(q, k, v, bias, name=None): """scaled dot-product attention. One head. One spatial dimension. Args: q: a Tensor with shape [batch, length_q, depth_k] k: a Tensor with shape [batch, length_kv, depth_k] v: a Tensor with shape [batch, length_kv, depth_v] bias: optional Tensor broadcastable to [batch, length_q, length_kv] name: an optional string Returns: A Tensor. """ with tf.variable_scope( name, default_name="scaled_dot_product_attention_simple"): scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2])) logits = tf.matmul(q * scalar, k, transpose_b=True) if bias is not None: logits += bias weights = tf.nn.softmax(logits, name="attention_weights") tf.summary.image( "attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1) return tf.matmul(weights, v)
def _MatMulGradMom(op, W, out_grad, batch_size, mom=2): """Computes gradient moment for a weight matrix through a MatMul operation. Assumes ``Z=tf.matmul(A, W)``, where ``W`` is a d1xd2 weight matrix, ``A`` are the nxd1 activations of the previous layer (n being the batch size). ``out_grad`` is the gradient w.r.t. ``Z``, as computed by ``tf.gradients()``. No transposes in the MatMul operation allowed. Inputs: :op: The MatMul operation :W: The weight matrix (the tensor, not the variable) :out_grad: The tensor of gradient w.r.t. to the output of the op :batch_size: Batch size n (constant integer or scalar int tf.Tensor) :mom: Integer moment desired (defaults to 2)""" assert op.type == "MatMul" t_a, t_b = op.get_attr("transpose_a"), op.get_attr("transpose_b") assert W is op.inputs[1] and not t_a and not t_b A = op.inputs[0] out_grad_pow = tf.pow(out_grad, mom) A_pow = tf.pow(A, mom) return tf.multiply(batch_size, tf.matmul(A_pow, out_grad_pow, transpose_a=True))
def connect_cores(input, output_dim, name): """Connect two cores given the inputs, synaptic weights, and output dimension. Inputs can be output from a previous core or spike inputs""" input_dim = int(input.get_shape()[1]) s, axon_types, axon_weights = synapse_weight((input_dim, output_dim), name) b = leak_bias([output_dim], name) c = synapse_connection([input_dim, output_dim], name) xc = tf.reshape(input, (-1, input_dim, 1)) * c mu = b + tf.reduce_sum(xc * s, 1) sigma2 = tf.reduce_sum(xc * (1. - xc) * tf.pow(s, 2), 1) # Output is proba that each neuron fires x0 = tf.zeros_like(mu) output = normal_ccdf(x0, mu, sigma2) return output, b, c, axon_types, axon_weights, s
def buildTVNorm(model): adjustedImage = model.bgr yPlusOne = tf.slice(adjustedImage, [0,0,1,0], [1,imageShape[0],(imageShape[1]-1),imageShape[2]]) xPlusOne = tf.slice(adjustedImage, [0,1,0,0], [1,(imageShape[0]-1),imageShape[1],imageShape[2]]) inputNoiseYadj = tf.slice(adjustedImage,[0,0,0,0],[1,imageShape[0],(imageShape[1]-1),imageShape[2]]) inputNoiseXadj = tf.slice(adjustedImage, [0,0,0,0], [1,(imageShape[0]-1),imageShape[1],imageShape[2]]) lambdaBeta = (sigma**beta) / (imageShape[0]*imageShape[1]*((a*B)**beta)) error1 = tf.slice(tf.square(yPlusOne-inputNoiseYadj), [0,0,0,0], [1,(imageShape[0]-1),(imageShape[1]-1), imageShape[2]]) error2 = tf.slice(tf.square(xPlusOne-inputNoiseXadj), [0,0,0,0], [1,(imageShape[0]-1),(imageShape[1]-1), imageShape[2]]) return lambdaBeta*tf.reduce_sum( tf.pow((error1+error2),(beta/2) ))
def sharp_weights(self,after_conv_shift, sharp_gamma): """ Sharpens the final weights Parameters: ---------- after_conv_shift: Tensor (batch_size, memory_locations, number_of_keys) weights after circular Convolution sharp_gamma: Tensor (batch_size, number_of_keys) scalar to sharpen the final weights Returns: Tensor (batch_size, memory_locations, number_of_keys) final weights """ sharp_gamma = tf.expand_dims(sharp_gamma,1) powed_conv_w = tf.pow(after_conv_shift, sharp_gamma) return powed_conv_w / tf.expand_dims(tf.reduce_sum(powed_conv_w,1),1)
def kl_gaussian(mean_, logsigma, prior_mean=0., prior_logsigma=0., regularizer_scale=1.): ''' KL-divergence between two gaussians. Useful for Variational AutoEncoders. Use this as an activation regularizer Parameters: ----------- mean, logsigma: parameters of the input distributions prior_mean, prior_logsigma: paramaters of the desired distribution (note the log on logsigma) regularizer_scale: Rescales the regularization cost. Keep this 1 for most cases. Note ---- origin implementation from seya: https://github.com/Philip-Bachman/ICML-2015/blob/master/LogPDFs.py Copyright (c) Philip Bachman ''' gauss_klds = 0.5 * (prior_logsigma - logsigma + ((tf.exp(logsigma) + pow((mean_ - prior_mean), 2.0)) / tf.exp(prior_logsigma)) - 1.0) return mean(gauss_klds)
def get_marginal_likelihood(yt, mean_yt, xt, s, alpha, beta, eta_mu, eta_sigma, eps, sigma_px, epsilon = 1e-8): yt_expand = tf.expand_dims(yt, 0) mean_yt = tf.reshape(mean_yt, [s, FLAGS.batch_size, 784]) xt = tf.reshape(xt, [1, s, FLAGS.batch_size, FLAGS.hidden_size]) # p_ygivenx = tf.reduce_prod(tf.pow(mean_yt, yt_expand) * tf.pow(1 - mean_yt, 1 - yt_expand), axis=2) v = alpha / (alpha + beta) pi = tf.concat(0, [v, [1.0]]) * tf.concat(0, [[1.0], tf.cumprod(1 - v)]) p_x = gaussian_mixture_pdf(eta_mu, tf.square(eta_sigma) + tf.square(sigma_px), xt, pi) log_p_y_s = tf.reduce_sum(yt_expand * tf.log(mean_yt + epsilon) \ + (1.0 - yt_expand) * tf.log(1.0 - mean_yt + epsilon), 2) \ + tf.log(p_x) \ + 0.5 * tf.reduce_sum(tf.square(eps), 2) log_p_y_s_max = tf.reduce_max(log_p_y_s, reduction_indices=0) log_p_y = tf.log(tf.reduce_mean(tf.exp(log_p_y_s - log_p_y_s_max), 0)) + log_p_y_s_max return tf.reduce_mean(log_p_y) # Taken from: https://github.com/tensorflow/tensorflow/issues/6322
def noisy_dense(inputs, units, bias_shape, c_names, w_i, b_i=None, activation=tf.nn.relu, noisy_distribution='factorised'): def f(e_list): return tf.multiply(tf.sign(e_list), tf.pow(tf.abs(e_list), 0.5)) # ??tf.layers?????flatten # dense1 = tf.layers.dense(tf.contrib.layers.flatten(relu5), activation=tf.nn.relu, units=50) if not isinstance(inputs, ops.Tensor): inputs = ops.convert_to_tensor(inputs, dtype='float') # dim_list = inputs.get_shape().as_list() # flatten_shape = dim_list[1] if len(dim_list) <= 2 else reduce(lambda x, y: x * y, dim_list[1:]) # reshaped = tf.reshape(inputs, [dim_list[0], flatten_shape]) if len(inputs.shape) > 2: inputs = tf.contrib.layers.flatten(inputs) flatten_shape = inputs.shape[1] weights = tf.get_variable('weights', shape=[flatten_shape, units], initializer=w_i) w_noise = tf.get_variable('w_noise', [flatten_shape, units], initializer=w_i, collections=c_names) if noisy_distribution == 'independent': weights += tf.multiply(tf.random_normal(shape=w_noise.shape), w_noise) elif noisy_distribution == 'factorised': noise_1 = f(tf.random_normal(tf.TensorShape([flatten_shape, 1]), dtype=tf.float32)) # ??????????????? noise_2 = f(tf.random_normal(tf.TensorShape([1, units]), dtype=tf.float32)) weights += tf.multiply(noise_1 * noise_2, w_noise) dense = tf.matmul(inputs, weights) if bias_shape is not None: assert bias_shape[0] == units biases = tf.get_variable('biases', shape=bias_shape, initializer=b_i) b_noise = tf.get_variable('b_noise', [1, units], initializer=b_i, collections=c_names) if noisy_distribution == 'independent': biases += tf.multiply(tf.random_normal(shape=b_noise.shape), b_noise) elif noisy_distribution == 'factorised': biases += tf.multiply(noise_2, b_noise) return activation(dense + biases) if activation is not None else dense + biases return activation(dense) if activation is not None else dense # ???bias??????relu
def gelu_fast(_x): return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
def _polynomial(tensor): size = int(tensor.get_shape()[1]) pows = [ tf.pow(tensor[:, n], n + 1) for n in range(size) ] return tf.transpose(tf.pack(pows))
def GumbelSoftmaxLogDensity(y, p, tau): # EPS = tf.constant(1e-10) k = tf.shape(y)[-1] k = tf.cast(k, tf.float32) # y = y + EPS # y = tf.divide(y, tf.reduce_sum(y, -1, keep_dims=True)) y = normalize_to_unit_sum(y) sum_p_over_y = tf.reduce_sum(tf.divide(p, tf.pow(y, tau)), -1) logp = tf.lgamma(k) logp = logp + (k - 1) * tf.log(tau) logp = logp - k * tf.log(sum_p_over_y) logp = logp + sum_p_over_y return logp
def perplexity(label, logit): words = tf.cast(tf.size(label), tf.float32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=logit) cross_entropy = tf.divide(tf.reduce_sum(cross_entropy), words) perplex = tf.pow(2.0, cross_entropy) return perplex
def weighted_binary_crossentropy(feature_weights): def loss(y_true, y_pred): # try: # x = K.binary_crossentropy(y_pred, y_true) # # y = tf.Variable(feature_weights.astype('float32')) # # z = K.dot(x, y) # y_true = tf.pow(y_true + 1e-5, .75) # y2 = tf.div(y_true, tf.reshape(K.sum(y_true, 1), [-1, 1])) # z = K.sum(tf.mul(x, y2), 1) # except Exception as e: # print e # import pdb;pdb.set_trace() # return z return K.dot(K.binary_crossentropy(y_pred, y_true), K.variable(feature_weights.astype('float32'))) return loss
def meanShift(n_updates=-1): X1 = tf.expand_dims(tf.transpose(input_X), 0) X2 = tf.expand_dims(input_X, 0) C = init_C sbs_C = tf.TensorArray(dtype=tf.float32, size=10000, infer_shape=False) sbs_C = sbs_C.write(0, init_C) def _mean_shift_step(C): C = tf.expand_dims(C, 2) Y = tf.reduce_sum(tf.pow((C - X1) / window_radius, 2), axis=1) gY = tf.exp(-Y) num = tf.reduce_sum(tf.expand_dims(gY, 2) * X2, axis=1) denom = tf.reduce_sum(gY, axis=1, keep_dims=True) C = num / denom return C if n_updates > 0: for i in range(n_updates): C = _mean_shift_step(C) sbs_C = sbs_C.write(i + 1, C) else: def _mean_shift(i, C, sbs_C, max_diff): new_C = _mean_shift_step(C) max_diff = tf.reshape(tf.reduce_max(tf.sqrt(tf.reduce_sum(tf.pow(new_C - C, 2), axis=1))), []) sbs_C = sbs_C.write(i + 1, new_C) return i + 1, new_C, sbs_C, max_diff def _cond(i, C, sbs_C, max_diff): return max_diff > 1e-5 n_updates, C, sbs_C, _ = tf.while_loop(cond=_cond, body=_mean_shift, loop_vars=(tf.constant(0), C, sbs_C, tf.constant(1e10))) n_updates = tf.Print(n_updates, [n_updates]) return C, sbs_C.gather(tf.range(n_updates + 1))
def __get_grad_noise_scale(self, gradients): if self.cfg.grad_noise_decay is None: grad_noise_scale = self.cfg.grad_noise_scale elif self.cfg.grad_noise_decay == 'annealing': """ Adds annealed gaussian noise to the gradients at every time step, by decaying the variance at each time step g_t <- g_t + N(0, sigma_t^2) sigma_t^2 = eta / (1 + t)^gamma with eta selected from {0.01, 0.3, 1.0) and gamma = 0.55 See: "Adding gradient noise improves learning for very deep networks", http://arxiv.org/pdf/1511.06807v1.pdf """ eta = self.cfg.grad_noise_scale ** 0.5 gamma = 0.55 / 2 grad_noise_scale = eta * tf.pow(tf.cast( self.global_step + 1, self.cfg._FLOATX), -gamma) elif self.cfg.grad_noise_decay == 'neural_gpu': if self.prev_err is None: grad_noise_scale = self.cfg.grad_noise_scale else: eta = self.cfg.grad_noise_scale gamma = 0.55 grad_noise_scale = eta * tf.sqrt( self.prev_err * tf.pow(tf.cast( self.global_step + 1, self.cfg._FLOATX), -gamma)) else: # Raise ValueError raise NotImplementedError('Unknown value of ' 'cfg.grad_noise_decay: %s' % self.cfg.grad_noise_decay) return grad_noise_scale
def pow(x, a): '''Element-wise exponentiation. ''' return tf.pow(x, a)