我们从Python开源项目中,提取了以下45个代码示例,用于说明如何使用theano.clone()。
def test_gt_grad(): """A user test that failed. Something about it made Elemwise.grad return something that was too complicated for get_scalar_constant_value to recognize as being 0, so gradient.grad reported that it was not a valid gradient of an integer. """ floatX = config.floatX T = theano.tensor input_ = T.vector(dtype=floatX) random_values = numpy.random.RandomState(1234).uniform( low=-1, high=1, size=(2, 2)) W_values = numpy.asarray(random_values, dtype=floatX) W = theano.shared(value=W_values, name='weights') correct_score = T.dot(input_, W) wrong_input = T.vector(dtype=floatX) wrong_score = theano.clone(correct_score, {input_: wrong_input}) # Hinge loss scores = T.ones_like(correct_score) - correct_score + wrong_score cost = (scores * (scores > 0)).sum() T.grad(cost, input_)
def infer_shape(self, node, shapes): out_shp = theano.scan_module.scan_utils.infer_shape(self.new_outputs, self.new_inputs, shapes) # Clone the output shape so that shape are computed from outer inputs. # Note: # Here we can do it more simply like: # ret = [theano.clone(shp, replace=repl) for shp in out_shp] # But doing it multiple time could duplicate common subgraph between # each shape call. Theano optimizer will clean this up later, but this # will ask extra work to the optimizer. repl = dict(zip(self.new_inputs, node.inputs)) cloned = theano.clone(reduce(tuple.__add__, out_shp), replace=repl) ret = [] used = 0 for i in range(len(out_shp)): nb = len(out_shp[i]) ret.append(cloned[used: used + nb]) used += nb return ret
def reconstruct_graph(inputs, outputs, tag=None): """ Different interface to clone, that allows you to pass inputs. Compared to clone, this method always replaces the inputs with new variables of the same type, and returns those (in the same order as the original inputs). """ if tag is None: tag = '' nw_inputs = [safe_new(x, tag) for x in inputs] givens = OrderedDict() for nw_x, x in izip(nw_inputs, inputs): givens[x] = nw_x allinputs = theano.gof.graph.inputs(outputs) for inp in allinputs: if isinstance(inp, theano.Constant): givens[inp] = inp.clone() nw_outputs = clone(outputs, replace=givens) return (nw_inputs, nw_outputs)
def test_cloning_no_replace_strict_copy_inputs(self): # This has nothing to do with scan, but it refers to the clone # function that scan uses internally and that pfunc uses now and # that users might want to use x = theano.tensor.vector('x') y = theano.tensor.vector('y') z = theano.shared(0.25) f1 = z * (x + y) ** 2 + 5 f2 = theano.clone(f1, replace=None, strict=True, share_inputs=True) f2_inp = theano.gof.graph.inputs([f2]) assert z in f2_inp assert x in f2_inp assert y in f2_inp
def test_cloning_no_replace_strict_not_copy_inputs(self): # This has nothing to do with scan, but it refers to the clone # function that scan uses internally and that pfunc uses now and # that users might want to use x = theano.tensor.vector('x') y = theano.tensor.vector('y') z = theano.shared(0.25) f1 = z * (x + y) ** 2 + 5 f2 = theano.clone(f1, replace=None, strict=True, share_inputs=False) f2_inp = theano.gof.graph.inputs([f2]) assert not z in f2_inp assert not x in f2_inp assert not y in f2_inp
def test_cloning_replace_not_strict_copy_inputs(self): # This has nothing to do with scan, but it refers to the clone # function that scan uses internally and that pfunc uses now and # that users might want to use x = theano.tensor.vector('x') y = theano.tensor.fvector('y') y2 = theano.tensor.dvector('y2') z = theano.shared(0.25) f1 = z * (x + y) ** 2 + 5 f2 = theano.clone(f1, replace=OrderedDict([(y, y2)]), strict=False, share_inputs=True) f2_inp = theano.gof.graph.inputs([f2]) assert z in f2_inp assert x in f2_inp assert y2 in f2_inp
def test_cloning_replace_strict_not_copy_inputs(self): # This has nothing to do with scan, but it refers to the clone # function that scan uses internally and that pfunc uses now and # that users might want to use x = theano.tensor.vector('x') y = theano.tensor.vector('y') y2 = theano.tensor.vector('y2') z = theano.shared(0.25) f1 = z * (x + y) ** 2 + 5 f2 = theano.clone(f1, replace=[(y, y2)], strict=True, share_inputs=False) f2_inp = theano.gof.graph.inputs([f2]) assert not z in f2_inp assert not x in f2_inp assert not y2 in f2_inp
def test_cloning_replace_not_strict_not_copy_inputs(self): # This has nothing to do with scan, but it refers to the clone # function that scan uses internally and that pfunc uses now and # that users might want to use x = theano.tensor.vector('x') y = theano.tensor.fvector('y') y2 = theano.tensor.dvector('y2') z = theano.shared(0.25) f1 = z * (x + y) ** 2 + 5 f2 = theano.clone(f1, replace=[(y, y2)], strict=False, share_inputs=False) f2_inp = theano.gof.graph.inputs([f2]) assert not z in f2_inp assert not x in f2_inp assert not y2 in f2_inp # TEST RE-ordering of inputs # some rnn with multiple outputs and multiple inputs; other # dimension instead of scalars/vectors
def clone(**new_inputs): new_obj = utils.copy(self) # Reorder inputs assert len(new_obj.inputs) == len(new_inputs.items()) pairs=[(x, new_inputs[x.name]) for x in inputs] new_obj.inputs = new_inputs.values() new_obj.out = theano.clone(new_obj.out, replace=pairs) if hasattr(new_obj, 'cost'): new_obj.cost = theano.clone(new_obj.cost, replace=pairs) if hasattr(new_obj, 'grads'): new_obj.grads = theano.clone(new_obj.grads, replace=pairs) if hasattr(new_obj, 'sample'): new_obj.sample = theano.clone(new_obj.sample, replace=pairs) return new_obj
def __call__(self, cost, params): grads = T.grad(cost=cost ,wrt=params) updates = [] for p, g in zip(params, grads): v = theano.shared(p.get_value() * 0.) new_v = self.mu * v + self.lr * theano.clone(g, replace = {p: p - self.mu * v}) updates.append((v, new_v)) updates.append((p, p - new_v)) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.fixed: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) #std = input.std(self.axes, keepdims=True) std = (input.var(self.axes, keepdims=True)+self.epsilon).sqrt() # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * std) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std #std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) # normalized = (input - mean) * (gamma / std) + beta normalized = (input - mean) / std if self.rescale: normalized = normalized * gamma + beta return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * std) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) normalized = (input - mean) * (gamma / std) + beta return self.nonlinearity(normalized)
def convolve(self, input, deterministic=False, **kwargs): """ Binary convolution. Both inputs and weights are binary (+1 or -1) This overrides convolve operation from Conv2DLayer implementation """ if(self.xnor): # compute the binary inputs H and the scaling matrix K input, K = binarize_conv_input(input, self.beta_filter) # Compute the binarized filters are the scaling matrix self.Wb, alpha = binarize_conv_filters(self.W) if not deterministic: old_alpha = theano.clone(self.xalpha, share_inputs=False) old_alpha.default_update = alpha alpha += 0*old_alpha else: alpha = self.xalpha # TODO: Use XNOR ops for the convolution. As of now using Lasagne's convolution for # functionality verification. # approx weight tensor #W_full_precision = self.Wb * alpha.dimshuffle(0, 'x', 'x', 'x') Wr = self.W self.W = self.Wb feat_maps = super(Conv2DLayer, self).convolve(input, **kwargs) # restore the approx full precision weight for gradiant computation #self.W = W_full_precision self.W = Wr # scale by K and alpha # FIXME: Actually we are scaling after adding bias here. Need to scale first and then add bias. # The super class method automatically adds bias. Somehow need to overcome this.. # may subtract the bias, scale by alpha and beta ans then add bias ? feat_maps = feat_maps * K feat_maps = feat_maps * alpha.dimshuffle('x', 0, 'x', 'x') else: feat_maps = super(Conv2DLayer, self).convolve(input, **kwargs) return feat_maps
def get_output_for(self, input, deterministic=False, **kwargs): """ Binary dense layer dot product computation """ if(self.xnor): # binarize the input bin_input, beta = binarize_fc_input(input) # compute weight scaling factor. self.Wb, alpha = binarize_fc_weights(self.W) if not deterministic: old_alpha = theano.clone(self.xalpha, share_inputs=False) old_alpha.default_update = alpha alpha += 0*old_alpha else: alpha = self.xalpha #W_full_precision = self.Wb * alpha.dimshuffle('x', 0) Wr = self.W self.W = self.Wb fc_out = super(DenseLayer, self).get_output_for(bin_input, **kwargs) # scale the output by alpha and beta # FIXME: Actually we are scaling after adding bias here. Need to scale first and then add bias. # The super class method automatically adds bias. Somehow need to overcome this.. # may subtract the bias, scale by alpha and beta ans then add bias ? fc_out = fc_out * beta.dimshuffle(0, 'x') fc_out = fc_out * alpha.dimshuffle('x', 0) #self.W = W_full_precision self.W = Wr else: fc_out = super(DenseLayer, self).get_output_for(input, **kwargs) return fc_out # find the dot product # scale the output by alpha and beta
def get_dOmega_dWrec(self, loss, x): # Pascanu's trick scan_node = x.owner.inputs[0].owner assert isinstance(scan_node.op, theano.scan_module.scan_op.Scan) npos = scan_node.op.n_seqs + 1 init_x = scan_node.inputs[npos] g_x = theano.grad(loss, init_x) # To force immediate derivatives d_xt = T.tensor3('d_xt') xt = T.tensor3('xt') # Vanishing-gradient regularization self.bound = 1e-20 self.lambda_Omega = 2 # Wrec Wrec = self.params['Wrec'] # Numerator alpha = self.alpha num = (1 - alpha)*d_xt[1:] + T.dot(alpha*d_xt[1:], Wrec.T)*self.df_hidden(xt) num = (num**2).sum(axis=2) # Denominator denom = (d_xt[1:]**2).sum(axis=2) # Omega bound = self.bound Omega = (T.switch(T.ge(denom, bound), num/denom, 1) - 1)**2 nelems = T.mean(T.ge(denom, bound), axis=1) Omega = Omega.mean(axis=1).sum()/nelems.sum() # Gradient w.r.t Wrec g_Wrec = theano.grad(Omega, Wrec) g_Wrec = theano.clone(g_Wrec, replace=[(d_xt, g_x), (xt, x)]) return self.lambda_Omega * g_Wrec
def forced_replace(out, x, y): """ Check all internal values of the graph that compute the variable ``out`` for occurrences of values identical with ``x``. If such occurrences are encountered then they are replaced with variable ``y``. Parameters ---------- out : Theano Variable x : Theano Variable y : Theano Variable Examples -------- out := sigmoid(wu)*(1-sigmoid(wu)) x := sigmoid(wu) forced_replace(out, x, y) := y*(1-y) """ if out is None: return None # ``visited`` is a set of nodes that are already known and don't need to be # checked again, speeding up the traversal of multiply-connected graphs. visited = set() def local_traverse(graph, x): if graph in visited: return [] visited.add(graph) if equal_computations([graph], [x]): return [graph] elif not graph.owner: return [] else: rval = [] for inp in graph.owner.inputs: rval += local_traverse(inp, x) return rval to_replace = local_traverse(out, x) return clone(out, replace=OrderedDict((v, y) for v in to_replace))
def test_inplace3(self): rng = numpy.random.RandomState(utt.fetch_seed()) vx0 = asarrayX(rng.uniform()) vx1 = asarrayX(rng.uniform()) x0 = theano.shared(vx0) x1 = theano.shared(vx1) outputs, updates = theano.scan(lambda x, y: (x + asarrayX(1), y + asarrayX(1)), [], [x0, x1], n_steps=3) x0 = asarrayX(numpy.zeros((3,))) x0[0] = vx0 x0 = theano.tensor.constant(x0) to_replace = outputs[0].owner.inputs[0].owner.inputs[1] outputs = theano.clone(outputs, replace=[(to_replace, x0)]) mode = theano.compile.mode.get_mode(None).including('inplace') f9 = theano.function([], outputs, updates=updates, mode=mode) scan_node = [x for x in f9.maker.fgraph.toposort() if isinstance(x.op, theano.scan_module.scan_op.Scan)] assert 0 not in scan_node[0].op.destroy_map.keys() assert 1 in scan_node[0].op.destroy_map.keys() # Shared variable with updates
def test_clone(self): def test(x, y, mention_y): if mention_y: d = 0.1 + 0 * y else: d = 0.1 out = theano.clone(y, replace={x: x + d}) # theano.printing.debugprint(out) return theano.function([], out)() x = theano.shared(numpy.asarray(0., dtype=theano.config.floatX)) utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=False), 1.21000003815) utt.assert_allclose(test(x, tensor.sum((x+1)**2), mention_y=True), 1.21000003815)
def pseudograd(loss, params, srng=None, temperature = 1.0e-1, learning_rate=1.0e-2, rho2=0.95): one = T.constant(1.0) zero = T.constant(0.0) deltas = [ make_normal(param, srng=srng) for param in params ] momentum = [ make_copy(param) for param in params ] new_params = [ param + learning_rate * delta for param, delta, m in zip(params, deltas, momentum) ] new_loss = theano.clone( loss, replace=dict(zip(params, new_params)) ) accepting_p = T.exp((loss - new_loss) / temperature) u = srng.uniform(size=(), dtype=loss.dtype) cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss)) step = T.switch(cond, zero, one) updates = OrderedDict() for m, delta in zip(momentum, deltas): updates[m] = m * rho2 + (one - rho2) * delta * step for param, m in zip(params, momentum): updates[param] = param + learning_rate * m return updates
def add_layer(self, new_layer): '''Adds the given layer to the network''' self.layers.append(new_layer) self.output = theano.clone(new_layer.output, replace={new_layer.input: self.output}) self.size += new_layer.size
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_std = TT.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * TT.inv(std)) + beta return normalized
def get_grads(self, state_below, target, mask = None, reg = None, scale=None, sum_over_time=True, use_noise=True, additional_inputs=None): """ This function implements both the forward and backwards pass of this layer. The reason we do this in a single function is because for the factorized softmax layer is hard to rely on grad and get an optimized graph. For uniformity I've implemented this method for this layer as well (though one doesn't need to use it) :param state_below: theano variable representing the input to the softmax layer :param target: theano variable representing the target for this layer :return: cost, dC_dstate_below, param_grads, new_properties dC_dstate_below is a computational graph representing the gradient of the cost wrt to state_below param_grads is a list containing the gradients wrt to the different parameters of the layer new_properties is a dictionary containing additional properties of the model; properties are theano expression that are evaluated and reported by the model """ cost = self.get_cost(state_below, target, mask = mask, reg = reg, scale=scale, sum_over_time=sum_over_time, use_noise=use_noise, additional_inputs=additional_inputs) grads = TT.grad(cost, self.params) if self.additional_gradients: for new_grads, to_replace, properties in self.additional_gradients: gparams, params = new_grads prop_expr = [x[1] for x in properties] replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace] rval = theano.clone(gparams + prop_expr, replace=replace) gparams = rval[:len(gparams)] prop_expr = rval[len(gparams):] self.properties += [(x[0], y) for x,y in zip(properties, prop_expr)] for gp, p in zip(gparams, params): grads[self.params.index(p)] += gp self.cost = cost self.grads = grads def Gvs_fn(*args): w = (1 - self.model_output) * self.model_output * state_below.shape[1] Gvs = TT.Lop(self.model_output, self.params, TT.Rop(self.model_output, self.params, args)/w) return Gvs self.Gvs = Gvs_fn return cost, grads
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_var = input.var(self.axes) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean var = self.var else: mean = input_mean var = input_var # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_var = theano.clone(self.var, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_var.default_update = ((1 - self.alpha) * running_var + self.alpha * input_var) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean var += 0 * running_var # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(self.beta.ndim)) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = self.beta.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = T.sqrt(var + self.epsilon) std = std.dimshuffle(pattern) # normalize # normalized = (input - mean) * (gamma / std) + beta normalized = T.nnet.batch_normalization(input, gamma=gamma, beta=beta, mean=mean, std=std, mode=self.mode) return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def call(self, x, mask=None): input_dim = self.input_dim input_type='real' out_every_t=False loss_function='MSE' output_type='real' flag_feed_forward=False flag_use_mask=False hidden_bias_mean=np.float32(0.0) hidden_bias_init='zero' Wimpl=self.unitary_impl if ('full' in Wimpl): Wimpl='full' elif (Wimpl=='ASB2016'): Wimpl='adhoc' #hidden_bias_init='rand' elif (Wimpl=='ASB2016_fast'): Wimpl='adhoc_fast' n_layers=1 seed=1234 x_spec=K.permute_dimensions(x,(1,0,2)) inputs, parameters, costs = models.complex_RNN(input_dim, self.hidden_dim, self.output_dim, input_type=input_type,out_every_t=out_every_t, loss_function=loss_function,output_type=output_type,flag_feed_forward=flag_feed_forward,flag_return_lin_output=True,x_spec=x_spec,flag_use_mask=flag_use_mask,hidden_bias_mean=hidden_bias_mean,Wimpl=Wimpl,flag_return_hidden_states=True,n_layers=n_layers,seed=seed,hidden_bias_init=hidden_bias_init) lin_output=costs[2] #self.hidden_states=costs[3] if (self.unitary_impl=='full'): # just use lrng for learning rate on this parameter parameters[-1].name+='full_natGrad' elif (self.unitary_impl=='full_natGrad'): # use fixed lrng with natural gradient update parameters[-1].name+='_natGrad_unitaryAug' elif (self.unitary_impl=='full_natGradRMS'): # use fixed lrng with natural gradient update and RMSprop-style gradient adjustment parameters[-1].name+='_natGradRMS_unitaryAug' elif (self.unitary_impl=='full_enforceComplex'): # swap out 2Nx2N augmented unitary matrix for Nx2N, which ensures the # complex number constraint is satisfied parameters[-1].name+='full_natGrad' Waug=parameters[-1] WReIm=K.variable(value=Waug[:Waug.shape[1]/2,:].eval(),name=Waug.name) WaugFull=K.concatenate( (WReIm, K.concatenate((-WReIm[:,WReIm.shape[1]/2:],WReIm[:,:WReIm.shape[1]/2]),axis=1)),axis=0 ) lin_output_new = theano.clone(lin_output,replace={parameters[-1]:WaugFull}) lin_output = lin_output_new parameters[-1]=WReIm self.trainable_weights = parameters return lin_output
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # decide whether to use the sotred averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics. running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def check_mat_rop_lop(self, y, out_shape): """ Test the Rop/Lop when input is a matrix and the output is a vector :param y: the output variable of the op applied to self.mx :param out_shape: Used to generate a random tensor corresponding to the evaluation point of the Rop (i.e. the tensor with which you multiply the Jacobian). It should be a tuple of ints. If the Op has more than 1 input, one of them must be mx, while others must be shared variables / constants. We will test only against the input self.mx, so you must call check_mat_rop_lop/check_rop_lop for the other inputs. We expect all inputs/outputs have dtype floatX. If you want to test an Op with an output matrix, add a sum after the Op you want to test. """ vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], yv, on_unused_input='ignore') sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.mx, self.mv]) scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop(theano.clone(y, replace={self.mx: break_op(self.mx)})) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v * y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop(theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise SkipTest('Rop does not handle non-differentiable inputs ' 'correctly. Bug exposed by fixing Add.grad method.')
def clone(output, replace=None, strict=True, share_inputs=True, copy_inputs=DEPRECATED_ARG): """ Function that allows replacing subgraphs of a computational graph. It returns a copy of the initial subgraph with the corresponding substitutions. Parameters ---------- output : Theano Variables (or Theano expressions) Theano expression that represents the computational graph. replace : dict Dictionary describing which subgraphs should be replaced by what. share_inputs : bool If True, use the same inputs (and shared variables) as the original graph. If False, clone them. Note that cloned shared variables still use the same underlying storage, so they will always have the same value. copy_inputs Deprecated, use share_inputs. """ if copy_inputs is not DEPRECATED_ARG: warnings.warn('In `clone()` function, the argument `copy_inputs` has been deprecated and renamed into `share_inputs`') assert share_inputs # since we used `copy_inputs` we should have default value for `share_inputs` share_inputs = copy_inputs if isinstance(replace, dict): items = list(replace.items()) elif isinstance(replace, (list, tuple)): items = replace elif replace is None: items = [] else: raise ValueError(("replace is neither a dictionary, list, " "tuple or None ! The value provided is %s," "of type %s")%(str(replace), str(type(replace)))) tmp_replace = [(x, x.type()) for x, y in items] new_replace = [(x, y) for ((_, x), (_, y)) in zip(tmp_replace, items)] _, _outs, _ = rebuild_collect_shared(output, [], tmp_replace, [], strict, share_inputs) # TODO Explain why we call it twice ?! _, outs, _ = rebuild_collect_shared(_outs, [], new_replace, [], strict, share_inputs) return outs
def test_rop_lop(): mx = tensor.matrix('mx') mv = tensor.matrix('mv') v = tensor.vector('v') y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) rop_f = function([mx, mv], yv) sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv]) scan_f = function([mx, mv], sy) rng = numpy.random.RandomState(utt.fetch_seed()) vx = numpy.asarray(rng.randn(4, 4), theano.config.floatX) vv = numpy.asarray(rng.randn(4, 4), theano.config.floatX) v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) raised = False try: tensor.Rop( theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(( 'Op did not raised an error even though the function' ' is not differentiable')) vv = numpy.asarray(rng.uniform(size=(4,)), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None, **kwargs): input_mean = input.mean(self.axes) input_std = T.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma / std) + beta return normalized
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # input_inv_std = T.inv(T.sqrt(input.var(self.axes)) + 1E-6) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) # input_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) input_std = T.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize # normalized = (input - mean) * (gamma * std) + beta normalized = batch_normalization( input, gamma, beta, mean, std, mode='low_mem') return self.activation(normalized)
def __call__(self, training, **kwargs): batch_norm_use_averages = kwargs.get('batch_norm_use_averages', not training) batch_norm_update_averages = kwargs.get('batch_norm_update_averages', training) inputs = self.get_input(training, **kwargs) outputs = [] for input in inputs: input_mean = T.mean(input, self.axes) input_inv_std = 1. / T.sqrt(T.var(input, self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std if batch_norm_update_averages: if config.backend() == 'theano': # this trick really fast and efficency so I want to keep it import theano # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std elif config.backend() == 'tensorflow': T.add_global_updates(self.mean, ((1 - self.alpha) * self.mean + self.alpha * input_mean)) T.add_global_updates(self.inv_std, ((1 - self.alpha) * self.inv_std + self.alpha * input_inv_std)) # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(T.ndim(input) - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(T.ndim(input))] # apply dimshuffle pattern to all parameters beta = 0. if self.beta is None else T.dimshuffle(self.beta, pattern) gamma = 1. if self.gamma is None else T.dimshuffle(self.gamma, pattern) mean = T.dimshuffle(mean, pattern) inv_std = T.dimshuffle(inv_std, pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta outputs.append(normalized) # ====== foot_print ====== # self._log_footprint(training, inputs, outputs) return outputs