我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.scan()。
def connect(self, inputs, mask, is_train): """ is_train: A boolean tensor. """ max_length = inputs.shape[0] batch_size = inputs.shape[1] outputs_info = [tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim), tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim)] # Dropout mask sharing for variational dropout. self.is_train = is_train if self.recurrent_dropout_layer != None: self.recurrent_dropout_layer.generate_mask([batch_size, self.hidden_dim], is_train) inputs = tensor.dot(inputs, self.W) + self.b rval, _ = theano.scan(self._step, # Scan function sequences=[inputs, mask], # Input sequence outputs_info=outputs_info, name=_p(self.prefix, '_layers'), n_steps=max_length) # scan steps return rval[0]
def connect(self, inputs, mask, is_train): max_length = inputs.shape[0] batch_size = inputs.shape[1] outputs_info = [tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim), tensor.alloc(numpy_floatX(0.), batch_size, self.hidden_dim)] # Dropout layers self.is_train = is_train if self.recurrent_dropout_layer != None: self.recurrent_dropout_layer.generate_mask([batch_size, self.hidden_dim], is_train) proj_inputs = tensor.dot(inputs, self.W) + self.b rval, _ = theano.scan(self._step, # Scan function sequences=[inputs, proj_inputs, mask], # Input sequence outputs_info=outputs_info, name=_p(self.prefix, '_layers'), n_steps=max_length) # scan steps return rval[0]
def sample_scan(self, x, sigma, n_steps, samples): # Enable on-the-fly graph computations # theano.config.compute_test_value = "raise" in_val = T.fmatrix("input_values") # in_val.tag.test_value = np.asarray( # np.random.rand(1, 784), dtype=theano.config.floatX) s_sigma = T.fscalr("sigma_values") # s_sigma = np.asarray( # np.random.rand(1), dtype=theano.config.floatX) mode = "FAST_RUN" values, updates = theano.scan(fn=self.sample_one_step, outputs_info=in_val, non_sequences=s_sigma, n_steps=n_steps, mode=mode) ae_sampler = theano.function(inputs=[in_val, s_sigma], outputs=values[-1], updates=updates) samples = ae_sampler(x, sigma) return samples
def gru_layer(tparams, emb, options): hiddenDimSize = options['hiddenDimSize'] timesteps = emb.shape[0] if emb.ndim == 3: n_samples = emb.shape[1] else: n_samples = 1 def stepFn(wx, h, U_gru): uh = T.dot(h, U_gru) r = T.nnet.sigmoid(_slice(wx, 0, hiddenDimSize) + _slice(uh, 0, hiddenDimSize)) z = T.nnet.sigmoid(_slice(wx, 1, hiddenDimSize) + _slice(uh, 1, hiddenDimSize)) h_tilde = T.tanh(_slice(wx, 2, hiddenDimSize) + r * _slice(uh, 2, hiddenDimSize)) h_new = z * h + ((1. - z) * h_tilde) return h_new Wx = T.dot(emb, tparams['W_gru']) + tparams['b_gru'] results, updates = theano.scan(fn=stepFn, sequences=[Wx], outputs_info=T.alloc(numpy_floatX(0.0), n_samples, hiddenDimSize), non_sequences=[tparams['U_gru']], name='gru_layer', n_steps=timesteps) return results
def get_output_for(self, inputs, **kwargs): vals, ref = inputs def filt(V, R): if self.norm_type is not None: o = tt.ones((1, V.shape[1], V.shape[2]), np.float32) norm = gaussian_filter(R, o, self.kern_std, self.ref_dim) norm = tt.sqrt(norm) if self.norm_type == "sym" else norm norm += 1e-8 V = V / norm if self.norm_type in ["pre", "sym"] else V F = gaussian_filter(R, V, self.kern_std) return F / norm if self.norm_type in ["post", "sym"] else F filtered = theano.scan(fn=filt, sequences=[vals, ref], outputs_info=None)[0] return filtered
def set_net_params(self): '''Returns MLP parameters for scan.''' super(GRU, self).set_net_params() if self.input_net_aux is None: self.input_net_aux = MLP( self.dim_in, 2 * self.dim_h, 2 * self.dim_hs[0], 1, rng=self.rng, trng=self.trng, h_act='T.nnet.sigmoid', out_act='T.tanh', name='input_net_aux') else: assert self.input_net_aux.dim_in == self.dim_in assert self.input_net_aux.dim_out == 2 * self.dim_hs[0] self.input_net_aux.name = self.name + '_input_net_aux' self.nets.append(self.input_net_aux) for i in xrange(self.n_layers - 1): n = MLP(self.dim_hs[i], 2 * self.dim_hs[i+1], rng=self.rng, trng=self.trng, distribution='centered_binomial', name='rnn_net_aux%d' % i) self.inter_nets.append(n) #insert(2 * i + 1, n)
def step_call(self, x, h0, c0, condition_on, *params): n_steps = x.shape[0] n_samples = x.shape[1] seqs = self.call_seqs(x, condition_on, *params) outputs_info = [h0, c0] non_seqs = self.get_recurrent_args(*params) (h, c), updates = theano.scan( self._step, sequences=seqs, outputs_info=outputs_info, non_sequences=non_seqs, name=self.name + '_recurrent_steps', n_steps=n_steps, strict=True) o_params = self.get_output_args(*params) out_net_out = self.output_net.step_call(h, *o_params) preact = out_net_out['z'] p = out_net_out['p'] #y = self.output_net.sample(p=p) return OrderedDict(h=h, p=p, z=preact), updates
def call_seqs(self, x, condition_on, level, *params): '''Prepares the input for __call__ Args: x (T.tensor): input condtion_on (T.tensor or None): tensor to condition recurrence on. level (int): reccurent level. *params: list of theano.shared. Returns: list: list of scan inputs. ''' if level == 0: i_params = self.get_input_args(*params) a = self.input_net.step_preact(x, *i_params) else: i_params = self.get_inter_args(level - 1, *params) a = self.inter_nets[level - 1].step_preact(x, *i_params) if condition_on is not None: a += condition_on return [a]
def shuffle_columns(x, srng): '''Shuffles a tensor along the second index. Args: x (T.tensor). srng (sharedRandomstream). ''' def step_shuffle(m, perm): return m[perm] perm_mat = srng.permutation(n=x.shape[0], size=(x.shape[1],)) y, _ = scan( step_shuffle, [x.transpose(1, 0, 2), perm_mat], [None], [], x.shape[1], name='shuffle', strict=False) return y.transpose(1, 0, 2)
def ctc_path_probs(predict, Y, alpha=1e-4): smoothed_predict = (1 - alpha) * predict[:, Y] + alpha * np.float32(1.) / Y.shape[0] L = T.log(smoothed_predict) zeros = T.zeros_like(L[0]) log_first = zeros f_skip_idxs = ctc_create_skip_idxs(Y) b_skip_idxs = ctc_create_skip_idxs(Y[::-1]) # there should be a shortcut to calculating this def step(log_f_curr, log_b_curr, f_active, log_f_prev, b_active, log_b_prev): f_active_next, log_f_next = ctc_update_log_p(f_skip_idxs, zeros, f_active, log_f_curr, log_f_prev) b_active_next, log_b_next = ctc_update_log_p(b_skip_idxs, zeros, b_active, log_b_curr, log_b_prev) return f_active_next, log_f_next, b_active_next, log_b_next [f_active, log_f_probs, b_active, log_b_probs], _ = theano.scan( step, sequences=[L, L[::-1, ::-1]], outputs_info=[np.int32(1), log_first, np.int32(1), log_first]) idxs = T.arange(L.shape[1]).dimshuffle('x', 0) mask = (idxs < f_active.dimshuffle(0, 'x')) & (idxs < b_active.dimshuffle(0, 'x'))[::-1, ::-1] log_probs = log_f_probs + log_b_probs[::-1, ::-1] - L return log_probs, mask
def gru_layer(tparams, emb, layerIndex, hiddenDimSize, mask=None): timesteps = emb.shape[0] if emb.ndim == 3: n_samples = emb.shape[1] else: n_samples = 1 W_rx = T.dot(emb, tparams['W_r_'+layerIndex]) W_zx = T.dot(emb, tparams['W_z_'+layerIndex]) Wx = T.dot(emb, tparams['W_'+layerIndex]) def stepFn(stepMask, wrx, wzx, wx, h): r = T.nnet.sigmoid(wrx + T.dot(h, tparams['U_r_'+layerIndex]) + tparams['b_r_'+layerIndex]) z = T.nnet.sigmoid(wzx + T.dot(h, tparams['U_z_'+layerIndex]) + tparams['b_z_'+layerIndex]) h_tilde = T.tanh(wx + T.dot(r*h, tparams['U_'+layerIndex]) + tparams['b_'+layerIndex]) h_new = z * h + ((1. - z) * h_tilde) h_new = stepMask[:, None] * h_new + (1. - stepMask)[:, None] * h return h_new#, output, time results, updates = theano.scan(fn=stepFn, sequences=[mask,W_rx,W_zx,Wx], outputs_info=T.alloc(numpy_floatX(0.0), n_samples, hiddenDimSize), name='gru_layer'+layerIndex, n_steps=timesteps) return results
def gru_layer(tparams, emb, layerIndex, hiddenDimSize, mask=None): timesteps = emb.shape[0] if emb.ndim == 3: n_samples = emb.shape[1] else: n_samples = 1 W_rx = T.dot(emb, tparams['W_r_'+layerIndex]) W_zx = T.dot(emb, tparams['W_z_'+layerIndex]) Wx = T.dot(emb, tparams['W_'+layerIndex]) def stepFn(stepMask, wrx, wzx, wx, h): r = T.nnet.sigmoid(wrx + T.dot(h, tparams['U_r_'+layerIndex]) + tparams['b_r_'+layerIndex]) z = T.nnet.sigmoid(wzx + T.dot(h, tparams['U_z_'+layerIndex]) + tparams['b_z_'+layerIndex]) h_tilde = T.tanh(wx + T.dot(r*h, tparams['U_'+layerIndex]) + tparams['b_'+layerIndex]) h_new = z * h + ((1. - z) * h_tilde) h_new = stepMask[:, None] * h_new + (1. - stepMask)[:, None] * h return h_new results, updates = theano.scan(fn=stepFn, sequences=[mask,W_rx,W_zx,Wx], outputs_info=T.alloc(numpy_floatX(0.0), n_samples, hiddenDimSize), name='gru_layer'+layerIndex, n_steps=timesteps) return results
def model(inputs, _is_training, params, batch_size, hidden_size, drop_i, drop_s, init_scale, init_H_bias, _theano_rng): noise_i_for_H = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) i_for_H = ifelse(_is_training, inputs * noise_i_for_H, inputs) i_for_H = linear.model(i_for_H, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias) # Dropout noise for recurrent hidden state. noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng) def step(i_for_H_t, y_tm1, noise_s): s_lm1_for_H = ifelse(_is_training, y_tm1 * noise_s, y_tm1) return T.tanh(i_for_H_t + linear.model(s_lm1_for_H, params, hidden_size, hidden_size, init_scale)) y_0 = shared_zeros((batch_size, hidden_size), name='h0') y, _ = theano.scan(step, sequences=i_for_H, outputs_info=[y_0], non_sequences = [noise_s]) y_last = y[-1] sticky_state_updates = [(y_0, y_last)] return y, y_0, sticky_state_updates
def generative_sampling(self, seed, emb_data, sample_length): fruit = theano.shared(value=seed) def step(h_tm, y_tm): h_t = self.activation(T.dot(emb_data[y_tm], self.W) + T.dot(h_tm, self.U) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by) y = T.argmax(y_t, axis=1) return h_t, y[0] [_, samples], _ = theano.scan(fn=step, outputs_info=[self.h0, fruit], n_steps=sample_length) get_samples = theano.function(inputs=[], outputs=samples) return get_samples()
def _labeling_batch_to_class_batch(y, y_labeling, num_classes, y_hat_mask=None): # FIXME: y_hat_mask is currently not used batch_size = y.shape[1] N = y_labeling.shape[0] n_labels = y.shape[0] # sum over all repeated labels # from (T, B, L) to (T, C, B) out = T.zeros((num_classes, batch_size, N)) y_labeling = y_labeling.dimshuffle((2, 1, 0)) # L, B, T y_ = y def scan_step(index, prev_res, y_labeling, y_): res_t = T.inc_subtensor(prev_res[y_[index, T.arange(batch_size)], T.arange(batch_size)], y_labeling[index, T.arange(batch_size)]) return res_t result, updates = theano.scan(scan_step, sequences=[T.arange(n_labels)], non_sequences=[y_labeling, y_], outputs_info=[out]) # result will be (C, B, T) so we make it (T, B, C) return result[-1].dimshuffle(2, 1, 0)
def get_cost(self, X, Y, X_sizes): """ Calculates cost for each values in mini batch, also regularizes all the input parameters and then returns final cost function as theano variable """ cost_fn, _ = theano.scan( fn=self.get_likelihood, sequences=[X, Y, X_sizes] ) cost_fn = cost_fn.mean() cost_fn += self.reg_lambda * T.sqr(self.W_c_r).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_c_l).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_conv).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_output).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.b_output).sum() / 2. # Regularizing word embedding cost_fn += self.reg_lambda * T.sqr(self.vector_dict).sum() / 2 return cost_fn
def get_output(self, train=False): input = self.get_input(train) proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0))) if self.context == 'word': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0)) elif self.context == 'clause': def step(a_t, h_tm1, W_in, W, sc): h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0))) s_t = T.tensordot(h_t, sc, axes=(2,0)) return h_t, s_t [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer]) att_scores = scores.dimshuffle(1,2,0) elif self.context == 'para': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2)) # Nested scans. For shame! def get_sample_att(sample_input, sample_att): sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input]) return sample_att_inp att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores]) return att_input
def _ctc_normal(self, predict,labels): n = labels.shape[0] labels2 = T.concatenate((labels, [self.tpo["CTC_blank"], self.tpo["CTC_blank"]])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * \ T.eq(labels2[1:-1], self.tpo["CTC_blank"]) recurrence_relation = \ T.eye(n) + \ T.eye(n, k=1) + \ T.eye(n, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = predict[:, labels] probabilities, _ = theano.scan( lambda curr, accum: curr * T.dot(accum, recurrence_relation), sequences=[pred_y], outputs_info=[T.eye(n)[0]] ) labels_probab = T.sum(probabilities[-1, -2:]) return -T.log(labels_probab)
def sequence_iteration(self, in_seq, mask, use_dropout,dropout_value=1): in_seq_d = T.switch(use_dropout, (in_seq * self.trng.binomial(in_seq.shape, p=dropout_value, n=1, dtype=in_seq.dtype)), in_seq) rz_in_seq = T.add( T.dot(in_seq_d, self.weights[0]) , self.weights[1] ) out_seq, updates = theano.scan( fn=self.t_forward_step, sequences=[mask, rz_in_seq], # in_seq_d], outputs_info=[self.t_ol_t00], non_sequences=[i for i in self.weights][2:] + [self.t_n_out], go_backwards = self.go_backwards, truncate_gradient=-1, #n_steps=50, strict=True, allow_gc=False, ) return out_seq
def _semi_lagrangian_displacement(self, v_sampled, grid_points, dt) : """ Semi-Lagrangian scheme. Given a downsampled velocity field v (which will be linearly interpolated), we find "where the information came from", i.e. numerically invert its flow during a time-step dt on the 'grid_points'. To do so, we simply solve the fixed point equation a(y)/2 = (dt/2) * v( y - a(y)/2 ) by an "Picard-like" iterative scheme, where y is a grid point, and -a(y) the corresponding "backward" vector. """ def f(r) : return .5 * dt * self._linear_interp_downsampledfield(v_sampled, grid_points - r) # Theano on GPU requires float32, i.e. explicit downcast from numpy float64 type : r_0 = np.zeros((np.prod(self.image_shape), self.image_dimension), dtype = config.floatX) result, updates = theano.scan(fn = f, # Iterated routine outputs_info = [r_0], # Starting estimate for r n_steps = 5) # Number of iterations, sufficient in practice r_inf = result[-1] # We only keep the end result return 2. * r_inf # displacement "alpha"
def _HamiltonianShootingCarrying(self, q, p, i0) : """ Given initial control points/momentums q0 and p0 given as n-by-d matrices, and a "template" image i0, outputs the trajectories q_t, p_t, I_t = I0 \circ phi_{t->0}. """ # Here, we use the "scan" theano routine, which can be understood as a "for" loop identity = T.as_tensor_variable(0. * self.dense_grid()) # We encode the identity as a null displacement field. # Here, we use the "scan" theano routine, which can be understood as a "for" loop result, updates = theano.scan(fn = lambda x,y,z : self._hamiltonian_step_carrying2(x,y,z), outputs_info = [q,p, identity], n_steps = int(np.round(1/self.dt) )) phi_inv_1 = result[2][-1] # We do not store the intermediate results I1 = self._image_circ_diffeo(i0, self.dense_grid() + phi_inv_1) # instead of interpolating the images It at all timesteps, we only do it in the end. return [result[0][-1], result[1][-1], I1] # and only return the final state + momentum + image
def sym_logdensity(self, x): """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """ def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha)) return (p, a, x) # First element is different (it is predicted from the bias only) a0 = T.zeros_like(T.dot(x.T, self.W)) # BxH p0 = T.zeros_like(x[0]) x0 = T.ones_like(x[0]) ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x, sequences=[x, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling], outputs_info=[p0, a0, x0]) return (ps[-1], updates)
def sym_logdensity(self, x): """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """ def density_given_previous_a_and_x(x, w, v, b, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH t = T.dot(h, v) + b p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5) # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995 p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one) return (p, a, x) # First element is different (it is predicted from the bias only) a0 = T.zeros_like(T.dot(x.T, self.W)) # BxH p0 = T.zeros_like(x[0]) x0 = T.ones_like(x[0]) ([ps, _, _], updates) = theano.scan(density_given_previous_a_and_x, sequences=[x, self.W, self.V, self.b, self.activation_rescaling], outputs_info=[p0, a0, x0]) return (ps[-1], updates)
def get_y_prob(self, h, y): """ :param h: 1D: n_words, 2D: Batch, 3D: n_y :param y: 1D: n_words, 2D: Batch :return: gradient of cross entropy: 1D: Batch """ batch_index = T.arange(h.shape[1]) z_score0 = self.BOS + h[0] # 1D: batch, 2D: n_y y_score0 = z_score0[batch_index, y[0]] # 1D: batch [_, y_scores, z_scores], _ = theano.scan(fn=self._forward_step, sequences=[h[1:], y[1:]], outputs_info=[y[0], y_score0, z_score0], non_sequences=[self.W_t, batch_index]) y_score = y_scores[-1] z_score = logsumexp(z_scores[-1], axis=1).flatten() return y_score - z_score
def get_layer(self, x_in): assert x_in.ndim == 2 n_steps = x_in.shape[0] def __slice(x_, n, dim): return x_[n * dim: (n + 1) * dim] def __step(x_, h_, c_): preact = T.dot(h_, self._params['U']) + x_ + self._params['b'] i = T.nnet.sigmoid(__slice(preact, 0, self._ydim)) f = T.nnet.sigmoid(__slice(preact, 1, self._ydim)) o = T.nnet.sigmoid(__slice(preact, 2, self._ydim)) c = T.tanh(__slice(preact, 3, self._ydim)) c = f * c_ + i * c h = o * T.tanh(c) return h, c x_in = T.dot(x_in, self._params['W']) + self._params['b'] rval, updates = theano.scan(__step, sequences=x_in, go_backwards=self.go_backwards, outputs_info=[T.alloc(np_floatX(0.), self._ydim), T.alloc(np_floatX(0.), self._ydim)], name='lstm_layers', n_steps=n_steps) return reverse(rval[0]) if self.go_backwards else rval[0]
def get_layer(self, x_in, C_in, ty_i): # op, n_steps = C_in.shape[0] def __logsumexp(x, axis=None): xmax = x.max(axis=axis, keepdims=True) xmax_ = x.max(axis=axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis)) def __step(_C, _x): #scores = T.dot( T.dot(_x, self._params['U']) + self._params['b'], self._params['v0']) scores = T.dot(T.nnet.sigmoid(T.dot(_x, self._params[ 'U1']) + T.dot(_C, self._params['U2']) + self._params['b']), self._params['v0']) return scores.flatten() y_out, _ = theano.scan( __step, sequences=C_in, non_sequences=x_in, name='classification_layer', n_steps=n_steps) norm_y = y_out.flatten() - __logsumexp(y_out) f_lc_debug = theano.function( [x_in, C_in, ty_i], [y_out, norm_y, norm_y[ty_i]]) return norm_y[ty_i], T.argmax(norm_y), f_lc_debug
def tagged_sequence_unnormalized_score_in_order_one_crf(input_tv, y, l): ''' Simply sum the log-scores along the path suggested by `y` in the `input_tv` tensor. Params ------ input_tv : A 3D tensor of (token, prev_pos, cur_pos) log scores. the input_tv also contains scores of y : The true sequence that was actually followed. l : The score of (EOS | tag) ''' def _score_step(o, y, p_, y_): return ((p_ + o[y_, y]), y) [rval, _], _ = theano.scan(_score_step, sequences=[input_tv[1:, :-1], y[1:]], #sequences=[input_tv, y], outputs_info=[input_tv[0, -1, y[0]], y[0]], #outputs_info=[0.0, numpy.int32(-1)], name='OrderOnePathMax_scan_score_step', strict=True) return rval[-1] + l[y[-1]]
def retrieve_path_from_backpointers(bp, starting_point): ''' Theano scan loop to follow backpointers, starting from a given spot. Params ------ bp : The trail of backpointers. Think of this is as a list of lists where we start from the back `bp = list[N][starting_point]` and then go to list[N-1][bp] and so on. starting_point : ''' vp_prefix = th_reverse( theano.scan( lambda p, y: p[y], sequences=bp, outputs_info=starting_point, go_backwards=True, name='OrderOnePathMax_scan__bkpntr', strict=True)[0]) return theano.tensor.concatenate([vp_prefix, starting_point.dimshuffle('x')])
def inner_fn_sample_actions_given(oat_given, stm1): st0_condition = theano.shared(name = 'st0_condition', value = numpy.random.randn( n_s,n_samples ).astype( dtype = theano.config.floatX ), borrow = True ) ot0_condition = theano.shared(name = 'ot0_condition', value = numpy.random.randn( n_o,n_samples ).astype( dtype = theano.config.floatX ), borrow = True ) oht0_condition = theano.shared(name = 'oht0_condition', value = numpy.random.randn( n_oh,n_samples ).astype( dtype = theano.config.floatX ), borrow = True ) oat0_condition = theano.shared(name = 'oat0_condition', value = numpy.random.randn( n_oa,n_samples ).astype( dtype = theano.config.floatX ), borrow = True ) # Iterate MCMC sampler to approximate constrained probabilities # p(o,oh|oa) of observations, given a sequence of proprioceptive # inputs oa # c.f. https://arxiv.org/abs/1401.4082, Appendix F. ((st, ot, oht, oat), _) = theano.scan(fn=inner_fn_condition, outputs_info=[st0_condition, ot0_condition, oht0_condition, oat0_condition], non_sequences=[oat_given, stm1], n_steps=n_iterations_ag) st = st[-1] ot = ot[-1] oht = oht[-1] oat = oat[-1] return st, ot, oht, oat # Define initial state and action
def renet_layer_lr_allscan(X, rnn1, rnn2, w, h, wp, hp): # list_of_images = [] C = X.shape[0] X = X.dimshuffle((1, 0, 2)).reshape((h/hp, hp*C*w)) # split the rows for the first scan def rnn_pass(x): x = x.reshape((hp, C, w)).dimshuffle((2, 1, 0)).reshape((w/wp, C*wp*hp)) h1 = rnn1.output(x) h2 = rnn2.output(x, go_backwards=True) img = T.concatenate([h1.T, h2.T]) # list_of_images.append(img) return img results, _ = theano.scan( fn=rnn_pass, sequences=X, outputs_info=None, n_steps=h/hp, ) return results.dimshuffle((1, 0, 2)) # return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def forward_all(self, x, masks = None, h0=None, return_c=False, direction = None): if h0 is None: if x.ndim > 1: h0 = T.zeros((x.shape[1], self.n_out*(self.order+1)), dtype=theano.config.floatX) else: h0 = T.zeros((self.n_out*(self.order+1),), dtype=theano.config.floatX) if masks == None: masks = T.ones((x.shape[0], x.shape[1]), dtype = theano.config.floatX) h, _ = theano.scan( fn = self.forward, sequences = [x, masks], outputs_info = [ h0 ] ) if return_c: return h elif x.ndim > 1: return h[:,:,self.n_out*self.order:] else: return h[:,self.n_out*self.order:]
def for_loop(step_function, inputs, initial_hidden_states, go_backwards): """ inputs: time axis must be first """ results = theano.scan( step_function, sequences=inputs, outputs_info=initial_hidden_states, go_backwards=go_backwards)[0] #screw the updates #when results has length 1, it is not returned as a list. wrap it if (isinstance(results, list)==False): results = [results] #put the batch axis back in front results = [dimshuffle(tensor, [1,0]+[x for x in xrange(2, tensor.ndim)]) for tensor in results] return results
def get_output(self, train=False): X = self.get_input(train) # shape: (nb_samples, time (padded with zeros), input_dim) # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x = T.dot(X, self.W) + self.b # scan = theano symbolic loop. # See: http://deeplearning.net/software/theano/library/scan.html # Iterate over the first dimension of the x array (=time). outputs, updates = theano.scan( self._step, # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i]) sequences=[x, dict(input=padded_mask, taps=[-1])], # tensors to iterate over, inputs to _step # initialization of the output. Input to _step with default tap=-1. outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=self.U, # static inputs to _step truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.W_r) + self.b_r x_h = T.dot(X, self.W_h) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_z, self.U_r, self.U_h], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xi, xf, xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.Pmat) + self.b_r x_h = T.dot(X, self.W_h) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_z, self.U_r, self.U_h], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def __Recurrent(name, hidden_dims, step_fn, inputs, non_sequences=[], h0s=None): if not isinstance(inputs, list): inputs = [inputs] if not isinstance(hidden_dims, list): hidden_dims = [hidden_dims] if h0s is None: h0s = [None]*len(hidden_dims) for i in xrange(len(hidden_dims)): if h0s[i] is None: h0_unbatched = lib.param( name + '.h0_' + str(i), numpy.zeros((hidden_dims[i],), dtype=theano.config.floatX) ) num_batches = inputs[0].shape[1] h0s[i] = T.alloc(h0_unbatched, num_batches, hidden_dims[i]) h0s[i] = T.patternbroadcast(h0s[i], [False] * h0s[i].ndim) outputs, _ = theano.scan( step_fn, sequences=inputs, outputs_info=h0s, non_sequences=non_sequences ) return outputs
def __init__(self, dropout_prob, fix_mask=False, fast_predict=False, prefix="dropout"): self.dropout_prob = dropout_prob self.fix_mask = fix_mask self.prefix = prefix self.fast_predict = fast_predict print (self.prefix, self.dropout_prob, self.fix_mask) assert (dropout_prob > 0) """ This one works for the scan function. (instead of theano.tensor.shared.randomstreams.RandomStreams) See discussion: https://groups.google.com/forum/#!topic/theano-users/DbvTgTqkT8o """ self.rng = MRG_RandomStreams(seed=RANDOM_SEED, use_cuda=True)
def get_reconstruction_cost(self, updates, pre_sigmoid_nv): """Approximation to the reconstruction error Note that this function requires the pre-sigmoid activation as input. To understand why this is so you need to understand a bit about how Theano works. Whenever you compile a Theano function, the computational graph that you pass as input gets optimized for speed and stability. This is done by changing several parts of the subgraphs with others. One such optimization expresses terms of the form log(sigmoid(x)) in terms of softplus. We need this optimization for the cross-entropy since sigmoid of numbers larger than 30. (or even less then that) turn to 1. and numbers smaller than -30. turn to 0 which in terms will force theano to compute log(0) and therefore we will get either -inf or NaN as cost. If the value is expressed in terms of softplus we do not get this undesirable behaviour. This optimization usually works fine, but here we have a special case. The sigmoid is applied inside the scan op, while the log is outside. Therefore Theano will only see log(scan(..)) instead of log(sigmoid(..)) and will not apply the wanted optimization. We can not go and replace the sigmoid in scan with something else also, because this only needs to be done on the last step. Therefore the easiest and more efficient way is to get also the pre-sigmoid activation as an output of scan, and apply both the log and sigmoid outside scan such that Theano can catch and optimize the expression. """ cross_entropy = T.mean( T.sum( self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) + (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)), axis=1 ) ) return cross_entropy
def output(self, train=True): outputs_info = [self.h0, self.s0] ([outputs, states], updates) = theano.scan( fn=self.one_step, #function sequences=self.X, # n_steps=600, outputs_info = outputs_info, go_backwards=self.go_backwards ) return outputs
def output(self, train=True): outputs_info = [self.s0] (outputs, updates) = theano.scan( fn=self.one_step, sequences=self.X, outputs_info=outputs_info, go_backwards=self.go_backwards ) return outputs
def get_output_for(self, input, **kwargs): def norm_fn(f, mask, label, previous, W_sim): # f: inst * class, mask: inst, previous: inst * class, W_sim: class * class next = previous.dimshuffle(0, 1, 'x') + f.dimshuffle(0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) if COST: next = next + COST_CONST * (1.0 - T.extra_ops.to_one_hot(label, self.num_classes).dimshuffle(0, 'x', 1)) # next: inst * prev * cur next = theano_logsumexp(next, axis = 1) # next: inst * class mask = mask.dimshuffle(0, 'x') next = previous * (1.0 - mask) + next * mask return next f = T.dot(input, self.W) # f: inst * time * class initial = f[:, 0, :] if CRF_INIT: initial = initial + self.W_init[0].dimshuffle('x', 0) if COST: initial = initial + COST_CONST * (1.0 - T.extra_ops.to_one_hot(self.label_input[:, 0], self.num_classes)) outputs, _ = theano.scan(fn = norm_fn, \ sequences = [f.dimshuffle(1, 0, 2)[1: ], self.mask_input.dimshuffle(1, 0)[1: ], self.label_input.dimshuffle(1, 0)[1:]], \ outputs_info = initial, non_sequences = [self.W_sim], strict = True) norm = T.sum(theano_logsumexp(outputs[-1], axis = 1)) f_pot = (f.reshape((-1, f.shape[-1]))[T.arange(f.shape[0] * f.shape[1]), self.label_input.flatten()] * self.mask_input.flatten()).sum() if CRF_INIT: f_pot += self.W_init[0][self.label_input[:, 0]].sum() labels = self.label_input # labels: inst * time shift_labels = T.roll(labels, -1, axis = 1) mask = self.mask_input # mask : inst * time shift_mask = T.roll(mask, -1, axis = 1) g_pot = (self.W_sim[labels.flatten(), shift_labels.flatten()] * mask.flatten() * shift_mask.flatten()).sum() return - (f_pot + g_pot - norm) / f.shape[0]
def get_output_for(self, input, **kwargs): def max_fn(f, mask, prev_score, prev_back, W_sim): next_score = prev_score.dimshuffle(0, 1, 'x') + f.dimshuffle(0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) next_back = T.argmax(next_score, axis = 1) next_score = T.max(next_score, axis = 1) mask = mask.dimshuffle(0, 'x') next_score = next_score * mask + prev_score * (1.0 - mask) next_back = next_back * mask + prev_back * (1.0 - mask) next_back = T.cast(next_back, 'int32') return [next_score, next_back] def produce_fn(back, mask, prev_py): # back: inst * class, prev_py: inst, mask: inst next_py = back[T.arange(prev_py.shape[0]), prev_py] next_py = mask * next_py + (1.0 - mask) * prev_py next_py = T.cast(next_py, 'int32') return next_py f = T.dot(input, self.W) init_score, init_back = f[:, 0, :], T.zeros_like(f[:, 0, :], dtype = 'int32') if CRF_INIT: init_score = init_score + self.W_init[0].dimshuffle('x', 0) ([scores, backs], _) = theano.scan(fn = max_fn, \ sequences = [f.dimshuffle(1, 0, 2)[1: ], self.mask_input.dimshuffle(1, 0)[1: ]], \ outputs_info = [init_score, init_back], non_sequences = [self.W_sim], strict = True) init_py = T.argmax(scores[-1], axis = 1) init_py = T.cast(init_py, 'int32') # init_py: inst, backs: time * inst * class pys, _ = theano.scan(fn = produce_fn, \ sequences = [backs, self.mask_input.dimshuffle(1, 0)[1:]], outputs_info = [init_py], go_backwards = True) # pys: (rev_time - 1) * inst pys = pys.dimshuffle(1, 0)[:, :: -1] # pys : inst * (time - 1) return T.concatenate([pys, init_py.dimshuffle(0, 'x')], axis = 1)
def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, stage1_model_file_name, p=None): y_vocabulary_size = len(y_vocabulary) self.stage1_model_file_name = stage1_model_file_name self.stage1, _ = load(stage1_model_file_name, minibatch_size, x) self.n_hidden = n_hidden self.x_vocabulary = x_vocabulary self.y_vocabulary = y_vocabulary # output model self.GRU = GRULayer(rng=rng, n_in=self.stage1.n_hidden + 1, n_out=n_hidden, minibatch_size=minibatch_size) self.Wy = weights_const(n_hidden, y_vocabulary_size, 'Wy', 0) self.by = weights_const(1, y_vocabulary_size, 'by', 0) self.params = [self.Wy, self.by] self.params += self.GRU.params def recurrence(x_t, p_t, h_tm1, Wy, by): h_t = self.GRU.step(x_t=T.concatenate((x_t, p_t.dimshuffle((0, 'x'))), axis=1), h_tm1=h_tm1) z = T.dot(h_t, Wy) + by y_t = T.nnet.softmax(z) return [h_t, y_t] [_, self.y], _ = theano.scan(fn=recurrence, sequences=[self.stage1.last_hidden_states, p], non_sequences=[self.Wy, self.by], outputs_info=[self.GRU.h0, None]) print "Number of parameters is %d" % sum(np.prod(p.shape.eval()) for p in self.params) print "Number of parameters with stage1 params is %d" % sum(np.prod(p.shape.eval()) for p in self.params + self.stage1.params) self.L1 = sum(abs(p).sum() for p in self.params) self.L2_sqr = sum((p**2).sum() for p in self.params)
def get_output_for(self, inputs, **kwargs): unary, ref = inputs N, _, H, W = ref.shape yx = tt.cast(tt.stack(tt.mgrid[0:H, 0:W]), "float32") grid = tt.alloc(yx[np.newaxis, :, :, :], N, 2, H, W) stacked = tt.concatenate([grid, ref], axis=1) def _bilateral(V, R): o = tt.ones((1, V.shape[1], V.shape[2]), "float32") norm = tt.sqrt(gaussian_filter(R, o, self.kstd_bf, self.ref_dim)) + 1e-8 return gaussian_filter(R, V/norm, self.kstd_bf, self.ref_dim, self.val_dim) / norm def _step(prev_q, U, ref, normalize=True): qbf = _bilateral(prev_q, ref,) qsf = tt.nnet.conv2d(prev_q[np.newaxis, :, :, :], self.W_spatial, border_mode="half")[0] q_hat = -self.compat_bf * qbf + -self.compat_spatial * qsf q_hat = U - q_hat return softmax(q_hat, axis=0) if normalize else q_hat def _inference(unary_i, ref_i): U = tt.log(tt.clip(unary_i, 1e-5, 1)) prev_q = softmax(U, axis=0) # This is faster than using scan. for i in range(self.num_iter): normalize = self.normalize_final_iter or i < self.num_iter-1 prev_q = _step(prev_q, U, ref_i, normalize) return prev_q return theano.scan(fn=_inference, sequences=[unary, stacked], outputs_info=None)[0]
def grad(self, inputs, ograds): ref, values, ref_dim, val_dim = inputs[:4] hash_struct = inputs[4:] ograd = ograds[0] ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) def _conv(x): return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct) # Since the kernels are separable and symmetric, the gradient w.r.t. # input is just the same filtering applied to the output grads. grad_i = _conv(ograd) def _gradr(r_i, vals, og, *args): return (og * (_conv(vals*r_i) - r_i*_conv(vals)) + vals * (_conv(og*r_i) - r_i*_conv(og))) grad_r, _ = theano.scan(fn=_gradr, sequences=[ref], non_sequences=[values, ograd] + hash_struct, outputs_info=None) grad_r = grad_r.sum(axis=1, acc_dtype="float32") grads = [DisconnectedType()() for i in range(len(inputs))] grads[0] = grad_r grads[1] = grad_i return grads