我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用torch.tanh()。
def _combine_last(self, r, h_t): ''' inputs: r : batch x n_dim h_t : batch x n_dim (this is the output from the gru unit) params : W_x : n_dim x n_dim W_p : n_dim x n_dim out : h_star : batch x n_dim ''' W_p_r = torch.mm(r, self.W_p) # batch x n_dim W_x_h = torch.mm(h_t, self.W_x) # batch x n_dim h_star = F.tanh(W_p_r + W_x_h) # batch x n_dim return h_star
def forward(self, x, hidden): h, c = hidden h = h.view(h.size(1), -1) c = c.view(c.size(1), -1) x = x.view(x.size(1), -1) # Linear mappings i_t = th.mm(x, self.w_xi) + th.mm(h, self.w_hi) + self.b_i f_t = th.mm(x, self.w_xf) + th.mm(h, self.w_hf) + self.b_f o_t = th.mm(x, self.w_xo) + th.mm(h, self.w_ho) + self.b_o # activations i_t.sigmoid_() f_t.sigmoid_() o_t.sigmoid_() # cell computations c_t = th.mm(x, self.w_xc) + th.mm(h, self.w_hc) + self.b_c c_t.tanh_() c_t = th.mul(c, f_t) + th.mul(i_t, c_t) h_t = th.mul(o_t, th.tanh(c_t)) # Reshape for compatibility h_t = h_t.view(1, h_t.size(0), -1) c_t = c_t.view(1, c_t.size(0), -1) if self.dropout > 0.0: F.dropout(h_t, p=self.dropout, training=self.training, inplace=True) return h_t, (h_t, c_t)
def _transform_decoder_init_state(self, hn): if isinstance(hn, tuple): hn, cn = hn # hn [2 * num_layers, batch, hidden_size] num_dir, batch, hidden_size = cn.size() # first convert cn t0 [batch, 2 * num_layers, hidden_size] cn = cn.transpose(0, 1).contiguous() # then view to [batch, num_layers, 2 * hidden_size] --> [num_layer, batch, 2 * num_layers] cn = cn.view(batch, num_dir / 2, 2 * hidden_size).transpose(0, 1) # take hx_dense to [num_layers, batch, hidden_size] cn = self.hx_dense(cn) # hn is tanh(cn) hn = F.tanh(cn) hn = (hn, cn) else: # hn [2 * num_layers, batch, hidden_size] num_dir, batch, hidden_size = hn.size() # first convert hn t0 [batch, 2 * num_layers, hidden_size] hn = hn.transpose(0, 1).contiguous() # then view to [batch, num_layers, 2 * hidden_size] --> [num_layer, batch, 2 * num_layers] hn = hn.view(batch, num_dir / 2, 2 * hidden_size).transpose(0, 1) # take hx_dense to [num_layers, batch, hidden_size] hn = F.tanh(self.hx_dense(hn)) return hn
def _step(self, H_t, T_t, C_t, h0, h_mask, t_mask, c_mask): s_lm1, rnns = h0, [self.rnn_h, self.rnn_t, self.rnn_c] for l, (rnn_h, rnn_t, rnn_c) in enumerate(zip(*rnns)): s_lm1_H = h_mask.expand_as(s_lm1) * s_lm1 s_lm1_T = t_mask.expand_as(s_lm1) * s_lm1 s_lm1_C = c_mask.expand_as(s_lm1) * s_lm1 if l == 0: H_t = F.tanh(H_t + rnn_h(s_lm1_H)) T_t = F.sigmoid(T_t + rnn_t(s_lm1_T)) C_t = F.sigmoid(C_t + rnn_t(s_lm1_C)) else: H_t = F.tanh(rnn_h(s_lm1_H)) T_t = F.sigmoid(rnn_t(s_lm1_T)) C_t = F.sigmoid(rnn_t(s_lm1_C)) s_l = H_t * T_t + s_lm1 * C_t s_lm1 = s_l return s_l
def forward(self, x, hidden): h, c = hidden h = h.view(h.size(0), -1) c = c.view(h.size(0), -1) x = x.view(x.size(0), -1) # Linear mappings i_t = th.mm(x, self.w_xi) + th.mm(h, self.w_hi) + self.b_i f_t = th.mm(x, self.w_xf) + th.mm(h, self.w_hf) + self.b_f o_t = th.mm(x, self.w_xo) + th.mm(h, self.w_ho) + self.b_o # activations i_t.sigmoid_() f_t.sigmoid_() o_t.sigmoid_() # cell computations c_t = th.mm(x, self.w_xc) + th.mm(h, self.w_hc) + self.b_c c_t.tanh_() c_t = th.mul(c, f_t) + th.mul(i_t, c_t) h_t = th.mul(o_t, th.tanh(c_t)) # Reshape for compatibility h_t = h_t.view(h_t.size(0), 1, -1) c_t = c_t.view(c_t.size(0), 1, -1) if self.dropout > 0.0: F.dropout(h_t, p=self.dropout, training=self.training, inplace=True) return h_t, (h_t, c_t)
def attention(self, hidden, W1xe, hidden_encoder): # train W2xdn = torch.mm(hidden, self.W2) W2xdn = W2xdn.unsqueeze(1).expand(self.batch_size, self.n + 1, self.hidden_size) u = (torch.bmm(torch.tanh(W1xe + W2xdn), self.v.unsqueeze(0) .expand(self.batch_size, self.hidden_size, 1))) u = u.squeeze() # test # W2xdn = torch.mm(hidden, self.W2) # u = Variable(torch.zeros(self.batch_size, self.n + 1)).type(dtype) # for n in xrange(self.n + 1): # aux = torch.tanh(W1xe[:, n].squeeze() + W2xdn) # size bs x hidd # aux2 = (torch.bmm(aux.unsqueeze(1), self.v.unsqueeze(0) # .expand(self.batch_size, self.hidden_size, 1))) # u[:, n] = aux2.squeeze() return u
def batch_matmul_bias(seq, weight, bias, nonlinearity=''): s = None bias_dim = bias.size() for i in range(seq.size(0)): _s = torch.mm(seq[i], weight) _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1) if(nonlinearity=='tanh'): _s_bias = torch.tanh(_s_bias) _s_bias = _s_bias.unsqueeze(0) if(s is None): s = _s_bias else: s = torch.cat((s,_s_bias),0) return s.squeeze()
def batch_matmul(seq, weight, nonlinearity=''): s = None for i in range(seq.size(0)): _s = torch.mm(seq[i], weight) if(nonlinearity=='tanh'): _s = torch.tanh(_s) _s = _s.unsqueeze(0) if(s is None): s = _s else: s = torch.cat((s,_s),0) return s.squeeze()
def forward(self, embed, state_word): # embeddings embedded = self.lookup(embed) # word level gru output_word, state_word = self.word_gru(embedded, state_word) word_squish = batch_matmul_bias(output_word, self.weight_W_word, self.bias_word, nonlinearity='tanh') word_attn = batch_matmul(word_squish, self.weight_proj_word) word_attn_norm = self.softmax_word(word_attn.transpose(1,0)) word_attn_vectors = attention_mul(output_word, word_attn_norm.transpose(1,0)) return word_attn_vectors, state_word, word_attn_norm
def forward(self, inputs, z, hidden_cell=None): if hidden_cell is None: # then we must init from z hidden,cell = torch.split(F.tanh(self.fc_hc(z)),hp.dec_hidden_size,1) hidden_cell = (hidden.unsqueeze(0).contiguous(), cell.unsqueeze(0).contiguous()) outputs,(hidden,cell) = self.lstm(inputs, hidden_cell) # in training we feed the lstm with the whole input in one shot # and use all outputs contained in 'outputs', while in generate # mode we just feed with the last generated sample: if self.training: y = self.fc_params(outputs.view(-1, hp.dec_hidden_size)) else: y = self.fc_params(hidden.view(-1, hp.dec_hidden_size)) # separate pen and mixture params: params = torch.split(y,6,1) params_mixture = torch.stack(params[:-1]) # trajectory params_pen = params[-1] # pen up/down # identify mixture params: pi,mu_x,mu_y,sigma_x,sigma_y,rho_xy = torch.split(params_mixture,1,2) # preprocess params:: if self.training: len_out = Nmax+1 else: len_out = 1 pi = F.softmax(pi.t().squeeze()).view(len_out,-1,hp.M) sigma_x = torch.exp(sigma_x.t().squeeze()).view(len_out,-1,hp.M) sigma_y = torch.exp(sigma_y.t().squeeze()).view(len_out,-1,hp.M) rho_xy = torch.tanh(rho_xy.t().squeeze()).view(len_out,-1,hp.M) mu_x = mu_x.t().squeeze().contiguous().view(len_out,-1,hp.M) mu_y = mu_y.t().squeeze().contiguous().view(len_out,-1,hp.M) q = F.softmax(params_pen).view(len_out,-1,3) return pi,mu_x,mu_y,sigma_x,sigma_y,rho_xy,q,hidden,cell
def visualize(): # initialise the model discriminator = ArcBinaryClassifier(num_glimpses=opt.numGlimpses, glimpse_h=opt.glimpseSize, glimpse_w=opt.glimpseSize, controller_out=opt.numStates) discriminator.load_state_dict(torch.load(os.path.join("saved_models", opt.name, opt.load))) arc = discriminator.arc sample = get_sample(discriminator) all_hidden = arc._forward(sample[None, :, :])[:, 0, :] # (2*numGlimpses, controller_out) glimpse_params = torch.tanh(arc.glimpser(all_hidden)) masks = arc.glimpse_window.get_attention_mask(glimpse_params, mask_h=opt.imageSize, mask_w=opt.imageSize) # separate the masks of each image. masks1 = [] masks2 = [] for i, mask in enumerate(masks): if i % 2 == 1: # the first image outputs the hidden state for the next image masks1.append(mask) else: masks2.append(mask) for i, (mask1, mask2) in enumerate(zip(masks1, masks2)): display(sample[0], mask1, sample[1], mask2, "img_{}".format(i))
def forward(self, input_, hx): """ Args: input_: A (batch, input_size) tensor containing input features. hx: A tuple (h_0, c_0), which contains the initial hidden and cell state, where the size of both states is (batch, hidden_size). Returns: h_1, c_1: Tensors containing the next hidden and cell state. """ h_0, c_0 = hx batch_size = h_0.size(0) bias_batch = (self.bias.unsqueeze(0) .expand(batch_size, *self.bias.size())) wh_b = torch.addmm(bias_batch, h_0, self.weight_hh) wi = torch.mm(input_, self.weight_ih) f, i, o, g = torch.split(wh_b + wi, split_size=self.hidden_size, dim=1) c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) h_1 = torch.sigmoid(o) * torch.tanh(c_1) return h_1, c_1
def forward(self, input_, hx, time): """ Args: input_: A (batch, input_size) tensor containing input features. hx: A tuple (h_0, c_0), which contains the initial hidden and cell state, where the size of both states is (batch, hidden_size). time: The current timestep value, which is used to get appropriate running statistics. Returns: h_1, c_1: Tensors containing the next hidden and cell state. """ h_0, c_0 = hx batch_size = h_0.size(0) bias_batch = (self.bias.unsqueeze(0) .expand(batch_size, *self.bias.size())) wh = torch.mm(h_0, self.weight_hh) wi = torch.mm(input_, self.weight_ih) bn_wh = self.bn_hh(wh, time=time) bn_wi = self.bn_ih(wi, time=time) f, i, o, g = torch.split(bn_wh + bn_wi + bias_batch, split_size=self.hidden_size, dim=1) c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) h_1 = torch.sigmoid(o) * torch.tanh(self.bn_c(c_1, time=time)) return h_1, c_1
def tanh_rescale(x, x_min=-1., x_max=1.): return (torch.tanh(x) + 1) * 0.5 * (x_max - x_min) + x_min
def _attention_forward(self, Y, mask_Y, h, r_tm1=None): ''' Computes the Attention Weights over Y using h (and r_tm1 if given) Returns an attention weighted representation of Y, and the alphas inputs: Y : T x batch x n_dim mask_Y : T x batch h : batch x n_dim r_tm1 : batch x n_dim params: W_y : n_dim x n_dim W_h : n_dim x n_dim W_r : n_dim x n_dim W_alpha : n_dim x 1 outputs : r = batch x n_dim alpha : batch x T ''' Y = Y.transpose(1, 0) # batch x T x n_dim mask_Y = mask_Y.transpose(1, 0) # batch x T Wy = torch.bmm(Y, self.W_y.unsqueeze(0).expand(Y.size(0), *self.W_y.size())) # batch x T x n_dim Wh = torch.mm(h, self.W_h) # batch x n_dim if r_tm1 is not None: W_r_tm1 = torch.mm(r_tm1, self.W_r) Wh += W_r_tm1 M = torch.tanh(Wy + Wh.unsqueeze(1).expand(Wh.size(0), Y.size(1), Wh.size(1))) # batch x T x n_dim alpha = torch.bmm(M, self.W_alpha.unsqueeze(0).expand(Y.size(0), *self.W_alpha.size())).squeeze(-1) # batch x T alpha = alpha + (-1000.0 * (1. - mask_Y)) # To ensure probability mass doesn't fall on non tokens alpha = F.softmax(alpha) return torch.bmm(alpha.unsqueeze(1), Y).squeeze(1), alpha
def _attention_forward(self, Y, mask_Y, h, r_tm1=None, index=None): ''' Computes the Attention Weights over Y using h (and r_tm1 if given) Returns an attention weighted representation of Y, and the alphas inputs: Y : T x batch x n_dim mask_Y : T x batch h : batch x n_dim r_tm1 : batch x n_dim index : int : The timestep params: W_y : n_dim x n_dim W_h : n_dim x n_dim W_r : n_dim x n_dim W_alpha : n_dim x 1 outputs : r = batch x n_dim alpha : batch x T ''' Y = Y.transpose(1, 0) # batch x T x n_dim mask_Y = mask_Y.transpose(1, 0) # batch x T Wy = torch.bmm(Y, self.W_y.unsqueeze(0).expand(Y.size(0), *self.W_y.size())) # batch x T x n_dim Wh = torch.mm(h, self.W_h) # batch x n_dim if r_tm1 is not None: W_r_tm1 = self.batch_norm_r_r(torch.mm(r_tm1, self.W_r), index) if hasattr(self, 'batch_norm_r_r') else torch.mm(r_tm1, self.W_r) Wh = self.batch_norm_h_r(Wh, index) if hasattr(self, 'batch_norm_h_r') else Wh Wh += W_r_tm1 M = torch.tanh(Wy + Wh.unsqueeze(1).expand(Wh.size(0), Y.size(1), Wh.size(1))) # batch x T x n_dim alpha = torch.bmm(M, self.W_alpha.unsqueeze(0).expand(Y.size(0), *self.W_alpha.size())).squeeze(-1) # batch x T alpha = alpha + (-1000.0 * (1. - mask_Y)) # To ensure probability mass doesn't fall on non tokens alpha = F.softmax(alpha) if r_tm1 is not None: r = torch.bmm(alpha.unsqueeze(1), Y).squeeze(1) + F.tanh(torch.mm(r_tm1, self.W_t)) # batch x n_dim else: r = torch.bmm(alpha.unsqueeze(1), Y).squeeze(1) # batch x n_dim return r, alpha
def forward(self, input): return torch.tanh(input)
def forward(self, x, hidden): do_dropout = self.training and self.dropout > 0.0 h, c = hidden h = h.view(h.size(1), -1) c = c.view(c.size(1), -1) x = x.view(x.size(1), -1) # Linear mappings preact = self.i2h(x) + self.h2h(h) # activations gates = preact[:, :3 * self.hidden_size].sigmoid() g_t = preact[:, 3 * self.hidden_size:].tanh() i_t = gates[:, :self.hidden_size] f_t = gates[:, self.hidden_size:2 * self.hidden_size] o_t = gates[:, -self.hidden_size:] # cell computations if do_dropout and self.dropout_method == 'semeniuta': g_t = F.dropout(g_t, p=self.dropout, training=self.training) c_t = th.mul(c, f_t) + th.mul(i_t, g_t) if do_dropout and self.dropout_method == 'moon': c_t.data.set_(th.mul(c_t, self.mask).data) c_t.data *= 1.0/(1.0 - self.dropout) h_t = th.mul(o_t, c_t.tanh()) # Reshape for compatibility if do_dropout: if self.dropout_method == 'pytorch': F.dropout(h_t, p=self.dropout, training=self.training, inplace=True) if self.dropout_method == 'gal': h_t.data.set_(th.mul(h_t, self.mask).data) h_t.data *= 1.0/(1.0 - self.dropout) h_t = h_t.view(1, h_t.size(0), -1) c_t = c_t.view(1, c_t.size(0), -1) return h_t, (h_t, c_t)
def getCoef(outputs): ''' Extracts the mean, standard deviation and correlation params: outputs : Output of the SRNN model ''' mux, muy, sx, sy, corr = outputs[:, :, 0], outputs[:, :, 1], outputs[:, :, 2], outputs[:, :, 3], outputs[:, :, 4] # Exponential to get a positive value for std dev sx = torch.exp(sx) sy = torch.exp(sy) # tanh to get a value between [-1, 1] for correlation corr = torch.tanh(corr) return mux, muy, sx, sy, corr
def getCoef_train(outputs): mux, muy, sx, sy, corr = outputs[:, 0], outputs[:, 1], outputs[:, 2], outputs[:, 3], outputs[:, 4] sx = torch.exp(sx) sy = torch.exp(sy) corr = torch.tanh(corr) return mux, muy, sx, sy, corr
def test_simple(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) def f(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) trace, z = torch.jit.trace(f, (x, y), nderivs=0) torch._C._jit_pass_lint(trace) torch._C._jit_pass_onnx(trace) torch._C._jit_pass_lint(trace) self.assertExpected(str(trace))
def test_lstm_fusion(self): input = Variable(torch.randn(3, 10).cuda()) hx = Variable(torch.randn(3, 20).cuda()) cx = Variable(torch.randn(3, 20).cuda()) module = nn.LSTMCell(10, 20).cuda() # Just to allocate weights with correct sizes def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): hx, cx = hidden gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = F.sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) return hy, cy trace, _ = torch.jit.trace(LSTMCell, (input, (hx, cx)) + tuple(module.parameters())) torch._C._jit_pass_lint(trace) torch._C._jit_pass_onnx(trace) torch._C._jit_pass_lint(trace) torch._C._jit_pass_fuse(trace) torch._C._jit_pass_lint(trace) self.assertExpected(str(trace))
def test_cse(self): x = Variable(torch.Tensor([0.4, 0.3]), requires_grad=True) y = Variable(torch.Tensor([0.7, 0.5]), requires_grad=True) trace = torch._C._tracer_enter((x, y), 0) w = (x + y) * (x + y) * (x + y) t = torch.tanh(w) + torch.tanh(w) z = (x + y) * (x + y) * (x + y) + t torch._C._tracer_exit((z,)) torch._C._jit_pass_lint(trace) torch._C._jit_pass_onnx(trace) torch._C._jit_pass_lint(trace) torch._C._jit_pass_cse(trace) self.assertExpected(str(trace))
def test_verify(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) @torch.jit.compile(verify=True, optimize=False) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) z = traced(x, y) z2 = traced(x, y) self.assertEqual(z, torch.sigmoid(torch.tanh(x * (x + y)))) self.assertEqual(z, z2)
def test_disabled_traced_function(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) @torch.jit.compile(enabled=False) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) z = doit(x, y) z2 = doit(x, y) self.assertEqual(z, torch.sigmoid(torch.tanh(x * (x + y)))) self.assertEqual(z, z2)
def test_python_ir(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) traced, _ = torch.jit.trace(doit, (x, y)) g = torch._C._jit_get_graph(traced) g2 = torch._C.Graph() g_to_g2 = {} for node in g.inputs(): g_to_g2[node] = g2.addInput() for node in g.nodes(): if node.kind() == "PythonOp": n_ = g2.create(node.pyname(), [g_to_g2[i] for i in node.inputs()]) \ .setType(node.typeOption()) \ .s_("note", "from_pyop") \ .i_("some_value", len(node.scalar_args())) assert(n_.i("some_value") == len(node.scalar_args())) else: n_ = g2.createClone(node, lambda x: g_to_g2[x]) assert(n_.kindOf("Offset") == "i") g_to_g2[node] = g2.appendNode(n_) for node in g.outputs(): g2.registerOutput(g_to_g2[node]) t_node = g2.create("TensorTest").t_("a", torch.ones([2, 2])) assert(t_node.attributeNames() == ["a"]) g2.appendNode(t_node) assert(torch.equal(torch.ones([2, 2]), t_node.t("a"))) self.assertExpected(str(g2))
def update_buffer(self, S_tm1, c_t, o_tm1, ident): # concat previous output & context idt = torch.tanh(self.F_u(ident)) o_tm1 = o_tm1.squeeze(0) z_t = torch.cat([c_t + idt, o_tm1/30], 1) z_t = z_t.unsqueeze(2) Sp = torch.cat([z_t, S_tm1[:, :, :-1]], 2) # update S u = self.N_u(Sp.view(Sp.size(0), -1)) u[:, :idt.size(1)] = u[:, :idt.size(1)] + idt u = u.unsqueeze(2) S = torch.cat([u, S_tm1[:, :, :-1]], 2) return S
def getCoef(outputs): ''' Extracts the mean, standard deviation and correlation params: outputs : Output of the SRNN model ''' mux, muy, sx, sy, corr = outputs[:, :, 0], outputs[:, :, 1], outputs[:, :, 2], outputs[:, :, 3], outputs[:, :, 4] sx = torch.exp(sx) sy = torch.exp(sy) corr = torch.tanh(corr) return mux, muy, sx, sy, corr
def _get_rnn_output(self, input_word, input_char, mask=None, length=None, hx=None): # hack length from mask # we do not hack mask from length for special reasons. # Thus, always provide mask if it is necessary. if length is None and mask is not None: length = mask.data.sum(dim=1).long() # [batch, length, word_dim] word = self.word_embedd(input_word) # [batch, length, char_length, char_dim] char = self.char_embedd(input_char) char_size = char.size() # first transform to [batch *length, char_length, char_dim] # then transpose to [batch * length, char_dim, char_length] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2) # put into cnn [batch*length, char_filters, char_length] # then put into maxpooling [batch * length, char_filters] char, _ = self.conv1d(char).max(dim=2) # reshape to [batch, length, char_filters] char = torch.tanh(char).view(char_size[0], char_size[1], -1) # concatenate word and char [batch, length, word_dim+char_filter] input = torch.cat([word, char], dim=2) # apply dropout input = self.dropout_in(input) # prepare packed_sequence if length is not None: seq_input, hx, rev_order, mask = utils.prepare_rnn_seq(input, length, hx=hx, masks=mask, batch_first=True) seq_output, hn = self.rnn(seq_input, hx=hx) output, hn = utils.recover_rnn_seq(seq_output, rev_order, hx=hn, batch_first=True) else: # output from rnn [batch, length, hidden_size] output, hn = self.rnn(input, hx=hx) output = self.dropout_rnn(output) if self.dense is not None: # [batch, length, tag_space] output = F.elu(self.dense(output)) return output, hn, mask, length
def _get_rnn_output(self, input_word, input_char, mask=None, length=None, hx=None): # [batch, length, word_dim] word = self.word_embedd(input_word) # [batch, length, char_length, char_dim] char = self.char_embedd(input_char) char_size = char.size() # first transform to [batch *length, char_length, char_dim] # then transpose to [batch * length, char_dim, char_length] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2) # put into cnn [batch*length, char_filters, char_length] # then put into maxpooling [batch * length, char_filters] char, _ = self.conv1d(char).max(dim=2) # reshape to [batch, length, char_filters] char = torch.tanh(char).view(char_size[0], char_size[1], -1) # concatenate word and char [batch, length, word_dim+char_filter] input = torch.cat([word, char], dim=2) # output from rnn [batch, length, hidden_size] output, hn = self.rnn(input, mask, hx=hx) # apply dropout for the output of rnn output = self.dropout_rnn(output.transpose(1, 2)).transpose(1, 2) if self.dense is not None: # [batch, length, tag_space] output = F.elu(self.dense(output)) return output, hn, mask, length
def _get_encoder_output(self, input_word, input_char, input_pos, mask_e=None, length_e=None, hx=None): # [batch, length, word_dim] word = self.word_embedd(input_word) # [batch, length, pos_dim] pos = self.pos_embedd(input_pos) # [batch, length, char_length, char_dim] char = self.char_embedd(input_char) char_size = char.size() # first transform to [batch *length, char_length, char_dim] # then transpose to [batch * length, char_dim, char_length] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2) # put into cnn [batch*length, char_filters, char_length] # then put into maxpooling [batch * length, char_filters] char, _ = self.conv1d(char).max(dim=2) # reshape to [batch, length, char_filters] char = torch.tanh(char).view(char_size[0], char_size[1], -1) # apply dropout on input word = self.dropout_in(word) pos = self.dropout_in(pos) char = self.dropout_in(char) # concatenate word and char [batch, length, word_dim+char_filter] src_encoding = torch.cat([word, char, pos], dim=2) # output from rnn [batch, length, hidden_size] output, hn = self.encoder(src_encoding, mask_e, hx=hx) # apply dropout # [batch, length, hidden_size] --> [batch, hidden_size, length] --> [batch, length, hidden_size] output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2) return src_encoding, output, hn, mask_e, length_e
def _step(self, H_t, T_t, h0, h_mask, t_mask): s_lm1 = h0 for l, (rnn_h, rnn_t) in enumerate(zip(self.rnn_h, self.rnn_t)): s_lm1_H = h_mask.expand_as(s_lm1) * s_lm1 s_lm1_T = t_mask.expand_as(s_lm1) * s_lm1 if l == 0: H_t = F.tanh(H_t + rnn_h(s_lm1_H)) T_t = F.sigmoid(T_t + rnn_t(s_lm1_T)) else: H_t = F.tanh(rnn_h(s_lm1_H)) T_t = F.sigmoid(rnn_t(s_lm1_T)) s_l = (H_t - s_lm1) * T_t + s_lm1 s_lm1 = s_l return s_l
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): hx, cx = hidden gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = F.sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) return hy, cy
def test_simple(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) def f(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) trace, z = torch.jit.trace(f, (x, y), nderivs=0) self.assertExpectedTrace(trace) # matmul is currently implemented as a native function, which # exercises different codepaths in the JIT. The following two # tests ensure that (1) matmul indeed traces into an atomic, # native operation, and (2) the JIT knows how to run it
def test_scopes(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) def f(x, y): out = x + y with torch.jit.scope('Foo', out): out = x * out with torch.jit.scope('Bar', out): out = torch.tanh(out) out = torch.sigmoid(out) return out trace, z = torch.jit.trace(f, (x, y), nderivs=0) self.assertExpectedTrace(trace)
def test_cse(self): x = Variable(torch.Tensor([0.4, 0.3]), requires_grad=True) y = Variable(torch.Tensor([0.7, 0.5]), requires_grad=True) trace = torch._C._tracer_enter((x, y), 0) w = (x + y) * (x + y) * (x + y) t = torch.tanh(w) + torch.tanh(w) z = (x + y) * (x + y) * (x + y) + t torch._C._tracer_exit((z,)) torch._C._jit_pass_lint(trace) torch._C._jit_pass_cse(trace) self.assertExpectedTrace(trace)
def test_compile_addc(self): x = Variable(torch.Tensor([0.4]), requires_grad=True).float().cuda() y = Variable(torch.Tensor([0.7]), requires_grad=True).float().cuda() @torch.jit.compile(nderivs=0) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y) + 1)) z = doit(x, y) with self.assertCompiled(doit): z2 = doit(x, y) self.assertEqual(z, torch.sigmoid(torch.tanh(x * (x + y) + 1))) self.assertEqual(z, z2)
def test_traced_function(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) @torch.jit.compile(nderivs=0) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) z = doit(x, y) with self.assertCompiled(doit): z2 = doit(x, y) self.assertEqual(z, torch.sigmoid(torch.tanh(x * (x + y)))) self.assertEqual(z, z2)
def test_python_ir(self): x = Variable(torch.Tensor([0.4]), requires_grad=True) y = Variable(torch.Tensor([0.7]), requires_grad=True) def doit(x, y): return torch.sigmoid(torch.tanh(x * (x + y))) traced, _ = torch.jit.trace(doit, (x, y)) g = torch._C._jit_get_graph(traced) g2 = torch._C.Graph() g_to_g2 = {} for node in g.inputs(): g_to_g2[node] = g2.addInput() for node in g.nodes(): n_ = g2.createClone(node, lambda x: g_to_g2[x]) g2.appendNode(n_) for o, no in zip(node.outputs(), n_.outputs()): g_to_g2[o] = no for node in g.outputs(): g2.registerOutput(g_to_g2[node]) t_node = g2.create("TensorTest").t_("a", torch.ones([2, 2])) assert(t_node.attributeNames() == ["a"]) g2.appendNode(t_node) assert(torch.equal(torch.ones([2, 2]), t_node.t("a"))) self.assertExpected(str(g2))
def tanh_quantize(input, bits): assert bits >= 1, bits if bits == 1: return torch.sign(input) input = torch.tanh(input) # [-1, 1] input_rescale = (input + 1.0) / 2 #[0, 1] n = math.pow(2.0, bits) - 1 v = torch.floor(input_rescale * n + 0.5) / n v = 2 * v - 1 # [-1, 1] v = 0.5 * torch.log((1 + v) / (1 - v)) # arctanh return v
def duplicate_model_with_quant(model, bits, overflow_rate=0.0, counter=10, type='linear'): """assume that original model has at least a nn.Sequential""" assert type in ['linear', 'minmax', 'log', 'tanh'] if isinstance(model, nn.Sequential): l = OrderedDict() for k, v in model._modules.items(): if isinstance(v, (nn.Conv2d, nn.Linear, nn.BatchNorm1d, nn.BatchNorm2d, nn.AvgPool2d)): l[k] = v if type == 'linear': quant_layer = LinearQuant('{}_quant'.format(k), bits=bits, overflow_rate=overflow_rate, counter=counter) elif type == 'log': # quant_layer = LogQuant('{}_quant'.format(k), bits=bits, overflow_rate=overflow_rate, counter=counter) quant_layer = NormalQuant('{}_quant'.format(k), bits=bits, quant_func=log_minmax_quantize) elif type == 'minmax': quant_layer = NormalQuant('{}_quant'.format(k), bits=bits, quant_func=min_max_quantize) else: quant_layer = NormalQuant('{}_quant'.format(k), bits=bits, quant_func=tanh_quantize) l['{}_{}_quant'.format(k, type)] = quant_layer else: l[k] = duplicate_model_with_quant(v, bits, overflow_rate, counter, type) m = nn.Sequential(l) return m else: for k, v in model._modules.items(): model._modules[k] = duplicate_model_with_quant(v, bits, overflow_rate, counter, type) return model