我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用torch.mv()。
def backward(self, grad_output): tensors = self.saved_tensors if len(tensors) == 2: input, weight = tensors bias = None else: input, weight, bias = tensors grad_input = grad_weight = grad_bias = None if self.needs_input_grad[0]: grad_input = torch.mm(grad_output, weight) if self.needs_input_grad[1]: grad_weight = torch.mm(grad_output.t(), input) if bias is not None and self.needs_input_grad[2]: grad_bias = torch.mv(grad_output.t(), self.add_buffer) if bias is not None: return grad_input, grad_weight, grad_bias else: return grad_input, grad_weight
def backward(self, grad_output): matrix, vector = self.saved_tensors grad_add_vector = grad_matrix = grad_vector = None if self.needs_input_grad[0]: grad_add_vector = grad_output if self.alpha != 1: grad_add_vector = grad_add_vector.mul(self.alpha) if self.needs_input_grad[1]: grad_matrix = torch.ger(grad_output, vector) if self.beta != 1: grad_matrix *= self.beta if self.needs_input_grad[2]: grad_vector = torch.mv(matrix.t(), grad_output) if self.beta != 1: grad_vector *= self.beta return grad_add_vector, grad_matrix, grad_vector
def backward(self, grad_output): vector1, vector2 = self.saved_tensors grad_add_matrix = grad_vector1 = grad_vector2 = None if self.needs_input_grad[0]: grad_add_matrix = grad_output if self.alpha != 1: grad_add_matrix = grad_add_matrix.mul(self.alpha) if self.needs_input_grad[1]: grad_vector1 = torch.mv(grad_output, vector2) if self.beta != 1: grad_vector1 *= self.beta if self.needs_input_grad[2]: # TODO: maybe it's better to do transpose + mv + transpose grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output) if self.beta != 1: grad_vector2 *= self.beta return grad_add_matrix, grad_vector1, grad_vector2
def backward(ctx, grad_output): matrix, vector = ctx.saved_variables grad_add_vector = grad_matrix = grad_vector = None if ctx.needs_input_grad[0]: grad_add_vector = grad_output if ctx.alpha != 1: grad_add_vector = grad_add_vector.mul(ctx.alpha) if ctx.needs_input_grad[1]: grad_matrix = torch.ger(grad_output, vector) if ctx.beta != 1: grad_matrix *= ctx.beta if ctx.needs_input_grad[2]: grad_vector = torch.mv(matrix.t(), grad_output) if ctx.beta != 1: grad_vector *= ctx.beta return grad_add_vector, grad_matrix, grad_vector, None, None, None
def backward(ctx, grad_output): vector1, vector2 = ctx.saved_variables grad_add_matrix = grad_vector1 = grad_vector2 = None if ctx.needs_input_grad[0]: grad_add_matrix = grad_output if ctx.alpha != 1: grad_add_matrix = grad_add_matrix.mul(ctx.alpha) if ctx.needs_input_grad[1]: grad_vector1 = torch.mv(grad_output, vector2) if ctx.beta != 1: grad_vector1 *= ctx.beta if ctx.needs_input_grad[2]: # TODO: maybe it's better to do transpose + mv + transpose grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output).squeeze(0) if ctx.beta != 1: grad_vector2 *= ctx.beta return grad_add_matrix, grad_vector1, grad_vector2, None, None, None
def updateOutput(self, input): M, v = input assert M.ndimension() == 2 or M.ndimension() == 3 if M.ndimension() == 2: assert v.ndimension() == 1 if self.trans: M = M.transpose(0, 1) self.output.resize_(M.size(0)) torch.mv(M, v, out=self.output) else: assert v.ndimension() == 2 if self.trans: M = M.transpose(1, 2) self.output.resize_(M.size(0), M.size(1), 1) torch.bmm(M, v.view(v.size(0), v.size(1), 1), out=self.output).resize_(M.size(0), M.size(1)) return self.output
def dot_nd(query, candidates): """ Perform a dot product between a query and n-dimensional candidates. Args: query (Variable): A vector to query, whose size is (query_dim,) candidates (Variable): A n-dimensional tensor to be multiplied by query, whose size is (d0, d1, ..., dn, query_dim) Returns: output: The result of the dot product, whose size is (d0, d1, ..., dn) """ cands_size = candidates.size() cands_flat = candidates.view(-1, cands_size[-1]) output_flat = torch.mv(cands_flat, query) output = output_flat.view(*cands_size[:-1]) return output
def backward(ctx, grad_output): matrix, vector = ctx.saved_variables grad_add_vector = grad_matrix = grad_vector = None if ctx.needs_input_grad[0]: grad_add_vector = maybe_unexpand(grad_output, ctx.add_vector_size) if ctx.alpha != 1: grad_add_vector = grad_add_vector.mul(ctx.alpha) if ctx.needs_input_grad[1]: grad_matrix = torch.ger(grad_output, vector) if ctx.beta != 1: grad_matrix *= ctx.beta if ctx.needs_input_grad[2]: grad_vector = torch.mv(matrix.t(), grad_output) if ctx.beta != 1: grad_vector *= ctx.beta return grad_add_vector, grad_matrix, grad_vector, None, None, None
def backward(ctx, grad_output): vector1, vector2 = ctx.saved_variables grad_add_matrix = grad_vector1 = grad_vector2 = None if ctx.needs_input_grad[0]: grad_add_matrix = maybe_unexpand(grad_output, ctx.add_matrix_size) if ctx.alpha != 1: grad_add_matrix = grad_add_matrix.mul(ctx.alpha) if ctx.needs_input_grad[1]: grad_vector1 = torch.mv(grad_output, vector2) if ctx.beta != 1: grad_vector1 *= ctx.beta if ctx.needs_input_grad[2]: # TODO: maybe it's better to do transpose + mv + transpose grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output).squeeze(0) if ctx.beta != 1: grad_vector2 *= ctx.beta return grad_add_matrix, grad_vector1, grad_vector2, None, None, None
def backward(self, grad_output): input, weight, bias = self.saved_tensors grad_input = grad_weight = grad_bias = None if self.needs_input_grad[0]: grad_output = grad_output.squeeze() grad_input = torch.mm(grad_output, weight) if self.needs_input_grad[1]: grad_weight = torch.mm(grad_output.t(), input) if bias is not None and self.needs_input_grad[2]: grad_bias = torch.mv(grad_output.t(), self.add_buffer) if bias is not None: return grad_input, grad_weight, grad_bias else: return grad_input, grad_weight
def __matmul__(self, other): dim_self = self.dim() dim_other = other.dim() # TODO: should this really be dot product? # if dim_self == 1 and dim_other == 1: # return self.dot(other) if dim_self == 2 and dim_other == 1: return torch.mv(self, other) elif dim_self == 2 and dim_other == 2: return torch.mm(self, other)
def accGradParameters(self, input, gradOutput, scale=1): self.network.accGradParameters([input, self.partition], gradOutput, scale) if self.bias: self.buffer = self.buffer or input.new() self.buffer.resize_(gradOutput.size(1)) torch.mv(self.buffer, gradOutput.t(), self.addBuffer).mul_(scale) self.gradBias.index_add_( 1, self.partition, self.buffer.view(1, self.buffer.nelement()) )
def test_mv(self): m1 = torch.randn(100,100) v1 = torch.randn(100) res1 = torch.mv(m1,v1) res2 = res1.clone().zero_() for i, j in iter_indices(m1): res2[i] += m1[i][j] * v1[j] self.assertEqual(res1, res2)
def gmm_batch_model(data): p = pyro.param("p", Variable(torch.Tensor([0.3]), requires_grad=True)) p = torch.cat([p, 1 - p]) sigma = pyro.param("sigma", Variable(torch.Tensor([1.0]), requires_grad=True)) mus = Variable(torch.Tensor([-1, 1])) with pyro.iarange("data", len(data)) as batch: n = len(batch) z = pyro.sample("z", dist.Categorical(p.unsqueeze(0).expand(n, 2))) assert z.size() == (n, 2) mu = torch.mv(z, mus) pyro.observe("x", dist.Normal(mu, sigma.expand(n)), data[batch])
def test_ip_forward(): p_t, Q_t, G_t, A_t, z0_t, s0_t = [torch.Tensor(x) for x in [p, Q, G, A, z0, s0]] b = torch.mv(A_t, z0_t) if neq > 0 else None h = torch.mv(G_t,z0_t)+s0_t L_Q, L_S, R = aip.pre_factor_kkt(Q_t, G_t, A_t) zhat_ip, nu_ip, lam_ip = aip.forward_single(p_t, Q_t, G_t, A_t, b, h, L_Q, L_S, R) # Unnecessary clones here because of a pytorch bug when calling numpy # on a tensor with a non-zero offset. npt.assert_allclose(zhat, zhat_ip.clone().numpy(), rtol=RTOL, atol=ATOL) if neq > 0: npt.assert_allclose(nu, nu_ip.clone().numpy(), rtol=RTOL, atol=ATOL) npt.assert_allclose(lam, lam_ip.clone().numpy(), rtol=RTOL, atol=ATOL)
def prof_instance(nz, neq, nineq, nIter, cuda): L = np.tril(npr.uniform(0,1, (nz,nz))) + np.eye(nz,nz) G = npr.randn(nineq,nz) A = npr.randn(neq,nz) z0 = npr.randn(nz) s0 = np.ones(nineq) p = npr.randn(nz) p, L, G, A, z0, s0 = [torch.Tensor(x) for x in [p, L, G, A, z0, s0]] Q = torch.mm(L, L.t())+0.001*torch.eye(nz).type_as(L) if cuda: p, L, Q, G, A, z0, s0 = [x.cuda() for x in [p, L, Q, G, A, z0, s0]] af = adact.AdactFunction() start = time.time() # One-time cost for numpy conversion. p_np, L_np, G_np, A_np, z0_np, s0_np = [adact.toNp(v) for v in [p, L, G, A, z0, s0]] cp = time.time()-start for i in range(nIter): start = time.time() zhat, nu, lam = af.forward_single_np(p_np, L_np, G_np, A_np, z0_np, s0_np) cp += time.time()-start b = torch.mv(A, z0) if neq > 0 else None h = torch.mv(G, z0)+s0 L_Q, L_S, R = aip.pre_factor_kkt(Q, G, A, nineq, neq) pdipm = [] for i in range(nIter): start = time.time() zhat_ip, nu_ip, lam_ip = aip.forward_single(p, Q, G, A, b, h, L_Q, L_S, R) pdipm.append(time.time()-start) return cp, np.sum(pdipm)
def backward(ctx, grad_output): input, weight, bias = ctx.saved_variables grad_input = grad_weight = grad_bias = None if ctx.needs_input_grad[0]: grad_input = torch.mm(grad_output, weight) if ctx.needs_input_grad[1]: grad_weight = torch.mm(grad_output.t(), input) if bias is not None and ctx.needs_input_grad[2]: grad_bias = torch.mv(grad_output.t(), Variable(ctx.add_buffer)) if bias is not None: return grad_input, grad_weight, grad_bias else: return grad_input, grad_weight
def accGradParameters(self, input, gradOutput, scale=1): self.network.accGradParameters([input, self.partition], gradOutput, scale) if self.bias is not None: if self.buffer is None: self.buffer = input.new() self.buffer.resize_(gradOutput.size(1)) torch.mv(gradOutput.t(), self.addBuffer, out=self.buffer).mul_(scale) self.gradBias.index_add_( 1, self.partition, self.buffer.view(1, self.buffer.nelement()) )
def test_mv(self): m1 = torch.randn(100, 100) v1 = torch.randn(100) res1 = torch.mv(m1, v1) res2 = res1.clone().zero_() for i, j in iter_indices(m1): res2[i] += m1[i][j] * v1[j] self.assertEqual(res1, res2)
def test_functional_blas(self): def compare(fn, *args): unpacked_args = tuple(arg.data if isinstance(arg, Variable) else arg for arg in args) self.assertEqual(fn(*args).data, fn(*unpacked_args)) def test_blas_add(fn, x, y, z): # Checks all signatures compare(fn, x, y, z) compare(fn, 0.5, x, y, z) compare(fn, 0.5, x, 0.25, y, z) def test_blas(fn, x, y): compare(fn, x, y) test_blas(torch.mm, Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4))) test_blas_add(torch.addmm, Variable(torch.randn(2, 4)), Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4))) test_blas(torch.bmm, Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas_add(torch.addbmm, Variable(torch.randn(2, 4)), Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas_add(torch.baddbmm, Variable(torch.randn(4, 2, 4)), Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas(torch.mv, Variable(torch.randn(2, 10)), Variable(torch.randn(10))) test_blas_add(torch.addmv, Variable(torch.randn(2)), Variable(torch.randn(2, 10)), Variable(torch.randn(10))) test_blas(torch.ger, Variable(torch.randn(5)), Variable(torch.randn(6))) test_blas_add(torch.addr, Variable(torch.randn(5, 6)), Variable(torch.randn(5)), Variable(torch.randn(6)))
def forward(self, h_temporal, h_spatials): ''' Forward pass for the model params: h_temporal : Hidden state of the temporal edgeRNN h_spatials : Hidden states of all spatial edgeRNNs connected to the node. ''' # Number of spatial edges num_edges = h_spatials.size()[0] # Embed the temporal edgeRNN hidden state temporal_embed = self.temporal_edge_layer(h_temporal) temporal_embed = temporal_embed.squeeze(0) # Embed the spatial edgeRNN hidden states spatial_embed = self.spatial_edge_layer(h_spatials) # Dot based attention attn = torch.mv(spatial_embed, temporal_embed) # Variable length temperature = num_edges / np.sqrt(self.attention_size) attn = torch.mul(attn, temperature) # Softmax attn = torch.nn.functional.softmax(attn) # Compute weighted value weighted_value = torch.mv(torch.t(h_spatials), attn) return weighted_value, attn
def solve_kkt(U_Q, d, G, A, U_S, rx, rs, rz, ry, dbg=False): """ Solve KKT equations for the affine step""" nineq, nz, neq, _ = get_sizes(G, A) invQ_rx = torch.potrs(rx.view(-1, 1), U_Q).view(-1) if neq > 0: h = torch.cat([torch.mv(A, invQ_rx) - ry, torch.mv(G, invQ_rx) + rs / d - rz], 0) else: h = torch.mv(G, invQ_rx) + rs / d - rz w = -torch.potrs(h.view(-1, 1), U_S).view(-1) g1 = -rx - torch.mv(G.t(), w[neq:]) if neq > 0: g1 -= torch.mv(A.t(), w[:neq]) g2 = -rs - w[neq:] dx = torch.potrs(g1.view(-1, 1), U_Q).view(-1) ds = g2 / d dz = w[neq:] dy = w[:neq] if neq > 0 else None # if np.all(np.array([x.norm() for x in [rx, rs, rz, ry]]) != 0): if dbg: import IPython import sys IPython.embed() sys.exit(-1) # if rs.norm() > 0: import IPython, sys; IPython.embed(); sys.exit(-1) return dx, ds, dz, dy
def factor_solve_kkt(Q, D, G, A, rx, rs, rz, ry): nineq, nz, neq, _ = get_sizes(G, A) if neq > 0: H_ = torch.cat([torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1)], 0) A_ = torch.cat([torch.cat([G, torch.eye(nineq).type_as(Q)], 1), torch.cat([A, torch.zeros(neq, nineq).type_as(Q)], 1)], 0) g_ = torch.cat([rx, rs], 0) h_ = torch.cat([rz, ry], 0) else: H_ = torch.cat([torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1)], 0) A_ = torch.cat([G, torch.eye(nineq).type_as(Q)], 1) g_ = torch.cat([rx, rs], 0) h_ = rz U_H_ = torch.potrf(H_) invH_A_ = torch.potrs(A_.t(), U_H_) invH_g_ = torch.potrs(g_.view(-1, 1), U_H_).view(-1) S_ = torch.mm(A_, invH_A_) U_S_ = torch.potrf(S_) t_ = torch.mv(A_, invH_g_).view(-1, 1) - h_ w_ = -torch.potrs(t_, U_S_).view(-1) v_ = torch.potrs(-g_.view(-1, 1) - torch.mv(A_.t(), w_), U_H_).view(-1) return v_[:nz], v_[nz:], w_[:nineq], w_[nineq:] if neq > 0 else None
def dot(x, y): def _dot(X): x, y = X x_ndim = ndim(x) y_ndim = ndim(y) if x_ndim == 2 and y_ndim == 2: return torch.mm(x, y) if x_ndim == 2 and y_ndim == 1: return torch.mv(x, y) if x_ndim == 1 and y_ndim == 2: return torch.mv(y, x) if x_ndim == 1 and y_ndim == 1: return torch.dot(x, y) else: raise Exception('Unsupported tensor ranks for dot operation : ' + str(x_ndim) + ' and ' + str(y_ndim) + '.') def _compute_output_shape(X): x, y = _get_shape(X[0]), _get_shape(X[1]) x_ndim = len(x) y_ndim = len(y) if x_ndim == 2 and y_ndim == 2: return (x[0], y[1]) if x_ndim == 2 and y_ndim == 1: return (x[0],) if x_ndim == 1 and y_ndim == 2: return (y[0],) if x_ndim == 1 and y_ndim == 1: return (0,) return get_op(_dot, output_shape=_compute_output_shape)([x, y])
def test_back(): npr.seed(1) nBatch, nz, neq, nineq = 1, 10, 1, 3 # nz, neq, nineq = 3,3,3 L = np.tril(np.random.randn(nz,nz)) + 2.*np.eye(nz,nz) Q = L.dot(L.T)+1e-4*np.eye(nz) G = 100.*npr.randn(nineq,nz) A = 100.*npr.randn(neq,nz) z0 = 1.*npr.randn(nz) s0 = 100.*np.ones(nineq) s0[:nineq//2] = 1e-6 # print(np.linalg.norm(L)) # print(np.linalg.norm(G)) # print(np.linalg.norm(A)) # print(np.linalg.norm(z0)) # print(np.linalg.norm(s0)) p = npr.randn(nBatch,nz) # print(np.linalg.norm(p)) truez = npr.randn(nBatch,nz) af = adact.AdactFunction() zhat_0, nu_0, lam_0 = af.forward_single_np(p[0], L, G, A, z0, s0) dl_dzhat_0 = zhat_0-truez[0] S = Solver(L, A, G, z0, s0, 1e-8) S.reinit(lam_0, zhat_0) dp_0, dL_0, dG_0, dA_0, dz0_0, ds0_0 = af.backward_single_np_solver( S, zhat_0, nu_0, lam_0, dl_dzhat_0, L, G, A, z0, s0) # zhat_1, nu_1, lam_1 = af.forward_single_np(p[1], L, G, A, z0, s0) # dl_dzhat_1 = zhat_1-truez[1] # S.reinit(lam_1, zhat_1) # dp_1, dL_1, dG_1, dA_1, dz0_1, ds0_1 = af.backward_single_np_solver( # S, zhat_1, nu_1, lam_1, dl_dzhat_1, L, G, A, z0, s0) p, L, G, A, z0, s0, truez = [torch.DoubleTensor(x) for x in [p, L, G, A, z0, s0, truez]] Q = torch.mm(L, L.t())+0.001*torch.eye(nz).type_as(L) if cuda: p, L, Q, G, A, z0, s0, truez = [x.cuda() for x in [p, L, Q, G, A, z0, s0, truez]] p, L, G, A, z0, s0 = [Variable(x) for x in [p, L, G, A, z0, s0]] for x in [p, L, G, A, z0, s0]: x.requires_grad = True # Q_LU, S_LU, R = aip.pre_factor_kkt_batch(Q, G, A, nBatch) # b = torch.mv(A, z0) if neq > 0 else None # h = torch.mv(G, z0)+s0 # zhat_b, nu_b, lam_b = aip.forward_batch(p, Q, G, A, b, h, Q_LU, S_LU, R) zhats = af(p, L, G, A, z0, s0) dl_dzhat = zhats.data - truez zhats.backward(dl_dzhat) dp, dL, dG, dA, dz0, ds0 = [x.grad.clone() for x in [p, L, G, A, z0, s0]]
def prof_instance(nz, neq, nineq, nBatch, cuda): L = np.tril(npr.uniform(0,1, (nz,nz))) + np.eye(nz,nz) G = npr.randn(nineq,nz) A = npr.randn(neq,nz) z0 = npr.randn(nz) s0 = np.ones(nineq) p = npr.randn(nBatch,nz) p, L, G, A, z0, s0 = [torch.Tensor(x) for x in [p, L, G, A, z0, s0]] Q = torch.mm(L, L.t())+0.001*torch.eye(nz).type_as(L) if cuda: p, L, Q, G, A, z0, s0 = [x.cuda() for x in [p, L, Q, G, A, z0, s0]] b = torch.mv(A, z0) if neq > 0 else None h = torch.mv(G, z0)+s0 af = adact.AdactFunction() single_results = [] start = time.time() U_Q, U_S, R = aip.pre_factor_kkt(Q, G, A) for i in range(nBatch): single_results.append(aip.forward_single(p[i], Q, G, A, b, h, U_Q, U_S, R)) single_time = time.time()-start start = time.time() Q_LU, S_LU, R = aip.pre_factor_kkt_batch(Q, G, A, nBatch) zhat_b, nu_b, lam_b = aip.forward_batch(p, Q, G, A, b, h, Q_LU, S_LU, R) batched_time = time.time()-start zhat_diff = (single_results[0][0] - zhat_b[0]).norm() lam_diff = (single_results[0][2] - lam_b[0]).norm() eps = 0.1 # Pretty relaxed. if zhat_diff > eps or lam_diff > eps: print('===========') print("Warning: Single and batched solutions might not match.") print(" + zhat_diff: {}".format(zhat_diff)) print(" + lam_diff: {}".format(lam_diff)) print(" + (nz, neq, nineq, nBatch) = ({}, {}, {}, {})".format( nz, neq, nineq, nBatch)) print('===========') return single_time, batched_time
def test_functional_blas(self): def compare(fn, *args): unpacked_args = tuple(arg.data if isinstance(arg, Variable) else arg for arg in args) unpacked_result = fn(*unpacked_args) packed_result = fn(*args).data # if non-Variable torch function returns a scalar, compare to scalar if not torch.is_tensor(unpacked_result): assert packed_result.dim() == 1 assert packed_result.nelement() == 1 packed_result = packed_result[0] self.assertEqual(packed_result, unpacked_result) def test_blas_add(fn, x, y, z): # Checks all signatures compare(fn, x, y, z) compare(fn, 0.5, x, y, z) compare(fn, 0.5, x, 0.25, y, z) def test_blas(fn, x, y): compare(fn, x, y) test_blas(torch.mm, Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4))) test_blas_add(torch.addmm, Variable(torch.randn(2, 4)), Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4))) test_blas(torch.bmm, Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas_add(torch.addbmm, Variable(torch.randn(2, 4)), Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas_add(torch.baddbmm, Variable(torch.randn(4, 2, 4)), Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4))) test_blas(torch.mv, Variable(torch.randn(2, 10)), Variable(torch.randn(10))) test_blas_add(torch.addmv, Variable(torch.randn(2)), Variable(torch.randn(2, 10)), Variable(torch.randn(10))) test_blas(torch.ger, Variable(torch.randn(5)), Variable(torch.randn(6))) test_blas_add(torch.addr, Variable(torch.randn(5, 6)), Variable(torch.randn(5)), Variable(torch.randn(6))) test_blas(torch.matmul, Variable(torch.randn(6)), Variable(torch.randn(6))) test_blas(torch.matmul, Variable(torch.randn(10, 4)), Variable(torch.randn(4))) test_blas(torch.matmul, Variable(torch.randn(5)), Variable(torch.randn(5, 6))) test_blas(torch.matmul, Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4))) test_blas(torch.matmul, Variable(torch.randn(5, 2, 10)), Variable(torch.randn(5, 10, 4))) test_blas(torch.matmul, Variable(torch.randn(3, 5, 2, 10)), Variable(torch.randn(3, 5, 10, 4))) test_blas(torch.matmul, Variable(torch.randn(3, 5, 2, 10)), Variable(torch.randn(10))) test_blas(torch.matmul, Variable(torch.randn(10)), Variable(torch.randn(3, 5, 10, 4)))
def evaluate(data_source, batch_size=10, window=args.window): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) next_word_history = None pointer_history = None for i in range(0, data_source.size(0) - 1, args.bptt): if i > 0: print(i, len(data_source), math.exp(total_loss / i)) data, targets = get_batch(data_source, i, evaluation=True, args=args) output, hidden, rnn_outs, _ = model(data, hidden, return_h=True) rnn_out = rnn_outs[-1].squeeze() output_flat = output.view(-1, ntokens) ### # Fill pointer history start_idx = len(next_word_history) if next_word_history is not None else 0 next_word_history = torch.cat([one_hot(t.data[0], ntokens) for t in targets]) if next_word_history is None else torch.cat([next_word_history, torch.cat([one_hot(t.data[0], ntokens) for t in targets])]) #print(next_word_history) pointer_history = Variable(rnn_out.data) if pointer_history is None else torch.cat([pointer_history, Variable(rnn_out.data)], dim=0) #print(pointer_history) ### # Built-in cross entropy # total_loss += len(data) * criterion(output_flat, targets).data[0] ### # Manual cross entropy # softmax_output_flat = torch.nn.functional.softmax(output_flat) # soft = torch.gather(softmax_output_flat, dim=1, index=targets.view(-1, 1)) # entropy = -torch.log(soft) # total_loss += len(data) * entropy.mean().data[0] ### # Pointer manual cross entropy loss = 0 softmax_output_flat = torch.nn.functional.softmax(output_flat) for idx, vocab_loss in enumerate(softmax_output_flat): p = vocab_loss if start_idx + idx > window: valid_next_word = next_word_history[start_idx + idx - window:start_idx + idx] valid_pointer_history = pointer_history[start_idx + idx - window:start_idx + idx] logits = torch.mv(valid_pointer_history, rnn_out[idx]) theta = args.theta ptr_attn = torch.nn.functional.softmax(theta * logits).view(-1, 1) ptr_dist = (ptr_attn.expand_as(valid_next_word) * valid_next_word).sum(0).squeeze() lambdah = args.lambdasm p = lambdah * ptr_dist + (1 - lambdah) * vocab_loss ### target_loss = p[targets[idx].data] loss += (-torch.log(target_loss)).data[0] total_loss += loss / batch_size ### hidden = repackage_hidden(hidden) next_word_history = next_word_history[-window:] pointer_history = pointer_history[-window:] return total_loss / len(data_source) # Load the best saved model.