def __init__(self, model_source="model", cuda=False): self.torch = torch.cuda if cuda else torch self.cuda = cuda if self.cuda: model_source = torch.load(model_source) else: model_source = torch.load(model_source, map_location=lambda storage, loc: storage) self.src_dict = model_source["src_dict"] self.trains_score = model_source["trains_score"] self.args = args = model_source["settings"] model = BiLSTM_Cut(args) model.load_state_dict(model_source['model']) if self.cuda: model = model.cuda() model.prob_projection = nn.Softmax().cuda() else: model = model.cpu() model.prob_projection = nn.Softmax().cpu() self.model = model.eval()
def evaluate_performance(ladder, valid_loader, e, agg_cost_scaled, agg_supervised_cost_scaled, agg_unsupervised_cost_scaled, args): correct = 0. total = 0. for batch_idx, (data, target) in enumerate(valid_loader): if args.cuda: data = data.cuda() data, target = Variable(data), Variable(target) output = ladder.forward_encoders_clean(data) # TODO: Do away with the below hack for GPU tensors. if args.cuda: output = output.cpu() target = target.cpu() output = output.data.numpy() preds = np.argmax(output, axis=1) target = target.data.numpy() correct += np.sum(target == preds) total += target.shape[0] print("Epoch:", e + 1, "\t", "Total Cost:", "{:.4f}".format(agg_cost_scaled), "\t", "Supervised Cost:", "{:.4f}".format(agg_supervised_cost_scaled), "\t", "Unsupervised Cost:", "{:.4f}".format(agg_unsupervised_cost_scaled), "\t", "Validation Accuracy:", correct / total)
def _generate_typedefs(): typedefs = [] for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']: for lib in ['TH', 'THCuda']: for kind in ['Tensor', 'Storage']: python_name = t + kind if t == 'Float' and lib == 'THCuda': th_name = 'THCuda' + kind else: th_name = lib + t + kind th_struct = 'struct ' + th_name typedefs += ['typedef {} {};'.format(th_struct, th_name)] module = torch if lib == 'TH' else torch.cuda python_class = getattr(module, python_name) _cffi_to_torch[th_struct] = python_class _torch_to_cffi[python_class] = th_struct return '\n'.join(typedefs) + '\n'
def _setup_wrapper(with_cuda): here = os.path.abspath(os.path.dirname(__file__)) lib_dir = os.path.join(here, '..', '..', 'lib') include_dirs = [ os.path.join(lib_dir, 'include'), os.path.join(lib_dir, 'include', 'TH'), ] wrapper_source = '#include <TH/TH.h>\n' if with_cuda: import torch.cuda wrapper_source += '#include <THC/THC.h>\n' cuda_include_dirs = glob.glob('/usr/local/cuda/include') cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include') include_dirs.append(os.path.join(lib_dir, 'include', 'THC')) include_dirs.extend(cuda_include_dirs) return wrapper_source, include_dirs
def test_gpu(self): compile_extension( name='gpulib', header=test_dir + '/ffi/src/cuda/cudalib.h', sources=[ test_dir + '/ffi/src/cuda/cudalib.c', ], with_cuda=True, verbose=False, ) import gpulib tensor = torch.ones(2, 2).float() gpulib.good_func(tensor, 2, 1.5) self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5) ctensor = tensor.cuda().fill_(1) gpulib.cuda_func(ctensor, 2, 1.5) self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5) self.assertRaises(TypeError, lambda: gpulib.cuda_func(tensor, 2, 1.5)) self.assertRaises(TypeError, lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
def test_serialization(self): x = torch.randn(5, 5).cuda() y = torch.IntTensor(2, 5).fill_(0).cuda() q = [x, y, x, y.storage()] with tempfile.NamedTemporaryFile() as f: torch.save(q, f) f.seek(0) q_copy = torch.load(f) self.assertEqual(q_copy, q, 0) q_copy[0].fill_(5) self.assertEqual(q_copy[0], q_copy[2], 0) self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage)) q_copy[1].fill_(10) self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
def _test_gather(self, dim): if torch.cuda.device_count() < 2: raise unittest.SkipTest("only one GPU detected") x = torch.randn(2, 5).cuda(0) y = torch.randn(2, 5).cuda(1) result = comm.gather((x, y), dim) expected_size = list(x.size()) expected_size[dim] += y.size(dim) expected_size = torch.Size(expected_size) self.assertEqual(result.get_device(), 0) self.assertEqual(result.size(), expected_size) index = [slice(None, None), slice(None, None)] index[dim] = slice(0, x.size(dim)) self.assertEqual(result[tuple(index)], x) index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim)) self.assertEqual(result[tuple(index)], y)
def test_cuda(self, test_case): if not TEST_CUDA or not self.should_test_cuda: raise unittest.SkipTest('Excluded from CUDA tests') try: cpu_input = self._get_input() type_map = { torch.DoubleTensor: torch.cuda.FloatTensor, } gpu_input = to_gpu(cpu_input, type_map=type_map) cpu_target = self.target gpu_target = to_gpu(self.target, type_map=type_map) cpu_module = self.constructor(*self.constructor_args) gpu_module = self.constructor(*self.constructor_args).float().cuda() cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target) gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target) test_case.assertEqual(cpu_output, gpu_output, 2e-4) cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target) gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target) test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4) except NotImplementedError: pass
def test_print(self): for t in torch._tensor_classes: if t.is_cuda and not torch.cuda.is_available(): continue obj = t(100, 100).fill_(1) obj.__repr__() str(obj) for t in torch._storage_classes: if t.is_cuda and not torch.cuda.is_available(): continue obj = t(100).fill_(1) obj.__repr__() str(obj) x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1]) x.__repr__() str(x)
def test_reduce_scatter(self): in_size = 32 * nGPUs out_size = 32 inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)] expected = torch.FloatTensor(in_size).zero_() for t in inputs: expected.add_(t) expected = expected.view(nGPUs, 32) inputs = [inputs[i].cuda(i) for i in range(nGPUs)] outputs = [torch.cuda.FloatTensor(out_size, device=i) for i in range(nGPUs)] nccl.reduce_scatter(inputs, outputs) for i in range(nGPUs): self.assertEqual(outputs[i], expected[i])
def trainepoch(self, X, y, nepoches=1): self.model.train() for _ in range(self.nepoch, self.nepoch + nepoches): permutation = np.random.permutation(len(X)) all_costs = [] for i in range(0, len(X), self.batch_size): # forward idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda() Xbatch = Variable(X.index_select(0, idx)) ybatch = Variable(y.index_select(0, idx)) output = self.model(Xbatch) # loss loss = self.loss_fn(output, ybatch) all_costs.append(loss.data[0]) # backward self.optimizer.zero_grad() loss.backward() # Update parameters self.optimizer.step() self.nepoch += nepoches
def __init__(self, model_source, cuda=False, beam_size=3): self.torch = torch.cuda if cuda else torch self.cuda = cuda self.beam_size = beam_size if self.cuda: model_source = torch.load(model_source) else: model_source = torch.load(model_source, map_location=lambda storage, loc: storage) self.src_dict = model_source["src_dict"] self.tgt_dict = model_source["tgt_dict"] self.src_idx2word = {v: k for k, v in model_source["tgt_dict"].items()} self.args = args = model_source["settings"] model = Transformer(args) model.load_state_dict(model_source['model']) if self.cuda: model = model.cuda() else: model = model.cpu() self.model = model.eval()
def sent2tenosr(self, sentence): max_len = self.args.max_word_len - 2 sentence = normalizeString(sentence) words = [w for w in sentence.strip().split()] if len(words) > max_len: words = words[:max_len] words = [WORD[BOS]] + words + [WORD[EOS]] idx = [self.src_dict[w] if w in self.src_dict else UNK for w in words] idx_data = torch.LongTensor(idx) idx_position = torch.LongTensor([pos_i+1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)]) idx_data_tensor = Variable(idx_data.unsqueeze(0), volatile=True) idx_position_tensor = Variable(idx_position.unsqueeze(0), volatile=True) if self.cuda: idx_data_tensor = idx_data_tensor.cuda() idx_position_tensor = idx_position_tensor.cuda() return idx_data_tensor, idx_position_tensor
def __init__(self, model_source, cuda=False, beam_size=3): self.torch = torch.cuda if cuda else torch self.cuda = cuda self.jb = Jieba("./segmenter_dicts", useSynonym=True, HMM=False) self.swf = StopwordFilter("./segmenter_dicts/stopwords.txt") model_source = torch.load(model_source) self.src_dict = model_source["src_dict"] self.tgt_dict = model_source["tgt_dict"] self.src_idx2ind = {v: k for k, v in model_source["tgt_dict"].items()} self.args = args = model_source["settings"] model = CNN_Ranking(args) model.load_state_dict(model_source['model']) if self.cuda: model = model.cuda() else: model = model.cpu() self.model = model.eval()
def __init__(self, model_source, cuda=False): self.torch = torch.cuda if cuda else torch self.cuda = cuda if self.cuda: model_source = torch.load(model_source) else: model_source = torch.load(model_source, map_location=lambda storage, loc: storage) self.src_dict = model_source["src_dict"] self.trains_score = model_source["trains_score"] self.args = args = model_source["settings"] model = BiLSTM_CRF_Size(args) model.load_state_dict(model_source['model']) if self.cuda: model = model.cuda() model.prob_projection = nn.Softmax().cuda() else: model = model.cpu() model.prob_projection = nn.Softmax().cpu() self.model = model.eval()
def __init__(self, size, cuda=False): self.size = size self.done = False self.tt = torch.cuda if cuda else torch # The score for each translation on the beam. self.scores = self.tt.FloatTensor(size).zero_() self.all_scores = [] # The backpointers at each time-step. self.prev_ks = [] # The outputs at each time-step. self.next_ys = [self.tt.LongTensor(size).fill_(Constants.PAD)] self.next_ys[0][0] = Constants.BOS
def __init__(self, args, model, criterion, device_ids=None, multiprocessing_method='spawn'): if device_ids is None: device_ids = tuple(range(torch.cuda.device_count())) super().__init__(device_ids, multiprocessing_method) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') model = model.share_memory() nccl_uid = nccl.get_unique_id() self.criterion = criterion Future.gen_list([ self.call_async(rank, '_async_init', args=args, model=model, criterion=criterion, nccl_uid=nccl_uid) for rank in range(self.num_replicas) ]) self._grads_initialized = False
def _async_init(self, rank, device_id, args, model, criterion, nccl_uid): """Initialize child processes.""" self.args = args # set CUDA device torch.cuda.set_device(device_id) # initialize NCCL nccl.initialize(self.num_replicas, nccl_uid, device_id) # copy model and criterion to current device self.model = model.cuda() self.criterion = criterion.cuda() # initialize optimizer and LR scheduler self.args.lr = list(map(float, self.args.lr.split(','))) self.optimizer = self._build_optimizer() self.lr_scheduler = self._build_lr_scheduler() self.loss = None self._max_bsz_seen = 0
def _async_forward(self, rank, device_id, eval=False): if eval: self.model.eval() else: self.model.train() self.optimizer.zero_grad() sample_size, logging_output, oom = 0, {}, False if self._sample is not None: try: # calculate loss and sample size self.loss, sample_size, logging_output = self.criterion(self.model, self._sample) except RuntimeError as e: if not eval and 'out of memory' in str(e): print('| WARNING: ran out of memory on GPU #{}, skipping batch'.format(device_id)) oom = True self.loss = None if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e return sample_size, logging_output, oom
def __init__(self, size, cuda=False): self.size = size self.done = False self.tt = torch.cuda if cuda else torch # The score for each translation on the beam. self.scores = self.tt.FloatTensor(size).zero_() self.allScores = [] # The backpointers at each time-step. self.prevKs = [] # The outputs at each time-step. self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)] self.nextYs[0][0] = onmt.Constants.BOS # The attentions (matrix) for each time. self.attn = []
def buildData(self, srcBatch, goldBatch): # This needs to be the same as preprocess.py. if self._type == "text": srcData = [self.src_dict.convertToIdx(b, onmt.Constants.UNK_WORD) for b in srcBatch] elif self._type == "img": srcData = [transforms.ToTensor()( Image.open(self.opt.src_img_dir + "/" + b[0])) for b in srcBatch] tgtData = None if goldBatch: tgtData = [self.tgt_dict.convertToIdx(b, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD) for b in goldBatch] return onmt.Dataset(srcData, tgtData, self.opt.batch_size, self.opt.cuda, volatile=True, data_type=self._type)
def __init__(self, X, Y, hidden_layer_sizes): super(Net, self).__init__() # Initialize linear layer with least squares solution X_ = np.hstack([X, np.ones((X.shape[0],1))]) Theta = np.linalg.solve(X_.T.dot(X_), X_.T.dot(Y)) self.lin = nn.Linear(X.shape[1], Y.shape[1]) W,b = self.lin.parameters() W.data = torch.Tensor(Theta[:-1,:].T) b.data = torch.Tensor(Theta[-1,:]) # Set up non-linear network of # Linear -> BatchNorm -> ReLU -> Dropout layers layer_sizes = [X.shape[1]] + hidden_layer_sizes layers = reduce(operator.add, [[nn.Linear(a,b), nn.BatchNorm1d(b), nn.ReLU(), nn.Dropout(p=0.2)] for a,b in zip(layer_sizes[0:-1], layer_sizes[1:])]) layers += [nn.Linear(layer_sizes[-1], Y.shape[1])] self.net = nn.Sequential(*layers) self.sig = Parameter(torch.ones(1, Y.shape[1]).cuda())
def __init__(self, params, eps=1e-2): super(SolveNewsvendor, self).__init__() k = len(params['d']) self.Q = Variable(torch.diag(torch.Tensor( [params['c_quad']] + [params['b_quad']]*k + [params['h_quad']]*k)) \ .cuda()) self.p = Variable(torch.Tensor( [params['c_lin']] + [params['b_lin']]*k + [params['h_lin']]*k) \ .cuda()) self.G = Variable(torch.cat([ torch.cat([-torch.ones(k,1), -torch.eye(k), torch.zeros(k,k)], 1), torch.cat([torch.ones(k,1), torch.zeros(k,k), -torch.eye(k)], 1), -torch.eye(1 + 2*k)], 0).cuda()) self.h = Variable(torch.Tensor( np.concatenate([-params['d'], params['d'], np.zeros(1+ 2*k)])).cuda()) self.one = Variable(torch.Tensor([1])).cuda() self.eps_eye = eps * Variable(torch.eye(1 + 2*k).cuda()).unsqueeze(0)
def forward(self, y): nBatch, k = y.size() Q_scale = torch.cat([torch.diag(torch.cat( [self.one, y[i], y[i]])).unsqueeze(0) for i in range(nBatch)], 0) Q = self.Q.unsqueeze(0).expand_as(Q_scale).mul(Q_scale) p_scale = torch.cat([Variable(torch.ones(nBatch,1).cuda()), y, y], 1) p = self.p.unsqueeze(0).expand_as(p_scale).mul(p_scale) G = self.G.unsqueeze(0).expand(nBatch, self.G.size(0), self.G.size(1)) h = self.h.unsqueeze(0).expand(nBatch, self.h.size(0)) e = Variable(torch.Tensor().cuda()).double() out = QPFunction(verbose=False)\ (Q.double(), p.double(), G.double(), h.double(), e, e).float() return out[:,:1]
def __init__(self, size, cuda=False): self.size = size self.done = False self.tt = torch.cuda if cuda else torch # The score for each translation on the beam. self.scores = self.tt.FloatTensor(size).zero_() # The backpointers at each time-step. self.prevKs = [] # The outputs at each time-step. self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)] self.nextYs[0][0] = onmt.Constants.BOS # The attentions (matrix) for each time. self.attn = [] # Get the outputs for the current timestep.
def test_cuda_small_tensors(self): # Check multiple small tensors which will likely use the same # underlying cached allocation ctx = mp.get_context('spawn') tensors = [] for i in range(5): tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()] inq = ctx.Queue() outq = ctx.Queue() inq.put(tensors) p = ctx.Process(target=sum_tensors, args=(inq, outq)) p.start() results = [] for i in range(5): results.append(outq.get()) p.join() for i, tensor in enumerate(tensors): v, device, tensor_size, storage_size = results[i] self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum()) self.assertEqual(device, 0) self.assertEqual(tensor_size, 5) self.assertEqual(storage_size, 5)
def test_event(self): ctx = mp.get_context('spawn') queue = ctx.Queue() ready = ctx.Event() done = ctx.Event() p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done)) p.start() ready.wait() with torch.cuda.stream(torch.cuda.Stream()): tensor = torch.cuda.FloatTensor([1, 1, 1, 1]) # Use a sleep kernel to test events. Without the event, the # multiply happens before the add. event = torch.cuda.Event(interprocess=True) torch.cuda._sleep(20000000) # about 30 ms tensor.add_(1) event.record() queue.put((event, tensor)) done.wait() # must wait until subprocess records event event.synchronize() self.assertEqual(list(tensor), [4, 4, 4, 4]) p.join()
def test_copy_device(self): x = torch.randn(5, 5).cuda() with torch.cuda.device(1): y = x.cuda() self.assertEqual(y.get_device(), 1) self.assertIs(y.cuda(), y) z = y.cuda(0) self.assertEqual(z.get_device(), 0) self.assertIs(z.cuda(0), z) x = torch.randn(5, 5) with torch.cuda.device(1): y = x.cuda() self.assertEqual(y.get_device(), 1) self.assertIs(y.cuda(), y) z = y.cuda(0) self.assertEqual(z.get_device(), 0) self.assertIs(z.cuda(0), z)
def test_broadcast_coalesced(self): numel = 5 num_bytes = numel * 8 tensors = [ torch.randn(numel).long().cuda(), torch.randn(numel).cuda(), torch.randn(numel).long().cuda(), torch.randn(numel).long().cuda(), torch.randn(numel * 2).int().cuda(), # int is 2x shorter torch.randn(numel).cuda(), ] b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors] for (_, bt), t in zip(b_tensors, tensors): self.assertEqual(bt.get_device(), 1) self.assertEqual(bt, t) self.assertIsInstance(bt, type(t)) bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2) bc_tensors_t = list(zip(*bc_tensors)) self.assertEqual(b_tensors, bc_tensors_t) for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t): self.assertEqual(bt.get_device(), bct.get_device()) self.assertIsInstance(bct, type(bt))
def test_streams(self): default_stream = torch.cuda.current_stream() user_stream = torch.cuda.Stream() self.assertEqual(torch.cuda.current_stream(), default_stream) self.assertNotEqual(default_stream, user_stream) self.assertEqual(default_stream.cuda_stream, 0) self.assertNotEqual(user_stream.cuda_stream, 0) with torch.cuda.stream(user_stream): self.assertEqual(torch.cuda.current_stream(), user_stream) self.assertTrue(user_stream.query()) # copy 10 MB tensor from CPU-GPU which should take some time tensor1 = torch.ByteTensor(10000000).pin_memory() tensor2 = tensor1.cuda(async=True) self.assertFalse(default_stream.query()) default_stream.synchronize() self.assertTrue(default_stream.query())
def test_caching_pinned_memory(self): cycles_per_ms = get_cycles_per_ms() # check that allocations are re-used after deletion t = torch.FloatTensor([1]).pin_memory() ptr = t.data_ptr() del t t = torch.FloatTensor([1]).pin_memory() self.assertEqual(t.data_ptr(), ptr, 'allocation not reused') # check that the allocation is not re-used if it's in-use by a copy gpu_tensor = torch.cuda.FloatTensor([0]) torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy gpu_tensor.copy_(t, async=True) del t t = torch.FloatTensor([1]).pin_memory() self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon') self.assertEqual(list(gpu_tensor), [1])
def test_caching_pinned_memory_multi_gpu(self): # checks that the events preventing pinned memory from being re-used # too early are recorded on the correct GPU cycles_per_ms = get_cycles_per_ms() t = torch.FloatTensor([1]).pin_memory() ptr = t.data_ptr() gpu_tensor0 = torch.cuda.FloatTensor([0], device=0) gpu_tensor1 = torch.cuda.FloatTensor([0], device=1) with torch.cuda.device(1): torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy gpu_tensor1.copy_(t, async=True) del t t = torch.FloatTensor([2]).pin_memory() self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon') with torch.cuda.device(0): gpu_tensor0.copy_(t, async=True) self.assertEqual(gpu_tensor1[0], 1) self.assertEqual(gpu_tensor0[0], 2)
def test_serialization_map_location(self): DATA_URL = 'https://download.pytorch.org/test_data/gpu_tensors.pt' data_dir = os.path.join(os.path.dirname(__file__), 'data') test_file_path = os.path.join(data_dir, 'gpu_tensors.pt') succ = download_file(DATA_URL, test_file_path) if not succ: warnings.warn( "Couldn't download the test file for map_location! " "Tests will be incomplete!", RuntimeWarning) return def map_location(storage, loc): return storage tensor = torch.load(test_file_path, map_location=map_location) self.assertEqual(type(tensor), torch.FloatTensor) self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]])) tensor = torch.load(test_file_path, map_location={'cuda:0': 'cpu'}) self.assertEqual(type(tensor), torch.FloatTensor) self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))
def test_print(self): for t in torch._tensor_classes: if t in torch.sparse._sparse_tensor_classes: continue if t.is_cuda and not torch.cuda.is_available(): continue obj = t(100, 100).fill_(1) obj.__repr__() str(obj) for t in torch._storage_classes: if t.is_cuda and not torch.cuda.is_available(): continue obj = t(100).fill_(1) obj.__repr__() str(obj) x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1]) x.__repr__() str(x)
def __init__(self, size, vocab, cuda=False): """Initialize params.""" self.size = size self.done = False self.pad = vocab['<pad>'] self.bos = vocab['<s>'] self.eos = vocab['</s>'] self.tt = torch.cuda if cuda else torch # The score for each translation on the beam. self.scores = self.tt.FloatTensor(size).zero_() # The backpointers at each time-step. self.prevKs = [] # The outputs at each time-step. self.nextYs = [self.tt.LongTensor(size).fill_(self.pad)] self.nextYs[0][0] = self.bos # The attentions (matrix) for each time. self.attn = [] # Get the outputs for the current timestep.