我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用chainer.cuda.Device()。
def setup_workers(self): # work only once if self._initialized: return self._initialized = True self.model.cleargrads() for i in six.moves.range(1, len(self.gpus)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(float(self.batch) / len(self.gpus) / self.train_batch_divide), self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self.gpus[0]): self.model.to_gpu(self.gpus[0]) if len(self.gpus) > 1: communication_id = nccl.get_unique_id() self._send_message(("set comm_id", communication_id)) self.communication = nccl.NcclCommunicator(len(self.gpus), communication_id, 0)
def setup_workers(self): # work only once if self._initialized: return self._initialized = True self.model.zerograds() for i in six.moves.range(1, len(self.gpus)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(self.batch / len(self.gpus) / self.train_batch_divide), self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self.gpus[0]): self.model.to_gpu(self.gpus[0]) if len(self.gpus) > 1: communication_id = nccl.get_unique_id() self._send_message(("set comm_id", communication_id)) self.communication = nccl.NcclCommunicator(len(self.gpus), communication_id, 0)
def _inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elments will be modified in-place a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32) # Output array c = cuda.cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cuda.cupy.empty(n_matrices, dtype=numpy.int32) ap = matmul._mat_ptrs(a) cp = matmul._mat_ptrs(c) _, lda = matmul._get_ld(a) _, ldc = matmul._get_ld(c) handle = cuda.Device().cublas_handle cuda.cublas.sgetrfBatched( handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched( handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) return c, info
def forward_gpu(self, inputs): x = inputs[0] W = inputs[1] # Prepare BLAS call handle = cuda.Device().cublas_handle k, m = W.shape n, l = x.shape[0] * x.shape[1], x.shape[2] lda = max(1, x.shape[-1]) ldb = max(1, W.strides[0] // W.dtype.itemsize) ldc = max(1, m) Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]), dtype=numpy.float32) sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb, x.data.ptr, lda, 0, Wx.data.ptr, ldc) if len(inputs) > 2: b = inputs[2] Wx += b return Wx,
def forward(self, inputs): xp = cuda.get_array_module(*inputs) x0, x1 = inputs self.diff = self.inside_weights * (x0 - x1) abs_diff = xp.abs(self.diff) flag = abs_diff < 1.0 / self.sigma2 y = (flag * 0.5 * xp.square(self.diff) * self.sigma2 + (~flag) * (abs_diff - 0.5 / self.sigma2)) if xp == cuda.cupy: with cuda.Device(cuda.get_device(y)): num = xp.prod(xp.asarray(y.shape)) else: num = xp.prod(y.shape) return xp.array(y.sum() / num).astype(numpy.float32),
def bbox_transform_inv(boxes, deltas, gpu=-1): if gpu >= 0: with cuda.Device(gpu): return _bbox_transform_inv(boxes, deltas) else: return _bbox_transform_inv(boxes, deltas)
def clip_boxes(boxes, im_shape, gpu=-1): if gpu >= 0: with cuda.Device(gpu): return _clip_boxes(boxes, im_shape) else: return _clip_boxes(boxes, im_shape)
def _batch_matmul_gpu(a, b, out, transa=False, transb=False, transout=False): a = _as_batch_mat(cuda.cupy.ascontiguousarray(a)) b = _as_batch_mat(cuda.cupy.ascontiguousarray(b)) trans_axis = (0, 2, 1) if transout: out = out.transpose(trans_axis) needtrans, _ = _get_ld(out) if needtrans == 1: # (A B)^T = B^T A^T a, b = b, a transa, transb = not transb, not transa out = out.transpose(trans_axis) if transa: a = a.transpose(trans_axis) if transb: b = b.transpose(trans_axis) transa, lda = _get_ld(a) transb, ldb = _get_ld(b) transout, ldout = _get_ld(out) la, n, ka = a.shape lb, kb, m = b.shape assert ka == kb assert transout == 0 or ldout == 1 assert out.shape == (la, n, m) ap = _mat_ptrs(a) bp = _mat_ptrs(b) outp = _mat_ptrs(out) cuda.cublas.sgemmBatched( cuda.Device().cublas_handle, transa, transb, n, m, ka, 1.0, ap.data.ptr, lda, bp.data.ptr, ldb, 0.0, outp.data.ptr, ldout, la)
def _det_gpu(b): # We do a batched LU decomposition on the GPU to compute # and compute the determinant by multiplying the diagonal. # Change the shape of the array to be size=1 minibatch if necessary. # Also copy the matrix as the elments will be modified in-place. a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.zeros((n_matrices, n), dtype='int32') # Output array # These arrays hold information on the execution success # or if the matrix was singular. info = cuda.cupy.zeros(n_matrices, dtype=numpy.intp) ap = matmul._mat_ptrs(a) _, lda = matmul._get_ld(a) cuda.cublas.sgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) det = cuda.cupy.prod(a.diagonal(axis1=1, axis2=2), axis=1) # The determinant is equal to the product of the diagonal entries # of `a` where the sign of `a` is flipped depending on whether # the pivot array is equal to its index. rng = cuda.cupy.arange(1, n + 1, dtype='int32') parity = cuda.cupy.sum(p != rng, axis=1) % 2 sign = 1. - 2. * parity.astype('float32') return det * sign, info
def test_linear_model_multi_gpu(self): with cuda.Device(0): self.assertGreater( cuda.to_cpu(self.model.accuracy_gpu(1).data), 0.9)
def test_model_setup_multi_gpu(self): with cuda.Device(0): model = self.model.model optimizer = self.model.optimizer model.to_gpu(1) optimizer.setup(model) for name, param in optimizer.target.namedparams(): for v in six.itervalues(optimizer._states[name]): self.assertEqual(int(param.data.device), int(v.device))
def check_accumulate_grads_from_gpu(self, src_id): with cuda.Device(src_id): self.optimizer.accumulate_grads([cuda.cupy.arange(3)]) grad = self.target.param.grad self.assertTrue((cuda.to_cpu(grad) == np.arange(3) * 2).all())
def test_accumulate_grads_gpu_to_cpu(self): self.setup_cpu() self.check_accumulate_grads_from_gpu(cuda.Device().id)
def test_accumulate_grads_gpu_to_gpu(self): device_id = cuda.Device().id self.setup_gpu(device_id) self.check_accumulate_grads_from_gpu(device_id)
def test_copy_parameters_from_cpu_to_gpu(self): self.check_copy_parameters_from(-1, cuda.Device().id)
def test_copy_parameters_from_gpu_to_cpu(self): self.check_copy_parameters_from(cuda.Device().id, -1)
def test_forward_gpu(self): device_id = cuda.Device().id self.check_forward(device_id, device_id)
def test_check_backward_gpu(self): device_id = cuda.Device().id self.check_forward(device_id, device_id)
def test_forward_cpu_to_gpu(self): device_id = cuda.Device().id self.check_forward(-1, device_id)
def test_backward_cpu_to_gpu(self): device_id = cuda.Device().id self.check_backward(-1, device_id)
def test_forward_gpu_to_cpu(self): device_id = cuda.Device().id self.check_forward(device_id, -1)
def backward_gpu(self, inputs, gy): x = inputs[0] W = inputs[1] # Backprop weight gW = cuda.cupy.empty_like(W) handle = cuda.Device().cublas_handle k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0] m = W.shape[1] lda = max(1, x.shape[-1]) ldb = max(1, gy[0].shape[-1]) ldc = max(1, m) sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb, x.data.ptr, lda, 1, gW.data.ptr, ldc) # Backprop input m, k = W.shape n, l = x.shape[0] * x.shape[1], gy[0].shape[2] lda = max(1, gy[0].shape[-1]) ldb = max(1, W.shape[1]) ldc = max(1, m) gx = cuda.cupy.empty_like(x) sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb, gy[0].data.ptr, lda, 0, gx.data.ptr, ldc) # Backprop bias if len(inputs) > 2: gy_2d = _as_mat(gy[0]) gb = gy_2d.sum(0) return gx, gW, gb else: return gx, gW
def run(self): dev = cuda.Device(self.device) dev.use() # build communication via nccl self.setup() gp = None p = multiprocessing.Pool(self.parallel_train) args_da = [self.da() for _ in six.moves.range(self.batch)] while True: job, data = self.pipe.recv() if job == 'finalize': dev.synchronize() break if job == 'update': # for reducing memory self.model.cleargrads() indices = list(self.sampling.yield_random_batch_from_category(1, self.picture_number_at_each_categories, self.batch, shuffle=True))[0] x = self.train_x[indices] t = self.train_y[indices] args = list(zip(x, t, args_da)) processed = p.starmap(process_train, args) tmp_x, tmp_t = list(zip(*processed)) train = True x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device) t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device) y = self.model(x, train=train) loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide loss.backward() del x del t del y del loss # send gradients of self.model gg = gather_grads(self.model) null_stream = cuda.Stream.null self.communication.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, null_stream.ptr) del gg self.model.cleargrads() # send parameters of self.model gp = gather_params(self.model) self.communication.bcast(gp.data.ptr, gp.size, nccl.NCCL_FLOAT, 0, null_stream.ptr) scatter_params(self.model, gp) gp = None
def update_core(self, x, t, p, args_da): self._send_message(('update', None)) with cuda.Device(self.gpus[0]): self.model.cleargrads() args = list(zip(x, t, args_da)) processed = p.starmap(process_train, args) tmp_x, tmp_t = list(zip(*processed)) data_length = len(tmp_x) train = True x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.gpus[0]) t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.gpus[0]) y = self.model(x, train=train) loss = self.model.calc_loss(y, t) / len(self.gpus) loss.backward() loss.to_cpu() loss = float(loss.data) * data_length del x del t del y # NCCL: reduce grads null_stream = cuda.Stream.null if self.communication is not None: # send grads gg = gather_grads(self.model) self.communication.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, null_stream.ptr) # copy grads, gg, to self.model scatter_grads(self.model, gg) del gg self.optimizer.update() if self.communication is not None: gp = gather_params(self.model) self.communication.bcast(gp.data.ptr, gp.size, nccl.NCCL_FLOAT, 0, null_stream.ptr) return loss
def run(self): dev = cuda.Device(self.device) dev.use() # build communication via nccl self.setup() gp = None da_args = [self.da() for _ in six.moves.range(self.batch)] p = multiprocessing.Pool(self.parallel) batch_of_batch = int(float(self.batch) / self.train_batch_divide) while True: job, data = self.pipe.recv() if job == 'finalize': dev.synchronize() break if job == 'update': # for reducing memory self.model.zerograds() indices = list(self.sampling.yield_random_batch_samples(1, self.batch, len(self.train_x), sort=False))[0] for ii in six.moves.range(0, len(indices), batch_of_batch): x = self.train_x[indices[ii:ii + batch_of_batch]] t = self.train_y[indices[ii:ii + batch_of_batch]] args = list(six.moves.zip(x, t, da_args)) processed = p.starmap(process_train, args) tmp_x, tmp_t = list(zip(*processed)) train = True x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device) t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device) y = self.model(x, train=train) loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide loss.backward() del x del t del y del loss # send gradients of self.model gg = gather_grads(self.model) null_stream = cuda.Stream.null self.communication.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, null_stream.ptr) del gg self.model.zerograds() # send parameters of self.model gp = gather_params(self.model) self.communication.bcast(gp.data.ptr, gp.size, nccl.NCCL_FLOAT, 0, null_stream.ptr) scatter_params(self.model, gp) gp = None