我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用theano.gpuarray()。
def test_multi_process_simultaneous(n_gpu=2, worker_func_maker=unpickle_func, bar_loop=False): barrier = mp.Barrier(n_gpu) if PROFILE: target = sim_profiling_worker else: target = simultaneous_worker procs = [mp.Process(target=target, args=(rank, worker_func_maker, barrier, bar_loop)) for rank in range(1, n_gpu)] for p in procs: p.start() theano.gpuarray.use("cuda0") f_train, name = build_train_func() barrier.wait() # workers build or unpickle time.sleep(1) barrier.wait() # workers are ready. test_the_function(f_train, name=name, barrier=barrier, bar_loop=bar_loop) for p in procs: p.join()
def init_gpus(rank, n_parallel=None): import theano import theano.gpuarray try: theano.gpuarray.use("cuda" + str(rank)) except Exception as exc: if n_parallel is not None: raise exc("Master unable to use GPU.") else: sync.workers_OK.value = False raise exc("Worker rank {} unable to use GPU.".format(rank)) finally: sync.barrier_out.wait() if n_parallel is not None: if sync.workers_OK.value: print("Synkhronos: {} GPUs initialized, master rank: {}".format( n_parallel, rank)) else: raise RuntimeError("Workers did not initialize GPUs.")
def test_one_process(gpu=0): theano.gpuarray.use("cuda" + str(gpu)) f_train, train_name = build_train_func() pickle_func(f_train) f_unpkl, unpkl_name = unpickle_func() test_the_function(f_train, train_name) test_the_function(f_unpkl, unpkl_name)
def test_multi_process_sequence(n_gpu=2, worker_func_maker=unpickle_func): barrier = mp.Barrier(n_gpu) if PROFILE: target = seq_profiling_worker else: target = sequence_worker procs = [mp.Process(target=target, args=(rank, n_gpu, barrier, worker_func_maker)) for rank in range(1, n_gpu)] for p in procs: p.start() theano.gpuarray.use("cuda0") f_train, name = build_train_func() pickle_func(f_train) barrier.wait() # workers make function (maybe unpickle). barrier.wait() for i in range(n_gpu): time.sleep(1) barrier.wait() if i == 0: test_the_function(f_train, name) for p in procs: p.join()
def sequence_worker(rank, n_gpu, barrier, function_maker): theano.gpuarray.use("cuda" + str(rank)) # maybe master makes the function barrier.wait() f_train, name = function_maker(rank=rank) # maybe unpickle barrier.wait() for i in range(n_gpu): time.sleep(1) barrier.wait() if i == rank: test_the_function(f_train, name=name, rank=rank)
def simultaneous_worker(rank, function_maker, barrier, bar_loop): theano.gpuarray.use("cuda" + str(rank)) # maybe master makes the function barrier.wait() f_train, name = function_maker(rank) barrier.wait() test_the_function(f_train, name=name, rank=rank, barrier=barrier, bar_loop=bar_loop)
def __init__(self, n_gpu, rank, master_rank): gpu_ctx = theano.gpuarray.get_context(None) clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx) if rank == master_rank: sync.dict["gpu_comm_id"] = clique_id.comm_id sync.barrier.wait() else: sync.barrier.wait() clique_id.comm_id = sync.dict["gpu_comm_id"] self.comm = gpu_coll.GpuComm(clique_id, n_gpu, rank) self.n_gpu = n_gpu self.avg_fac = 1. / n_gpu self.master_rank = master_rank
def init_device(device='gpu0'): if device.startswith('cuda'): import os if 'THEANO_FLAGS' in os.environ: raise ValueError('Use theanorc to set the theano config') os.environ['THEANO_FLAGS'] = 'device={0}'.format(device) import theano.gpuarray # This is a bit of black magic that may stop working in future # theano releases ctx = theano.gpuarray.type.get_context(None) drv = None elif device.startswith('gpu'): gpuid = int(device[-1]) import pycuda.driver as drv drv.init() dev = drv.Device(gpuid) ctx = dev.make_context() import theano.sandbox.cuda theano.sandbox.cuda.use(device) import theano else: drv=None ctx=None import theano.sandbox.cuda theano.sandbox.cuda.use(device) import theano from theano import function, config, shared, sandbox, tensor vlen = 10 * 30 * 768 # 10 x #cores x # threads per core iters = 1000 rng = np.random.RandomState(22) arr = rng.rand(vlen) shared_x = theano.shared(np.asarray(arr, config.floatX)) shared_xx = theano.shared(np.asarray(arr, config.floatX)) x=tensor.fvector("x") # compile a function so that shared_x will be set to part of a computing graph on GPU (CUDAndarray) f = function([], tensor.exp(x), givens=[(x,shared_x)]) if np.any([isinstance(x.op, tensor.Elemwise) and ('Gpu' not in type(x.op).__name__) for x in f.maker.fgraph.toposort()]): print('Used the cpu') else: print('Used the gpu') # if np.any([isinstance(x.op, tensor.Elemwise) for x in f.maker.fgraph.toposort()]) and device!='cpu': # raise TypeError('graph not compiled on GPU') return drv,ctx, arr, shared_x, shared_xx
def traverse(out, x, x_copy, d, visited=None): """ Function used by scan to parse the tree and figure out which nodes it needs to replace. There are two options : 1) x and x_copy or on host, then you would replace x with x_copy 2) x is on gpu, x_copy on host, then you need to replace host_from_gpu(x) with x_copy This happens because initially shared variables are on GPU... which is fine for the main computational graph but confuses things a bit for the inner graph of scan. """ # ``visited`` is a set of nodes that are already known and don't need to be # checked again, speeding up the traversal of multiply-connected graphs. # if a ``visited`` set is given, it will be updated in-place so the callee # knows which nodes we have seen. if visited is None: visited = set() if out in visited: return d visited.add(out) from theano.sandbox import cuda from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu from theano.gpuarray import pygpu_activated from theano.gpuarray.type import GpuArrayType if out == x: if isinstance(x.type, cuda.CudaNdarrayType): d[out] = cuda.gpu_from_host(x_copy) else: assert isinstance(x.type, GpuArrayType) d[out] = gpu_from_host(x.type.context_name)(x_copy) return d elif out.owner is None: return d elif (cuda.cuda_available and out.owner.op == cuda.host_from_gpu and out.owner.inputs == [x]): d[out] = tensor.as_tensor_variable(x_copy) return d elif (pygpu_activated and out.owner.op == host_from_gpu and out.owner.inputs == [x]): d[out] = tensor.as_tensor_variable(x_copy) return d else: for inp in out.owner.inputs: d = traverse(inp, x, x_copy, d, visited) return d # Hashing a dictionary/list/tuple by xoring the hash of each element