我们从Python开源项目中,提取了以下7个代码示例,用于说明如何使用numba.int32()。
def make_complete_graph(num_vertices): """Constructs a complete graph. The pairing function is: k = v1 + v2 * (v2 - 1) // 2 Args: num_vertices: Number of vertices. Returns: A tuple with elements: V: Number of vertices. K: Number of edges. grid: a 3 x K grid of (edge, vertex, vertex) triples. """ V = num_vertices K = V * (V - 1) // 2 grid = np.zeros([3, K], np.int32) k = 0 for v2 in range(V): for v1 in range(v2): grid[:, k] = [k, v1, v2] k += 1 return grid
def make_tree(edges): """Constructs a tree graph from a set of (vertex,vertex) pairs. Args: edges: A list or set of unordered (vertex, vertex) pairs. Returns: A tuple with elements: V: Number of vertices. E: Number of edges. grid: a 3 x E grid of (edge, vertex, vertex) triples. """ assert all(isinstance(edge, tuple) for edge in edges) edges = [tuple(sorted(edge)) for edge in edges] edges.sort() E = len(edges) grid = np.zeros([3, E], np.int32) for e, (v1, v2) in enumerate(edges): grid[:, e] = [e, v1, v2] return grid
def __init__(self, num_vertices): logger.debug('TreeStructure with %d vertices', num_vertices) self._num_vertices = num_vertices self._num_edges = num_vertices - 1 self.set_edges([(v, v + 1) for v in range(num_vertices - 1)]) self._complete_grid = None # Lazily constructed. self._vertices = np.arange(num_vertices, dtype=np.int32)
def numba_csgraph(csr, node_props=None): if node_props is None: node_props = np.broadcast_to(1., csr.shape[0]) node_props.flags.writeable = True return CSGraph(csr.indptr, csr.indices, csr.data, np.array(csr.shape, dtype=np.int32), node_props)
def cudatest_hist(): # src1 = np.arange(n, dtype=np.float32) src1 = np.random.randint(BIN_COUNT,size=n).astype(np.float32) histogram = np.zeros(BIN_COUNT, dtype=np.int32) print(src1) stream = cuda.stream() # use stream to trigger async memory transfer ts = timer() # Controll the iterations count = 1 for i in range(count): with stream.auto_synchronize(): # ts = timer() d_src1 = cuda.to_device(src1, stream=stream) d_hist = cuda.to_device(histogram, stream=stream) # gpu_1d_stencil[bpg, tpb, stream](d_src1) gpu_histogram[bpg, tpb, stream](d_src1,d_hist) d_src1.copy_to_host(src1, stream=stream) d_hist.copy_to_host(histogram, stream=stream) te = timer() print('pinned ',count," : ", te - ts) print(histogram) # Taking histogram on origional data. # This histogram will contain few more frequency due to the padding we add in the orional data. # in kernel code. hist = src1.astype(np.int64) x = itemfreq(hist.ravel()) hist = x#[:, 1]/sum(x[:, 1]) print(hist) # cudatest_stencil()
def thresholding(arry, hist): # We have 10*10 threads per block A = cuda.shared.array(shape=(32,32), dtype=int32) x,y = cuda.grid(2) ty = cuda.threadIdx.x tx = cuda.threadIdx.y A[ty,tx] = arry[x,y] cuda.syncthreads() threadCountX = A.shape[0] - 1 threadCountY = A.shape[1] - 1 # If within x range and y range then calculate the LBP discriptor along # with histogram value to specific bin # Other wise Ignore the Value if (ty > 0 and (threadCountX-ty) > 0 ) and (tx > 0 and (threadCountY-tx) > 0): # You can do the Processing here. ^_^ code = 0 # We need to make sure that each value is accessable to each thread # TODO: make them atomic center = A[ty, tx] code = 0 if center > 150 else 255 code = ( code - center) A[ty,tx] = code # Wait All Threads to Sync here. cuda.syncthreads() val = A[ty,tx] cuda.atomic.add(arry, (x,y),val) cuda.syncthreads() # This Atomic Operation is equivalent to hist[code % 256] += 1 ind = code % BIN_COUNT cuda.atomic.add(hist, ind, 1)
def unsharp_masking(arry, hist): # We have 10*10 threads per block A = cuda.shared.array(shape=(32,32), dtype=int32) # H = cuda.shared.array(BIN_COUNT, dtype=int32) x,y = cuda.grid(2) ty = cuda.threadIdx.x tx = cuda.threadIdx.y A[ty,tx] = arry[x,y] cuda.syncthreads() threadCountX = A.shape[0] - 1 threadCountY = A.shape[1] - 1 # If within x range and y range then calculate the LBP discriptor along # with histogram value to specific bin # Other wise Ignore the Value if (ty > 0 and (threadCountX-ty) > 0 ) and (tx > 0 and (threadCountY-tx) > 0): # # You can do the Processing here. ^_^ code = 0 # We need to make sure that each value is accessable to each thread # TODO: make them atomic center = A[ty, tx] # Lets try averaging, code += A[ty-1][tx-1]*-1 code += A[ty][tx-1]*-2 code += A[ty+1][tx-1]*-1 code += A[ty+1][tx]*-2 code += A[ty+1][tx+1]*-1 code += A[ty][tx+1]*-2 code += A[ty-1][tx+1]*-1 code += A[ty-1][tx-1]*-2 code = code / 16 code = ( code - center) A[ty,tx] = code # cuda.atomic.add(A, (ty,tx),code) cuda.syncthreads() val = A[ty,tx] cuda.atomic.add(arry, (x,y),val) cuda.syncthreads() # This Atomic Operation is equivalent to hist[code % 256] += 1 ind = code % BIN_COUNT cuda.atomic.add(hist, ind, 1)