#! /usr/bin/env python # def cuda_loop ( blocks, threads, n ): #*****************************************************************************80 # ## CUDA_LOOP simulates the behavior of a CUDA loop. # # Discussion: # # A CUDA kernel "kernel()" is invoked by a command of the form # # kernel << blocks, threads >> ( args ) # # where blocks and threads are each vectors of up to 3 values, # listing the number of blocks and number of threads to be used. # # If a problem involves N tasks, then tasks are allotted to # specific CUDA processes in an organized fashion. Some processes # may get no tasks, one task, or multiple tasks. # # Each process is given variables that can be used to determine # the tasks to be performed: # # gridDim.x, gridDim.y, gridDim.z: the block dimensions as # given by the user in "blocks" # # blockDim.x, blockDim.y, blockDim.z: the thread dimensions as # given by the user in "threads" # # blockIdx.x, blockIdx.y, blockId.z: the block indices for this process. # # threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process. # # Essentially, a process can determine its linear index K by: # # K = threadIdx.x # + blockdim.x * threadIdx.y # + blockDim.x * blockDim.y * threadIdx.z # + blockDim.x * blockDim.y * blockDim.z * blockIdx.x # + blockDim.x * blockDim.y * blockDim.z * gridDim.x * blockIdx.y # + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * blockIdx.z # # Set task T = K. # # while ( T < N ) # carry out task T # T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 27 March 2018 # # Author: # # John Burkardt # # Parameters: # # Input, integer BLOCKS[3], the CUDA block values. These should be nonnegative. # Typically, the third entry is 1. Generally, the first two values cannot # be greater than 35,535. # # Input, integer THREADS[3], the CUDA thread values. These should be nonnegative. # Typically, there is a maximum value imposed on these quantities, which # depends on the GPU model. # # Input, integer N, the number of tasks to be carried out. # print ( "" ) print ( "CUDA_LOOP:" ) print ( " Simulate the assignment of N tasks to the blocks" ) print ( " and threads of a GPU using CUDA." ) print ( "" ) print ( " Number of tasks is %d" % ( n ) ) print ( " BLOCKS: { %d, %d, %d }" % ( blocks[0], blocks[1], blocks[2] ) ) print ( " THREADS: { %d, %d, %d }" % ( threads[0], threads[1], threads[2] ) ) k1 = 0 blockDimx = threads[0] blockDimy = threads[1] blockDimz = threads[2] gridDimx = blocks[0] gridDimy = blocks[1] gridDimz = blocks[2] chunk = blocks[1] * blocks[0] * threads[2] * threads[1] * threads[0] print ( " Total threads = %d" % ( chunk ) ) print ( "" ) print ( " Process Process (bx,by,bz) (tx,ty,tz) Tasks..." ) print ( " Increment Formula" ) print ( "" ) for blockIdz in range ( 0, gridDimz ): for blockIdy in range ( 0, gridDimy ): for blockIdx in range ( 0, gridDimx ): for threadIdz in range ( 0, blockDimz ): for threadIdy in range ( 0, blockDimy ): for threadIdx in range ( 0, blockDimx ): t = k1 k2 = \ threadIdx \ + blockDimx * threadIdy \ + blockDimx * blockDimy * threadIdz \ + blockDimx * blockDimy * blockDimz * blockIdx \ + blockDimx * blockDimy * blockDimz * gridDimx * blockIdy \ + blockDimx * blockDimy * blockDimz * gridDimx * gridDimy * blockIdz print ( " %7d %7d: (%2d,%2d,%2d) (%2d,%2d,%2d)" % ( k1, k2, blockIdx, blockIdy, blockIdz, threadIdx, threadIdy, threadIdz ), end = '' ) while ( t < n ): print ( "%3d" % ( t ) ), t = t + chunk print ( "" ) k1 = k1 + 1 return def cuda_loop_test ( ): #*****************************************************************************80 # ## CUDA_LOOP_TEST demonstrates CUDA_LOOP. # # Discussion: # # A CUDA kernel "kernel()" is invoked by a command of the form # # kernel << blocks, threads >> ( args ) # # where blocks and threads are each vectors of up to 3 values, # listing the number of blocks and number of threads to be used. # # If a problem involves N tasks, then tasks are allotted to # specific CUDA processes in an organized fashion. Some processes # may get no tasks, one task, or multiple tasks. # # Each process is given variables that can be used to determine # the tasks to be performed: # # gridDim.x, gridDim.y, gridDim.z: the block dimensions as # given by the user in "blocks" # # blockDim.x, blockDim.y, blockDim.z: the thread dimensions as # given by the user in "threads" # # blockIdx.x, blockIdx.y, blockId.z: the block indices for this process. # # threadIdx.x, threadIdx.y, threadIdx.z: the thread indices for this process. # # Essentially, a process can determine its linear index K by: # # K = threadIdx.x # + blockdim.x * threadIdx.y # + blockDim.x * blockDim.y * threadIdx.z # + blockDim.x * blockDim.y * blockDim.z * blockIdx.x # + blockDim.x * blockDim.y * blockDim.z * gridDim.x * blockIdx.y # + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * blockIdx.z # # Set task T = K. # # while ( T < N ) # carry out task T # T = T + blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z. # # This program suggests how a specific set of block and thread parameters # would determine the assignment of individual tasks to CUDA processes. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 27 March 2018 # # Author: # # John Burkardt # # Parameters: # # Local, integer BLOCKS[3], the CUDA block values. These should be nonnegative. # Typically, the third entry is 1. Generally, the first two values cannot # be greater than 35,535. # # Local, integer THREADS[3], the CUDA thread values. These should be nonnegative. # Typically, there is a maximum value imposed on these quantities, which # depends on the GPU model. # # Local, integer N, the number of tasks to be carried out. # import numpy as np print ( "" ) print ( "CUDA_LOOP_TEST:" ) print ( " Python version" ) print ( " Simulate the way CUDA breaks up an iterative task, using" ) print ( " blocks and threads." ) # # Linear array of blocks and threads. # Essentially, blocks = your hands and threads = your fingers. # Now count up to 23. # blocks = np.array ( [ 2, 1, 1 ] ) threads = np.array ( [ 5, 1, 1 ] ) n = 23 cuda_loop ( blocks, threads, n ) # # Unit arrays of blocks and threads. # Waste your GPU by having a single block and thread do everything. # blocks = np.array ( [ 1, 1, 1 ] ) threads = np.array ( [ 1, 1, 1 ] ) n = 23 cuda_loop ( blocks, threads, n ) # # 2D block array, 3D thread array. # More processes than tasks. # blocks = np.array ( [ 2, 3, 1 ] ) threads = np.array ( [ 2, 1, 4 ] ) n = 40 cuda_loop ( blocks, threads, n ) # # One block, 8 threads. # blocks = np.array ( [ 1, 1, 1 ] ) threads = np.array ( [ 2, 2, 2 ] ) n = 23 cuda_loop ( blocks, threads, n ) # # Terminate. # print ( "" ) print ( "CUDA_LOOP_TEST:" ) print ( " Normal end of execution." ) return def timestamp ( ): #*****************************************************************************80 # ## TIMESTAMP prints the date as a timestamp. # # Licensing: # # This code is distributed under the GNU LGPL license. # # Modified: # # 06 April 2013 # # Author: # # John Burkardt # # Parameters: # # None # import time t = time.time ( ) print ( time.ctime ( t ) ) return None if ( __name__ == '__main__' ): timestamp ( ) cuda_loop_test ( ) timestamp ( )