本章节旨在帮助用户了解 cuda 内部线程块划分的规则,理解线程号的计算逻辑。

1. 1 维 block 和 1 维 thread

样例中设置了两个 block,每个 block 中 64 个线程, blockDim.x = 64,

blockIdx.x 代表当前线程所在第几个 block;

threadIdx.x 代表当前现在在当前 block 中是第几个 thread;

warp_idx 代表当前线程在当前 block 中是第几个 warp(warp 会选择相邻的线程号做组合);

calc_idx 代表当前线程计算的是全局的第几个 thread;

block 的索引 * 每个 block 的 thread 个数 + block 内的 thread 索引 计算出全局索引。

   const unsigned int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;

编译命令

nvcc my_id.cu -o my_id

执行命令

./my_id

运行结果

cac_thread   0 - block  0 - warp   0 - thread  0
cac_thread   1 - block  0 - warp   0 - thread  1
cac_thread   2 - block  0 - warp   0 - thread  2
cac_thread   3 - block  0 - warp   0 - thread  3
cac_thread   4 - block  0 - warp   0 - thread  4
cac_thread   5 - block  0 - warp   0 - thread  5
cac_thread   6 - block  0 - warp   0 - thread  6
cac_thread   7 - block  0 - warp   0 - thread  7
cac_thread   8 - block  0 - warp   0 - thread  8
cac_thread   9 - block  0 - warp   0 - thread  9
cac_thread  10 - block  0 - warp   0 - thread 10
cac_thread  11 - block  0 - warp   0 - thread 11
cac_thread  12 - block  0 - warp   0 - thread 12
cac_thread  13 - block  0 - warp   0 - thread 13
cac_thread  14 - block  0 - warp   0 - thread 14
cac_thread  15 - block  0 - warp   0 - thread 15
cac_thread  16 - block  0 - warp   0 - thread 16
cac_thread  17 - block  0 - warp   0 - thread 17
cac_thread  18 - block  0 - warp   0 - thread 18
cac_thread  19 - block  0 - warp   0 - thread 19
cac_thread  20 - block  0 - warp   0 - thread 20
cac_thread  21 - block  0 - warp   0 - thread 21
cac_thread  22 - block  0 - warp   0 - thread 22
cac_thread  23 - block  0 - warp   0 - thread 23
cac_thread  24 - block  0 - warp   0 - thread 24
cac_thread  25 - block  0 - warp   0 - thread 25
cac_thread  26 - block  0 - warp   0 - thread 26
cac_thread  27 - block  0 - warp   0 - thread 27
cac_thread  28 - block  0 - warp   0 - thread 28
cac_thread  29 - block  0 - warp   0 - thread 29
cac_thread  30 - block  0 - warp   0 - thread 30
cac_thread  31 - block  0 - warp   0 - thread 31
cac_thread  32 - block  0 - warp   1 - thread 32
cac_thread  33 - block  0 - warp   1 - thread 33
cac_thread  34 - block  0 - warp   1 - thread 34
cac_thread  35 - block  0 - warp   1 - thread 35
cac_thread  36 - block  0 - warp   1 - thread 36
cac_thread  37 - block  0 - warp   1 - thread 37
cac_thread  38 - block  0 - warp   1 - thread 38
cac_thread  39 - block  0 - warp   1 - thread 39
cac_thread  40 - block  0 - warp   1 - thread 40
cac_thread  41 - block  0 - warp   1 - thread 41
cac_thread  42 - block  0 - warp   1 - thread 42
cac_thread  43 - block  0 - warp   1 - thread 43
cac_thread  44 - block  0 - warp   1 - thread 44
cac_thread  45 - block  0 - warp   1 - thread 45
cac_thread  46 - block  0 - warp   1 - thread 46
cac_thread  47 - block  0 - warp   1 - thread 47
cac_thread  48 - block  0 - warp   1 - thread 48
cac_thread  49 - block  0 - warp   1 - thread 49
cac_thread  50 - block  0 - warp   1 - thread 50
cac_thread  51 - block  0 - warp   1 - thread 51
cac_thread  52 - block  0 - warp   1 - thread 52
cac_thread  53 - block  0 - warp   1 - thread 53
cac_thread  54 - block  0 - warp   1 - thread 54
cac_thread  55 - block  0 - warp   1 - thread 55
cac_thread  56 - block  0 - warp   1 - thread 56
cac_thread  57 - block  0 - warp   1 - thread 57
cac_thread  58 - block  0 - warp   1 - thread 58
cac_thread  59 - block  0 - warp   1 - thread 59
cac_thread  60 - block  0 - warp   1 - thread 60
cac_thread  61 - block  0 - warp   1 - thread 61
cac_thread  62 - block  0 - warp   1 - thread 62
cac_thread  63 - block  0 - warp   1 - thread 63
cac_thread  64 - block  1 - warp   0 - thread  0
cac_thread  65 - block  1 - warp   0 - thread  1
cac_thread  66 - block  1 - warp   0 - thread  2
cac_thread  67 - block  1 - warp   0 - thread  3
cac_thread  68 - block  1 - warp   0 - thread  4
cac_thread  69 - block  1 - warp   0 - thread  5
cac_thread  70 - block  1 - warp   0 - thread  6
cac_thread  71 - block  1 - warp   0 - thread  7
cac_thread  72 - block  1 - warp   0 - thread  8
cac_thread  73 - block  1 - warp   0 - thread  9
cac_thread  74 - block  1 - warp   0 - thread 10
cac_thread  75 - block  1 - warp   0 - thread 11
cac_thread  76 - block  1 - warp   0 - thread 12
cac_thread  77 - block  1 - warp   0 - thread 13
cac_thread  78 - block  1 - warp   0 - thread 14
cac_thread  79 - block  1 - warp   0 - thread 15
cac_thread  80 - block  1 - warp   0 - thread 16
cac_thread  81 - block  1 - warp   0 - thread 17
cac_thread  82 - block  1 - warp   0 - thread 18
cac_thread  83 - block  1 - warp   0 - thread 19
cac_thread  84 - block  1 - warp   0 - thread 20
cac_thread  85 - block  1 - warp   0 - thread 21
cac_thread  86 - block  1 - warp   0 - thread 22
cac_thread  87 - block  1 - warp   0 - thread 23
cac_thread  88 - block  1 - warp   0 - thread 24
cac_thread  89 - block  1 - warp   0 - thread 25
cac_thread  90 - block  1 - warp   0 - thread 26
cac_thread  91 - block  1 - warp   0 - thread 27
cac_thread  92 - block  1 - warp   0 - thread 28
cac_thread  93 - block  1 - warp   0 - thread 29
cac_thread  94 - block  1 - warp   0 - thread 30
cac_thread  95 - block  1 - warp   0 - thread 31
cac_thread  96 - block  1 - warp   1 - thread 32
cac_thread  97 - block  1 - warp   1 - thread 33
cac_thread  98 - block  1 - warp   1 - thread 34
cac_thread  99 - block  1 - warp   1 - thread 35
cac_thread 100 - block  1 - warp   1 - thread 36
cac_thread 101 - block  1 - warp   1 - thread 37
cac_thread 102 - block  1 - warp   1 - thread 38
cac_thread 103 - block  1 - warp   1 - thread 39
cac_thread 104 - block  1 - warp   1 - thread 40
cac_thread 105 - block  1 - warp   1 - thread 41
cac_thread 106 - block  1 - warp   1 - thread 42
cac_thread 107 - block  1 - warp   1 - thread 43
cac_thread 108 - block  1 - warp   1 - thread 44
cac_thread 109 - block  1 - warp   1 - thread 45
cac_thread 110 - block  1 - warp   1 - thread 46
cac_thread 111 - block  1 - warp   1 - thread 47
cac_thread 112 - block  1 - warp   1 - thread 48
cac_thread 113 - block  1 - warp   1 - thread 49
cac_thread 114 - block  1 - warp   1 - thread 50
cac_thread 115 - block  1 - warp   1 - thread 51
cac_thread 116 - block  1 - warp   1 - thread 52
cac_thread 117 - block  1 - warp   1 - thread 53
cac_thread 118 - block  1 - warp   1 - thread 54
cac_thread 119 - block  1 - warp   1 - thread 55
cac_thread 120 - block  1 - warp   1 - thread 56
cac_thread 121 - block  1 - warp   1 - thread 57
cac_thread 122 - block  1 - warp   1 - thread 58
cac_thread 123 - block  1 - warp   1 - thread 59
cac_thread 124 - block  1 - warp   1 - thread 60
cac_thread 125 - block  1 - warp   1 - thread 61
cac_thread 126 - block  1 - warp   1 - thread 62
cac_thread 127 - block  1 - warp   1 - thread 63

2. 2 维 block 和 2 维 thread

一,二列是用户调用 kernel 时设置的 block 个数 num_blocks =(1,4), x 维是 1, y 维是 4;

三,四列是用户调用 kernel 时设置的每个 block 中 thread 个数 num_threads= (32, 4) x 维是 32, y 维是 4;

总的线程数计算为(gridDim.x * gridDim.y)* (blockDim.x * blockDim.y) 共计 512 个线程。

    griddim_x[thread_idx] = gridDim.x; // 1    griddim_y[thread_idx] = gridDim.y; // 4    blockdim_x[thread_idx] = blockDim.x; // 32    blockdim_y[thread_idx] = blockDim.y; // 4