本章节旨在帮助用户了解 cuda 内部线程块划分的规则,理解线程号的计算逻辑。
样例中设置了两个 block,每个 block 中 64 个线程, blockDim.x = 64,
blockIdx.x 代表当前线程所在第几个 block;
threadIdx.x 代表当前现在在当前 block 中是第几个 thread;
warp_idx 代表当前线程在当前 block 中是第几个 warp(warp 会选择相邻的线程号做组合);
calc_idx 代表当前线程计算的是全局的第几个 thread;
block 的索引 * 每个 block 的 thread 个数 + block 内的 thread 索引 计算出全局索引。
const unsigned int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
编译命令
nvcc my_id.cu -o my_id
执行命令
./my_id
运行结果
cac_thread 0 - block 0 - warp 0 - thread 0
cac_thread 1 - block 0 - warp 0 - thread 1
cac_thread 2 - block 0 - warp 0 - thread 2
cac_thread 3 - block 0 - warp 0 - thread 3
cac_thread 4 - block 0 - warp 0 - thread 4
cac_thread 5 - block 0 - warp 0 - thread 5
cac_thread 6 - block 0 - warp 0 - thread 6
cac_thread 7 - block 0 - warp 0 - thread 7
cac_thread 8 - block 0 - warp 0 - thread 8
cac_thread 9 - block 0 - warp 0 - thread 9
cac_thread 10 - block 0 - warp 0 - thread 10
cac_thread 11 - block 0 - warp 0 - thread 11
cac_thread 12 - block 0 - warp 0 - thread 12
cac_thread 13 - block 0 - warp 0 - thread 13
cac_thread 14 - block 0 - warp 0 - thread 14
cac_thread 15 - block 0 - warp 0 - thread 15
cac_thread 16 - block 0 - warp 0 - thread 16
cac_thread 17 - block 0 - warp 0 - thread 17
cac_thread 18 - block 0 - warp 0 - thread 18
cac_thread 19 - block 0 - warp 0 - thread 19
cac_thread 20 - block 0 - warp 0 - thread 20
cac_thread 21 - block 0 - warp 0 - thread 21
cac_thread 22 - block 0 - warp 0 - thread 22
cac_thread 23 - block 0 - warp 0 - thread 23
cac_thread 24 - block 0 - warp 0 - thread 24
cac_thread 25 - block 0 - warp 0 - thread 25
cac_thread 26 - block 0 - warp 0 - thread 26
cac_thread 27 - block 0 - warp 0 - thread 27
cac_thread 28 - block 0 - warp 0 - thread 28
cac_thread 29 - block 0 - warp 0 - thread 29
cac_thread 30 - block 0 - warp 0 - thread 30
cac_thread 31 - block 0 - warp 0 - thread 31
cac_thread 32 - block 0 - warp 1 - thread 32
cac_thread 33 - block 0 - warp 1 - thread 33
cac_thread 34 - block 0 - warp 1 - thread 34
cac_thread 35 - block 0 - warp 1 - thread 35
cac_thread 36 - block 0 - warp 1 - thread 36
cac_thread 37 - block 0 - warp 1 - thread 37
cac_thread 38 - block 0 - warp 1 - thread 38
cac_thread 39 - block 0 - warp 1 - thread 39
cac_thread 40 - block 0 - warp 1 - thread 40
cac_thread 41 - block 0 - warp 1 - thread 41
cac_thread 42 - block 0 - warp 1 - thread 42
cac_thread 43 - block 0 - warp 1 - thread 43
cac_thread 44 - block 0 - warp 1 - thread 44
cac_thread 45 - block 0 - warp 1 - thread 45
cac_thread 46 - block 0 - warp 1 - thread 46
cac_thread 47 - block 0 - warp 1 - thread 47
cac_thread 48 - block 0 - warp 1 - thread 48
cac_thread 49 - block 0 - warp 1 - thread 49
cac_thread 50 - block 0 - warp 1 - thread 50
cac_thread 51 - block 0 - warp 1 - thread 51
cac_thread 52 - block 0 - warp 1 - thread 52
cac_thread 53 - block 0 - warp 1 - thread 53
cac_thread 54 - block 0 - warp 1 - thread 54
cac_thread 55 - block 0 - warp 1 - thread 55
cac_thread 56 - block 0 - warp 1 - thread 56
cac_thread 57 - block 0 - warp 1 - thread 57
cac_thread 58 - block 0 - warp 1 - thread 58
cac_thread 59 - block 0 - warp 1 - thread 59
cac_thread 60 - block 0 - warp 1 - thread 60
cac_thread 61 - block 0 - warp 1 - thread 61
cac_thread 62 - block 0 - warp 1 - thread 62
cac_thread 63 - block 0 - warp 1 - thread 63
cac_thread 64 - block 1 - warp 0 - thread 0
cac_thread 65 - block 1 - warp 0 - thread 1
cac_thread 66 - block 1 - warp 0 - thread 2
cac_thread 67 - block 1 - warp 0 - thread 3
cac_thread 68 - block 1 - warp 0 - thread 4
cac_thread 69 - block 1 - warp 0 - thread 5
cac_thread 70 - block 1 - warp 0 - thread 6
cac_thread 71 - block 1 - warp 0 - thread 7
cac_thread 72 - block 1 - warp 0 - thread 8
cac_thread 73 - block 1 - warp 0 - thread 9
cac_thread 74 - block 1 - warp 0 - thread 10
cac_thread 75 - block 1 - warp 0 - thread 11
cac_thread 76 - block 1 - warp 0 - thread 12
cac_thread 77 - block 1 - warp 0 - thread 13
cac_thread 78 - block 1 - warp 0 - thread 14
cac_thread 79 - block 1 - warp 0 - thread 15
cac_thread 80 - block 1 - warp 0 - thread 16
cac_thread 81 - block 1 - warp 0 - thread 17
cac_thread 82 - block 1 - warp 0 - thread 18
cac_thread 83 - block 1 - warp 0 - thread 19
cac_thread 84 - block 1 - warp 0 - thread 20
cac_thread 85 - block 1 - warp 0 - thread 21
cac_thread 86 - block 1 - warp 0 - thread 22
cac_thread 87 - block 1 - warp 0 - thread 23
cac_thread 88 - block 1 - warp 0 - thread 24
cac_thread 89 - block 1 - warp 0 - thread 25
cac_thread 90 - block 1 - warp 0 - thread 26
cac_thread 91 - block 1 - warp 0 - thread 27
cac_thread 92 - block 1 - warp 0 - thread 28
cac_thread 93 - block 1 - warp 0 - thread 29
cac_thread 94 - block 1 - warp 0 - thread 30
cac_thread 95 - block 1 - warp 0 - thread 31
cac_thread 96 - block 1 - warp 1 - thread 32
cac_thread 97 - block 1 - warp 1 - thread 33
cac_thread 98 - block 1 - warp 1 - thread 34
cac_thread 99 - block 1 - warp 1 - thread 35
cac_thread 100 - block 1 - warp 1 - thread 36
cac_thread 101 - block 1 - warp 1 - thread 37
cac_thread 102 - block 1 - warp 1 - thread 38
cac_thread 103 - block 1 - warp 1 - thread 39
cac_thread 104 - block 1 - warp 1 - thread 40
cac_thread 105 - block 1 - warp 1 - thread 41
cac_thread 106 - block 1 - warp 1 - thread 42
cac_thread 107 - block 1 - warp 1 - thread 43
cac_thread 108 - block 1 - warp 1 - thread 44
cac_thread 109 - block 1 - warp 1 - thread 45
cac_thread 110 - block 1 - warp 1 - thread 46
cac_thread 111 - block 1 - warp 1 - thread 47
cac_thread 112 - block 1 - warp 1 - thread 48
cac_thread 113 - block 1 - warp 1 - thread 49
cac_thread 114 - block 1 - warp 1 - thread 50
cac_thread 115 - block 1 - warp 1 - thread 51
cac_thread 116 - block 1 - warp 1 - thread 52
cac_thread 117 - block 1 - warp 1 - thread 53
cac_thread 118 - block 1 - warp 1 - thread 54
cac_thread 119 - block 1 - warp 1 - thread 55
cac_thread 120 - block 1 - warp 1 - thread 56
cac_thread 121 - block 1 - warp 1 - thread 57
cac_thread 122 - block 1 - warp 1 - thread 58
cac_thread 123 - block 1 - warp 1 - thread 59
cac_thread 124 - block 1 - warp 1 - thread 60
cac_thread 125 - block 1 - warp 1 - thread 61
cac_thread 126 - block 1 - warp 1 - thread 62
cac_thread 127 - block 1 - warp 1 - thread 63
一,二列是用户调用 kernel 时设置的 block 个数 num_blocks =(1,4), x 维是 1, y 维是 4;
三,四列是用户调用 kernel 时设置的每个 block 中 thread 个数 num_threads= (32, 4) x 维是 32, y 维是 4;
总的线程数计算为(gridDim.x * gridDim.y)* (blockDim.x * blockDim.y) 共计 512 个线程。
griddim_x[thread_idx] = gridDim.x; // 1 griddim_y[thread_idx] = gridDim.y; // 4 blockdim_x[thread_idx] = blockDim.x; // 32 blockdim_y[thread_idx] = blockDim.y; // 4