此文档面向的是「CUDA 编程」新同学,「由浅入深」阐述 CUDA 并行编程模型、基础概念和语言特点,期望在如下方面对新同学有所裨益:
概括的讲,CUDA C++ 通过允许程序员定义称为 kernel 的 C++ 函数来扩展 C++。当调用内核时,由 N 个不同的 CUDA 线程并行执行 N 次,而不是像常规 C++ 函数那样只执行一次。使用 global 声明说明符定义内核,并使用新的 <<<...>>> 执行配置(execution configuration)语法指定内核调用时的 CUDA 线程数(请参阅 C++ 语言扩展)。
我们分别从 CPU、CUDA 编程的视角分别看下 Matmul 的朴素实现,详细的代码可参考:手写实现矩阵乘 Matmul。
如下是 CPU 版本代码实现,可以看出:
void sgemm_naive_cpu(float *A, float *B, float *C, int M, int N, int K){ for (int x = 0; x < M; x++) { for (int y = 0; y < N; y++) { float sum = 0.0f; for (int i = 0; i < K; i++) { sum += A[x * K + i] * B[i * N + y]; } C[x * N + y] = sum; } }}
如下是 CUDA 版本代码实现,可以看出:
__global__ void sgemm_naive(int M, int N, int K, float alpha, const float *A, const float *B, float beta, float *C) { // compute position in C that this thread is responsible for const uint x = blockIdx.x * blockDim.x + threadIdx.x; const uint y = blockIdx.y * blockDim.y + threadIdx.y; // `if` condition is necessary for when M or N aren't multiples of 32. if (x < M && y < N) { float tmp = 0.0; for (int i = 0; i < K; ++i) { tmp += A[x * K + i] * B[i * N + y]; } // C = α*(A@B)+β*C C[x * N + y] = alpha * tmp + beta * C[x * N + y]; }}
如下是经过定向优化后的 Matmul Kernel 实现:
template <const int BM, const int BN, const int BK, const int TM>__global__ void sgemm_blocktiling_1d_kernel(float *A, float *B, float *C, int M, int N, int K){ // the output block that we want to compute in this threadblock const uint c_row = blockIdx.y; const uint c_col = blockIdx.x; // allocate shared memory for the input and output submatrices __shared__ float A_shared[BM * BK]; __shared__ float B_shared[BK * BN]; // the inner row & col that we're accessing in this thread const uint thread_row = threadIdx.x / BN; const uint thread_col = threadIdx.x % BN; // advance pointers to the starting positions A += c_row * BM * K; B += c_col * BN; C += c_row * BM * N + c_col * BN; // use to avoid out-of-bounds accesses int global_m_pos = c_row * BM * K; int global_n_pos = c_col * BN; const uint m_size = M * K; const uint n_size = N * K; assert(BM * BK == blockDim.x); assert(BN * BK == blockDim.x); const uint A_inner_row = threadIdx.x / BK; // warp-level GMEM coalescing const uint A_inner_col = threadIdx.x % BK; const uint B_inner_row = threadIdx.x / BN; // warp-level GMEM coalescing const uint B_inner_col = threadIdx.x % BN; // allocate thread-local cache for results in registerfile float thread_results[TM] = {0.0}; // outer loop over block tiles for (uint bk_idx = 0; bk_idx < K; bk_idx += BK) { // load the next block of the input matrices into shared memory A_shared[A_inner_row * BK + A_inner_col] = (global_m_pos + A_inner_row * K + A_inner_col < m_size) ? A[A_inner_row * K + A_inner_col] : 0.0f; B_shared[B_inner_row * BN + B_inner_col] = (global_n_pos + B_inner_row * N + B_inner_col < n_size) ? B[B_inner_row * N + B_inner_col] : 0.0f; // wait for all threads to finish loading __syncthreads(); // advance the pointers A += BK; B += BK * N; global_m_pos += BK; global_n_pos += BK * N; // compute the partial sum for (uint dot_idx = 0; dot_idx < BK; dot_idx++) { // we make the dotproduct loop the outside loop, which facilitates // reuse of the Bs entry, which we can cache in a tmp var. float tmp_b = B_shared[dot_idx * BN + thread_col]; for (uint res_idx = 0; res_idx < TM; res_idx++) { thread_results[res_idx] += A_shared[(thread_row * TM + res_idx) * BK + dot_idx] * tmp_b; } } // wait for all threads to finish computing __syncthreads(); } for (uint res_idx = 0; res_idx < TM; res_idx++) { if (c_row * BM + thread_row * TM + res_idx < M && c_col * BN + thread_col < N) { C[(thread_row * TM + res_idx) * N + thread_col] = thread_results[res_idx]; } }}
从上面优化后的代码里能看到一些新的代码段或者概念关键词:
__shared__ 关键字__syncthreads()至此给大家提供了一个优化全流程的「概览性认知」,但只是 CUDA 编程的「冰山一角」。接下来,我们将尝试一起掀开大幕的一角,将从如下几个维度来介绍: