CUDA 算子优化:Reduce 类
一、Sum 优化
Sum 是最基础的规约操作,将数组中所有元素累加求和。许多其他规约操作(如 max、min、mean)都可以复用 sum 优化的核心技巧。
本节源代码见 sum.cu。
1. Naive 实现
最直观的做法是让所有线程直接使用 atomicAdd 更新全局输出变量。这种方式会导致严重的串行化——所有线程必须排队依次访问同一内存地址,性能极差。
__global__ void sum_naive(const float* input, float* output, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N) atomicAdd(output, input[idx]);}在 NVIDIA GeForce RTX 4090 D 上的测试结果如下,内存带宽利用率不足 1%:
| N | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWUtil | Samples | Batch GPU |
|---|---|---|---|---|---|---|---|---|---|---|
| 1024 | 87402x | 26.369 us | 106.63% | 4.208 us | 7.23% | 243.348M | 974.342 MB/s | 0.10% | 202066x | 2.474 us |
| 4096 | 61504x | 28.746 us | 4.75% | 8.130 us | 3.52% | 503.784M | 2.016 GB/s | 0.20% | 77923x | 6.417 us |
| 16384 | 20768x | 44.552 us | 2.14% | 24.077 us | 2.05% | 680.496M | 2.722 GB/s | 0.27% | 22272x | 22.451 us |
| 65536 | 5665x | 108.819 us | 1.02% | 88.277 us | 0.41% | 742.394M | 2.970 GB/s | 0.29% | 5797x | 86.253 us |
| 262144 | 1458x | 363.942 us | 0.29% | 343.147 us | 0.08% | 763.942M | 3.056 GB/s | 0.30% | 1507x | 341.661 us |
| 1048576 | 367x | 1.388 ms | 0.16% | 1.366 ms | 0.03% | 767.625M | 3.071 GB/s | 0.30% | 383x | 1.363 ms |
| 4194304 | 92x | 5.474 ms | 0.03% | 5.452 ms | 0.01% | 769.374M | 3.077 GB/s | 0.31% | 96x | 5.449 ms |
| 16777216 | 23x | 21.820 ms | 0.02% | 21.796 ms | 0.00% | 769.750M | 3.079 GB/s | 0.31% | 24x | 21.792 ms |
2. 并行折半规约
假设 block 包含 256 个线程,数据规模恰好是 256 的倍数。我们可以分两步完成:先计算每个 block 内的局部和,再将各 block 的结果汇总。

block 内的求和采用折半规约策略。以 8 个线程为例:
- 第一轮:线程 0 与线程 4 求和,线程 1 与线程 5 求和,依此类推
- 第二轮:线程 0 与线程 2 求和,线程 1 与线程 3 求和
- 第三轮:线程 0 与线程 1 求和
过程如下图所示:

实现代码如下(block 间汇总在 CPU 端完成):
// assume blockDim.x is a power of 2 and N is a multiple of blockDim.x__global__ void sum_v0(float* input, float* output) { int tid = threadIdx.x; float *x = &input[blockIdx.x * blockDim.x];
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { x[tid] += x[tid + offset]; } __syncthreads(); }
if (tid == 0) output[blockIdx.x] = x[0];}在 NVIDIA GeForce RTX 4090 D 上的测试结果如下,内存带宽利用率显著提升:
| N | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWUtil |
|---|---|---|---|---|---|---|---|---|
| 1024 | 49920x | 20.523 us | 5.21% | 10.019 us | 8.40% | 102.210M | 410.438 MB/s | 0.04% |
| 4096 | 50592x | 20.390 us | 4.98% | 9.883 us | 8.89% | 414.430M | 1.664 GB/s | 0.17% |
| 16384 | 50464x | 20.447 us | 4.79% | 9.910 us | 8.25% | 1.653G | 6.639 GB/s | 0.66% |
| 65536 | 45312x | 21.551 us | 12.66% | 11.035 us | 24.67% | 5.939G | 23.849 GB/s | 2.37% |
| 262144 | 26528x | 29.450 us | 3.44% | 18.853 us | 4.90% | 13.905G | 55.836 GB/s | 5.54% |
| 1048576 | 22960x | 32.363 us | 4.56% | 21.783 us | 6.20% | 48.137G | 193.299 GB/s | 19.17% |
| 4194304 | 9072x | 66.174 us | 5.16% | 55.161 us | 6.10% | 76.038G | 305.340 GB/s | 30.29% |
| 16777216 | 4016x | 189.159 us | 7.82% | 178.278 us | 8.32% | 94.107G | 377.899 GB/s | 37.49% |
3. 折半规约 + 共享内存
前一版本直接在全局内存上操作。我们可以将数据先加载到共享内存,再进行规约,减少全局内存访问开销。
__global__ void sum_v1(const float* input, float* output, int N) { int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float sdata[256];
// Load data to shared memory with boundary check sdata[tid] = (idx < N) ? input[idx] : 0.0f; __syncthreads();
// Reduction in shared memory for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { sdata[tid] += sdata[tid + offset]; } __syncthreads(); }
if (tid == 0) output[blockIdx.x] = sdata[0];}相较于前一版本,性能提升并不明显。原因是现代编译器会自动将全局内存访问优化为共享内存访问。NVIDIA GeForce RTX 4090 D 测试结果如下:
| N | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWUtil |
|---|---|---|---|---|---|---|---|---|
| 1024 | 49376x | 20.676 us | 4.30% | 10.127 us | 7.49% | 101.111M | 406.025 MB/s | 0.04% |
| 4096 | 50864x | 20.301 us | 4.35% | 9.832 us | 7.89% | 416.600M | 1.673 GB/s | 0.17% |
| 16384 | 50128x | 20.466 us | 4.46% | 9.976 us | 8.08% | 1.642G | 6.595 GB/s | 0.65% |
| 65536 | 48448x | 20.813 us | 8.80% | 10.322 us | 16.81% | 6.349G | 25.495 GB/s | 2.53% |
| 262144 | 26048x | 29.748 us | 3.78% | 19.204 us | 5.10% | 13.650G | 54.815 GB/s | 5.44% |
| 1048576 | 23744x | 31.959 us | 2.78% | 21.058 us | 3.67% | 49.794G | 199.952 GB/s | 19.83% |
| 4194304 | 9200x | 65.352 us | 6.79% | 54.413 us | 8.19% | 77.082G | 309.534 GB/s | 30.70% |
| 16777216 | 2704x | 196.522 us | 0.90% | 185.420 us | 0.98% | 90.482G | 363.344 GB/s | 36.04% |
可以使用 atomicAdd 在 kernel 内直接汇总各 block 的结果,避免 CPU-GPU 数据往返,进一步提升效率。
4. Warp Shuffle 优化
GPU 以 warp(32 个线程)为单位调度执行。同一 warp 内的线程以 SIMT(单指令多线程)方式同步执行,天然保持同步,无需 __syncthreads() 即可安全交换寄存器数据。
使用 __shfl_down_sync 可以在 warp 内高效地进行规约:
__inline__ __device__ float warp_reduce(float val) { for (int offset = 16; offset > 0; offset >>= 1) { val += __shfl_down_sync(0xffffffff, val, offset); } return val;}完整的 block 级实现如下(假设 blockDim ≤ 1024):
__global__ void sum_v4(const float* input, float* output, int N) { int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x; int warp_id = tid / 32; int lane_id = tid % 32; int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
// Load data with boundary check float val = (idx < N) ? input[idx] : 0.0f;
// Warp-level reduction val = warp_reduce(val);
// Only first lane of each warp writes to shared memory if (lane_id == 0) { sdata[warp_id] = val; } __syncthreads();
// First warp reduces the warp results if (warp_id == 0) { val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f; val = warp_reduce(val); if (lane_id == 0) { atomicAdd(output, val); } }}NVIDIA GeForce RTX 4090 D 测试结果显示性能显著提升:
| N | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWUtil |
|---|---|---|---|---|---|---|---|---|
| 1024 | 51584x | 20.277 us | 4.45% | 9.693 us | 7.65% | 105.642M | 422.980 MB/s | 0.04% |
| 4096 | 51888x | 20.214 us | 4.32% | 9.638 us | 7.50% | 424.995M | 1.700 GB/s | 0.17% |
| 16384 | 51392x | 20.615 us | 5.03% | 9.732 us | 8.04% | 1.684G | 6.735 GB/s | 0.67% |
| 65536 | 50992x | 20.625 us | 4.16% | 9.808 us | 7.41% | 6.682G | 26.727 GB/s | 2.65% |
| 262144 | 36832x | 24.206 us | 14.72% | 13.577 us | 26.98% | 19.308G | 77.233 GB/s | 7.66% |
| 1048576 | 25952x | 29.795 us | 2.96% | 19.266 us | 3.44% | 54.425G | 217.700 GB/s | 21.60% |
| 4194304 | 11328x | 55.008 us | 3.34% | 44.178 us | 3.96% | 94.940G | 379.762 GB/s | 37.67% |
| 16777216 | 4256x | 128.793 us | 0.78% | 117.861 us | 0.65% | 142.348G | 569.391 GB/s | 56.48% |
5. Warp Shuffle + float4 向量化
进一步优化:每个线程使用 float4 加载 4 个元素,减少线程总数,提高内存访问效率。
__global__ void sum_v5(const float* input, float* output, int N) { int tid = threadIdx.x; int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4; int warp_id = tid / 32; int lane_id = tid % 32; int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
// Load data with boundary check float val = 0.0f; if (idx < N) { float4 tmp = *reinterpret_cast<const float4 *>(&input[idx]); val += tmp.x; val += tmp.y; val += tmp.z; val += tmp.w; }
// Warp-level reduction val = warp_reduce(val);
// Only first lane of each warp writes to shared memory if (lane_id == 0) { sdata[warp_id] = val; } __syncthreads();
// First warp reduces the warp results if (warp_id == 0) { val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f; val = warp_reduce(val); if (lane_id == 0) { atomicAdd(output, val); } }}NVIDIA GeForce RTX 4090 D 测试结果如下,大数据量时带宽利用率接近 58%:
| N | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWUtil |
|---|---|---|---|---|---|---|---|---|
| 1024 | 51360x | 20.376 us | 4.60% | 9.738 us | 7.51% | 105.152M | 421.018 MB/s | 0.04% |
| 4096 | 50576x | 20.609 us | 5.78% | 9.889 us | 9.06% | 414.197M | 1.657 GB/s | 0.16% |
| 16384 | 52256x | 20.293 us | 4.51% | 9.569 us | 7.86% | 1.712G | 6.849 GB/s | 0.68% |
| 65536 | 51824x | 20.405 us | 5.49% | 9.649 us | 10.23% | 6.792G | 27.167 GB/s | 2.69% |
| 262144 | 39792x | 23.123 us | 13.14% | 12.568 us | 24.40% | 20.858G | 83.431 GB/s | 8.28% |
| 1048576 | 22528x | 32.624 us | 3.86% | 22.202 us | 5.06% | 47.228G | 188.913 GB/s | 18.74% |
| 4194304 | 11680x | 53.560 us | 3.51% | 42.849 us | 4.24% | 97.886G | 391.546 GB/s | 38.84% |
| 16777216 | 4336x | 126.296 us | 1.77% | 115.382 us | 1.89% | 145.406G | 581.623 GB/s | 57.70% |
二、Softmax 优化
Softmax 是深度学习中常用的激活函数,将向量转换为概率分布。其数学定义为:
计算 Softmax 需要先求最大值(用于数值稳定性)、再求指数和、最后逐元素计算。这涉及多次规约操作。
数值稳定性问题:直接计算 可能导致数值溢出。例如当 时,,超出 float 表示范围。解决方案是减去最大值:
减去 后,所有指数的输入 ,因此 ,不会溢出。同时分子分母同除以 ,结果不变。
本节源代码见 softmax.cu。
1. CPU 实现
void softmax_cpu(const float* input, float* output, int N) { // 找最大值(防止指数溢出) float max_val = input[0]; for (int i = 1; i < N; i++) { max_val = std::max(max_val, input[i]); }
// 计算指数和 float sum = 0.0f; for (int i = 0; i < N; i++) { output[i] = expf(input[i] - max_val); sum += output[i]; }
// 归一化 for (int i = 0; i < N; i++) { output[i] /= sum; }}2. GPU 基础实现
将 CPU 逻辑直接移植到 GPU,使用三次独立的 kernel 调用:
__inline__ __device__ float warp_reduce_max(float val) { for (int offset = 16; offset > 0; offset >>= 1) { val = fmax(val, __shfl_down_sync(0xffffffff, val, offset)); } return val;}
__device__ static float atomicMax(float* address, float val) { int* address_as_i = (int*)address; int old = *address_as_i, assumed; do { assumed = old; old = atomicCAS(address_as_i, assumed, __float_as_int(fmax(val, __int_as_float(assumed)))); } while (assumed != old); return __int_as_float(old);}
// Kernel 1: 找最大值__global__ void reduce_max(const float* input, float* max_val, int N) { int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + tid; int warp_id = tid / 32; int lane_id = tid % 32; int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
float val = (idx < N) ? input[idx] : (-FLT_MAX);
val = warp_reduce_max(val);
if (lane_id == 0) { sdata[warp_id] = val; } __syncthreads();
if (warp_id == 0) { val = (lane_id < num_warps) ? sdata[lane_id] : (-FLT_MAX); val = warp_reduce_max(val); if (lane_id == 0) { atomicMax(max_val, val); } }}
__inline__ __device__ float warp_reduce_sum(float val) { for (int offset = 16; offset > 0; offset >>= 1) { val += __shfl_down_sync(0xffffffff, val, offset); } return val;}
// Kernel 2: 计算 exp(x - max) 并求和__global__ void softmax_exp_sum( const float* input, const float* max_val, float* sum_val, int N) { int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + tid; int warp_id = tid / 32; int lane_id = tid % 32; int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
float val = (idx < N) ? expf(input[idx] - *max_val) : 0.0f;
val = warp_reduce_sum(val);
if (lane_id == 0) { sdata[warp_id] = val; } __syncthreads();
if (warp_id == 0) { val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f; val = warp_reduce_sum(val); if (lane_id == 0) { atomicAdd(sum_val, val); } }}
// Kernel 3: 归一化__global__ void softmax_normalize( const float* input, const float* max_val, const float* sum_val, float* output, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N) output[idx] = expf(input[idx] - *max_val) / (*sum_val);}能不能进一步优化?
支持与分享
如果这篇文章对你有帮助,欢迎分享给更多人或赞助支持!