CUDA 算子优化:Reduce 类

1900 字
10 分钟
CUDA 算子优化:Reduce 类
2026-04-17

一、Sum 优化#

Sum 是最基础的规约操作,将数组中所有元素累加求和。许多其他规约操作(如 max、min、mean)都可以复用 sum 优化的核心技巧。

Important

本节源代码见 sum.cu

1. Naive 实现#

最直观的做法是让所有线程直接使用 atomicAdd 更新全局输出变量。这种方式会导致严重的串行化——所有线程必须排队依次访问同一内存地址,性能极差。

__global__ void sum_naive(const float* input, float* output, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) atomicAdd(output, input[idx]);
}

在 NVIDIA GeForce RTX 4090 D 上的测试结果如下,内存带宽利用率不足 1%:

NSamplesCPU TimeNoiseGPU TimeNoiseElem/sGlobalMem BWBWUtilSamplesBatch GPU
102487402x26.369 us106.63%4.208 us7.23%243.348M974.342 MB/s0.10%202066x2.474 us
409661504x28.746 us4.75%8.130 us3.52%503.784M2.016 GB/s0.20%77923x6.417 us
1638420768x44.552 us2.14%24.077 us2.05%680.496M2.722 GB/s0.27%22272x22.451 us
655365665x108.819 us1.02%88.277 us0.41%742.394M2.970 GB/s0.29%5797x86.253 us
2621441458x363.942 us0.29%343.147 us0.08%763.942M3.056 GB/s0.30%1507x341.661 us
1048576367x1.388 ms0.16%1.366 ms0.03%767.625M3.071 GB/s0.30%383x1.363 ms
419430492x5.474 ms0.03%5.452 ms0.01%769.374M3.077 GB/s0.31%96x5.449 ms
1677721623x21.820 ms0.02%21.796 ms0.00%769.750M3.079 GB/s0.31%24x21.792 ms

2. 并行折半规约#

假设 block 包含 256 个线程,数据规模恰好是 256 的倍数。我们可以分两步完成:先计算每个 block 内的局部和,再将各 block 的结果汇总。

并行规约算法
并行规约算法

block 内的求和采用折半规约策略。以 8 个线程为例:

  • 第一轮:线程 0 与线程 4 求和,线程 1 与线程 5 求和,依此类推
  • 第二轮:线程 0 与线程 2 求和,线程 1 与线程 3 求和
  • 第三轮:线程 0 与线程 1 求和

过程如下图所示:

折半规约
折半规约

实现代码如下(block 间汇总在 CPU 端完成):

// assume blockDim.x is a power of 2 and N is a multiple of blockDim.x
__global__ void sum_v0(float* input, float* output) {
int tid = threadIdx.x;
float *x = &input[blockIdx.x * blockDim.x];
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = x[0];
}

在 NVIDIA GeForce RTX 4090 D 上的测试结果如下,内存带宽利用率显著提升:

NSamplesCPU TimeNoiseGPU TimeNoiseElem/sGlobalMem BWBWUtil
102449920x20.523 us5.21%10.019 us8.40%102.210M410.438 MB/s0.04%
409650592x20.390 us4.98%9.883 us8.89%414.430M1.664 GB/s0.17%
1638450464x20.447 us4.79%9.910 us8.25%1.653G6.639 GB/s0.66%
6553645312x21.551 us12.66%11.035 us24.67%5.939G23.849 GB/s2.37%
26214426528x29.450 us3.44%18.853 us4.90%13.905G55.836 GB/s5.54%
104857622960x32.363 us4.56%21.783 us6.20%48.137G193.299 GB/s19.17%
41943049072x66.174 us5.16%55.161 us6.10%76.038G305.340 GB/s30.29%
167772164016x189.159 us7.82%178.278 us8.32%94.107G377.899 GB/s37.49%

3. 折半规约 + 共享内存#

前一版本直接在全局内存上操作。我们可以将数据先加载到共享内存,再进行规约,减少全局内存访问开销。

__global__ void sum_v1(const float* input, float* output, int N) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float sdata[256];
// Load data to shared memory with boundary check
sdata[tid] = (idx < N) ? input[idx] : 0.0f;
__syncthreads();
// Reduction in shared memory
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if (tid < offset) {
sdata[tid] += sdata[tid + offset];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}

相较于前一版本,性能提升并不明显。原因是现代编译器会自动将全局内存访问优化为共享内存访问。NVIDIA GeForce RTX 4090 D 测试结果如下:

NSamplesCPU TimeNoiseGPU TimeNoiseElem/sGlobalMem BWBWUtil
102449376x20.676 us4.30%10.127 us7.49%101.111M406.025 MB/s0.04%
409650864x20.301 us4.35%9.832 us7.89%416.600M1.673 GB/s0.17%
1638450128x20.466 us4.46%9.976 us8.08%1.642G6.595 GB/s0.65%
6553648448x20.813 us8.80%10.322 us16.81%6.349G25.495 GB/s2.53%
26214426048x29.748 us3.78%19.204 us5.10%13.650G54.815 GB/s5.44%
104857623744x31.959 us2.78%21.058 us3.67%49.794G199.952 GB/s19.83%
41943049200x65.352 us6.79%54.413 us8.19%77.082G309.534 GB/s30.70%
167772162704x196.522 us0.90%185.420 us0.98%90.482G363.344 GB/s36.04%
Important

可以使用 atomicAdd 在 kernel 内直接汇总各 block 的结果,避免 CPU-GPU 数据往返,进一步提升效率。

4. Warp Shuffle 优化#

GPU 以 warp(32 个线程)为单位调度执行。同一 warp 内的线程以 SIMT(单指令多线程)方式同步执行,天然保持同步,无需 __syncthreads() 即可安全交换寄存器数据。

使用 __shfl_down_sync 可以在 warp 内高效地进行规约:

__inline__ __device__ float warp_reduce(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}

完整的 block 级实现如下(假设 blockDim ≤ 1024):

__global__ void sum_v4(const float* input, float* output, int N) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int warp_id = tid / 32;
int lane_id = tid % 32;
int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
// Load data with boundary check
float val = (idx < N) ? input[idx] : 0.0f;
// Warp-level reduction
val = warp_reduce(val);
// Only first lane of each warp writes to shared memory
if (lane_id == 0) {
sdata[warp_id] = val;
}
__syncthreads();
// First warp reduces the warp results
if (warp_id == 0) {
val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
val = warp_reduce(val);
if (lane_id == 0) {
atomicAdd(output, val);
}
}
}

NVIDIA GeForce RTX 4090 D 测试结果显示性能显著提升:

NSamplesCPU TimeNoiseGPU TimeNoiseElem/sGlobalMem BWBWUtil
102451584x20.277 us4.45%9.693 us7.65%105.642M422.980 MB/s0.04%
409651888x20.214 us4.32%9.638 us7.50%424.995M1.700 GB/s0.17%
1638451392x20.615 us5.03%9.732 us8.04%1.684G6.735 GB/s0.67%
6553650992x20.625 us4.16%9.808 us7.41%6.682G26.727 GB/s2.65%
26214436832x24.206 us14.72%13.577 us26.98%19.308G77.233 GB/s7.66%
104857625952x29.795 us2.96%19.266 us3.44%54.425G217.700 GB/s21.60%
419430411328x55.008 us3.34%44.178 us3.96%94.940G379.762 GB/s37.67%
167772164256x128.793 us0.78%117.861 us0.65%142.348G569.391 GB/s56.48%

5. Warp Shuffle + float4 向量化#

进一步优化:每个线程使用 float4 加载 4 个元素,减少线程总数,提高内存访问效率。

__global__ void sum_v5(const float* input, float* output, int N) {
int tid = threadIdx.x;
int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
int warp_id = tid / 32;
int lane_id = tid % 32;
int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
// Load data with boundary check
float val = 0.0f;
if (idx < N) {
float4 tmp = *reinterpret_cast<const float4 *>(&input[idx]);
val += tmp.x;
val += tmp.y;
val += tmp.z;
val += tmp.w;
}
// Warp-level reduction
val = warp_reduce(val);
// Only first lane of each warp writes to shared memory
if (lane_id == 0) {
sdata[warp_id] = val;
}
__syncthreads();
// First warp reduces the warp results
if (warp_id == 0) {
val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
val = warp_reduce(val);
if (lane_id == 0) {
atomicAdd(output, val);
}
}
}

NVIDIA GeForce RTX 4090 D 测试结果如下,大数据量时带宽利用率接近 58%:

NSamplesCPU TimeNoiseGPU TimeNoiseElem/sGlobalMem BWBWUtil
102451360x20.376 us4.60%9.738 us7.51%105.152M421.018 MB/s0.04%
409650576x20.609 us5.78%9.889 us9.06%414.197M1.657 GB/s0.16%
1638452256x20.293 us4.51%9.569 us7.86%1.712G6.849 GB/s0.68%
6553651824x20.405 us5.49%9.649 us10.23%6.792G27.167 GB/s2.69%
26214439792x23.123 us13.14%12.568 us24.40%20.858G83.431 GB/s8.28%
104857622528x32.624 us3.86%22.202 us5.06%47.228G188.913 GB/s18.74%
419430411680x53.560 us3.51%42.849 us4.24%97.886G391.546 GB/s38.84%
167772164336x126.296 us1.77%115.382 us1.89%145.406G581.623 GB/s57.70%

二、Softmax 优化#

Softmax 是深度学习中常用的激活函数,将向量转换为概率分布。其数学定义为:

Softmax(xi)=exijexj\text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}

计算 Softmax 需要先求最大值(用于数值稳定性)、再求指数和、最后逐元素计算。这涉及多次规约操作。

数值稳定性问题:直接计算 exie^{x_i} 可能导致数值溢出。例如当 xi=100x_i = 100 时,e1002.7×1043e^{100} \approx 2.7 \times 10^{43},超出 float 表示范围。解决方案是减去最大值:

Softmax(xi)=exijexj=eximax(x)jexjmax(x)\text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}} = \frac{e^{x_i - \max(x)}}{\sum_{j} e^{x_j - \max(x)}}

减去 max(x)\max(x) 后,所有指数的输入 ximax(x)0x_i - \max(x) \leq 0,因此 eximax(x)(0,1]e^{x_i - \max(x)} \in (0, 1],不会溢出。同时分子分母同除以 emax(x)e^{\max(x)},结果不变。

Important

本节源代码见 softmax.cu

1. CPU 实现#

void softmax_cpu(const float* input, float* output, int N) {
// 找最大值(防止指数溢出)
float max_val = input[0];
for (int i = 1; i < N; i++) {
max_val = std::max(max_val, input[i]);
}
// 计算指数和
float sum = 0.0f;
for (int i = 0; i < N; i++) {
output[i] = expf(input[i] - max_val);
sum += output[i];
}
// 归一化
for (int i = 0; i < N; i++) {
output[i] /= sum;
}
}

2. GPU 基础实现#

将 CPU 逻辑直接移植到 GPU,使用三次独立的 kernel 调用:

__inline__ __device__ float warp_reduce_max(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val = fmax(val, __shfl_down_sync(0xffffffff, val, offset));
}
return val;
}
__device__ static float atomicMax(float* address, float val) {
int* address_as_i = (int*)address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = atomicCAS(address_as_i, assumed,
__float_as_int(fmax(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
// Kernel 1: 找最大值
__global__ void reduce_max(const float* input, float* max_val, int N) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
int warp_id = tid / 32;
int lane_id = tid % 32;
int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
float val = (idx < N) ? input[idx] : (-FLT_MAX);
val = warp_reduce_max(val);
if (lane_id == 0) {
sdata[warp_id] = val;
}
__syncthreads();
if (warp_id == 0) {
val = (lane_id < num_warps) ? sdata[lane_id] : (-FLT_MAX);
val = warp_reduce_max(val);
if (lane_id == 0) {
atomicMax(max_val, val);
}
}
}
__inline__ __device__ float warp_reduce_sum(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
// Kernel 2: 计算 exp(x - max) 并求和
__global__ void softmax_exp_sum(
const float* input, const float* max_val, float* sum_val, int N
) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
int warp_id = tid / 32;
int lane_id = tid % 32;
int num_warps = blockDim.x / 32;
__shared__ float sdata[32];
float val = (idx < N) ? expf(input[idx] - *max_val) : 0.0f;
val = warp_reduce_sum(val);
if (lane_id == 0) {
sdata[warp_id] = val;
}
__syncthreads();
if (warp_id == 0) {
val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
val = warp_reduce_sum(val);
if (lane_id == 0) {
atomicAdd(sum_val, val);
}
}
}
// Kernel 3: 归一化
__global__ void softmax_normalize(
const float* input, const float* max_val,
const float* sum_val, float* output, int N
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) output[idx] = expf(input[idx] - *max_val) / (*sum_val);
}
Note

能不能进一步优化?

支持与分享

如果这篇文章对你有帮助,欢迎分享给更多人或赞助支持!

赞助
CUDA 算子优化:Reduce 类
https://llm-tech.com.cn/posts/cuda-reduce-op/
作者
Ming
发布于
2026-04-17
许可协议
CC BY-NC-SA 4.0
Profile Image of the Author
Ming
你是来找 Ming 学习的吗
🎉 欢迎来到 Ming 的博客
这里是我的个人博客,分享 AI Infra、LLM 等技术内容。欢迎关注交流!
分类
标签
站点统计
文章
8
分类
6
标签
8
总字数
11,954
运行时长
0
最后活动
0 天前

目录