CUDA 算子优化：Reduce 类

一、Sum 优化#

Sum 是最基础的规约操作，将数组中所有元素累加求和。许多其他规约操作（如 max、min、mean）都可以复用 sum 优化的核心技巧。

Important

本节源代码见 sum.cu。

1. Naive 实现#

最直观的做法是让所有线程直接使用 atomicAdd 更新全局输出变量。这种方式会导致严重的串行化——所有线程必须排队依次访问同一内存地址，性能极差。

1
__global__ void sum_naive(const float* input, float* output, int N) {
2
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
3
    if (idx < N) atomicAdd(output, input[idx]);
4
}

在 NVIDIA GeForce RTX 4090 D 上的测试结果如下，内存带宽利用率不足 1%：

N	Samples	CPU Time	Noise	GPU Time	Noise	Elem/s	GlobalMem BW	BWUtil	Samples	Batch GPU
1024	87402x	26.369 us	106.63%	4.208 us	7.23%	243.348M	974.342 MB/s	0.10%	202066x	2.474 us
4096	61504x	28.746 us	4.75%	8.130 us	3.52%	503.784M	2.016 GB/s	0.20%	77923x	6.417 us
16384	20768x	44.552 us	2.14%	24.077 us	2.05%	680.496M	2.722 GB/s	0.27%	22272x	22.451 us
65536	5665x	108.819 us	1.02%	88.277 us	0.41%	742.394M	2.970 GB/s	0.29%	5797x	86.253 us
262144	1458x	363.942 us	0.29%	343.147 us	0.08%	763.942M	3.056 GB/s	0.30%	1507x	341.661 us
1048576	367x	1.388 ms	0.16%	1.366 ms	0.03%	767.625M	3.071 GB/s	0.30%	383x	1.363 ms
4194304	92x	5.474 ms	0.03%	5.452 ms	0.01%	769.374M	3.077 GB/s	0.31%	96x	5.449 ms
16777216	23x	21.820 ms	0.02%	21.796 ms	0.00%	769.750M	3.079 GB/s	0.31%	24x	21.792 ms

2. 并行折半规约#

假设 block 包含 256 个线程，数据规模恰好是 256 的倍数。我们可以分两步完成：先计算每个 block 内的局部和，再将各 block 的结果汇总。

block 内的求和采用折半规约策略。以 8 个线程为例：

第一轮：线程 0 与线程 4 求和，线程 1 与线程 5 求和，依此类推
第二轮：线程 0 与线程 2 求和，线程 1 与线程 3 求和
第三轮：线程 0 与线程 1 求和

过程如下图所示：

实现代码如下（block 间汇总在 CPU 端完成）：

1
// assume blockDim.x is a power of 2 and N is a multiple of blockDim.x
2
__global__ void sum_v0(float* input, float* output) {
3
    int tid = threadIdx.x;
4
    float *x = &input[blockIdx.x * blockDim.x];
5

6
    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
7
        if (tid < offset) {
8
            x[tid] += x[tid + offset];
9
        }
10
        __syncthreads();
11
    }
12

13
    if (tid == 0) output[blockIdx.x] = x[0];
14
}

在 NVIDIA GeForce RTX 4090 D 上的测试结果如下，内存带宽利用率显著提升：

N	Samples	CPU Time	Noise	GPU Time	Noise	Elem/s	GlobalMem BW	BWUtil
1024	49920x	20.523 us	5.21%	10.019 us	8.40%	102.210M	410.438 MB/s	0.04%
4096	50592x	20.390 us	4.98%	9.883 us	8.89%	414.430M	1.664 GB/s	0.17%
16384	50464x	20.447 us	4.79%	9.910 us	8.25%	1.653G	6.639 GB/s	0.66%
65536	45312x	21.551 us	12.66%	11.035 us	24.67%	5.939G	23.849 GB/s	2.37%
262144	26528x	29.450 us	3.44%	18.853 us	4.90%	13.905G	55.836 GB/s	5.54%
1048576	22960x	32.363 us	4.56%	21.783 us	6.20%	48.137G	193.299 GB/s	19.17%
4194304	9072x	66.174 us	5.16%	55.161 us	6.10%	76.038G	305.340 GB/s	30.29%
16777216	4016x	189.159 us	7.82%	178.278 us	8.32%	94.107G	377.899 GB/s	37.49%

3. 折半规约 + 共享内存#

前一版本直接在全局内存上操作。我们可以将数据先加载到共享内存，再进行规约，减少全局内存访问开销。

1
__global__ void sum_v1(const float* input, float* output, int N) {
2
    int tid = threadIdx.x;
3
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
4

5
    __shared__ float sdata[256];
6

7
    // Load data to shared memory with boundary check
8
    sdata[tid] = (idx < N) ? input[idx] : 0.0f;
9
    __syncthreads();
10

11
    // Reduction in shared memory
12
    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
13
        if (tid < offset) {
14
            sdata[tid] += sdata[tid + offset];
15
        }
16
        __syncthreads();
17
    }
18

19
    if (tid == 0) output[blockIdx.x] = sdata[0];
20
}

相较于前一版本，性能提升并不明显。原因是现代编译器会自动将全局内存访问优化为共享内存访问。NVIDIA GeForce RTX 4090 D 测试结果如下：

N	Samples	CPU Time	Noise	GPU Time	Noise	Elem/s	GlobalMem BW	BWUtil
1024	49376x	20.676 us	4.30%	10.127 us	7.49%	101.111M	406.025 MB/s	0.04%
4096	50864x	20.301 us	4.35%	9.832 us	7.89%	416.600M	1.673 GB/s	0.17%
16384	50128x	20.466 us	4.46%	9.976 us	8.08%	1.642G	6.595 GB/s	0.65%
65536	48448x	20.813 us	8.80%	10.322 us	16.81%	6.349G	25.495 GB/s	2.53%
262144	26048x	29.748 us	3.78%	19.204 us	5.10%	13.650G	54.815 GB/s	5.44%
1048576	23744x	31.959 us	2.78%	21.058 us	3.67%	49.794G	199.952 GB/s	19.83%
4194304	9200x	65.352 us	6.79%	54.413 us	8.19%	77.082G	309.534 GB/s	30.70%
16777216	2704x	196.522 us	0.90%	185.420 us	0.98%	90.482G	363.344 GB/s	36.04%

Important

可以使用 atomicAdd 在 kernel 内直接汇总各 block 的结果，避免 CPU-GPU 数据往返，进一步提升效率。

4. Warp Shuffle 优化#

GPU 以 warp（32 个线程）为单位调度执行。同一 warp 内的线程以 SIMT（单指令多线程）方式同步执行，天然保持同步，无需 __syncthreads() 即可安全交换寄存器数据。

使用 __shfl_down_sync 可以在 warp 内高效地进行规约：

1
__inline__ __device__ float warp_reduce(float val) {
2
    for (int offset = 16; offset > 0; offset >>= 1) {
3
        val += __shfl_down_sync(0xffffffff, val, offset);
4
    }
5
    return val;
6
}

完整的 block 级实现如下（假设 blockDim ≤ 1024）：

1
__global__ void sum_v4(const float* input, float* output, int N) {
2
    int tid = threadIdx.x;
3
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
4
    int warp_id = tid / 32;
5
    int lane_id = tid % 32;
6
    int num_warps = blockDim.x / 32;
7

8
    __shared__ float sdata[32];
9

10
    // Load data with boundary check
11
    float val = (idx < N) ? input[idx] : 0.0f;
12

13
    // Warp-level reduction
14
    val = warp_reduce(val);
15

16
    // Only first lane of each warp writes to shared memory
17
    if (lane_id == 0) {
18
        sdata[warp_id] = val;
19
    }
20
    __syncthreads();
21

22
    // First warp reduces the warp results
23
    if (warp_id == 0) {
24
        val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
25
        val = warp_reduce(val);
26
        if (lane_id == 0) {
27
            atomicAdd(output, val);
28
        }
29
    }
30
}

NVIDIA GeForce RTX 4090 D 测试结果显示性能显著提升：

N	Samples	CPU Time	Noise	GPU Time	Noise	Elem/s	GlobalMem BW	BWUtil
1024	51584x	20.277 us	4.45%	9.693 us	7.65%	105.642M	422.980 MB/s	0.04%
4096	51888x	20.214 us	4.32%	9.638 us	7.50%	424.995M	1.700 GB/s	0.17%
16384	51392x	20.615 us	5.03%	9.732 us	8.04%	1.684G	6.735 GB/s	0.67%
65536	50992x	20.625 us	4.16%	9.808 us	7.41%	6.682G	26.727 GB/s	2.65%
262144	36832x	24.206 us	14.72%	13.577 us	26.98%	19.308G	77.233 GB/s	7.66%
1048576	25952x	29.795 us	2.96%	19.266 us	3.44%	54.425G	217.700 GB/s	21.60%
4194304	11328x	55.008 us	3.34%	44.178 us	3.96%	94.940G	379.762 GB/s	37.67%
16777216	4256x	128.793 us	0.78%	117.861 us	0.65%	142.348G	569.391 GB/s	56.48%

5. Warp Shuffle + float4 向量化#

进一步优化：每个线程使用 float4 加载 4 个元素，减少线程总数，提高内存访问效率。

1
__global__ void sum_v5(const float* input, float* output, int N) {
2
    int tid = threadIdx.x;
3
    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
4
    int warp_id = tid / 32;
5
    int lane_id = tid % 32;
6
    int num_warps = blockDim.x / 32;
7

8
    __shared__ float sdata[32];
9

10
    // Load data with boundary check
11
    float val = 0.0f;
12
    if (idx < N) {
13
        float4 tmp = *reinterpret_cast<const float4 *>(&input[idx]);
14
        val += tmp.x;
15
        val += tmp.y;
16
        val += tmp.z;
17
        val += tmp.w;
18
    }
19

20
    // Warp-level reduction
21
    val = warp_reduce(val);
22

23
    // Only first lane of each warp writes to shared memory
24
    if (lane_id == 0) {
25
        sdata[warp_id] = val;
26
    }
27
    __syncthreads();
28

29
    // First warp reduces the warp results
30
    if (warp_id == 0) {
31
        val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
32
        val = warp_reduce(val);
33
        if (lane_id == 0) {
34
            atomicAdd(output, val);
35
        }
36
    }
37
}

NVIDIA GeForce RTX 4090 D 测试结果如下，大数据量时带宽利用率接近 58%：

N	Samples	CPU Time	Noise	GPU Time	Noise	Elem/s	GlobalMem BW	BWUtil
1024	51360x	20.376 us	4.60%	9.738 us	7.51%	105.152M	421.018 MB/s	0.04%
4096	50576x	20.609 us	5.78%	9.889 us	9.06%	414.197M	1.657 GB/s	0.16%
16384	52256x	20.293 us	4.51%	9.569 us	7.86%	1.712G	6.849 GB/s	0.68%
65536	51824x	20.405 us	5.49%	9.649 us	10.23%	6.792G	27.167 GB/s	2.69%
262144	39792x	23.123 us	13.14%	12.568 us	24.40%	20.858G	83.431 GB/s	8.28%
1048576	22528x	32.624 us	3.86%	22.202 us	5.06%	47.228G	188.913 GB/s	18.74%
4194304	11680x	53.560 us	3.51%	42.849 us	4.24%	97.886G	391.546 GB/s	38.84%
16777216	4336x	126.296 us	1.77%	115.382 us	1.89%	145.406G	581.623 GB/s	57.70%

二、Softmax 优化#

Softmax 是深度学习中常用的激活函数，将向量转换为概率分布。其数学定义为：

$\text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}$

计算 Softmax 需要先求最大值（用于数值稳定性）、再求指数和、最后逐元素计算。这涉及多次规约操作。

数值稳定性问题：直接计算 $e^{x_i}$ 可能导致数值溢出。例如当 $x_i = 100$ 时， $e^{100} \approx 2.7 \times 10^{43}$ ，超出 float 表示范围。解决方案是减去最大值：

$\text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}} = \frac{e^{x_i - \max(x)}}{\sum_{j} e^{x_j - \max(x)}}$

减去 $\max(x)$ 后，所有指数的输入 $x_i - \max(x) \leq 0$ ，因此 $e^{x_i - \max(x)} \in (0, 1]$ ，不会溢出。同时分子分母同除以 $e^{\max(x)}$ ，结果不变。

Important

本节源代码见 softmax.cu。

1. CPU 实现#

1
void softmax_cpu(const float* input, float* output, int N) {
2
    // 找最大值（防止指数溢出）
3
    float max_val = input[0];
4
    for (int i = 1; i < N; i++) {
5
        max_val = std::max(max_val, input[i]);
6
    }
7

8
    // 计算指数和
9
    float sum = 0.0f;
10
    for (int i = 0; i < N; i++) {
11
        output[i] = expf(input[i] - max_val);
12
        sum += output[i];
13
    }
14

15
    // 归一化
16
    for (int i = 0; i < N; i++) {
17
        output[i] /= sum;
18
    }
19
}

2. GPU 基础实现#

将 CPU 逻辑直接移植到 GPU，使用三次独立的 kernel 调用：

1
__inline__ __device__ float warp_reduce_max(float val) {
2
    for (int offset = 16; offset > 0; offset >>= 1) {
3
        val = fmax(val, __shfl_down_sync(0xffffffff, val, offset));
4
    }
5
    return val;
6
}
7

8
__device__ static float atomicMax(float* address, float val) {
9
    int* address_as_i = (int*)address;
10
    int old = *address_as_i, assumed;
11
    do {
12
        assumed = old;
13
        old = atomicCAS(address_as_i, assumed,
14
            __float_as_int(fmax(val, __int_as_float(assumed))));
15
    } while (assumed != old);
16
    return __int_as_float(old);
17
}
18

19
// Kernel 1: 找最大值
20
__global__ void reduce_max(const float* input, float* max_val, int N) {
21
    int tid = threadIdx.x;
22
    int idx = blockIdx.x * blockDim.x + tid;
23
    int warp_id = tid / 32;
24
    int lane_id = tid % 32;
25
    int num_warps = blockDim.x / 32;
26

27
    __shared__ float sdata[32];
28

29
    float val = (idx < N) ? input[idx] : (-FLT_MAX);
30

31
    val = warp_reduce_max(val);
32

33
    if (lane_id == 0) {
34
        sdata[warp_id] = val;
35
    }
36
    __syncthreads();
37

38
    if (warp_id == 0) {
39
        val = (lane_id < num_warps) ? sdata[lane_id] : (-FLT_MAX);
40
        val = warp_reduce_max(val);
41
        if (lane_id == 0) {
42
            atomicMax(max_val, val);
43
        }
44
    }
45
}
46

47
__inline__ __device__ float warp_reduce_sum(float val) {
48
    for (int offset = 16; offset > 0; offset >>= 1) {
49
        val += __shfl_down_sync(0xffffffff, val, offset);
50
    }
51
    return val;
52
}
53

54
// Kernel 2: 计算 exp(x - max) 并求和
55
__global__ void softmax_exp_sum(
56
    const float* input, const float* max_val, float* sum_val, int N
57
) {
58
    int tid = threadIdx.x;
59
    int idx = blockIdx.x * blockDim.x + tid;
60
    int warp_id = tid / 32;
61
    int lane_id = tid % 32;
62
    int num_warps = blockDim.x / 32;
63

64
    __shared__ float sdata[32];
65

66
    float val = (idx < N) ? expf(input[idx] - *max_val) : 0.0f;
67

68
    val = warp_reduce_sum(val);
69

70
    if (lane_id == 0) {
71
        sdata[warp_id] = val;
72
    }
73
    __syncthreads();
74

75
    if (warp_id == 0) {
76
        val = (lane_id < num_warps) ? sdata[lane_id] : 0.0f;
77
        val = warp_reduce_sum(val);
78
        if (lane_id == 0) {
79
            atomicAdd(sum_val, val);
80
        }
81
    }
82
}
83

84
// Kernel 3: 归一化
85
__global__ void softmax_normalize(
86
    const float* input, const float* max_val,
87
    const float* sum_val, float* output, int N
88
) {
89
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
90
    if (idx < N) output[idx] = expf(input[idx] - *max_val) / (*sum_val);
91
}

Note

能不能进一步优化？

一、Sum 优化#

1. Naive 实现#

2. 并行折半规约#

3. 折半规约 + 共享内存#

4. Warp Shuffle 优化#

5. Warp Shuffle + float4 向量化#

二、Softmax 优化#

1. CPU 实现#

2. GPU 基础实现#

支持与分享

目录