习题列表#

挑战名称	难度	规约特征说明
Reduction	Medium	直接的并行规约，对数组元素执行求和、最大值等归约为标量
Dot Product	Medium	两个向量逐元素相乘后再求和，规约得到一个标量
Softmax	Medium	计算 softmax 需先求最大值再求指数和
Categorical Cross Entropy Loss	Medium	对预测概率与真实标签计算交叉熵，涉及对数、乘法后的求和规约
Mean Squared Error	Medium	计算所有样本的平方误差之和并求平均，核心是求和规约
Batch Normalization	Medium	在批次维度上计算均值与方差，均依赖求和规约
RMS Normalization	Medium	计算均方根，需要先求平方和再归一化
Histogramming	Medium	统计数值落入各 bin 的数量，可视为对每个 bin 的计数规约（常通过原子操作或局部规约实现）
Count Array Element	Medium	统计等于指定值的元素个数，本质是对布尔标记的求和规约
Count 2D Array Element	Medium	在二维数组中统计指定值的元素个数，同理为求和规约
Count 3D Array Element	Medium	在三维数组中统计指定值的元素个数，同理为求和规约
Monte Carlo Integration	Medium	通过对函数值求平均来估计积分，核心是求均值规约
Subarray Sum	Medium	计算给定区间内元素的总和，是对子区域进行求和规约
2D Subarray Sum	Medium	计算二维子区域的总和，属于多维部分规约求和
3D Subarray Sum	Medium	计算三维子区域的总和，属于多维部分规约求和
Max Subarray Sum	Medium	找出所有固定长度子数组和的最大值，内含多次求和与一次 max reduction

Reduction#

CUDA版本#

仅供参考

1
#include <cuda_runtime.h>
2

3
__device__ __forceinline__ float warpReduceSum(float val) {
4
    val += __shfl_down_sync(0xffffffff, val, 16);
5
    val += __shfl_down_sync(0xffffffff, val, 8);
6
    val += __shfl_down_sync(0xffffffff, val, 4);
7
    val += __shfl_down_sync(0xffffffff, val, 2);
8
    val += __shfl_down_sync(0xffffffff, val, 1);
9
    return val;
10
}
11

12
__device__ __forceinline__ float blockReduceSum(float val) {
13
    __shared__ float warpSums[32];
14
    int lane = threadIdx.x & 31;
15
    int wid  = threadIdx.x >> 5;
16

17
    val = warpReduceSum(val);
18

19
    if (lane == 0) warpSums[wid] = val;
20
    __syncthreads();
21

22
    // 只让第一个 warp 做最终归约
23
    int numWarps = blockDim.x >> 5;
24
    val = (threadIdx.x < numWarps) ? warpSums[threadIdx.x] : 0.0f;
25
    if (wid == 0) val = warpReduceSum(val);
26

27
    return val;
28
}
29

30
__global__ void reduce(const float* __restrict__ in, float* __restrict__ out, int N) {
31
    // 每个线程用 float4 向量化加载，一次处理 4 个元素
32
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
33
    int stride = blockDim.x * gridDim.x;
34

35
    float sum = 0.0f;
36

37
    // 向量化加载：以 float4 为单位遍历
38
    int vecN = N >> 2; // N / 4
39
    const float4* in4 = reinterpret_cast<const float4*>(in);
40

41
    for (int i = tid; i < vecN; i += stride) {
42
        float4 v = in4[i];
43
        sum += v.x + v.y + v.z + v.w;
44
    }
45

46
    // 处理尾部不足 4 个的元素
47
    int tailStart = vecN << 2;
48
    for (int i = tailStart + tid; i < N; i += stride) {
49
        sum += in[i];
50
    }
51

52
    // block 内归约
53
    sum = blockReduceSum(sum);
54

55
    if (threadIdx.x == 0) atomicAdd(out, sum);
56
}
57

58
extern "C" void solve(const float* input, float* output, int N) {
59
    const int BLOCK = 256;
60
    // 每个线程处理多个 float4，不需要太多 block
61
    int grid = min((N + BLOCK * 8 - 1) / (BLOCK * 8), 1024);
62

63
    // output 需要先清零
64
    cudaMemsetAsync(output, 0, sizeof(float));
65
    reduce<<<grid, BLOCK>>>(input, output, N);
66
}

Triton版本#

1
import torch
2
import triton
3
import triton.language as tl
4

5

6
@triton.jit
7
def reduce_kernel(
8
    input_ptr,
9
    partial_ptr,
10
    N,
11
    BLOCK_SIZE: tl.constexpr,
12
):
13
    pid = tl.program_id(0)
14
    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
15
    mask = offsets < N
16
    x = tl.load(input_ptr + offsets, mask=mask, other=0.0)
17
    s = tl.sum(x, axis=0)
18
    # 注意这里同样要用原子加法
19
    tl.atomic_add(partial_ptr, s)
20

21

22
def solve(input: torch.Tensor, output: torch.Tensor, N: int):
23
    BLOCK_SIZE = 1024
24
    grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE,)
25

26
    # output 清零，因为用 atomic_add 累加
27
    output.zero_()
28
    reduce_kernel[grid](input, output, N, BLOCK_SIZE=BLOCK_SIZE)

Pytorch版本#

1
import torch
2

3

4
def solve(input: torch.Tensor, output: torch.Tensor, N: int):
5
    output.copy_(input[:N].sum().unsqueeze(0))

Dot Product#

CUDA版本#

1
#include "reduce.cuh"
2

3

4
__device__ __forceinline__
5
float warp_reduce_sum(float val) {
6
    unsigned mask = 0xffffffff;
7
    val += __shfl_down_sync(mask, val, 16);
8
    val += __shfl_down_sync(mask, val, 8);
9
    val += __shfl_down_sync(mask, val, 4);
10
    val += __shfl_down_sync(mask, val, 2);
11
    val += __shfl_down_sync(mask, val, 1);
12
    return val;
13
}
14

15

16
__global__ void dot_product_v0(const float* a, const float* b, float* out, int n) {
17
    __shared__ float sdata[256];
18

19
    int tid = threadIdx.x;
20
    int idx = blockIdx.x * blockDim.x + tid;
21

22
    float sum = 0.0f;
23
    int stride = blockDim.x * gridDim.x;
24

25
    // grid-loop element wise版本
26
    for(int i = idx; i < n; i += stride) {
27
        sum += a[i] * b[i];
28
    }
29
    sdata[tid] = sum;
30
    __syncthreads();
31

32
    // 规约
33
    for (int s = blockDim.x / 2; s > 0; s /= 2) {
34
        if (tid < s) {
35
            sdata[tid] += sdata[tid + s];
36
        }
37
        __syncthreads();
38
    }
39

40
    // 求和
41
    if (tid == 0) {
42
        atomicAdd(out, sdata[0]);
43
    }
44
}
45

46
template <typename VecType, int blockSize>
47
__global__ void dot_product_v1(const float* a, const float* b, float* out, int n) {
48

49
    // blockSize（256）除以 32 得到 numWarps = 8，即每个 block 有 8 个 warp，精确计算出需要的内存（进一步优化，可以考虑在编译期确定）
50
    constexpr int numWarps = blockSize / 32;
51
    __shared__ float sdata[numWarps];
52

53
    // tid & 31等价于取模运算，即该线程取出低5位，即该线程在warp内部的0-31编号。
54
    // tid >> 5 等价于除以32， 得到warp的索引
55
    int tid = threadIdx.x;
56
    int lane = tid & 31;
57
    int warp_id = tid >> 5;
58

59
    float sum = 0.0f;
60

61
    // 类型为float2、float4
62
    const VecType* a_vec = reinterpret_cast<const VecType*>(a);
63
    const VecType* b_vec = reinterpret_cast<const VecType*>(b);
64

65
    constexpr int vecSize = sizeof(VecType) / sizeof(float);
66

67
    int numVec = n / vecSize;
68
    int stride = blockSize * gridDim.x;
69

70
    for (int i = blockIdx.x * blockSize + tid; i < numVec; i += stride) {
71
        VecType a_val = a_vec[i];
72
        VecType b_val = b_vec[i];
73
        if constexpr (vecSize == 1) {
74
            sum += a_val.x * b_val.x;
75
        } else if constexpr (vecSize == 2) {
76
            sum += a_val.x * b_val.x + a_val.y * b_val.y;
77
        } else { // float4
78
            sum += a_val.x * b_val.x + a_val.y * b_val.y + a_val.z * b_val.z + a_val.w * b_val.w;
79
        }
80
    }
81

82
    // 处理向量化未能覆盖的尾部剩余元素（n 不是 vecSize 倍数的情况）
83
    // 仅由线程 0 执行，避免所有线程做冗余工作，且保证只做一次。
84
    if (tid == 0) {
85
        int start = numVec * vecSize;
86
        for (int i = start ; i < n; i++) {
87
            sum += a[i] * b[i];
88
        }
89
    }
90

91
    // warp 内部 shuffle
92
    sum = warp_reduce_sum(sum);
93
    // 把结果放到共享内存中
94
    if (lane == 0) {
95
        sdata[warp_id] = sum;
96
    }
97
    __syncthreads();
98

99
    // 前 8 个线程（tid 0～7）从共享内存读取对应 warp 的部分和，其他线程设 0。
100
    if (warp_id == 0) {
101
        sum = (tid < numWarps) ? sdata[tid] : 0.0f;
102
        sum = warp_reduce_sum(sum);
103
        if (tid == 0) {
104
            atomicAdd(out, sum);
105
        }
106
    }
107
}
108

109

110
int main() {
111
    const int N = 1 << 25;
112
    const int iter = 3;
113

114
    std::vector<float> h_a(N, 1.0f), h_b(N, 2.0f);
115
    benchmark_dot_product("naive", N, iter, h_a, h_b, [&](const float* A, const float* B, float* out, int N) {
116
        constexpr int blockSize = 256;
117
        int gridSize = (N + blockSize - 1) / blockSize;
118
        cudaMemset(out, 0, sizeof(float));
119
        dot_product_v0<<<gridSize, blockSize>>>(A, B, out, N);
120
    });
121

122
    benchmark_dot_product("optimized", N, iter, h_a, h_b, [&](const float* A, const float* B, float* out, int N) {
123
        constexpr int blockSize = 256;
124
        int gridSize = 1024;
125
        cudaMemset(out, 0, sizeof(float));
126
        dot_product_v1<float4, blockSize><<<gridSize, blockSize>>>(A, B, out, N);
127
    });
128
    return 0;
129
}

Triton和Pytorch版本#

1
import triton
2
import triton.language as tl
3
import torch
4

5

6
@triton.jit
7
def dot_product_atomic_kernel(x_ptr, y_ptr, output_ptr, n_elements,
8
                              BLOCK_SIZE: tl.constexpr):
9
    pid = tl.program_id(0)
10
    block_start = pid * BLOCK_SIZE
11
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
12
    mask = offsets < n_elements
13

14
    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
15
    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
16
    partial = tl.sum(x * y)
17
    tl.atomic_add(output_ptr, partial)
18

19

20
def dot_product_atomic(x: torch.Tensor, y: torch.Tensor,
21
                       BLOCK_SIZE: int = 1024) -> torch.Tensor:
22
    n = x.numel()
23
    output = torch.zeros(1, device=x.device, dtype=torch.float32)
24
    grid = (triton.cdiv(n, BLOCK_SIZE),)
25
    dot_product_atomic_kernel[grid](x, y, output, n, BLOCK_SIZE=BLOCK_SIZE)
26
    return output
27

28

29
@triton.jit
30
def dot_product_partial_kernel(x_ptr, y_ptr, partial_out_ptr, n_elements,
31
                               BLOCK_SIZE: tl.constexpr):
32
    """Stage 1: 计算局部点积, 写入 per-block 缓冲区."""
33
    pid = tl.program_id(0)
34
    block_start = pid * BLOCK_SIZE
35
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
36
    mask = offsets < n_elements
37

38
    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
39
    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
40
    partial = tl.sum(x * y)
41
    tl.store(partial_out_ptr + pid, partial)
42

43

44
@triton.jit
45
def reduce_final_kernel(partial_ptr, output_ptr, n_partials,
46
                        BLOCK_SIZE: tl.constexpr):
47
    """Stage 2: 将 partial 缓冲区归约为最终标量."""
48
    pid = tl.program_id(0)
49
    block_start = pid * BLOCK_SIZE
50
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
51
    mask = offsets < n_partials
52

53
    vals = tl.load(partial_ptr + offsets, mask=mask, other=0.0)
54
    partial = tl.sum(vals)
55
    tl.atomic_add(output_ptr, partial)
56

57

58
def dot_product_two_stage(x: torch.Tensor, y: torch.Tensor,
59
                          BLOCK_SIZE: int = 1024,
60
                          REDUCE_BLOCK: int = 1024) -> torch.Tensor:
61
    n = x.numel()
62
    num_blocks = triton.cdiv(n, BLOCK_SIZE)
63

64
    # Stage 1: 局部点积
65
    partial = torch.empty(num_blocks, device=x.device, dtype=torch.float32)
66
    grid1 = (num_blocks,)
67
    dot_product_partial_kernel[grid1](x, y, partial, n, BLOCK_SIZE=BLOCK_SIZE)
68

69
    # Stage 2: Triton 归约
70
    output = torch.zeros(1, device=x.device, dtype=torch.float32)
71
    num_reduce = triton.cdiv(num_blocks, REDUCE_BLOCK)
72
    grid2 = (num_reduce,)
73
    reduce_final_kernel[grid2](partial, output, num_blocks, BLOCK_SIZE=REDUCE_BLOCK)
74

75
    return output
76

77

78
def dot_product_pytorch_dot(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
79
    return torch.dot(x, y)
80

81

82
def dot_product_pytorch_sum(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
83
    return torch.sum(x * y)
84

85

86
def benchmark(fn, x, y, ref, name, iters=100, warmup=10):
87
    for _ in range(warmup):
88
        fn(x, y)
89
    torch.cuda.synchronize()
90

91
    start = torch.cuda.Event(enable_timing=True)
92
    end = torch.cuda.Event(enable_timing=True)
93

94
    start.record()
95
    for _ in range(iters):
96
        fn(x, y)
97
    end.record()
98
    torch.cuda.synchronize()
99

100
    elapsed_ms = start.elapsed_time(end) / iters
101

102
    result = fn(x, y)
103
    result_val = result.item()
104
    ref_val = ref.item() if isinstance(ref, torch.Tensor) else ref
105

106
    error = abs(result_val - ref_val)
107
    rel_error = error / (abs(ref_val) + 1e-10)
108
    passed = rel_error < 1e-4
109

110
    bytes_read = 2 * x.numel() * x.element_size()
111
    bw = bytes_read / (elapsed_ms * 1e6)
112

113
    tag = "PASS" if passed else "FAIL"
114
    print(f"  [{name}]")
115
    print(f"    Time: {elapsed_ms:.4f} ms | BW: {bw:.2f} GB/s | {tag}")
116
    if not passed:
117
        print(f"    Debug: Ref={ref_val:.6f}, Result={result_val:.6f}, "
118
              f"RelError={rel_error:.6e}")
119

120
    return elapsed_ms, bw, passed
121

122

123
def main():
124
    configs = [
125
        ("32M",     33554432),
126
    ]
127

128
    for name, N in configs:
129
        print(f"--- N = {name} ({N:,} elements) ---")
130

131
        x = torch.randn(N, device='cuda', dtype=torch.float32)
132
        y = torch.randn(N, device='cuda', dtype=torch.float32)
133
        ref = torch.dot(x.cpu().double(), y.cpu().double())
134

135
        benchmark(dot_product_atomic, x, y, ref,
136
                  "Atomic")
137
        benchmark(dot_product_two_stage, x, y, ref,
138
                  "Two-Stage")
139
        benchmark(dot_product_pytorch_dot, x, y, ref,
140
                  "PyTorch dot")
141
        benchmark(dot_product_pytorch_sum, x, y, ref,
142
                  "PyTorch sum(a*b)")
143
        print()
144

145

146
if __name__ == "__main__":
147
    main()

音乐

音乐

习题列表#

Reduction#

CUDA版本#

Triton版本#

Pytorch版本#

Dot Product#

CUDA版本#

Triton和Pytorch版本#

支持与分享

音乐

目录

音乐

音乐

LeetGPU习题04：Reduce汇总

习题列表#

Reduction#

CUDA版本#

Triton版本#

Pytorch版本#

Dot Product#

CUDA版本#

Triton和Pytorch版本#

支持与分享

音乐

目录