下边是OneFlow框架中element_wise算子的完整CUDA实现，仅约200行，却浓缩了大量高性能计算的设计智慧。

接下来我们就以架构师的角度来拆解这份代码，同时也要站在用户的角度来思考，如何使用这份代码。

1
/*
2
Copyright 2020 The OneFlow Authors. All rights reserved.
3

4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7

8
    http://www.apache.org/licenses/LICENSE-2.0
9

10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16

17

18

19
#ifndef ONEFLOW_CORE_CUDA_ELEMENTWISE_H_
20
#define ONEFLOW_CORE_CUDA_ELEMENTWISE_H_
21

22
#include <cuda_runtime.h>
23
#include <cstdint>
24
#include <algorithm>
25
#include <type_traits>
26

27
namespace oneflow {
28

29
namespace cuda {
30

31
namespace elementwise {
32

33
constexpr int kBlockSize = 256;
34
constexpr int kNumWaves = 32;
35

36
inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) {
37
  int dev;
38
  {
39
    cudaError_t err = cudaGetDevice(&dev);
40
    if (err != cudaSuccess) { return err; }
41
  }
42
  int sm_count;
43
  {
44
    cudaError_t err = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
45
    if (err != cudaSuccess) { return err; }
46
  }
47
  int tpm;
48
  {
49
    cudaError_t err = cudaDeviceGetAttribute(&tpm, cudaDevAttrMaxThreadsPerMultiProcessor, dev);
50
    if (err != cudaSuccess) { return err; }
51
  }
52
  *num_blocks = std::max<int>(1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
53
                                                   sm_count * tpm / kBlockSize * kNumWaves));
54
  return cudaSuccess;
55
}
56

57
template<typename T, int pack_size>
58
struct GetPackType {
59
  using type = typename std::aligned_storage<pack_size * sizeof(T), pack_size * sizeof(T)>::type;
60
};
61

62
template<typename T, int pack_size>
63
using PackType = typename GetPackType<T, pack_size>::type;
64

65
template<typename T, int pack_size>
66
union Pack {
67
  static_assert(sizeof(PackType<T, pack_size>) == sizeof(T) * pack_size, "");
68
  __device__ Pack() {
69
    // do nothing
70
  }
71
  PackType<T, pack_size> storage;
72
  T elem[pack_size];
73
};
74

75
template<typename T, int pack_size>
76
struct alignas(sizeof(T) * pack_size) Packed {
77
  __device__ Packed() {
78
    // do nothing
79
  }
80
  union {
81
    T elem[pack_size];
82
  };
83
};
84

85
constexpr int kMaxPackBytes = 128 / 8;
86
constexpr int kMaxPackSize = 8;
87

88
constexpr int Min(int a, int b) { return a < b ? a : b; }
89

90
template<typename T>
91
constexpr int PackSize() {
92
  return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
93
}
94

95
template<typename T, typename U, typename... Args>
96
constexpr int PackSize() {
97
  return Min(PackSize<T>(), PackSize<U, Args...>());
98
}
99

100
template<typename T>
101
class HasApply2 {
102
  typedef char one;
103
  struct two {
104
    char x[2];
105
  };
106

107
  template<typename C>
108
  static one test(decltype(&C::Apply2));
109
  template<typename C>
110
  static two test(...);
111

112
 public:
113
  enum { value = sizeof(test<T>(0)) == sizeof(char) };
114
};
115

116
template<int pack_size, typename FunctorT, typename R, typename... IN>
117
__device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
118
                                   Packed<R, pack_size>>::type
119
ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
120
  Packed<R, pack_size> ret;
121
#pragma unroll
122
  for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in.elem + j)...); }
123
  return ret;
124
}
125

126
template<int pack_size, typename FunctorT, typename R, typename... IN>
127
__device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
128
                                   Packed<R, pack_size>>::type
129
ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
130
  Packed<R, pack_size> ret;
131
#pragma unroll
132
  for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in.elem[j])...); }
133
  return ret;
134
}
135

136
template<int pack_size, typename FactoryT, typename R, typename... IN>
137
__global__ void __launch_bounds__(kBlockSize)
138
    ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
139
                 const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
140
                 const IN*... tail_in) {
141
  auto functor = factory();
142
  const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
143
  for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
144
    pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i])...);
145
  }
146
  if (global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
147
}
148

149
template<typename FunctorT>
150
struct SimpleFactory {
151
  explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
152
  __device__ FunctorT operator()() const { return tpl; }
153

154
 private:
155
  FunctorT tpl;
156
};
157

158
template<size_t pack_size>
159
bool IsAlignedForPack() {
160
  return true;
161
}
162

163
template<size_t pack_size, typename T, typename... Args>
164
bool IsAlignedForPack(const T* ptr, const Args*... others) {
165
  return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
166
         && IsAlignedForPack<pack_size, Args...>(others...);
167
}
168

169
template<size_t pack_size, typename FactoryT, typename R, typename... IN>
170
cudaError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, cudaStream_t stream) {
171
  const int64_t n_pack = n / pack_size;
172
  const int64_t tail_offset = n_pack * pack_size;
173
  const int64_t n_tail = n - tail_offset;
174
  int num_blocks;
175
  {
176
    cudaError_t err = GetNumBlocks(n_pack, &num_blocks);
177
    if (err != cudaSuccess) { return err; }
178
  }
179
  ApplyGeneric<pack_size, FactoryT, R, IN...><<<num_blocks, kBlockSize, 0, stream>>>(
180
      factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
181
      (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
182
      (in + tail_offset)...);
183
  return cudaPeekAtLastError();
184
}
185

186
template<typename FactoryT, typename R, typename... IN>
187
struct GenericLauncher {
188
  static cudaError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in,
189
                            cudaStream_t stream) {
190
    constexpr int max_pack_size = PackSize<R, IN...>();
191
    if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) {
192
      return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in..., stream);
193
    } else {
194
      return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream);
195
    }
196
  }
197
};
198

199
template<typename FactoryT, typename R, typename A>
200
inline cudaError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a,
201
                                    cudaStream_t stream) {
202
  return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a, stream);
203
}
204

205
template<typename FunctorT, typename R, typename A>
206
inline cudaError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, cudaStream_t stream) {
207
  return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, stream);
208
}
209

210
template<typename FactoryT, typename R, typename A, typename B>
211
inline cudaError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
212
                                     cudaStream_t stream) {
213
  return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b, stream);
214
}
215

216
template<typename FunctorT, typename R, typename A, typename B>
217
inline cudaError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b,
218
                          cudaStream_t stream) {
219
  return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, stream);
220
}
221

222
template<typename FactoryT, typename R, typename A, typename B, typename C>
223
inline cudaError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
224
                                      const C* c, cudaStream_t stream) {
225
  return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c, stream);
226
}
227

228
template<typename FunctorT, typename R, typename A, typename B, typename C>
229
inline cudaError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c,
230
                           cudaStream_t stream) {
231
  return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c, stream);
232
}
233

234
}  // namespace elementwise
235

236
}  // namespace cuda
237

238
}  // namespace oneflow
239

240
#endif  // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_

Oneflow是什么？#

Oneflow是一款国产开源的深度学习框架，以极致的性能和分布式易用性著称。它的核心理念之一是：静态编译与运行时调度分离。这使得它能够在算子层面做出非常激进的编译期优化。

我们分析的Element Wise正是这种设计哲学的集中体现。

它在200行代码内实现了向量化访存、自适应Grid分配、编译器多态、模板特化等多项技术。

Element_wise的核心思想#

Element-wise 操作（如加法、ReLU、dropout 等）的计算密度很低。一次内存读取只伴随少量浮点运算。因此 GPU 的算力远不是瓶颈，瓶颈在于带宽，也就是如何用最快的方式把数据从全局内存搬运到寄存器里。

围绕这一核心痛点，这份代码确立了三大设计支柱：

Memory Packing：强制 128-bit 对齐读写，触发编译器和硬件使用向量化访存指令（如 LDG.128 / STG.128）。
SFINAE 编译期优化：当用户定义的算符支持 Apply2 接口时，自动切换到更高效的 half2 等 SIMD 指令路径。
Adaptive Grid Sizing：动态查询当前 GPU 的 SM 数量与最大线程数，计算出恰好能掩盖延迟、又不浪费资源的线程网格。

下面我们沿着调用栈从上到下，看看作者是如何把这些思想有机地组织在一起的。

架构师视角#

1. 视角？视角！#

在阅读其他人的代码的时候，我们都要有一种开发者视角，千万不能从上往下一口气读完，而是我们要思考，第一行代码要写什么？

以及整个架构是什么？

在这里我姑妄言之，给大家一点思路：

2. 用户接口层#

框架的第一要务是服务好算法工程师。用户在写 element-wise 算子时，不应该感知「线程块大小是多少」「指针是否对齐」这类底层细节。他们只想描述数学逻辑，然后扔给一个函数执行。 OneFlow 给出了最简洁的接口：Unary、Binary、Ternary。例如：

1
// 以 Binary (二元计算) 为例
2
// 用户自定义一个二元运算
3
struct AddFunctor {
4
    __device__ float operator()(float a, float b) {
5
        return a + b;
6
    }
7
};
8

9
// 调用 OneFlow 的 Binary 接口
10
float *d_r, *d_a, *d_b;
11
int64_t n = 1 << 20;
12
cudaStream_t stream = 0;
13
oneflow::cuda::elementwise::Binary(AddFunctor{}, n, d_r, d_a, d_b, stream);

三行核心代码，一个 operator()，用户就把 CUDA 核函数的烦恼全部外包给了 OneFlow。

有些算子的状态是在运行时才决定的，例如 Dropout 需要随机种子。这时可以使用「工厂模式」。即将一组参数打包成一个工厂对象，待到 GPU 端真正执行时再生成最终的算符。对应接口叫 UnaryWithFactory / BinaryWithFactory / TernaryWithFactory。

你说工厂模式具体什么？

知乎：设计模式精讲：什么是工厂模式请自行学习。

3. 分发层#

3.1. 编译期计算最大打包宽度#

接口函数内部会直接调用 GenericLauncher::Launch(...)。这是性能决策的枢纽，核心任务是选择 pack_size。这个地方的核心手段就是向量化访存。GPU提供了LDG.128和STG.128指令，一次可以搬运16个字节。但是向量化访存需要严格对齐内存。

PackSize 根据所有输入输出类型，在编译期算出安全的最大打包元素数。它受限于两个因素：

kMaxPackBytes = 16 字节（128 位），因为 GPU 的向量化加载指令一次最多搬运 128 位。
kMaxPackSize = 8，即最高一次打包 8 个元素。

以 float 为例，每个 4 字节，16 / 4 = 4，所以 float 的最大 pack 大小是 4。half（2 字节）则是 min(16/2, 8) = 8。这样，一条 128-bit 指令就能加载/存储 4 个 float 或 8 个 half，极大降低指令发射数。

3.2. 运行期检查内存对齐#

向量化访存要求数据地址与访问宽度对齐。IsAlignedForPack 在运行时检查所有输入输出指针是否满足 alignof(Packed<T, pack_size>)（即 sizeof(T) * pack_size）。

1
template<typename FactoryT, typename R, typename... IN>
2
struct GenericLauncher {
3
  static cudaError_t Launch(...) {
4
    // 1. 计算最大可能的打包大小 (最大 16 字节)
5
    constexpr int max_pack_size = PackSize<R, IN...>();
6

7
    // 2. 动态检查指针地址是否满足对齐要求
8
    if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) {
9
      // 性能模式：按 max_pack_size (如 4 或 8) 启动 Kernel
10
      return LaunchKernel<max_pack_size, ...>(...);
11
    } else {
12
      // 安全回退模式：地址不对齐，按标量 (pack_size = 1) 启动 Kernel
13
      return LaunchKernel<1, ...>(...);
14
    }
15
  }
16
};

如果地址对齐，就享用向量化红利；如果不对齐（比如用户无意中传入了偏移过的指针），则安全回退到 pack_size = 1 的标量模式。这种防御性编程非常值得学习。

4. 网格优化层#

在确定了Packsize之后，就准备启动Kernel，这个时候需要决定要分配多少个线程块。

对于我们新手来说，一般来说就是：blocks = (n + blockSize - 1) / blockSize。

这在数据量较小时会「喂不饱」GPU，而数据量极大时又会淹没调度器。OneFlow 的做法是自适应，引入了GetNumBlocks算法。

1
constexpr int kBlockSize = 256;
2
constexpr int kNumWaves = 32;
3

4
inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) {
5
    int dev, sm_count, tpm;
6
    cudaGetDevice(&dev);
7
    cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
8
    cudaDeviceGetAttribute(&tpm, cudaDevAttrMaxThreadsPerMultiProcessor, dev);
9

10
    *num_blocks = std::max<int>(1, std::min<int64_t>(
11
        (n + kBlockSize - 1) / kBlockSize,
12
        sm_count * tpm / kBlockSize * kNumWaves
13
    ));
14
    return cudaSuccess;
15
}

这里 kNumWaves = 32 是一个经验值：让每个 SM 上常驻 kNumWaves 个线程块，以便在访存延迟时立即切换到另一波线程执行。上限 sm_count * tpm / kBlockSize * kNumWaves 确保不会因过度订阅空耗调度资源。无论你用的是 V100、H100 还是 4070，这段逻辑都能自动适配，得到最适合的网格大小。

5. 内核实现#

5.1. Grid-stride Loop#

最终就到了我们之前已经学过的Grid Loop。

1
template<int pack_size, ...>
2
__global__ void ApplyGeneric(..., int64_t n_pack, Packed<R, pack_size>* pack_r, ..., int64_t n_tail, R* tail_r, ...) {
3

4
  auto functor = factory();
5
  const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
6

7
  // 1. 主循环：处理 Pack 数据 (向量化读取 -> 计算 -> 向量化写入)
8
  for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
9
    pack_r[i] = ApplyPack<pack_size, ...>(functor, (pack_in[i])...);
10
  }
11

12
  // 2. 尾部处理：处理无法被 pack_size 整除的剩余标量
13
  if (global_tid < n_tail) {
14
    tail_r[global_tid] = functor((tail_in[global_tid])...);
15
  }
16
}

可以看出这个地方它也分离了打包区与尾部区，确保所有的数据都会被处理。

5.2. 架构师的黑魔法#

前面我们一直说到的打包打包，究竟是什么？

1
template<typename T, int pack_size>
2
struct alignas(sizeof(T) * pack_size) Packed {
3
  union {
4
    T elem[pack_size];
5
  };
6
};

alignas告诉编译器：这个结构体必须按 sizeof(T) * pack_size 对齐。当我们把输入输出指针 reinterpret_cast 成 Packed<R, pack_size>* 时，编译器会自动生成 128-bit 的加载/存储指令，而不是四次 32-bit 的标量指令。因此自然会调用最高效的宽总线访存指令。

5.3. SFINAE#

ApplyPack 有两个重载，通过 std::enable_if 在编译期选择：

默认路径：一个简单的 for 循环，逐个调用functor(elem...)。
优化路径：当检测到 FunctorT 拥有 Apply2 方法且 pack_size % 2 == 0 时，启用这个版本。

1
template<int pack_size, typename FunctorT, ...>
2
__device__ enable_if<HasApply2<FunctorT>::value && pack_size % 2 == 0, Packed<R, pack_size>>::type
3
ApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) {
4
    Packed<R, pack_size> ret;
5
    #pragma unroll
6
    for (int j = 0; j < pack_size; j += 2) {
7
        functor.Apply2(ret.elem + j, (in.elem + j)...);
8
    }
9
    return ret;
10
}

用户视角#

前面为了讲清原理，我已经零散地展示了一些调用方式。这里系统化地给出几个典型场景。

6. 基本运算#

1
#include <cuda_runtime.h>
2
#include "elementwise.h"
3

4
struct MulFunctor {
5
    __device__ float operator()(float a, float b) { return a * b; }
6
};
7

8
void launch_mul(float *d_out, const float *d_a, const float *d_b, int64_t n, cudaStream_t stream) {
9
    oneflow::cuda::elementwise::Binary(MulFunctor{}, n, d_out, d_a, d_b, stream);
10
}
11

12
struct ReLUFunctor {
13
    __device__ float operator()(float x) { return fmaxf(x, 0.f); }
14
};
15

16
void launch_relu(float *d_out, const float *d_in, int64_t n, cudaStream_t stream) {
17
    oneflow::cuda::elementwise::Unary(ReLUFunctor{}, n, d_out, d_in, stream);
18
}

7. 带运行时参数运算#

1
struct ScaleAddFunctor {
2
    float alpha;
3
    __device__ float operator()(float a, float b) const {
4
        return alpha * a + b;
5
    }
6
};
7

8
struct ScaleAddFactory {
9
    float alpha;
10
    __device__ ScaleAddFunctor operator()() const {
11
        return ScaleAddFunctor{alpha};
12
    }
13
};
14

15
void launch_scale_add(float *d_out, const float *d_a, const float *d_b,
16
                      int64_t n, float alpha, cudaStream_t stream) {
17
    oneflow::cuda::elementwise::BinaryWithFactory(
18
        ScaleAddFactory{alpha}, n, d_out, d_a, d_b, stream);
19
}

这里就是把运行时参数存进工厂，核函数内再生产出一个带状态的 functor。这也是许多复杂算子（比如带随机种子的 Dropout）的标准做法。

8. Apply2加速#

1
struct AddFunctor {
2
    template<typename T>
3
    __device__ T operator()(T a, T b) const { return a + b; }
4

5
    template<typename T>
6
    __device__ void Apply2(T* out, const T* a, const T* b) const {
7
        out[0] = a[0] + b[0];
8
        out[1] = a[1] + b[1];
9
    }
10
};

总结#

虽然这段代码只有200行，却体现了架构师极其深厚的功力，模板编程、设计模式、并行计算。希望这篇拆解能给你带来一些启发，之后在你写自己的 CUDA 算子时，不妨试试将其中一两个设计思想融入进去把这些交织在一起，这些综合性的知识并不是短期就能够被替代的，大家不要焦虑，慢慢学，相信你们！

音乐

Oneflow是什么？#

Element_wise的核心思想#

架构师视角#

1. 视角？视角！#

2. 用户接口层#

3. 分发层#

3.1. 编译期计算最大打包宽度#

3.2. 运行期检查内存对齐#

4. 网格优化层#

5. 内核实现#

5.1. Grid-stride Loop#

5.2. 架构师的黑魔法#

5.3. SFINAE#

用户视角#

6. 基本运算#

7. 带运行时参数运算#

8. Apply2加速#

总结#

支持与分享

音乐

目录

音乐

CUDA学习之路[7]：详解oneflow的element_wise代码

Oneflow是什么？#

Element_wise的核心思想#

架构师视角#

1. 视角？视角！#

2. 用户接口层#

3. 分发层#

3.1. 编译期计算最大打包宽度#

3.2. 运行期检查内存对齐#

4. 网格优化层#

5. 内核实现#

5.1. Grid-stride Loop#

5.2. 架构师的黑魔法#

5.3. SFINAE#

用户视角#

6. 基本运算#

7. 带运行时参数运算#

8. Apply2加速#

总结#

支持与分享

音乐

目录