CUDA学习之路[7]:详解oneflow的element_wise代码
990 字
5 分钟
CUDA学习之路[7]:详解oneflow的element_wise代码
/*Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance with the License.You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, softwaredistributed under the License is distributed on an "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the License for the specific language governing permissions andlimitations under the License.*/
#ifndef ONEFLOW_CORE_CUDA_ELEMENTWISE_H_#define ONEFLOW_CORE_CUDA_ELEMENTWISE_H_
#include <cuda_runtime.h>#include <cstdint>#include <algorithm>#include <type_traits>
namespace oneflow {
namespace cuda {
namespace elementwise {
constexpr int kBlockSize = 256;constexpr int kNumWaves = 32;
inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); if (err != cudaSuccess) { return err; } } int sm_count; { cudaError_t err = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev); if (err != cudaSuccess) { return err; } } int tpm; { cudaError_t err = cudaDeviceGetAttribute(&tpm, cudaDevAttrMaxThreadsPerMultiProcessor, dev); if (err != cudaSuccess) { return err; } } *num_blocks = std::max<int>(1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize, sm_count * tpm / kBlockSize * kNumWaves)); return cudaSuccess;}
template<typename T, int pack_size>struct GetPackType { using type = typename std::aligned_storage<pack_size * sizeof(T), pack_size * sizeof(T)>::type;};
template<typename T, int pack_size>using PackType = typename GetPackType<T, pack_size>::type;
template<typename T, int pack_size>union Pack { static_assert(sizeof(PackType<T, pack_size>) == sizeof(T) * pack_size, ""); __device__ Pack() { // do nothing } PackType<T, pack_size> storage; T elem[pack_size];};
template<typename T, int pack_size>struct alignas(sizeof(T) * pack_size) Packed { __device__ Packed() { // do nothing } union { T elem[pack_size]; };};
constexpr int kMaxPackBytes = 128 / 8;constexpr int kMaxPackSize = 8;
constexpr int Min(int a, int b) { return a < b ? a : b; }
template<typename T>constexpr int PackSize() { return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);}
template<typename T, typename U, typename... Args>constexpr int PackSize() { return Min(PackSize<T>(), PackSize<U, Args...>());}
template<typename T>class HasApply2 { typedef char one; struct two { char x[2]; };
template<typename C> static one test(decltype(&C::Apply2)); template<typename C> static two test(...);
public: enum { value = sizeof(test<T>(0)) == sizeof(char) };};
template<int pack_size, typename FunctorT, typename R, typename... IN>__device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0, Packed<R, pack_size>>::typeApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) { Packed<R, pack_size> ret;#pragma unroll for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in.elem + j)...); } return ret;}
template<int pack_size, typename FunctorT, typename R, typename... IN>__device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0, Packed<R, pack_size>>::typeApplyPack(const FunctorT& functor, const Packed<IN, pack_size>... in) { Packed<R, pack_size> ret;#pragma unroll for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in.elem[j])...); } return ret;}
template<int pack_size, typename FactoryT, typename R, typename... IN>__global__ void __launch_bounds__(kBlockSize) ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r, const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r, const IN*... tail_in) { auto functor = factory(); const int global_tid = blockIdx.x * kBlockSize + threadIdx.x; for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) { pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i])...); } if (global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }}
template<typename FunctorT>struct SimpleFactory { explicit SimpleFactory(FunctorT functor) : tpl(functor) {} __device__ FunctorT operator()() const { return tpl; }
private: FunctorT tpl;};
template<size_t pack_size>bool IsAlignedForPack() { return true;}
template<size_t pack_size, typename T, typename... Args>bool IsAlignedForPack(const T* ptr, const Args*... others) { return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0 && IsAlignedForPack<pack_size, Args...>(others...);}
template<size_t pack_size, typename FactoryT, typename R, typename... IN>cudaError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, cudaStream_t stream) { const int64_t n_pack = n / pack_size; const int64_t tail_offset = n_pack * pack_size; const int64_t n_tail = n - tail_offset; int num_blocks; { cudaError_t err = GetNumBlocks(n_pack, &num_blocks); if (err != cudaSuccess) { return err; } } ApplyGeneric<pack_size, FactoryT, R, IN...><<<num_blocks, kBlockSize, 0, stream>>>( factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r), (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset, (in + tail_offset)...); return cudaPeekAtLastError();}
template<typename FactoryT, typename R, typename... IN>struct GenericLauncher { static cudaError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in, cudaStream_t stream) { constexpr int max_pack_size = PackSize<R, IN...>(); if (IsAlignedForPack<max_pack_size, R, IN...>(r, in...)) { return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in..., stream); } else { return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream); } }};
template<typename FactoryT, typename R, typename A>inline cudaError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, cudaStream_t stream) { return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a, stream);}
template<typename FunctorT, typename R, typename A>inline cudaError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, cudaStream_t stream) { return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, stream);}
template<typename FactoryT, typename R, typename A, typename B>inline cudaError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, cudaStream_t stream) { return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b, stream);}
template<typename FunctorT, typename R, typename A, typename B>inline cudaError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, cudaStream_t stream) { return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, stream);}
template<typename FactoryT, typename R, typename A, typename B, typename C>inline cudaError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, const C* c, cudaStream_t stream) { return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c, stream);}
template<typename FunctorT, typename R, typename A, typename B, typename C>inline cudaError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c, cudaStream_t stream) { return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c, stream);}
} // namespace elementwise
} // namespace cuda
} // namespace oneflow
#endif // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_Oneflow是什么?
Element_wise的核心思想
向量化+自适应网格+编译期多态。
支持与分享
如果这篇文章对你有帮助,欢迎分享给更多人或赞助支持!
CUDA学习之路[7]:详解oneflow的element_wise代码
https://dlog.com.cn/posts/cuda07/one_flow/ 相关文章 智能推荐
1
CUDA学习之路[2]——你需要哪些C/C++的知识呢?
CUDA学习之路 你学Java/Python忽视的知识点,反而是在CUDA编程中最需要的。
2
CUDA学习之路[6]:PyTorch CUDA 扩展完全指南
CUDA学习之路 从零掌握在 PyTorch 中调用 CUDA 代码的多种方式,理解 JIT 编译与预编译的权衡,并深入整合自动微分。
3
CUDA学习之路[0]——祛魅
CUDA学习之路 你在什么时候才会用到CUDA呢?
4
CUDA学习之路[4]——CUDA全局坐标计算
CUDA学习之路 从一维到三维,彻底理清CUDA线程索引的映射逻辑。
5
CUDA学习之路[5]——逐元素操作算子
CUDA学习之路 最基础的一系列算子:element-wise。
随机文章 随机推荐