diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index 97409e6cb1b17b8fc109e30dc78720b8d573f042..a29670b415d24749326adf0daaf13e5a1c40725f 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -52,8 +52,9 @@ class AbsKernel std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - LaunchElementwiseCudaKernel>( - dev_ctx, ins, &outs, functor); + LaunchSameDimsElementwiseCudaKernel>(dev_ctx, ins, &outs, + functor); } }; diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 002fae601204ef2001b7f47c5573874df5a06589..87e65e8817798199c907f88692887a21da58673c 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1316,8 +1316,8 @@ class ActivationCudaKernel for (auto& attr : attrs) { *attr.second = ctx.Attr(attr.first); } - LaunchElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } }; @@ -1346,16 +1346,16 @@ class ActivationGradCudaKernel if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { // Only need forward output Out ins.push_back(out); - LaunchElementwiseCudaKernel( + LaunchSameDimsElementwiseCudaKernel( dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == static_cast(kDepX)) { // Only need forward input X ins.push_back(x); - LaunchElementwiseCudaKernel( + LaunchSameDimsElementwiseCudaKernel( dev_ctx, ins, &outs, functor); } else { - LaunchElementwiseCudaKernel( + LaunchSameDimsElementwiseCudaKernel( dev_ctx, ins, &outs, functor); } } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 63f62347b81c6aee1ae9aea6397081885f8781da..b551629169deed66a1a79636287569995726c4be 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -69,15 +69,6 @@ struct SameDimsElemwiseAdd< } }; -template -struct BroadcastElemwiseAdd { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z) { - default_elementwise_add(ctx, x, y, z); - } -}; - class ElementwiseAddOpMaker : public ElementwiseOpMaker { protected: std::string GetName() const override { return "Add"; } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 7b42803aa51ec6e34ee1484ee46486091a560a9f..a4b97301a2611b8ec17fadfcb49458f1bab5a32e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" @@ -40,29 +39,24 @@ struct CudaAddFunctor { }; template -struct SameDimsElemwiseAdd { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { +class ElementwiseAddKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis; + std::vector ins = {x, y}; std::vector outs = {z}; - LaunchElementwiseCudaKernel( - ctx.template device_context(), ins, &outs, - CudaAddFunctor()); - } -}; + const auto& cuda_ctx = + ctx.template device_context(); -template -struct BroadcastElemwiseAdd { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* out) { - std::vector ins = {x, y}; - int axis = ctx.Attr("axis"); - axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis; - LaunchBroadcastElementwiseCudaKernel( - ctx.template device_context(), ins, out, - CudaAddFunctor(), axis); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaAddFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 57f6629702214f15caeaf281a68e5e6eb2e61567..ec7d036a1a1e0295ec496960069335fb33d3d003 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -26,7 +26,7 @@ limitations under the License. */ #include #include #include "cub/cub.cuh" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" + #endif #ifdef __HIPCC__ #include @@ -40,9 +40,10 @@ namespace paddle { namespace operators { template -void default_elementwise_add(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, framework::Tensor *z) { +void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, + framework::Tensor *z) { int axis = ctx.Attr("axis"); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -62,13 +63,6 @@ struct SameDimsElemwiseAdd { framework::Tensor *z); }; -template -struct BroadcastElemwiseAdd { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z); -}; - template class ElementwiseAddKernel : public framework::OpKernel { public: @@ -77,13 +71,13 @@ class ElementwiseAddKernel : public framework::OpKernel { auto *y = ctx.Input("Y"); auto *z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - auto dims_equal = x->dims() == y->dims(); - if (dims_equal) { - SameDimsElemwiseAdd same_dims_add; - same_dims_add(ctx, x, y, z); + if (x->dims() == y->dims()) { + SameDimsElemwiseAdd + LaunchElementwiseCpuKernel; + LaunchElementwiseCpuKernel(ctx, x, y, z); } else { - BroadcastElemwiseAdd broadcast_add; - broadcast_add(ctx, x, y, z); + LaunchBroadcastElementwiseCpuKernel(ctx, x, + y, z); } } }; @@ -469,8 +463,8 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); ddout->mutable_data(ctx.GetPlace()); - default_elementwise_add(ctx, &ddx_safe, &ddy_safe, - ddout); + LaunchBroadcastElementwiseCpuKernel(ctx, &ddx_safe, + &ddy_safe, ddout); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index c9657a1b9db04675b2ebe17ecfc347df60eb7e29..aeef6ee71446f7dec695e79504def1ae13ceddee 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" namespace paddle { namespace operators { @@ -28,7 +28,8 @@ struct DimensionsTransform { std::vector in_dims; private: - // 1. To compensate the lackage of input_tensors` dimension; + // To compensate the lackage of input_tensors` dimension with input variable + // 'axis' void InputDimensionsExtend(int N, int axis) { for (auto &in_dim : in_dims) { int64_t in_idx = 0; @@ -70,7 +71,7 @@ struct DimensionsTransform { } template - __inline__ void DimensionsReorganise(MergeFunctor merge_func, int N) { + __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, vec->begin() + m_idx, 1, @@ -139,7 +140,7 @@ struct DimensionsTransform { // To Merge the dimensions of input_tensors while the consequtive // equal-dimensions appears. MergeFunctor merge_ptr = merge_sequential_dims; - DimensionsReorganise(merge_ptr, N); + MergeDimensions(merge_ptr, N); int min_idx = 0; int min_val = std::accumulate(in_dims[0].begin(), in_dims[0].end(), 1, @@ -155,12 +156,12 @@ struct DimensionsTransform { // To Merge the dimension of input_tensors while the consequtive // 1-value-dimensions appears. merge_ptr = merge_sequential_one_dims; - DimensionsReorganise(merge_ptr, N); + MergeDimensions(merge_ptr, N); std::swap(in_dims[min_idx], in_dims[0]); } }; -struct CalculateInputStrides { +struct StridesCalculation { std::vector> strides; std::vector divmoders; @@ -181,9 +182,9 @@ struct CalculateInputStrides { } public: - explicit CalculateInputStrides( - const int64_t &dim_size, const std::vector> &in_dims, - const std::vector &out_dims) { + explicit StridesCalculation(const int64_t &dim_size, + const std::vector> &in_dims, + const std::vector &out_dims) { const auto N = in_dims.size(); divmoders.resize(dim_size); strides.resize(N, std::vector(dim_size, 1)); @@ -195,34 +196,40 @@ struct CalculateInputStrides { } }; -template +template struct BroadcastArgsWarpper { - using DimsVec = CudaAlignedVector; + using VecType = CudaAlignedVector; T *out_data; + VecType *vec_out_data; const T *__restrict__ in_data[ET]; - uint32_t strides[ET][framework::DDim::kMaxRank]; + const VecType *__restrict__ vec_in_data[ET]; bool no_broadcast[ET]; FastDivMod divmoders[kDims]; - uint32_t scalar_offset; + uint32_t strides[ET][framework::DDim::kMaxRank]; + uint32_t scalar_cal_offset; + Functor func; HOSTDEVICE BroadcastArgsWarpper( - const std::vector &ins, - const CalculateInputStrides &offset_calculator, framework::Tensor *out, - int scalar_offset) - : scalar_offset(scalar_offset) { + const std::vector &ins, framework::Tensor *out, + int scalar_cal_offset, Functor func, + const StridesCalculation &offset_calculator) + : scalar_cal_offset(scalar_cal_offset), func(func) { for (int j = 0; j < ET; ++j) { in_data[j] = ins[j]->data(); + vec_in_data[j] = reinterpret_cast(in_data[j]); no_broadcast[j] = ins[j]->dims() == out->dims() ? true : false; memcpy(strides[j], offset_calculator.strides[j].data(), kDims * sizeof(uint32_t)); } out_data = out->data(); + vec_out_data = reinterpret_cast(out_data); memcpy(divmoders, offset_calculator.divmoders.data(), kDims * sizeof(FastDivMod)); } - __device__ __forceinline__ uint32_t GetDivmodOffset(int idx, int in_idx) { + __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) { uint32_t offset = 0; #pragma unroll(kDims) @@ -234,120 +241,127 @@ struct BroadcastArgsWarpper { return offset; } - __device__ __forceinline__ void CommonVector(DimsVec args[], int tid, - int idx) { - const DimsVec *__restrict__ vec_data = - reinterpret_cast(in_data[idx]); - args[idx] = vec_data[tid]; + __device__ __forceinline__ void LoadVectorizedDataCommon(VecType *vector_args, + int tid, int idx) { + *vector_args = vec_in_data[idx][tid]; } - __device__ __forceinline__ void DivmodVector(DimsVec args[], int tid, - int idx) { + __device__ __forceinline__ void LoadVectorizedDataByDivmod(T *scalar_args, + int tid, int idx) { int index = tid * VecSize; - +#pragma unroll(VecSize) for (int i = 0; i < VecSize; ++i) { - uint32_t offset = GetDivmodOffset(index + i, idx); - args[idx].val[i] = in_data[idx][offset]; + uint32_t offset = GetOffsetByDivmod(index + i, idx); + scalar_args[i] = in_data[idx][offset]; } } - __device__ __forceinline__ void CommonScalar(T args[], int tid, int idx) { - args[idx] = in_data[idx][tid + scalar_offset]; + __device__ __forceinline__ void LoadScalarizedDataCommon(T args[], int tid, + int idx) { + args[idx] = in_data[idx][tid + scalar_cal_offset]; } - __device__ __forceinline__ void DivmodScalar(T args[], int tid, int idx) { - auto offset = GetDivmodOffset(tid + scalar_offset, idx); + __device__ __forceinline__ void LoadScalarizedDataByDivmod(T args[], int tid, + int idx) { + auto offset = GetOffsetByDivmod(tid + scalar_cal_offset, idx); args[idx] = in_data[idx][offset]; } - __device__ __forceinline__ void LoadVector(DimsVec args[], int tid) { + __device__ __forceinline__ void LoadVectorizedData(T (*args)[VecSize], + int tid) { #pragma unroll(ET) for (int j = 0; j < ET; ++j) { if (no_broadcast[j]) { - CommonVector(args, tid, j); + VecType *vector_args = reinterpret_cast(args[j]); + LoadVectorizedDataCommon(vector_args, tid, j); } else { - DivmodVector(args, tid, j); + LoadVectorizedDataByDivmod(args[j], tid, j); } } } - __device__ __forceinline__ void LoadScalar(T args[], int tid) { + __device__ __forceinline__ void LoadScalarizedData(T args[], int tid) { #pragma unroll(ET) for (int j = 0; j < ET; ++j) { if (no_broadcast[j]) { - CommonScalar(args, tid, j); + LoadScalarizedDataCommon(args, tid, j); } else { - DivmodScalar(args, tid, j); + LoadScalarizedDataByDivmod(args, tid, j); } } } - __device__ __forceinline__ void StoreVector(DimsVec args[], int tid) { - DimsVec *vec_out = reinterpret_cast(out_data); - vec_out[tid] = args[0]; + __device__ __forceinline__ void StoreVectorizedData(T (*args)[VecSize], + int tid) { + VecType *args_out = reinterpret_cast(args[0]); + vec_out_data[tid] = *args_out; } - __device__ __forceinline__ void StoreScalar(T args[], int tid) { - out_data[scalar_offset + tid] = args[0]; + __device__ __forceinline__ void StoreScalarizedData(T args[], int tid) { + out_data[scalar_cal_offset + tid] = args[0]; } }; template __device__ inline void ScalarizedBroadcastKernelImpl( - BroadcastArgsWarpper data_transfer, int tid) { + BroadcastArgsWarpper broadcast_warpper, int tid) { T args[ET]; - data_transfer.LoadScalar(args, tid); + broadcast_warpper.LoadScalarizedData(args, tid); #pragma unroll(ET) for (int j = 1; j < ET; ++j) { - args[0] += args[j]; + args[0] = broadcast_warpper.func(args); } - data_transfer.StoreScalar(args, tid); + broadcast_warpper.StoreScalarizedData(args, tid); } template __device__ inline void VectorizedBroadcastKernelImpl( - BroadcastArgsWarpper data_transfer, int tid) { - using VecT = CudaAlignedVector; - VecT args[ET]; - data_transfer.LoadVector(args, tid); + BroadcastArgsWarpper broadcast_warpper, int tid) { + T ins[ET]; + T args[ET][VecSize]; + broadcast_warpper.LoadVectorizedData(args, tid); -#pragma unroll(ET) - for (int j = 1; j < ET; ++j) { #pragma unroll(VecSize) - for (int i = 0; i < VecSize; ++i) { - args[0].val[i] += args[j].val[i]; + for (int i = 0; i < VecSize; ++i) { +#pragma unroll(ET) + for (int j = 0; j < ET; ++j) { + ins[j] = args[j][i]; } + args[0][i] = broadcast_warpper.func(ins); } - data_transfer.StoreVector(args, tid); + broadcast_warpper.StoreVectorizedData(args, tid); } template -__global__ void ElementwiseBroadcastKernel(BroadcastArgsWarpper data_transfer, - int main_tid, int tail_tid) { +__global__ void ElementwiseBroadcastKernel( + BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - // Aimming at vectorized calculation of major data whose length is max - // multipler of VecSize. + // Vectorized calculation of major data whose length is the max multipler of + // VecSize, + // eg: Calcualting the front 1024-length data in total 1027 data once VecSize + // is 4. if (tid < main_tid) { VectorizedBroadcastKernelImpl( - data_transfer, tid); + broadcast_warpper, tid); } - // Aimming at scalar calculation of rest data whose lenght cannot fulfill - // VecSize. + // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize. + // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is + // 4. if (tid < tail_tid) { - ScalarizedBroadcastKernelImpl(data_transfer, - tid); + ScalarizedBroadcastKernelImpl( + broadcast_warpper, tid); } } -template +template void LaunchBroadcastKernelForDifferentDimSize( const platform::CUDADeviceContext &ctx, const std::vector &ins, framework::Tensor *out, - int axis) { + int axis, Functor func) { int numel = out->numel(); const int threads = 256; int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; @@ -357,72 +371,72 @@ void LaunchBroadcastKernelForDifferentDimSize( auto stream = ctx.stream(); const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); - const auto offset_calculator = CalculateInputStrides( + const auto offset_calculator = StridesCalculation( merge_dims.dim_size, merge_dims.in_dims, merge_dims.out_dims); switch (merge_dims.dim_size) { case 1: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 2: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 3: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 4: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 5: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 6: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 7: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } case 8: { - auto data_transfer = BroadcastArgsWarpper( - ins, offset_calculator, out, vec_len); - ElementwiseBroadcastKernel( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( - data_transfer, main_tid, tail_tid); + broadcast_warpper, main_tid, tail_tid); break; } default: { @@ -437,9 +451,11 @@ void LaunchBroadcastKernelForDifferentDimSize( template void LaunchBroadcastElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, - const std::vector &ins, framework::Tensor *out, - Functor func, int axis) { + const std::vector &ins, + std::vector *outs, int axis, Functor func) { + static_assert(ET == (ElementwiseType)2, "Only Support binary calculation."); int in_vec_size = 4; + framework::Tensor *out = (*outs)[0]; for (auto *in : ins) { auto temp_size = GetVectorizedSizeImpl(in->data()); in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) @@ -450,19 +466,46 @@ void LaunchBroadcastElementwiseCudaKernel( switch (vec_size) { case 4: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis); + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis, + func); break; } case 2: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis); + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis, + func); + break; + } + case 1: { + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis, + func); break; } default: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, axis); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); break; } } } +template +void LaunchElementwiseCudaKernel( + const platform::CUDADeviceContext &cuda_ctx, + const std::vector &ins, + std::vector *outs, int axis, Functor func) { + bool no_broadcast_flag = true; + for (auto *in : ins) { + no_broadcast_flag = ins[0]->dims() == in->dims(); + } + + if (no_broadcast_flag) { + LaunchSameDimsElementwiseCudaKernel( + cuda_ctx, ins, outs, func); + } else { + LaunchBroadcastElementwiseCudaKernel( + cuda_ctx, ins, outs, axis, func); + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 449863f93f2a733e687d558b885248fe35be327c..33a2b7e182f0a06232015abf34b810bcc1ce39af 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -15,8 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/fast_divmod.h" #ifdef __HIPCC__ #define ELEMENTWISE_BLOCK_SIZE 256 @@ -29,11 +28,6 @@ namespace operators { enum ElementwiseType { kUnary = 1, kBinary = 2 }; -template -struct alignas(sizeof(T) * Size) CudaAlignedVector { - T val[Size]; -}; - template int GetVectorizedSizeImpl(const T *pointer) { uint64_t address = reinterpret_cast(pointer); @@ -181,7 +175,7 @@ __global__ void ScalarKernel(const InT *__restrict__ in0, } template -void LaunchElementwiseCudaKernel( +void LaunchSameDimsElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, Functor func) { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h b/paddle/fluid/platform/fast_divmod.h similarity index 91% rename from paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h rename to paddle/fluid/platform/fast_divmod.h index 083bc6a1378ae39ddda7b00f6514f2162fa9c914..5c5903d62cd277f6d86e542f04de066c96374704 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h +++ b/paddle/fluid/platform/fast_divmod.h @@ -14,13 +14,19 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" +#include +#include "paddle/fluid/platform/hostdevice.h" #define INT_BITS 32 namespace paddle { namespace operators { +template +struct alignas(sizeof(T) * Size) CudaAlignedVector { + T val[Size]; +}; + struct FastDivMod { // 1st value represents the result of input number divides by recorded divisor // 2nd value represents the result of input number modulo by recorded divisor