diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index f3ddcd4e5c23c75f9715f6624cdbdf420309dd7e..adea0c04df59cd1715905e75dbbe9012add7d134 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -36,7 +36,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/pten/kernels/funcs/cuda_kernel_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index e3d4607b7130c5fc6abeafb9a617589a8c2370cc..f568ee191239d205fcf5eef77cb4a53959ae1952 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -15,45 +15,13 @@ #pragma once #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" + +// only can include the headers in paddle/top/api dirs +#include "paddle/pten/kernels/gpu/elementwise.h" namespace paddle { namespace operators { -namespace kps = paddle::operators::kernel_primitives; - -template -void LaunchBroadcastElementwiseCudaKernel( - const KPDevice &ctx, const std::vector &ins, - std::vector *outs, int axis, Functor func) { - std::vector pt_inputs; - std::vector pt_outputs; - // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary - // DenseTensor obj - // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp - // can be deleted - // when DenseTensor support copy constructor. - std::vector> pt_inputs_tmp; - std::vector> pt_outputs_tmp; - for (auto in : ins) { - pt_inputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*in))); - } - for (auto out : *outs) { - pt_outputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*out))); - } - for (int i = 0; i < pt_inputs_tmp.size(); i++) { - pt_inputs.push_back(pt_inputs_tmp[i].get()); - } - for (int i = 0; i < pt_outputs_tmp.size(); i++) { - pt_outputs.push_back(pt_outputs_tmp[i].get()); - } - pten::LaunchBroadcastElementwiseCudaKernel( - ctx, pt_inputs, &pt_outputs, axis, func); -} - template void LaunchElementwiseCudaKernel( @@ -82,7 +50,7 @@ void LaunchElementwiseCudaKernel( for (int i = 0; i < pt_outputs_tmp.size(); i++) { pt_outputs.push_back(pt_outputs_tmp[i].get()); } - pten::LaunchElementwiseCudaKernel( + pten::funcs::BroadcastKernel( ctx, pt_inputs, &pt_outputs, axis, func); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 233b0767ed624489c478b953e26ed43c91d5b4fc..88c4a266cd9a108cc597e81d281cf5b02299fce6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/kernels/gpu/elementwise.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" namespace paddle { namespace operators { @@ -53,8 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel( for (int i = 0; i < pt_outputs_tmp.size(); i++) { pt_outputs.push_back(pt_outputs_tmp[i].get()); } - pten::funcs::LaunchSameDimsElementwiseCudaKernel( - ctx, pt_inputs, &pt_outputs, func); + pten::funcs::ElementwiseKernel(ctx, pt_inputs, + &pt_outputs, func); } } // namespace operators diff --git a/paddle/pten/kernels/funcs/broadcast_function.h b/paddle/pten/kernels/funcs/broadcast_function.h new file mode 100644 index 0000000000000000000000000000000000000000..0823fa32b775c500f984287e676d2ad4e6421b43 --- /dev/null +++ b/paddle/pten/kernels/funcs/broadcast_function.h @@ -0,0 +1,585 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/kernels/funcs/elementwise_base.h" + +#if defined(__NVCC__) || defined(__HIPCC__) + +namespace kps = pten::kps; + +#endif + +namespace pten { +namespace funcs { + +struct DimensionsTransform { + using DimVector = std::vector; + typedef void (*MergeFunctor)( + bool &, std::vector &, DimVector &, int, int); + int64_t dim_size; + DimVector out_dims; + std::vector in_dims; + + private: + // To compensate the lackage of input_tensors` dimension with input variable + // 'axis' + void InputDimensionsExtend(int N, int axis) { + for (auto &in_dim : in_dims) { + int64_t in_idx = 0; + if (in_dim.size() < dim_size) { + DimVector tmp_dim(dim_size, 1); + do { + if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { + tmp_dim[axis] = in_dim[in_idx]; + in_idx++; + axis++; + } else { + PADDLE_THROW(pten::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", + in_idx + 1, + axis + 1, + out_dims[axis], + in_dim[in_idx])); + } + } while (in_idx < in_dim.size()); + in_dim.resize(dim_size); + std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + } else { + do { + if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { + in_idx++; + } else { + PADDLE_THROW(pten::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", + in_idx + 1, + in_idx + 1, + out_dims[in_idx], + in_dim[in_idx])); + } + } while (in_idx < dim_size); + } + std::reverse(in_dim.begin(), in_dim.end()); + } + std::reverse(out_dims.begin(), out_dims.end()); + } + + template + __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { + auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { + (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, + vec->begin() + m_idx, + 1, + std::multiplies()); + vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); + }; + + int64_t i = 0; + while (i < dim_size) { + int cnt = 0; + int low_idx = i; + bool equal = true; + do { + merge_func(equal, in_dims, out_dims, i, N); + if (equal) { + i++; + cnt++; + } else { + break; + } + } while (i < dim_size); + + if (cnt > 1) { + for (auto &in_dim : in_dims) { + VectorReorganise(&in_dim, low_idx, i); + } + VectorReorganise(&out_dims, low_idx, i); + dim_size -= --cnt; + i -= cnt; + } else if (cnt < 1) { + i++; + } + } + } + + public: + explicit DimensionsTransform(const std::vector &ins, + const pten::DDim &dims, + int axis) { + const int N = max(static_cast(ins.size()), 2); + dim_size = dims.size(); + out_dims = pten::vectorize(dims); + in_dims.resize(N); + if (ins.size() == 1) { + // when ins.size() = 1, broadcast input to output + in_dims[0] = pten::vectorize(ins[0]->dims()); + in_dims[1] = out_dims; + // Add out_dims to in_dims to avoid errors in dims merging + } else { + for (int j = 0; j < N; ++j) { + in_dims[j] = pten::vectorize(ins[j]->dims()); + } + } + InputDimensionsExtend(N, axis); + + auto merge_sequential_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + for (int j = 1; j < num; ++j) { + equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false; + } + }; + auto merge_sequential_one_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + equal = in_dims[0][i] == 1; + if (equal) { + for (int j = 1; j < num; ++j) { + equal &= in_dims[j][i] == out[i]; + } + } + }; + // To Merge the dimensions of input_tensors while the consequtive + // equal-dimensions appears. + MergeFunctor merge_ptr = merge_sequential_dims; + MergeDimensions(merge_ptr, N); + + int min_idx = 0; + int min_val = std::accumulate( + in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies()); + for (int j = 1; j < N; ++j) { + int temp = std::accumulate( + in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies()); + min_val = min_val > temp ? temp : min_val; + min_idx = min_val == temp ? j : min_idx; + } + std::swap(in_dims[0], in_dims[min_idx]); + + // To Merge the dimension of input_tensors while the consequtive + // 1-value-dimensions appears. + merge_ptr = merge_sequential_one_dims; + MergeDimensions(merge_ptr, N); + std::swap(in_dims[min_idx], in_dims[0]); + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) + +template +__device__ __forceinline__ void LoadData( + T *dst, + const _ptr_ T *src, + uint32_t block_offset, + const kps::details::BroadcastConfig &config, + int numel, + int num, + int need_broadcast) { + // numel : whole num of output + // num: how many data will be deal with in this time + if (need_broadcast) { + kps::ReadDataBc( + dst, src, block_offset, config, numel); + } else { + kps::ReadData(dst, src + block_offset, num); + } +} + +template +__device__ void VectorizedBroadcastKernelImpl( + const pten::Array &ins, + pten::Array<_ptr_ OutT *, NumOuts> outs, + const pten::Array &use_broadcast, + uint32_t numel, + const pten::Array, Arity> &configs, + int num, + int block_offset, + Functor func) { + InT args[Arity][VecSize]; + ConditionalT result[VecSize]; + +#pragma unroll + for (int i = 0; i < Arity; i++) { + kps::Init(args[i], static_cast(1.0f)); + LoadData(args[i], + ins[i], + block_offset, + configs[i], + numel, + num, + use_broadcast[i]); + } + constexpr bool kCallElementwiseAny = + paddle::platform::FunctionTraits::has_pointer_args; + pten::funcs::ElementwisePrimitiveCaller, + VecSize, + Functor, + Arity, + kCallElementwiseAny>()( + func, args, result); + + pten::funcs::ElementwiseWriteDataCaller()( + outs, result, block_offset, num); +} + +template +__global__ void VectorizedBroadcastKernel( + pten::Array ins, + pten::Array<_ptr_ OutT *, NumOuts> outs, + pten::Array use_broadcast, + uint32_t numel, + pten::Array, Arity> configs, + int main_offset, + int tail_tid, + Functor func) { + int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + +#ifdef PADDLE_WITH_XPU2 + for (; block_offset < main_offset; block_offset += stride) { + VectorizedBroadcastKernelImpl(ins, + outs, + use_broadcast, + numel, + configs, + BLOCK_NUM_X * VecSize, + block_offset, + func); + } + int num = numel - block_offset; + if (num > 0) { + VectorizedBroadcastKernelImpl( + ins, outs, use_broadcast, numel, configs, num, block_offset, func); + } +#else + if (block_offset < main_offset) { + VectorizedBroadcastKernelImpl(ins, + outs, + use_broadcast, + numel, + configs, + BLOCK_NUM_X * VecSize, + block_offset, + func); + } else { + VectorizedBroadcastKernelImpl( + ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func); + } +#endif +} + +template +void LaunchBroadcastKernel(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + Functor func, + DimensionsTransform merge_dims) { + int numel = (*outs)[0]->numel(); + pten::Array, Arity> configs; + pten::Array use_broadcast; + pten::Array ins_data; + pten::Array<_ptr_ OutT *, NumOuts> outs_data; + + for (int i = 0; i < NumOuts; ++i) { + outs_data[i] = ctx.Alloc((*outs)[i]); + } + + for (int i = 0; i < Arity; i++) { + use_broadcast[i] = (ins[i]->numel() != numel); + ins_data[i] = (_ptr_ InT *)(ins[i]->data()); + if (use_broadcast[i]) { + // get the broadcast config, + // if data shape is[m, n], then you should set data_dim = {n, m} + // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} + configs[i] = kps::details::BroadcastConfig( + merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); + } + } + +#ifdef PADDLE_WITH_XPU2 + const int threads = 64; + const int blocks = 8; + int main_offset = (numel / (VecSize * threads)) * VecSize * threads; + int tail_tid = numel % (VecSize * threads); + auto stream = ctx.x_context()->xpu_stream; + VectorizedBroadcastKernel<<>>(ins_data, + outs_data, + use_broadcast, + numel, + configs, + main_offset, + tail_tid, + func); +#else + const int threads = 256; + int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; + int main_offset = (numel / (VecSize * threads)) * VecSize * threads; + int tail_tid = numel % (VecSize * threads); + auto stream = ctx.stream(); + VectorizedBroadcastKernel<<>>(ins_data, + outs_data, + use_broadcast, + numel, + configs, + main_offset, + tail_tid, + func); +#endif +} + +template +void BroadcastKernelForDifferentDimSize( + const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis); + +#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \ + case rank: { \ + LaunchBroadcastKernel( \ + ctx, ins, outs, func, merge_dims); \ + } break; + + switch (merge_dims.dim_size) { + CALL_BROADCAST_FOR_DIM_SIZE(1); + CALL_BROADCAST_FOR_DIM_SIZE(2); + CALL_BROADCAST_FOR_DIM_SIZE(3); + CALL_BROADCAST_FOR_DIM_SIZE(4); + CALL_BROADCAST_FOR_DIM_SIZE(5); + CALL_BROADCAST_FOR_DIM_SIZE(6); + CALL_BROADCAST_FOR_DIM_SIZE(7); + CALL_BROADCAST_FOR_DIM_SIZE(8); + default: { + PADDLE_THROW(pten::errors::InvalidArgument( + "The maximum dimension of input tensor is expected to be less than " + "%d, but recieved %d.", + merge_dims.dim_size, + pten::DDim::kMaxRank)); + } + } +#undef CALL_BROADCAST_FOR_DIM_SIZE +} + +template +void BroadcastKernelForDifferentVecSize( + const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + using Traits = paddle::platform::FunctionTraits; + const int kArity = + Traits::has_pointer_args ? static_cast(ET) : Traits::arity; + PADDLE_ENFORCE_EQ(ins.size(), + kArity, + pten::errors::InvalidArgument( + "The number of inputs is expected to be equal to the " + "arity of functor. But recieved: the number of inputs " + "is %d, the arity of functor is %d.", + ins.size(), + kArity)); + PADDLE_ENFORCE_LE(kArity, + 3, + pten::errors::InvalidArgument( + "Currently only broadcast of ternary is supported " + "and verified, but received %d.", + kArity)); + PADDLE_ENFORCE_EQ(outs->size(), + NumOuts, + pten::errors::InvalidArgument( + "Number of outputs shall equal to number of functions, " + "but number of outputs is %d, of functions is %d.", + outs->size(), + NumOuts)); + int in_vec_size = 4; + int out_vec_size = 4; + if (NumOuts > 1) { + for (int i = 0; i < NumOuts; ++i) { + PADDLE_ENFORCE_EQ( + (*outs)[i]->dims(), + (*outs)[0]->dims(), + pten::errors::InvalidArgument( + "The shape of each output tensor shall be identical yet, but " + "%d-th output tensor`s shape is not.", + i)); + out_vec_size = std::min( + paddle::platform::GetVectorizedSize((*outs)[i]->data()), + out_vec_size); + } + } else { + out_vec_size = + paddle::platform::GetVectorizedSize((*outs)[0]->data()); + } + + for (auto *in : ins) { + auto temp_size = paddle::platform::GetVectorizedSize(in->data()); + in_vec_size = in->dims() == (*outs)[0]->dims() + ? std::min(temp_size, in_vec_size) + : in_vec_size; + } + int vec_size = std::min(out_vec_size, in_vec_size); + + switch (vec_size) { + case 4: { + BroadcastKernelForDifferentDimSize(ctx, ins, outs, axis, func); + break; + } + case 2: { + BroadcastKernelForDifferentDimSize(ctx, ins, outs, axis, func); + break; + } + case 1: { + BroadcastKernelForDifferentDimSize(ctx, ins, outs, axis, func); + break; + } + default: { + PADDLE_THROW(pten::errors::Unimplemented( + "Unsupported vectorized size: %d!", vec_size)); + break; + } + } +} + +template +void BroadcastKernel(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + std::vector dims_size; + bool no_broadcast_flag = true; + for (auto *in : ins) { + no_broadcast_flag &= ins[0]->dims() == in->dims(); + dims_size.emplace_back(in->dims().size()); + } + + if (ins.size() > 0 && outs->size() > 0) { + no_broadcast_flag &= outs->at(0)->dims() == ins[0]->dims(); + } + + if (no_broadcast_flag) { + pten::funcs::ElementwiseKernel( + ctx, ins, outs, func); + } else { + axis = axis == -1 + ? *std::max_element(dims_size.begin(), dims_size.end()) - + *std::min_element(dims_size.begin(), dims_size.end()) + : axis; + BroadcastKernelForDifferentVecSize( + ctx, ins, outs, axis, func); + } +} + +#endif + +} // namespace funcs +} // namespace pten diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h deleted file mode 100644 index 483e58eedb4d13e9b45b9dd7065974bdd9606085..0000000000000000000000000000000000000000 --- a/paddle/pten/kernels/funcs/cuda_kernel_config.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/pten/backends/gpu/gpu_context.h" - -#ifdef __HIPCC__ -#define ELEMENTWISE_BLOCK_SIZE 256 -#else -#define ELEMENTWISE_BLOCK_SIZE 512 -#endif - -namespace pten { -namespace funcs { -/* -* According to NVIDIA, if number of threads per block is 64/128/256/512, -* cuda performs better. And number of blocks should be greater (at least -* 2x~4x) than number of SMs. Hence, SM count is took into account within -* this function to determine the right number of threads per block. -*/ -inline int GetThreadsConfig(const pten::GPUContext &ctx, - int64_t numel, - int vec_size) { - int threads = ELEMENTWISE_BLOCK_SIZE; - int sm_count = ctx.GetSMCount(); - int active_threads_num = numel / vec_size; - if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about twice of SM, to acquire better performance. - threads = paddle::platform::RoundToPowerOfTwo(active_threads_num / - (sm_count << 1)); - } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about 4 times of SM, to acquire better performance. - threads = paddle::platform::RoundToPowerOfTwo(active_threads_num / - (sm_count << 2)); - } - // Number of threads per block shall be larger than 64. - return std::max(64, threads); -} - -} // namespace funcs -} // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index fd89c55f68c59807ab1e0fde97f7196ea305fda3..438b6e8ffbe8e793a29baf13fd33c7283c28cdf0 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -746,11 +746,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx, } template -void LaunchSameDimsElementwiseCudaKernel( - const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - Functor func) { +void ElementwiseKernel(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + Functor func) { using Traits = paddle::platform::FunctionTraits; const int kArity = Traits::arity; PADDLE_ENFORCE_EQ(ins.size(), diff --git a/paddle/pten/kernels/gpu/abs_kernel.cu b/paddle/pten/kernels/gpu/abs_kernel.cu index 06eff050674c3670a2aa07cb43d0baea82fe7202..d6fa5c8d34c990ee9264c62309b6e6b3b4327405 100644 --- a/paddle/pten/kernels/gpu/abs_kernel.cu +++ b/paddle/pten/kernels/gpu/abs_kernel.cu @@ -47,8 +47,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { std::vector outs = {out}; auto functor = CudaAbsFunctor(); - funcs::LaunchSameDimsElementwiseCudaKernel>( - ctx, ins, &outs, functor); + funcs::ElementwiseKernel>(ctx, ins, &outs, functor); } } // namespace pten diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 006b4f3687c44bcefe5f710a113abb9cd5a3ffc9..b47871aadcb790401faa193f1bb19664da8efdbb 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -44,7 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, inputs.emplace_back(&x); outputs.emplace_back(out); dev_ctx.Alloc(out); - pten::funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::ElementwiseKernel( dev_ctx, inputs, &outputs, CastFuctor()); } diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index d01102f2b331b77c0555deef0273a69dcec726ad..1b6df97d5e3d5f81346e8834076af109f77e17f1 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -15,9 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/pten/kernels/copy_kernel.h" +#include "paddle/pten/kernels/funcs/broadcast_function.h" #include "paddle/pten/kernels/funcs/common_shape.h" -#include "paddle/pten/kernels/funcs/cuda_kernel_config.h" -#include "paddle/pten/kernels/funcs/elementwise_base.h" #include "paddle/pten/kernels/gpu/reduce.h" #ifdef __HIPCC__ @@ -36,555 +35,8 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; } while (0) namespace pten { -// FORWARD CODE -struct DimensionsTransform { - using DimVector = std::vector; - typedef void (*MergeFunctor)( - bool &, std::vector &, DimVector &, int, int); - int64_t dim_size; - DimVector out_dims; - std::vector in_dims; - - private: - // To compensate the lackage of input_tensors` dimension with input variable - // 'axis' - void InputDimensionsExtend(int N, int axis) { - for (auto &in_dim : in_dims) { - int64_t in_idx = 0; - if (in_dim.size() < dim_size) { - DimVector tmp_dim(dim_size, 1); - do { - if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { - tmp_dim[axis] = in_dim[in_idx]; - in_idx++; - axis++; - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "recieved %d.", - in_idx + 1, - axis + 1, - out_dims[axis], - in_dim[in_idx])); - } - } while (in_idx < in_dim.size()); - in_dim.resize(dim_size); - std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); - } else { - do { - if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { - in_idx++; - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "recieved %d.", - in_idx + 1, - in_idx + 1, - out_dims[in_idx], - in_dim[in_idx])); - } - } while (in_idx < dim_size); - } - std::reverse(in_dim.begin(), in_dim.end()); - } - std::reverse(out_dims.begin(), out_dims.end()); - } - - template - __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { - auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { - (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, - vec->begin() + m_idx, - 1, - std::multiplies()); - vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); - }; - - int64_t i = 0; - while (i < dim_size) { - int cnt = 0; - int low_idx = i; - bool equal = true; - do { - merge_func(equal, in_dims, out_dims, i, N); - if (equal) { - i++; - cnt++; - } else { - break; - } - } while (i < dim_size); - - if (cnt > 1) { - for (auto &in_dim : in_dims) { - VectorReorganise(&in_dim, low_idx, i); - } - VectorReorganise(&out_dims, low_idx, i); - dim_size -= --cnt; - i -= cnt; - } else if (cnt < 1) { - i++; - } - } - } - - public: - explicit DimensionsTransform(const std::vector &ins, - const pten::DDim &dims, - int axis) { - const int N = max(static_cast(ins.size()), 2); - dim_size = dims.size(); - out_dims = pten::vectorize(dims); - in_dims.resize(N); - if (ins.size() == 1) { - // when ins.size() = 1, broadcast input to output - in_dims[0] = pten::vectorize(ins[0]->dims()); - in_dims[1] = out_dims; - // Add out_dims to in_dims to avoid errors in dims merging - } else { - for (int j = 0; j < N; ++j) { - in_dims[j] = pten::vectorize(ins[j]->dims()); - } - } - InputDimensionsExtend(N, axis); - - auto merge_sequential_dims = [](bool &equal, - std::vector &in_dims, - DimVector &out, - int i, - int num) { - for (int j = 1; j < num; ++j) { - equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false; - } - }; - auto merge_sequential_one_dims = [](bool &equal, - std::vector &in_dims, - DimVector &out, - int i, - int num) { - equal = in_dims[0][i] == 1; - if (equal) { - for (int j = 1; j < num; ++j) { - equal &= in_dims[j][i] == out[i]; - } - } - }; - // To Merge the dimensions of input_tensors while the consequtive - // equal-dimensions appears. - MergeFunctor merge_ptr = merge_sequential_dims; - MergeDimensions(merge_ptr, N); - - int min_idx = 0; - int min_val = std::accumulate( - in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies()); - for (int j = 1; j < N; ++j) { - int temp = std::accumulate( - in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies()); - min_val = min_val > temp ? temp : min_val; - min_idx = min_val == temp ? j : min_idx; - } - std::swap(in_dims[0], in_dims[min_idx]); - - // To Merge the dimension of input_tensors while the consequtive - // 1-value-dimensions appears. - merge_ptr = merge_sequential_one_dims; - MergeDimensions(merge_ptr, N); - std::swap(in_dims[min_idx], in_dims[0]); - } -}; - -template -__device__ __forceinline__ void LoadData( - T *dst, - const _ptr_ T *src, - uint32_t block_offset, - const kps::details::BroadcastConfig &config, - int numel, - int num, - int need_broadcast) { - // numel : whole num of output - // num: how many data will be deal with in this time - if (need_broadcast) { - kps::ReadDataBc( - dst, src, block_offset, config, numel); - } else { - kps::ReadData(dst, src + block_offset, num); - } -} - -template -__device__ void ElementwiseBroadcastKernelImpl( - const pten::Array &ins, - pten::Array<_ptr_ OutT *, NumOuts> outs, - const pten::Array &use_broadcast, - uint32_t numel, - const pten::Array, Arity> &configs, - int num, - int block_offset, - Functor func) { - InT args[Arity][VecSize]; - ConditionalT result[VecSize]; - -#pragma unroll - for (int i = 0; i < Arity; i++) { - kps::Init(args[i], static_cast(1.0f)); - LoadData(args[i], - ins[i], - block_offset, - configs[i], - numel, - num, - use_broadcast[i]); - } - constexpr bool kCallElementwiseAny = - paddle::platform::FunctionTraits::has_pointer_args; - pten::funcs::ElementwisePrimitiveCaller, - VecSize, - Functor, - Arity, - kCallElementwiseAny>()( - func, args, result); - - pten::funcs::ElementwiseWriteDataCaller()( - outs, result, block_offset, num); -} - -template -__global__ void ElementwiseBroadcastKernel( - pten::Array ins, - pten::Array<_ptr_ OutT *, NumOuts> outs, - pten::Array use_broadcast, - uint32_t numel, - pten::Array, Arity> configs, - int main_offset, - int tail_tid, - Functor func) { - int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; - int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; - -#ifdef PADDLE_WITH_XPU2 - for (; block_offset < main_offset; block_offset += stride) { - ElementwiseBroadcastKernelImpl(ins, - outs, - use_broadcast, - numel, - configs, - BLOCK_NUM_X * VecSize, - block_offset, - func); - } - int num = numel - block_offset; - if (num > 0) { - ElementwiseBroadcastKernelImpl( - ins, outs, use_broadcast, numel, configs, num, block_offset, func); - } -#else - if (block_offset < main_offset) { - ElementwiseBroadcastKernelImpl(ins, - outs, - use_broadcast, - numel, - configs, - BLOCK_NUM_X * VecSize, - block_offset, - func); - } else { - ElementwiseBroadcastKernelImpl( - ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func); - } -#endif -} - -template -void LaunchKernel(const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - Functor func, - DimensionsTransform merge_dims) { - int numel = (*outs)[0]->numel(); - pten::Array, Arity> configs; - pten::Array use_broadcast; - pten::Array ins_data; - pten::Array<_ptr_ OutT *, NumOuts> outs_data; - - for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = ctx.Alloc((*outs)[i]); - } - - for (int i = 0; i < Arity; i++) { - use_broadcast[i] = (ins[i]->numel() != numel); - ins_data[i] = (_ptr_ InT *)(ins[i]->data()); - if (use_broadcast[i]) { - // get the broadcast config, - // if data shape is[m, n], then you should set data_dim = {n, m} - // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} - configs[i] = kps::details::BroadcastConfig( - merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); - } - } - -#ifdef PADDLE_WITH_XPU2 - const int threads = 64; - const int blocks = 8; - int main_offset = (numel / (VecSize * threads)) * VecSize * threads; - int tail_tid = numel % (VecSize * threads); - auto stream = ctx.x_context()->xpu_stream; - ElementwiseBroadcastKernel<<>>(ins_data, - outs_data, - use_broadcast, - numel, - configs, - main_offset, - tail_tid, - func); -#else - const int threads = 256; - int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; - int main_offset = (numel / (VecSize * threads)) * VecSize * threads; - int tail_tid = numel % (VecSize * threads); - auto stream = ctx.stream(); - ElementwiseBroadcastKernel<<>>( - ins_data, - outs_data, - use_broadcast, - numel, - configs, - main_offset, - tail_tid, - func); -#endif -} - -template -void LaunchBroadcastKernelForDifferentVecSize( - const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - int axis, - Functor func) { - const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis); - -#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \ - case rank: { \ - LaunchKernel( \ - ctx, ins, outs, func, merge_dims); \ - } break; - - switch (merge_dims.dim_size) { - CALL_BROADCAST_FOR_DIM_SIZE(1); - CALL_BROADCAST_FOR_DIM_SIZE(2); - CALL_BROADCAST_FOR_DIM_SIZE(3); - CALL_BROADCAST_FOR_DIM_SIZE(4); - CALL_BROADCAST_FOR_DIM_SIZE(5); - CALL_BROADCAST_FOR_DIM_SIZE(6); - CALL_BROADCAST_FOR_DIM_SIZE(7); - CALL_BROADCAST_FOR_DIM_SIZE(8); - default: { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "The maximum dimension of input tensor is expected to be less than " - "%d, but recieved %d.\n", - merge_dims.dim_size, - pten::DDim::kMaxRank)); - } - } -#undef CALL_BROADCAST_FOR_DIM_SIZE -} - -template -void LaunchBroadcastElementwiseCudaKernel( - const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - int axis, - Functor func) { - using Traits = paddle::platform::FunctionTraits; - const int kArity = - Traits::has_pointer_args ? static_cast(ET) : Traits::arity; - PADDLE_ENFORCE_EQ(ins.size(), - kArity, - paddle::platform::errors::InvalidArgument( - "The number of inputs is expected to be equal to the " - "arity of functor. But recieved: the number of inputs " - "is %d, the arity of functor is %d.", - ins.size(), - kArity)); - PADDLE_ENFORCE_LE(kArity, - 3, - paddle::platform::errors::InvalidArgument( - "Currently only broadcast of ternary is supported " - "and verified, but received %d.", - kArity)); - PADDLE_ENFORCE_EQ(outs->size(), - NumOuts, - paddle::platform::errors::InvalidArgument( - "Number of outputs shall equal to number of functions, " - "but number of outputs is %d, of functions is %d.", - outs->size(), - NumOuts)); - int in_vec_size = 4; - int out_vec_size = 4; - if (NumOuts > 1) { - for (int i = 0; i < NumOuts; ++i) { - PADDLE_ENFORCE_EQ( - (*outs)[i]->dims(), - (*outs)[0]->dims(), - paddle::platform::errors::InvalidArgument( - "The shape of each output tensor shall be identical yet, but " - "%dth output tensor`s shape is not.", - i)); - out_vec_size = std::min( - paddle::platform::GetVectorizedSize((*outs)[i]->data()), - out_vec_size); - } - } else { - out_vec_size = - paddle::platform::GetVectorizedSize((*outs)[0]->data()); - } - - for (auto *in : ins) { - auto temp_size = paddle::platform::GetVectorizedSize(in->data()); - in_vec_size = in->dims() == (*outs)[0]->dims() - ? std::min(temp_size, in_vec_size) - : in_vec_size; - } - int vec_size = std::min(out_vec_size, in_vec_size); - - switch (vec_size) { - case 4: { - LaunchBroadcastKernelForDifferentVecSize(ctx, ins, outs, axis, func); - break; - } - case 2: { - LaunchBroadcastKernelForDifferentVecSize(ctx, ins, outs, axis, func); - break; - } - case 1: { - LaunchBroadcastKernelForDifferentVecSize(ctx, ins, outs, axis, func); - break; - } - default: { - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "Unsupported vectorized size: %d !", vec_size)); - break; - } - } -} - -template -void LaunchElementwiseCudaKernel(const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - int axis, - Functor func) { - std::vector dims_size; - bool no_broadcast_flag = true; - for (auto *in : ins) { - no_broadcast_flag &= ins[0]->dims() == in->dims(); - dims_size.emplace_back(in->dims().size()); - } - if (no_broadcast_flag) { - pten::funcs::LaunchSameDimsElementwiseCudaKernel( - ctx, ins, outs, func); - } else { - axis = axis == -1 - ? *std::max_element(dims_size.begin(), dims_size.end()) - - *std::min_element(dims_size.begin(), dims_size.end()) - : axis; - pten::LaunchBroadcastElementwiseCudaKernel( - ctx, ins, outs, axis, func); - } -} +// General binary elementwise comutaion with the support of broadcast. template void ElementwiseCompute(const GPUContext &dev_ctx, const DenseTensor &x, @@ -595,12 +47,10 @@ void ElementwiseCompute(const GPUContext &dev_ctx, std::vector ins = {&x, &y}; std::vector outs = {z}; z->mutable_data(dev_ctx.GetPlace()); - pten::LaunchElementwiseCudaKernel( + pten::funcs::BroadcastKernel( dev_ctx, ins, &outs, axis, func); } -// BACKWARD CODE - // Suppose only has contiguous dims static inline bool CheckContiguousDims(const std::vector &broadcast_pos) { for (int i = 1; i < broadcast_pos.size(); ++i) { diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu index 937d398be769b33215a6bce5c8531d0db17f5cb4..2a7ec387c8175229505c0d82ce4156c91d514344 100644 --- a/paddle/pten/kernels/gpu/full_kernel.cu +++ b/paddle/pten/kernels/gpu/full_kernel.cu @@ -49,7 +49,7 @@ void FullKernel(const Context& dev_ctx, // This function has no input, so the inputs.size() == 0. Use kUnary, but // the data will not be loaded in the kernel because the number of // parameters in the operator is 0 - pten::funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::ElementwiseKernel( dev_ctx, inputs, &outputs, FullFuctor(val.to())); } } @@ -91,7 +91,7 @@ void FullLikeKernel(const Context& dev_ctx, // the operator is 0 int numel = out->numel(); if (numel > 0) { - pten::funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::ElementwiseKernel( dev_ctx, inputs, &outputs, FullFuctor(value)); } } diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 3b7122ba1b9164c19b98eaf4b9707301234fb54b..a066866c431597f3e7aebbeefaa6c3a84b8bae29 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -48,7 +48,7 @@ namespace pten { inputs.emplace_back(&y); \ outputs.emplace_back(out); \ dev_ctx.template Alloc(out); \ - LaunchElementwiseCudaKernel( \ + funcs::BroadcastKernel( \ dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index f5296477ceb2a0f346821227e47338b2bfdfd0df..43c19176493d90f700807d9d6b2b5fd607727e48 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -1091,8 +1091,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx, if (config.reduce_num == 1) { std::vector inputs = {&x}; std::vector outputs = {y}; - funcs::LaunchSameDimsElementwiseCudaKernel( - dev_ctx, inputs, &outputs, transform); + funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); return; } diff --git a/paddle/pten/kernels/gpu/reduce_grad.h b/paddle/pten/kernels/gpu/reduce_grad.h index a626f2f70e7fb1ea0dc1d8ea419abf8022758459..517192f0f93a5cb8b11da7f00d85d24fa78c1e81 100644 --- a/paddle/pten/kernels/gpu/reduce_grad.h +++ b/paddle/pten/kernels/gpu/reduce_grad.h @@ -22,8 +22,10 @@ #include #include #include -#include "paddle/pten/kernels/gpu/elementwise.h" +#include "paddle/pten/kernels/funcs/broadcast_function.h" + namespace pten { + template void ReduceGrad(const GPUContext& dev_ctx, DenseTensor* d_out, @@ -33,12 +35,11 @@ void ReduceGrad(const GPUContext& dev_ctx, std::vector inputs = {d_out}; std::vector outputs = {d_x}; PD_VISIT_ALL_TYPES( - out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] { - LaunchBroadcastElementwiseCudaKernel( + out_dtype, "BroadcastKernel", ([&] { + funcs::BroadcastKernel( dev_ctx, inputs, &outputs, 0, functor); })); } + } // namespace pten #endif diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index e1cf78224a19dcb8d40dd08b836d4e2b2fa85480..edb9a35efbde645811637f6a44018ddc412a8624 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx, inputs.emplace_back(&x); outputs.emplace_back(out); dev_ctx.template Alloc(out); - pten::funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::ElementwiseKernel( dev_ctx, inputs, &outputs,