From c13103430346d4c5f63e026bb4dbc826e059fb5d Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 12 Nov 2021 10:22:51 +0800 Subject: [PATCH] [Pten]Refactor the Elementwise_add Kernel (#37043) * elementwise_add kernel refactor * fix compile bugs in elementwise_add refactor * fix compile bugs when run in npu/xpu * fix bugs when run unit test * fix bugs when run ci-windows * modify code as recommended * code format adjust * fix bugs when run ci * fix compile bug when run in ci-windwos --- .../elementwise/elementwise_add_op.cu | 22 +- .../elementwise/elementwise_add_op.h | 21 +- .../operators/elementwise/elementwise_op.h | 11 + .../elementwise/elementwise_op_broadcast.cu.h | 239 ++-------- .../elementwise/elementwise_op_function.h | 423 ++---------------- .../elementwise/elementwise_op_impl.cu.h | 205 ++------- .../kernel_primitives/functor_primitives.h | 2 + paddle/pten/api/include/math.h | 2 + paddle/pten/api/lib/math.cc | 35 ++ paddle/pten/core/dense_tensor.cc | 4 +- paddle/pten/include/math.h | 14 + paddle/pten/infershape/binary.cc | 47 +- paddle/pten/infershape/binary.h | 3 + paddle/pten/kernels/cpu/CMakeLists.txt | 2 +- paddle/pten/kernels/cpu/math.cc | 37 ++ paddle/pten/kernels/cpu/math.h | 7 + paddle/pten/kernels/cuda/math.cu | 32 ++ paddle/pten/kernels/cuda/math.h | 7 + paddle/pten/kernels/functions/CMakeLists.txt | 2 + .../kernels/functions/blas/CMakeLists.txt | 0 .../pten/kernels/functions/blas/elementwise.h | 33 ++ .../pten/kernels/functions/cpu/CMakeLists.txt | 0 .../pten/kernels/functions/cpu/elementwise.h | 230 ++++++++++ .../kernels/functions/cuda/CMakeLists.txt | 0 .../functions/cuda/elementwise/elementwise.h | 48 ++ .../elementwise/elementwise_broadcast.cu.h | 412 +++++++++++++++++ .../cuda/elementwise/elementwise_common.cu.h | 79 ++++ .../elementwise/elementwise_no_broadcast.cu.h | 183 ++++++++ .../kernels/functions/eigen/elementwise.h | 36 ++ .../kernels/functions/general/CMakeLists.txt | 0 .../functions/general/elementwise_base.h | 384 ++++++++++++++++ .../functions/general/elementwise_functor.h | 74 +++ paddle/pten/tests/api/CMakeLists.txt | 1 + paddle/pten/tests/api/test_elementwise_api.cc | 85 ++++ paddle/pten/tests/kernels/CMakeLists.txt | 1 + .../tests/kernels/test_elementwise_dev_api.cc | 84 ++++ 36 files changed, 1997 insertions(+), 768 deletions(-) create mode 100644 paddle/pten/kernels/functions/blas/CMakeLists.txt create mode 100644 paddle/pten/kernels/functions/blas/elementwise.h create mode 100644 paddle/pten/kernels/functions/cpu/CMakeLists.txt create mode 100644 paddle/pten/kernels/functions/cpu/elementwise.h create mode 100644 paddle/pten/kernels/functions/cuda/CMakeLists.txt create mode 100644 paddle/pten/kernels/functions/cuda/elementwise/elementwise.h create mode 100644 paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h create mode 100644 paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h create mode 100644 paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h create mode 100644 paddle/pten/kernels/functions/eigen/elementwise.h create mode 100644 paddle/pten/kernels/functions/general/CMakeLists.txt create mode 100644 paddle/pten/kernels/functions/general/elementwise_base.h create mode 100644 paddle/pten/kernels/functions/general/elementwise_functor.h create mode 100644 paddle/pten/tests/api/test_elementwise_api.cc create mode 100644 paddle/pten/tests/kernels/test_elementwise_dev_api.cc diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 0b78aa4a01a..65bcb2239e4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" @@ -19,28 +20,17 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" +// only can include the headers in paddle/top/api dirs +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/include/core.h" +#include "paddle/pten/include/math.h" + namespace ops = paddle::operators; namespace plat = paddle::platform; namespace paddle { namespace operators { -template -class ElementwiseAddKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, AddFunctor()); - } -}; - template static __global__ void SimpleElemwiseAddGradCUDAKernel( const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) { diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 0ce4ca665dd..f0292ffe178 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -20,6 +20,13 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/framework/pten_utils.h" + +// only can include the headers in paddle/pten/include dirs +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/include/core.h" +#include "paddle/pten/include/math.h" + namespace paddle { namespace operators { @@ -55,12 +62,14 @@ class ElementwiseAddKernel : public framework::OpKernel { auto *y = ctx.Input("Y"); auto *z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - if (x->dims() == y->dims()) { - SameDimsElemwiseAdd LaunchElementwiseCpuKernel; - LaunchElementwiseCpuKernel(ctx, x, y, z); - } else { - LaunchBroadcastElementwiseCpuKernel(ctx, x, y, z); - } + + auto &dev_ctx = ctx.device_context(); + int axis = ctx.Attr("axis"); + auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); + auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); + auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); + pten::ElementwiseAdd(dev_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 651f0e3dc80..90ad8ce682b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -139,6 +139,17 @@ class ElementwiseOp : public framework::OperatorWithKernel { tensor.place(), tensor.layout()); } } + + framework::KernelSignature GetExpectedPtenKernelArgs( + const framework::ExecutionContext &ctx) const override { + if (Type() == "elementwise_add") { + if (ctx.InputVar("X")->IsType()) { + return framework::KernelSignature("elementwise_add", {"X", "Y"}, + {"axis"}, {"Out"}); + } + } + return framework::KernelSignature("None", {"X"}, {}, {"Out"}); + } }; class ElementwiseOpInferVarType diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 549a6be0b45..e743d43e47e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -162,190 +162,36 @@ struct DimensionsTransform { } }; -template -__device__ __forceinline__ void LoadData( - T *dst, const T *__restrict__ src, uint32_t block_offset, - const kps::details::BroadcastConfig &config, int numel, int num, - bool need_broadcast) { - // numel : whole num of output - // num: how many data will be deal with in this time - if (need_broadcast) { - kps::ReadDataBc(dst, src, block_offset, - config, numel); - } else { - kps::ReadData(dst, src + block_offset, num); - } -} - -template -__device__ void DealSegment( - const framework::Array &ins, OutT *out, - const framework::Array &use_broadcast, uint32_t numel, - const framework::Array, Arity> &configs, - int num, Functor func) { - InT args[Arity][VecSize]; - OutT result[VecSize]; - - int block_offset = blockIdx.x * blockDim.x * VecSize; - -#pragma unroll - for (int i = 0; i < Arity; i++) { - kps::Init(args[i], static_cast(1.0f)); - LoadData(args[i], ins[i], block_offset, - configs[i], numel, num, - use_broadcast[i]); - } - - const bool kCallElementwiseAny = - platform::FunctionTraits::has_pointer_args; - ElementwisePrimitiveCaller()(func, args, result); - kps::WriteData(out + block_offset, result, - num); -} - -template -__global__ void BroadcastKernel( - framework::Array ins, OutT *out, - framework::Array use_broadcast, uint32_t numel, - framework::Array, Arity> configs, - int main_tid, int tail_tid, Functor func) { - int block_offset = blockIdx.x * blockDim.x * VecSize; - // data offset of this block - if (blockIdx.x < main_tid) { - int num = blockDim.x * VecSize; // blockIdx.x < main_tid - DealSegment( - ins, out, use_broadcast, numel, configs, num, func); - } else { // reminder - int num = tail_tid; - DealSegment( - ins, out, use_broadcast, numel, configs, num, func); - } -} - -template -void LaunchKernel(const platform::CUDADeviceContext &ctx, - const std::vector &ins, - framework::Tensor *out, Functor func, - DimensionsTransform merge_dims) { - int numel = out->numel(); - const int threads = 256; - int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; - - int main_tid = numel / (VecSize * threads); - int tail_tid = numel % (VecSize * threads); - auto stream = ctx.stream(); - OutT *out_data = out->data(); - - framework::Array, Arity> configs; - framework::Array use_broadcast; - framework::Array ins_data; - - for (int i = 0; i < Arity; i++) { - use_broadcast[i] = (ins[i]->numel() != numel); - ins_data[i] = ins[i]->data(); - if (use_broadcast[i]) { - // get the broadcast config, - // if data shape is[m, n], then you should set data_dim = {n, m} - // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} - configs[i] = kps::details::BroadcastConfig( - merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); - } - } - - BroadcastKernel<<>>( - ins_data, out_data, use_broadcast, numel, configs, main_tid, tail_tid, - func); -} - -template -void LaunchBroadcastKernelForDifferentVecSize( - const platform::CUDADeviceContext &ctx, - const std::vector &ins, framework::Tensor *out, - int axis, Functor func) { - const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); - -#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \ - case rank: { \ - LaunchKernel(ctx, ins, out, \ - func, merge_dims); \ - } break; - - switch (merge_dims.dim_size) { - CALL_BROADCAST_FOR_DIM_SIZE(1); - CALL_BROADCAST_FOR_DIM_SIZE(2); - CALL_BROADCAST_FOR_DIM_SIZE(3); - CALL_BROADCAST_FOR_DIM_SIZE(4); - CALL_BROADCAST_FOR_DIM_SIZE(5); - CALL_BROADCAST_FOR_DIM_SIZE(6); - CALL_BROADCAST_FOR_DIM_SIZE(7); - CALL_BROADCAST_FOR_DIM_SIZE(8); - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "The maximum dimension of input tensor is expected to be less than " - "%d, but recieved %d.\n", - merge_dims.dim_size, framework::DDim::kMaxRank)); - } - } -#undef CALL_BROADCAST_FOR_DIM_SIZE -} - template void LaunchBroadcastElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, int axis, Functor func) { - using Traits = platform::FunctionTraits; - const int kArity = - Traits::has_pointer_args ? static_cast(ET) : Traits::arity; - PADDLE_ENFORCE_EQ(ins.size(), kArity, - platform::errors::InvalidArgument( - "The number of inputs is expected to be equal to the " - "arity of functor. But recieved: the number of inputs " - "is %d, the arity of functor is %d.", - ins.size(), kArity)); - PADDLE_ENFORCE_EQ(kArity, 2, - platform::errors::InvalidArgument( - "Currently only broadcast of binary is supported and " - "verified, but received %d.", - kArity)); - - int in_vec_size = 4; - framework::Tensor *out = (*outs)[0]; - for (auto *in : ins) { - auto temp_size = platform::GetVectorizedSize(in->data()); - in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) - : in_vec_size; + std::vector pt_inputs; + std::vector pt_outputs; + // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary + // DenseTensor obj + // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp + // can be deleted + // when DenseTensor support copy constructor. + std::vector> pt_inputs_tmp; + std::vector> pt_outputs_tmp; + for (auto in : ins) { + pt_inputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*in))); } - int out_vec_size = platform::GetVectorizedSize(out->data()); - int vec_size = std::min(out_vec_size, in_vec_size); - - switch (vec_size) { - case 4: { - LaunchBroadcastKernelForDifferentVecSize( - ctx, ins, out, axis, func); - break; - } - case 2: { - LaunchBroadcastKernelForDifferentVecSize( - ctx, ins, out, axis, func); - break; - } - case 1: { - LaunchBroadcastKernelForDifferentVecSize( - ctx, ins, out, axis, func); - break; - } - default: { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported vectorized size: %d !", vec_size)); - break; - } + for (auto out : *outs) { + pt_outputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*out))); + } + for (int i = 0; i < pt_inputs_tmp.size(); i++) { + pt_inputs.push_back(pt_inputs_tmp[i].get()); } + for (int i = 0; i < pt_outputs_tmp.size(); i++) { + pt_outputs.push_back(pt_outputs_tmp[i].get()); + } + pten::LaunchBroadcastElementwiseCudaKernel( + ctx, pt_inputs, &pt_outputs, axis, func); } template @@ -353,24 +199,31 @@ void LaunchElementwiseCudaKernel( const platform::CUDADeviceContext &cuda_ctx, const std::vector &ins, std::vector *outs, int axis, Functor func) { - std::vector dims_size; - bool no_broadcast_flag = true; - for (auto *in : ins) { - no_broadcast_flag = ins[0]->dims() == in->dims(); - dims_size.emplace_back(in->dims().size()); + std::vector pt_inputs; + std::vector pt_outputs; + // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary + // DenseTensor obj + // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp + // can be deleted + // when DenseTensor support copy constructor. + std::vector> pt_inputs_tmp; + std::vector> pt_outputs_tmp; + for (auto in : ins) { + pt_inputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*in))); } - - if (no_broadcast_flag) { - LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, outs, - func); - } else { - axis = axis == -1 - ? *std::max_element(dims_size.begin(), dims_size.end()) - - *std::min_element(dims_size.begin(), dims_size.end()) - : axis; - LaunchBroadcastElementwiseCudaKernel(cuda_ctx, ins, outs, - axis, func); + for (auto out : *outs) { + pt_outputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*out))); + } + for (int i = 0; i < pt_inputs_tmp.size(); i++) { + pt_inputs.push_back(pt_inputs_tmp[i].get()); + } + for (int i = 0; i < pt_outputs_tmp.size(); i++) { + pt_outputs.push_back(pt_outputs_tmp[i].get()); } + pten::LaunchElementwiseCudaKernel(cuda_ctx, pt_inputs, + &pt_outputs, axis, func); } } // namespace operators diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 2df7dd06f2c..d3ab8ad9d69 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -29,6 +29,11 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/transform.h" +// only can include the headers in paddle/pten/include dirs +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/kernels/functions/cpu/elementwise.h" +#include "paddle/pten/kernels/functions/general/elementwise_base.h" + #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ #include @@ -131,71 +136,14 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx, return axis; } -/* - * Out = X ⊙ Y - * If Y's shape does not match X' shape, they will be reshaped. - * For example: - * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 - * pre=2, n=3*4, post=5 - * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) - * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) - * pre=2*3, n=4*5, post=1 - * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) - * - * New parameter: *is_run_common_broadcast* is a flag to record whether to run - * common broadcast code. - */ -inline void get_mid_dims(const framework::DDim &x_dims, - const framework::DDim &y_dims, const int axis, - int *pre, int *n, int *post, - int *is_run_common_broadcast) { - *pre = 1; - *n = 1; - *post = 1; - *is_run_common_broadcast = 0; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - if (x_dims[i + axis] != y_dims[i]) { - PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, true, - platform::errors::InvalidArgument( - "Broadcast dimension mismatch. Operands " - "could not be broadcast together with the shape of " - "X = [%s] and the shape of Y = [%s]. Received [%d] " - "in X is not equal to [%d] in Y.", - x_dims, y_dims, x_dims[i + axis], y_dims[i])); - *is_run_common_broadcast = 1; - return; - } - (*n) *= y_dims[i]; - } - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} - inline int GetElementwiseIndex(const int *x_dims_array, const int max_dim, const int *index_array) { - int index_ = 0; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] > 1) { - index_ = index_ * x_dims_array[i] + index_array[i]; - } - } - return index_; + return pten::GetElementwiseIndex(x_dims_array, max_dim, index_array); } inline void UpdateElementwiseIndexArray(const int *out_dims_array, const int max_dim, int *index_array) { - for (int i = max_dim - 1; i >= 0; --i) { - ++index_array[i]; - if (index_array[i] >= out_dims_array[i]) { - index_array[i] -= out_dims_array[i]; - } else { - break; - } - } + pten::UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array); } inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, @@ -203,48 +151,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, int *x_dims_array, int *y_dims_array, int *out_dims_array, const int max_dim, const int axis) { - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, axis)); - if (x_dims.size() > y_dims.size()) { - std::fill(y_dims_array, y_dims_array + axis, 1); - if (axis + y_dims.size() < max_dim) { - std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1); - } - std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array); - std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array + axis); - } else { - std::fill(x_dims_array, x_dims_array + axis, 1); - if (axis + x_dims.size() < max_dim) { - std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1); - } - std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis); - std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array); - } - - for (int i = 0; i < max_dim; i++) { - PADDLE_ENFORCE_EQ( - x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || - y_dims_array[i] <= 1, - true, platform::errors::InvalidArgument( - "Broadcast dimension mismatch. Operands could " - "not be broadcast together with the shape of X = [%s] and " - "the shape of Y = [%s]. Received [%d] in X is not equal to " - "[%d] in Y at i:%d.", - x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); - if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || - (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { - out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]); - } else { - out_dims_array[i] = -1; - } - } + pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array, + y_dims_array, out_dims_array, max_dim, + axis); } template @@ -255,29 +164,9 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, const platform::CPUDeviceContext &ctx, Functor func, const bool is_xsize_larger = true) { - std::vector index_array(max_dim, 0); - const T *x_data = x->data(); - const T *y_data = y->data(); - PADDLE_ENFORCE_NOT_NULL(x_data, platform::errors::InvalidArgument( - "The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL(y_data, platform::errors::InvalidArgument( - "The input Y should not be empty.")); - OutType *out_data = z->mutable_data(ctx.GetPlace()); - - const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, - 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (is_xsize_larger) { - out_data[out_index] = func(x_data[x_index], y_data[y_index]); - } else { - out_data[out_index] = func(y_data[y_index], x_data[x_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } + pten::CommonForwardBroadcastCPU(x, y, z, x_dims_array, y_dims_array, + out_dims_array, max_dim, ctx, func, + is_xsize_larger); } template @@ -1184,172 +1073,9 @@ void CommonGradBroadcastCUDA( inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims.size(); - for (; actual_dims_size != 0; --actual_dims_size) { - if (dims[actual_dims_size - 1] != 1) break; - } - if (actual_dims_size == dims.size()) return dims; - std::vector trim_dims; - trim_dims.resize(actual_dims_size); - for (int i = 0; i < actual_dims_size; ++i) { - trim_dims[i] = dims[i]; - } - if (trim_dims.size() == 0) { - return framework::DDim(framework::make_dim()); - } - framework::DDim actual_dims = framework::make_ddim(trim_dims); - return actual_dims; + return pten::general::trim_trailing_singular_dims(dims); } -template -class RowwiseTransformIterator; - -template -class MidWiseTransformIterator; - -// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17 -template -class RowwiseTransformIterator - : public std::iterator { - public: - RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {} - - RowwiseTransformIterator &operator++() { - ++i_; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - return *this; - } - - RowwiseTransformIterator &operator+(int n) { - while (n-- > 0) { - ++i_; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - } - - return *this; - } - - bool operator==(const RowwiseTransformIterator - &rhs) const { - return (ptr_ + i_) == &(*rhs); - } - - bool operator!=(const RowwiseTransformIterator - &rhs) const { - return (ptr_ + i_) != &(*rhs); - } - - const T &operator*() { return ptr_[i_]; } - - private: - const T *ptr_; - int i_; - int64_t n_; -}; - -template -class MidWiseTransformIterator - : public std::iterator { - public: - MidWiseTransformIterator(const T *ptr, int n, int post) - : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - - MidWiseTransformIterator &operator++() { - ++j_; - if (UNLIKELY(j_ == post_)) { - ++i_; - j_ = 0; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - } - return *this; - } - - MidWiseTransformIterator &operator+(int n) { - while (n-- > 0) { - ++j_; - if (UNLIKELY(j_ == post_)) { - ++i_; - j_ = 0; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - } - } - return *this; - } - - bool operator==(const MidWiseTransformIterator - &rhs) const { - return (ptr_ + i_) == &(*rhs); - } - - bool operator!=(const MidWiseTransformIterator - &rhs) const { - return (ptr_ + i_) != &(*rhs); - } - - const T &operator*() { return ptr_[i_]; } - - private: - const T *ptr_; - int64_t i_; - int64_t j_; - int64_t n_; - int64_t post_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) -template -class RowwiseTransformIterator - : public thrust::iterator_adaptor< - RowwiseTransformIterator, const T *> { - public: - typedef thrust::iterator_adaptor< - RowwiseTransformIterator, const T *> - super_t; - HOSTDEVICE RowwiseTransformIterator(const T *x, int n) - : super_t(x), begin_(x), n_(n) {} - friend class thrust::iterator_core_access; - - private: - unsigned int n_; - const T *begin_; - HOSTDEVICE typename super_t::reference dereference() const { - return *(begin_ + (this->base() - begin_) % n_); - } -}; - -template -class MidWiseTransformIterator - : public thrust::iterator_adaptor< - MidWiseTransformIterator, const T *> { - public: - typedef thrust::iterator_adaptor< - MidWiseTransformIterator, const T *> - super_t; - HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post) - : super_t(x), begin_(x), n_(n), post_(post) {} - friend class thrust::iterator_core_access; - - private: - unsigned int post_; - unsigned int n_; - const T *begin_; - HOSTDEVICE typename super_t::reference dereference() const { - return *(begin_ + (((this->base() - begin_) / post_) % n_)); - } -}; -#endif - template class TransformFunctor { @@ -1378,10 +1104,12 @@ class TransformFunctor { platform::Transform trans; if (is_xsize_larger_) { trans(ctx_, x_, x_ + nx_, - RowwiseTransformIterator(y_, n), z_, func_); + pten::general::RowwiseTransformIterator(y_, n), + z_, func_); } else { trans(ctx_, y_, y_ + nx_, - RowwiseTransformIterator(x_, n), z_, func_); + pten::general::RowwiseTransformIterator(x_, n), + z_, func_); } } @@ -1389,10 +1117,14 @@ class TransformFunctor { platform::Transform trans; if (is_xsize_larger_) { trans(ctx_, x_, x_ + nx_, - MidWiseTransformIterator(y_, n, post), z_, func_); + pten::general::MidWiseTransformIterator(y_, n, + post), + z_, func_); } else { trans(ctx_, y_, y_ + nx_, - MidWiseTransformIterator(x_, n, post), z_, func_); + pten::general::MidWiseTransformIterator(x_, n, + post), + z_, func_); } } @@ -1721,13 +1453,13 @@ void ElemwiseGradComputeWithBroadcast( if (is_xsize_larger) { auto y_dims_trimed = trim_trailing_singular_dims(y_dims); axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); + pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, + &post, &is_run_common_broadcast); } else { auto x_dims_trimed = trim_trailing_singular_dims(x_dims); axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); + pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, + &post, &is_run_common_broadcast); } // special case for common backward implementation. if (is_run_common_broadcast) { @@ -1779,28 +1511,14 @@ void CommonElementwiseBroadcastForward( const framework::Tensor *y, framework::Tensor *z, const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, int axis, const bool is_xsize_larger = true) { - int max_dim = (std::max)(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, axis)); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - - CommonForwardBroadcastCPU( - x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), - max_dim, ctx.template device_context(), func, - is_xsize_larger); + z->mutable_data(ctx.GetPlace()); + auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); + auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); + auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); + const auto &dev_ctx = ctx.template device_context(); + pten::CommonElementwiseBroadcastForward(dev_ctx, *pt_x.get(), *pt_y.get(), + pt_z.get(), x_dims, y_dims, func, + axis, is_xsize_larger); } template @@ -1859,6 +1577,10 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { + z->mutable_data(ctx.GetPlace()); + auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); + auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); + auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); if (platform::is_gpu_place(ctx.GetPlace())) { #if defined(__NVCC__) || defined(__HIPCC__) std::vector ins = {x, y}; @@ -1873,61 +1595,10 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, return; } - auto x_dims = x->dims(); - auto y_dims = y->dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - TransformFunctor functor( - x, y, z, ctx.template device_context(), func, - is_xsize_larger); - if (x_dims == y_dims) { - functor.Run(); - return; - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, - &is_run_common_broadcast); - } - // special case for common implementation. - // case 1: x=[2,3,1,5], y=[2,1,4,1] - // case 2: x=[2,3,4], y=[1,1,4] - if (is_run_common_broadcast == 1) { - CommonElementwiseBroadcastForward( - ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); - return; - } - - if (post == 1) { - functor.RunRowWise(n, pre); - return; - } else { - functor.RunMidWise(n, pre, post); - return; - } + const auto &dev_ctx = + ctx.template device_context(); + pten::ElementwiseCompute( + dev_ctx, *pt_x.get(), *pt_y.get(), axis, func, pt_z.get()); } // FusedElemwiseAndAct @@ -2185,7 +1856,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &is_run_common_broadcast); + pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -2732,7 +2404,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &is_run_common_broadcast); + pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index e4074cc7d7d..531a6ad224c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -14,11 +14,17 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" +// only can include the headers in paddle/top/api dirs +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/include/core.h" +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise.h" + #ifdef __HIPCC__ #define ELEMENTWISE_BLOCK_SIZE 256 #else @@ -30,187 +36,38 @@ namespace operators { namespace kps = paddle::operators::kernel_primitives; -enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 }; - -/* -* According to NVIDIA, if number of threads per block is 64/128/256/512, -* cuda performs better. And number of blocks should be greater (at least -* 2x~4x) than number of SMs. Hence, SM count is took into account within -* this function to determine the right number of threads per block. -*/ -inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx, - int64_t numel, int vec_size) { - int threads = ELEMENTWISE_BLOCK_SIZE; - int sm_count = ctx.GetSMCount(); - int active_threads_num = numel / vec_size; - if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about twice of SM, to acquire better performance. - threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 1)); - } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about 4 times of SM, to acquire better performance. - threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 2)); - } - // Number of threads per block shall be larger than 64. - return std::max(64, threads); -} - -template -int GetVectorizedSizeForTensors( - const std::vector &ins, - const std::vector &outs) { - int vec_size = 4; - for (auto iter = ins.begin(); iter != ins.end(); ++iter) { - vec_size = std::min(vec_size, - platform::GetVectorizedSize((*iter)->data())); - } - for (auto iter = outs.begin(); iter != outs.end(); ++iter) { - vec_size = std::min( - vec_size, platform::GetVectorizedSize((*iter)->data())); - } - return vec_size; -} - -template -struct ElementwisePrimitiveCaller { - __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result); -}; - -template -struct ElementwisePrimitiveCaller { - __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { - kps::ElementwiseAny(result, args, - func); - } -}; - -template -struct ElementwisePrimitiveCaller { - __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { - kps::ElementwiseUnary(result, args[0], - func); - } -}; - -template -struct ElementwisePrimitiveCaller { - __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { - kps::ElementwiseBinary(result, args[0], - args[1], func); - } -}; - -template -struct ElementwisePrimitiveCaller { - __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { - kps::ElementwiseTernary( - result, args[0], args[1], args[2], func); - } -}; - -template -__device__ void DealSegment( - const framework::Array &in, OutT *out, - int num, Functor func) { - InT args[Arity][VecSize]; - OutT result[VecSize]; - - int data_offset = VecSize * blockIdx.x * blockDim.x; - -#pragma unroll - for (int i = 0; i < Arity; i++) { - kps::Init(args[i], static_cast(1.0f)); - kps::ReadData(args[i], in[i] + data_offset, - num); - } - - const bool kCallElementwiseAny = - platform::FunctionTraits::has_pointer_args; - ElementwisePrimitiveCaller()(func, args, result); - kps::WriteData(out + data_offset, result, - num); -} - -template -__global__ void ElementVectorizeKernel( - framework::Array ins, OutT *out, int size, - Functor func) { - int data_offset = VecSize * blockIdx.x * blockDim.x; - int num = size - data_offset; - // the num this time have to deal with - if (VecSize * blockDim.x > num) { // reminder segment - DealSegment(ins, out, num, func); - } else { // complete segment - DealSegment(ins, out, num, func); - } -} - -template -void ElementwiseCudaKernel(const platform::CUDADeviceContext &ctx, - const std::vector &ins, - std::vector *outs, - Functor func) { - auto numel = ins[0]->numel(); - int block_size = GetThreadsConfig(ctx, numel, VecSize); - int grid_size = - ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; - - auto stream = ctx.stream(); - OutT *out_data = (*outs)[0]->data(); - framework::Array ins_data; - for (int i = 0; i < Arity; i++) { - ins_data[i] = ins[i]->data(); - } - ElementVectorizeKernel<<>>( - ins_data, out_data, numel, func); -} +using ElementwiseType = pten::ElementwiseType; template void LaunchSameDimsElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, Functor func) { - using Traits = platform::FunctionTraits; - const int kArity = - Traits::has_pointer_args ? static_cast(ET) : Traits::arity; - PADDLE_ENFORCE_EQ(ins.size(), kArity, - platform::errors::InvalidArgument( - "The number of inputs is expected to be equal to the " - "arity of functor. But recieved: the number of inputs " - "is %d, the arity of functor is %d.", - ins.size(), kArity)); - - // calculate the max vec_size for all ins and outs - int vec_size = GetVectorizedSizeForTensors(ins, *outs); - switch (vec_size) { - case 4: - ElementwiseCudaKernel(ctx, ins, outs, - func); - break; - case 2: - ElementwiseCudaKernel(ctx, ins, outs, - func); - break; - case 1: - ElementwiseCudaKernel(ctx, ins, outs, - func); - break; - default: { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported vectorized size: %d !", vec_size)); - break; - } + std::vector pt_inputs; + std::vector pt_outputs; + // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary + // DenseTensor obj + // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp + // can be deleted + // when DenseTensor support copy constructor. + std::vector> pt_inputs_tmp; + std::vector> pt_outputs_tmp; + for (auto in : ins) { + pt_inputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*in))); + } + for (auto out : *outs) { + pt_outputs_tmp.emplace_back( + std::move(paddle::experimental::MakePtenDenseTensor(*out))); + } + for (int i = 0; i < pt_inputs_tmp.size(); i++) { + pt_inputs.push_back(pt_inputs_tmp[i].get()); + } + for (int i = 0; i < pt_outputs_tmp.size(); i++) { + pt_outputs.push_back(pt_outputs_tmp[i].get()); } + pten::LaunchSameDimsElementwiseCudaKernel(ctx, pt_inputs, + &pt_outputs, func); } } // namespace operators diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h index fcfcdc28b1f..3fce3b1c092 100644 --- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -14,6 +14,8 @@ #pragma once +#include "paddle/fluid/platform/eigen_ext.h" + namespace paddle { namespace operators { namespace kernel_primitives { diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h index 9fea515646d..01d5850a2b2 100644 --- a/paddle/pten/api/include/math.h +++ b/paddle/pten/api/include/math.h @@ -23,5 +23,7 @@ namespace experimental { // TODO(chenweihang): move mean API into stat.h/cc Tensor mean(const Tensor& x); +Tensor add(const Tensor& x, const Tensor& y); + } // namespace experimental } // namespace paddle diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc index 8102bbaaa58..13c74394eb2 100644 --- a/paddle/pten/api/lib/math.cc +++ b/paddle/pten/api/lib/math.cc @@ -60,5 +60,40 @@ Tensor mean(const Tensor& x) { return out; } +Tensor add(const Tensor& x, const Tensor& y) { + // 1. Get kernel signature and kernel + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError( + "elementwise_add", kernel_key); + + // 2. Get Device Context + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + auto kernel_context = pten::KernelContext(dev_ctx); + + // 3. Auto data transform + auto dense_x = std::dynamic_pointer_cast(x.impl()); + kernel_context.EmplaceBackInput(dense_x); + auto dense_y = std::dynamic_pointer_cast(y.impl()); + kernel_context.EmplaceBackInput(dense_y); + kernel_context.EmplaceBackAttr(-1); + + // 4. InferShape + auto out_meta = ElementwiseInferShape(dense_x->meta(), dense_y->meta(), -1); + + // 5. Prepare outputs + Tensor out; + const auto allocator = std::make_shared( + pten::TransToFluidPlace(kernel_key.backend())); + auto dense_out = std::make_shared(allocator, out_meta); + kernel_context.EmplaceBackOutput(dense_out); + out.set_impl(dense_out); + + // 6. Call kernel + kernel(&kernel_context); + + return out; +} + } // namespace experimental } // namespace paddle diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 647ddea0b4e..471b24f09fd 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -76,7 +76,7 @@ template T* DenseTensor::mutable_data() { PADDLE_ENFORCE( (data_type() == paddle::experimental::CppTypeToDataType::Type()), - paddle::platform::errors::PreconditionNotMet( + paddle::platform::errors::InvalidArgument( "The type of data (%d) we are trying to retrieve does not match the " "type of data currently contained in the container (%d).", static_cast(paddle::experimental::CppTypeToDataType::Type()), @@ -88,7 +88,7 @@ template const T* DenseTensor::data() const { PADDLE_ENFORCE( (data_type() == paddle::experimental::CppTypeToDataType::Type()), - paddle::platform::errors::PreconditionNotMet( + paddle::platform::errors::InvalidArgument( "The type of data we are trying to retrieve does not match the " "type of data currently contained in the container.")); return static_cast(data()); diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h index 0a61a94aa8d..ef7f038f10c 100644 --- a/paddle/pten/include/math.h +++ b/paddle/pten/include/math.h @@ -73,4 +73,18 @@ DenseTensor Scale(const ContextT& dev_ctx, ScaleHost(dev_ctx, x, scale, bias, bias_after_scale, &dense_out); return dense_out; } + +template +DenseTensor ElementwiseAdd(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferShape(x.meta(), y.meta(), axis); + const auto allocator = + std::make_shared( + dev_ctx.GetPlace()); + pten::DenseTensor dense_out(allocator, out_meta); + ElementwiseAdd(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} } // namespace pten diff --git a/paddle/pten/infershape/binary.cc b/paddle/pten/infershape/binary.cc index c17e0871581..671bac3165a 100644 --- a/paddle/pten/infershape/binary.cc +++ b/paddle/pten/infershape/binary.cc @@ -14,7 +14,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/infershape/binary.h" - +#include "paddle/pten/kernels/functions/general/elementwise_base.h" namespace pten { DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta, @@ -129,4 +129,49 @@ DenseTensorMeta MatmulInferShape(const DenseTensorMeta& x_meta, return {x_meta.type, ddim_out, x_meta.layout}; } +DenseTensorMeta ElementwiseInferShape(const DenseTensorMeta& x_meta, + const DenseTensorMeta& y_meta, + int axis) { + DenseTensorMeta return_meta(x_meta.type, x_meta.dims, x_meta.layout); + if (x_meta.dims != y_meta.dims) { + auto x_dims = x_meta.dims; + auto y_dims = y_meta.dims; + int max_dim = std::max(x_dims.size(), y_dims.size()); + if (x_dims.size() == y_dims.size()) { + PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0), + true, + paddle::platform::errors::InvalidArgument( + "axis should be -1 or 0 while the dimension of " + "tensor X (%s) is equal to the dimension of " + "tensor Y (%s), but received axis: %s", + x_dims.size(), + y_dims.size(), + axis)); + } + PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), + true, + paddle::platform::errors::InvalidArgument( + "The axis range must be [%s, %s), but axis is %s. " + "Please set the axis again.", + -1 * max_dim, + max_dim, + axis)); + axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) + : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + general::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + return_meta.dims = paddle::framework::make_ddim(out_dims_array); + } + return_meta.lod = x_meta.lod; + return return_meta; +} + } // namespace pten diff --git a/paddle/pten/infershape/binary.h b/paddle/pten/infershape/binary.h index f58e5503f22..35749ab61b3 100644 --- a/paddle/pten/infershape/binary.h +++ b/paddle/pten/infershape/binary.h @@ -41,4 +41,7 @@ DenseTensorMeta MatmulInferShape(const DenseTensorMeta& x_meta, bool trans_x, bool trans_y); +DenseTensorMeta ElementwiseInferShape(const DenseTensorMeta& x_meta, + const DenseTensorMeta& y_meta, + int axis); } // namespace pten diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt index 2c4a424e484..66c29dafbe2 100644 --- a/paddle/pten/kernels/cpu/CMakeLists.txt +++ b/paddle/pten/kernels/cpu/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function) +cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas) cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory) cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function) cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils) diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc index 0682479993f..25c4671baad 100644 --- a/paddle/pten/kernels/cpu/math.cc +++ b/paddle/pten/kernels/cpu/math.cc @@ -14,13 +14,16 @@ #include "paddle/pten/kernels/cpu/math.h" +#include "paddle/pten/kernels/functions/cpu/elementwise.h" #include "paddle/pten/kernels/functions/eigen/mean.h" #include "paddle/pten/kernels/functions/eigen/scale.h" #include "paddle/pten/kernels/functions/eigen/sign.h" +#include "paddle/pten/kernels/functions/general/elementwise_functor.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" namespace pten { @@ -61,11 +64,35 @@ void ScaleHost(const CPUContext& dev_ctx, out); } +template +void ElementwiseAdd(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + if (x.dims() == y.dims()) { + SameDimsElementwiseCompute>()( + dev_ctx, x, y, out); + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::AddFunctor(), out); + } else { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::InverseAddFunctor(), out); + } + } +} } // namespace pten // TODO(chenweihang): replace by better impl PT_REGISTER_MODULE(MathCPU); +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // using bfloat16 = ::paddle::platform::bfloat16; @@ -97,3 +124,13 @@ PT_REGISTER_KERNEL("scale.host", int64_t) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); } +PT_REGISTER_KERNEL("elementwise_add", + CPU, + ANY, + pten::ElementwiseAdd, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h index 3013ad9d04d..6d31f48e9d6 100644 --- a/paddle/pten/kernels/cpu/math.h +++ b/paddle/pten/kernels/cpu/math.h @@ -46,4 +46,11 @@ void ScaleHost(const CPUContext& dev_ctx, bool bias_after_scale, DenseTensor* out); +template +void ElementwiseAdd(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + } // namespace pten diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu index cc1a7bef4f1..73a743d58e6 100644 --- a/paddle/pten/kernels/cuda/math.cu +++ b/paddle/pten/kernels/cuda/math.cu @@ -14,9 +14,11 @@ limitations under the License. */ #include "paddle/pten/kernels/cuda/math.h" +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise.h" #include "paddle/pten/kernels/functions/eigen/mean.h" #include "paddle/pten/kernels/functions/eigen/scale.h" #include "paddle/pten/kernels/functions/eigen/sign.h" +#include "paddle/pten/kernels/functions/general/elementwise_functor.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -26,6 +28,7 @@ limitations under the License. */ namespace cub = hipcub; #endif +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/pten/api/lib/utils/tensor_utils.h" @@ -121,12 +124,30 @@ void ScaleHost(const CUDAContext& dev_ctx, out); } +template +void ElementwiseAdd(const CUDAContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + std::vector inputs; + std::vector outputs; + inputs.emplace_back(&x); + inputs.emplace_back(&y); + outputs.emplace_back(out); + LaunchElementwiseCudaKernel( + dev_ctx, inputs, &outputs, axis, general::AddFunctor()); +} + } // namespace pten // TODO(chenweihang): replace by better impl PT_REGISTER_MODULE(MathCUDA); using float16 = paddle::platform::float16; +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {} PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, float16) {} PT_REGISTER_KERNEL("scale", @@ -155,3 +176,14 @@ PT_REGISTER_KERNEL("scale.host", int64_t) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); } +PT_REGISTER_KERNEL("elementwise_add", + CUDA, + ANY, + pten::ElementwiseAdd, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h index 65f4f412658..0227daae669 100644 --- a/paddle/pten/kernels/cuda/math.h +++ b/paddle/pten/kernels/cuda/math.h @@ -48,6 +48,13 @@ void ScaleHost(const CUDAContext& dev_ctx, bool bias_after_scale, DenseTensor* out); +template +void ElementwiseAdd(const CUDAContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + } // namespace pten #endif diff --git a/paddle/pten/kernels/functions/CMakeLists.txt b/paddle/pten/kernels/functions/CMakeLists.txt index a3b2bf314b4..a5b4bb3e8bd 100644 --- a/paddle/pten/kernels/functions/CMakeLists.txt +++ b/paddle/pten/kernels/functions/CMakeLists.txt @@ -1 +1,3 @@ add_subdirectory(eigen) +add_subdirectory(blas) +add_subdirectory(general) diff --git a/paddle/pten/kernels/functions/blas/CMakeLists.txt b/paddle/pten/kernels/functions/blas/CMakeLists.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/paddle/pten/kernels/functions/blas/elementwise.h b/paddle/pten/kernels/functions/blas/elementwise.h new file mode 100644 index 00000000000..a0f80a8fa6a --- /dev/null +++ b/paddle/pten/kernels/functions/blas/elementwise.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { +namespace blas { + +template +void ElementwiseAdd(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VADD(x.numel(), x.data(), y.data(), out->mutable_data()); +} + +} // namespace blas +} // namespace pten diff --git a/paddle/pten/kernels/functions/cpu/CMakeLists.txt b/paddle/pten/kernels/functions/cpu/CMakeLists.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/paddle/pten/kernels/functions/cpu/elementwise.h b/paddle/pten/kernels/functions/cpu/elementwise.h new file mode 100644 index 00000000000..b565b8403b9 --- /dev/null +++ b/paddle/pten/kernels/functions/cpu/elementwise.h @@ -0,0 +1,230 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/functions/general/elementwise_base.h" + +namespace pten { + +inline void UpdateElementwiseIndexArray(const int *out_dims_array, + const int max_dim, + int *index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; + } else { + break; + } + } +} + +inline int GetElementwiseIndex(const int *x_dims_array, + const int max_dim, + const int *index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +template +void CommonForwardBroadcastCPU(const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const paddle::platform::CPUDeviceContext &ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + PADDLE_ENFORCE_NOT_NULL(x_data, + paddle::platform::errors::InvalidArgument( + "The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL(y_data, + paddle::platform::errors::InvalidArgument( + "The input Y should not be empty.")); + OutType *out_data = z->mutable_data(); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +template +void CommonElementwiseBroadcastForward( + const paddle::platform::CPUDeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + const DDim &x_dims, + const DDim &y_dims, + Functor func, + int axis, + const bool is_xsize_larger = true) { + int max_dim = (std::max)(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + general::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + CommonForwardBroadcastCPU(x, + y, + z, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dev_ctx, + func, + is_xsize_larger); +} + +// It is a common implementation to compute binary calculation with the support +// of broadcast, supporting both CPU and GPU. +// - CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like paddle/fluid/operators/elementwise/elementwise_add_op.h#L49 - L55. +// - GPU implementation supports all the broadcast cases, thus there is no need +// to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. + +template +void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + general:: + TransformFunctor + functor(x, y, z, dev_ctx, func, is_xsize_larger); + if (x_dims == y_dims) { + functor.Run(); + return; + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + general::get_mid_dims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + general::get_mid_dims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); + return; + } + + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } +} + +template +struct SameDimsElementwiseCompute { + void operator()(const paddle::platform::CPUDeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z) { + Functor()(dev_ctx, x, y, z); + } +}; + +} // namespace pten diff --git a/paddle/pten/kernels/functions/cuda/CMakeLists.txt b/paddle/pten/kernels/functions/cuda/CMakeLists.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise.h b/paddle/pten/kernels/functions/cuda/elementwise/elementwise.h new file mode 100644 index 00000000000..b769936089b --- /dev/null +++ b/paddle/pten/kernels/functions/cuda/elementwise/elementwise.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h" +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h" + +namespace pten { + +template +void LaunchElementwiseCudaKernel( + const paddle::platform::CUDADeviceContext &cuda_ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + std::vector dims_size; + bool no_broadcast_flag = true; + for (auto *in : ins) { + no_broadcast_flag = ins[0]->dims() == in->dims(); + dims_size.emplace_back(in->dims().size()); + } + if (no_broadcast_flag) { + LaunchSameDimsElementwiseCudaKernel( + cuda_ctx, ins, outs, func); + } else { + axis = axis == -1 + ? *std::max_element(dims_size.begin(), dims_size.end()) - + *std::min_element(dims_size.begin(), dims_size.end()) + : axis; + LaunchBroadcastElementwiseCudaKernel( + cuda_ctx, ins, outs, axis, func); + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h new file mode 100644 index 00000000000..2e5ea5fa481 --- /dev/null +++ b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h @@ -0,0 +1,412 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h" + +namespace pten { + +struct DimensionsTransform { + using DimVector = std::vector; + typedef void (*MergeFunctor)( + bool &, std::vector &, DimVector &, int, int); + int64_t dim_size; + DimVector out_dims; + std::vector in_dims; + + private: + // To compensate the lackage of input_tensors` dimension with input variable + // 'axis' + void InputDimensionsExtend(int N, int axis) { + for (auto &in_dim : in_dims) { + int64_t in_idx = 0; + if (in_dim.size() < dim_size) { + DimVector tmp_dim(dim_size, 1); + do { + if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { + tmp_dim[axis] = in_dim[in_idx]; + in_idx++; + axis++; + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", + in_idx + 1, + axis + 1, + out_dims[axis], + in_dim[in_idx])); + } + } while (in_idx < in_dim.size()); + in_dim.resize(dim_size); + std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + } else { + do { + if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { + in_idx++; + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", + in_idx + 1, + in_idx + 1, + out_dims[in_idx], + in_dim[in_idx])); + } + } while (in_idx < dim_size); + } + std::reverse(in_dim.begin(), in_dim.end()); + } + std::reverse(out_dims.begin(), out_dims.end()); + } + + template + __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { + auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { + (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, + vec->begin() + m_idx, + 1, + std::multiplies()); + vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); + }; + + int64_t i = 0; + while (i < dim_size) { + int cnt = 0; + int low_idx = i; + bool equal = true; + do { + merge_func(equal, in_dims, out_dims, i, N); + if (equal) { + i++; + cnt++; + } else { + break; + } + } while (i < dim_size); + + if (cnt > 1) { + for (auto &in_dim : in_dims) { + VectorReorganise(&in_dim, low_idx, i); + } + VectorReorganise(&out_dims, low_idx, i); + dim_size -= --cnt; + i -= cnt; + } else if (cnt < 1) { + i++; + } + } + } + + public: + explicit DimensionsTransform(const std::vector &ins, + const paddle::framework::DDim &dims, + int axis) { + const int N = ins.size(); + dim_size = dims.size(); + out_dims = paddle::framework::vectorize(dims); + in_dims.resize(N); + for (int j = 0; j < N; ++j) { + in_dims[j] = paddle::framework::vectorize(ins[j]->dims()); + } + InputDimensionsExtend(N, axis); + + auto merge_sequential_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + for (int j = 1; j < num; ++j) { + equal = (in_dims[0][i] == in_dims[j][i]) ? true : false; + } + }; + auto merge_sequential_one_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + equal = in_dims[0][i] == 1; + if (equal) { + for (int j = 1; j < num; ++j) { + equal = in_dims[j][i] == out[i]; + } + } + }; + // To Merge the dimensions of input_tensors while the consequtive + // equal-dimensions appears. + MergeFunctor merge_ptr = merge_sequential_dims; + MergeDimensions(merge_ptr, N); + + int min_idx = 0; + int min_val = std::accumulate( + in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies()); + for (int j = 1; j < N; ++j) { + int temp = std::accumulate( + in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies()); + min_val = min_val > temp ? temp : min_val; + min_idx = min_val == temp ? j : min_idx; + } + std::swap(in_dims[0], in_dims[min_idx]); + + // To Merge the dimension of input_tensors while the consequtive + // 1-value-dimensions appears. + merge_ptr = merge_sequential_one_dims; + MergeDimensions(merge_ptr, N); + std::swap(in_dims[min_idx], in_dims[0]); + } +}; + +template +__device__ __forceinline__ void LoadData( + T *dst, + const T *__restrict__ src, + uint32_t block_offset, + const kps::details::BroadcastConfig &config, + int numel, + int num, + bool need_broadcast) { + // numel : whole num of output + // num: how many data will be deal with in this time + if (need_broadcast) { + kps::ReadDataBc( + dst, src, block_offset, config, numel); + } else { + kps::ReadData(dst, src + block_offset, num); + } +} + +template +__device__ void DealSegment( + const paddle::framework::Array &ins, + OutT *out, + const paddle::framework::Array &use_broadcast, + uint32_t numel, + const paddle::framework::Array, Arity> + &configs, + int num, + Functor func) { + InT args[Arity][VecSize]; + OutT result[VecSize]; + + int block_offset = blockIdx.x * blockDim.x * VecSize; + +#pragma unroll + for (int i = 0; i < Arity; i++) { + kps::Init(args[i], static_cast(1.0f)); + LoadData(args[i], + ins[i], + block_offset, + configs[i], + numel, + num, + use_broadcast[i]); + } + + const bool kCallElementwiseAny = + paddle::platform::FunctionTraits::has_pointer_args; + ElementwisePrimitiveCaller()(func, args, result); + kps::WriteData( + out + block_offset, result, num); +} + +template +__global__ void BroadcastKernel( + paddle::framework::Array ins, + OutT *out, + paddle::framework::Array use_broadcast, + uint32_t numel, + paddle::framework::Array, Arity> + configs, + int main_tid, + int tail_tid, + Functor func) { + int block_offset = blockIdx.x * blockDim.x * VecSize; + // data offset of this block + if (blockIdx.x < main_tid) { + int num = blockDim.x * VecSize; // blockIdx.x < main_tid + pten::DealSegment( + ins, out, use_broadcast, numel, configs, num, func); + } else { // reminder + int num = tail_tid; + pten::DealSegment( + ins, out, use_broadcast, numel, configs, num, func); + } +} + +template +void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx, + const std::vector &ins, + DenseTensor *out, + Functor func, + DimensionsTransform merge_dims) { + int numel = out->numel(); + const int threads = 256; + int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; + + int main_tid = numel / (VecSize * threads); + int tail_tid = numel % (VecSize * threads); + auto stream = ctx.stream(); + OutT *out_data = out->mutable_data(); + + paddle::framework::Array, Arity> configs; + paddle::framework::Array use_broadcast; + paddle::framework::Array ins_data; + + for (int i = 0; i < Arity; i++) { + use_broadcast[i] = (ins[i]->numel() != numel); + ins_data[i] = ins[i]->data(); + if (use_broadcast[i]) { + // get the broadcast config, + // if data shape is[m, n], then you should set data_dim = {n, m} + // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} + configs[i] = kps::details::BroadcastConfig( + merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); + } + } + + BroadcastKernel<<>>(ins_data, + out_data, + use_broadcast, + numel, + configs, + main_tid, + tail_tid, + func); +} + +template +void LaunchBroadcastKernelForDifferentVecSize( + const paddle::platform::CUDADeviceContext &ctx, + const std::vector &ins, + DenseTensor *out, + int axis, + Functor func) { + const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); + +#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \ + case rank: { \ + LaunchKernel( \ + ctx, ins, out, func, merge_dims); \ + } break; + + switch (merge_dims.dim_size) { + CALL_BROADCAST_FOR_DIM_SIZE(1); + CALL_BROADCAST_FOR_DIM_SIZE(2); + CALL_BROADCAST_FOR_DIM_SIZE(3); + CALL_BROADCAST_FOR_DIM_SIZE(4); + CALL_BROADCAST_FOR_DIM_SIZE(5); + CALL_BROADCAST_FOR_DIM_SIZE(6); + CALL_BROADCAST_FOR_DIM_SIZE(7); + CALL_BROADCAST_FOR_DIM_SIZE(8); + default: { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The maximum dimension of input tensor is expected to be less than " + "%d, but recieved %d.\n", + merge_dims.dim_size, + paddle::framework::DDim::kMaxRank)); + } + } +#undef CALL_BROADCAST_FOR_DIM_SIZE +} + +template +void LaunchBroadcastElementwiseCudaKernel( + const paddle::platform::CUDADeviceContext &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { + using Traits = paddle::platform::FunctionTraits; + const int kArity = + Traits::has_pointer_args ? static_cast(ET) : Traits::arity; + PADDLE_ENFORCE_EQ(ins.size(), + kArity, + paddle::platform::errors::InvalidArgument( + "The number of inputs is expected to be equal to the " + "arity of functor. But recieved: the number of inputs " + "is %d, the arity of functor is %d.", + ins.size(), + kArity)); + PADDLE_ENFORCE_EQ(kArity, + 2, + paddle::platform::errors::InvalidArgument( + "Currently only broadcast of binary is supported and " + "verified, but received %d.", + kArity)); + + int in_vec_size = 4; + DenseTensor *out = (*outs)[0]; + for (auto *in : ins) { + auto temp_size = paddle::platform::GetVectorizedSize(in->data()); + in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) + : in_vec_size; + } + int out_vec_size = + paddle::platform::GetVectorizedSize(out->data()); + int vec_size = std::min(out_vec_size, in_vec_size); + + switch (vec_size) { + case 4: { + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); + break; + } + case 2: { + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); + break; + } + case 1: { + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); + break; + } + default: { + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h new file mode 100644 index 00000000000..1c0dd56254c --- /dev/null +++ b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/function_traits.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/functions/general/elementwise_base.h" + +namespace pten { +namespace kps = paddle::operators::kernel_primitives; +enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 }; + +template +struct ElementwisePrimitiveCaller { + __device__ inline void operator()(Functor func, + InT (*args)[VecSize], + OutT *result); +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline void operator()(Functor func, + InT (*args)[VecSize], + OutT *result) { + kps::ElementwiseAny( + result, args, func); + } +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline void operator()(Functor func, + InT (*args)[VecSize], + OutT *result) { + kps::ElementwiseUnary( + result, args[0], func); + } +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline void operator()(Functor func, + InT (*args)[VecSize], + OutT *result) { + kps::ElementwiseBinary( + result, args[0], args[1], func); + } +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline void operator()(Functor func, + InT (*args)[VecSize], + OutT *result) { + kps::ElementwiseTernary( + result, args[0], args[1], args[2], func); + } +}; + +} // namespace pten diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h new file mode 100644 index 00000000000..10142ba0a37 --- /dev/null +++ b/paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h" + +#ifdef __HIPCC__ +#define ELEMENTWISE_BLOCK_SIZE 256 +#else +#define ELEMENTWISE_BLOCK_SIZE 512 +#endif + +namespace pten { + +/* +* According to NVIDIA, if number of threads per block is 64/128/256/512, +* cuda performs better. And number of blocks should be greater (at least +* 2x~4x) than number of SMs. Hence, SM count is took into account within +* this function to determine the right number of threads per block. +*/ +inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx, + int64_t numel, + int vec_size) { + int threads = ELEMENTWISE_BLOCK_SIZE; + int sm_count = ctx.GetSMCount(); + int active_threads_num = numel / vec_size; + if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about twice of SM, to acquire better performance. + threads = paddle::platform::RoundToPowerOfTwo(active_threads_num / + (sm_count << 1)); + } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about 4 times of SM, to acquire better performance. + threads = paddle::platform::RoundToPowerOfTwo(active_threads_num / + (sm_count << 2)); + } + // Number of threads per block shall be larger than 64. + return std::max(64, threads); +} + +template +__device__ void DealSegment( + const paddle::framework::Array &in, + OutT *out, + int num, + Functor func) { + InT args[Arity][VecSize]; + OutT result[VecSize]; + + int data_offset = VecSize * blockIdx.x * blockDim.x; + +#pragma unroll + for (int i = 0; i < Arity; i++) { + kps::Init(args[i], static_cast(1.0f)); + kps::ReadData( + args[i], in[i] + data_offset, num); + } + + const bool kCallElementwiseAny = + paddle::platform::FunctionTraits::has_pointer_args; + ElementwisePrimitiveCaller()(func, args, result); + kps::WriteData( + out + data_offset, result, num); +} + +template +__global__ void ElementVectorizeKernel( + paddle::framework::Array ins, + OutT *out, + int size, + Functor func) { + int data_offset = VecSize * blockIdx.x * blockDim.x; + int num = size - data_offset; + // the num this time have to deal with + if (VecSize * blockDim.x > num) { // reminder segment + DealSegment(ins, out, num, func); + } else { // complete segment + DealSegment(ins, out, num, func); + } +} + +template +int GetVectorizedSizeForTensors(const std::vector &ins, + const std::vector &outs) { + int vec_size = 4; + for (auto iter = ins.begin(); iter != ins.end(); ++iter) { + vec_size = std::min( + vec_size, paddle::platform::GetVectorizedSize((*iter)->data())); + } + for (auto iter = outs.begin(); iter != outs.end(); ++iter) { + vec_size = std::min( + vec_size, paddle::platform::GetVectorizedSize((*iter)->data())); + } + return vec_size; +} + +template +void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx, + const std::vector &ins, + std::vector *outs, + Functor func) { + auto numel = ins[0]->numel(); + int block_size = GetThreadsConfig(ctx, numel, VecSize); + int grid_size = + ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; + auto stream = ctx.stream(); + OutT *out_data = (*outs)[0]->mutable_data(); + paddle::framework::Array ins_data; + for (int i = 0; i < Arity; i++) { + ins_data[i] = ins[i]->data(); + } + ElementVectorizeKernel<<>>( + ins_data, out_data, numel, func); +} + +template +void LaunchSameDimsElementwiseCudaKernel( + const paddle::platform::CUDADeviceContext &ctx, + const std::vector &ins, + std::vector *outs, + Functor func) { + using Traits = paddle::platform::FunctionTraits; + const int kArity = + Traits::has_pointer_args ? static_cast(ET) : Traits::arity; + PADDLE_ENFORCE_EQ(ins.size(), + kArity, + paddle::platform::errors::InvalidArgument( + "The number of inputs is expected to be equal to the " + "arity of functor. But recieved: the number of inputs " + "is %d, the arity of functor is %d.", + ins.size(), + kArity)); + // calculate the max vec_size for all ins and outs + int vec_size = GetVectorizedSizeForTensors(ins, *outs); + switch (vec_size) { + case 4: + ElementwiseCudaKernel( + ctx, ins, outs, func); + break; + case 2: + ElementwiseCudaKernel( + ctx, ins, outs, func); + break; + case 1: + ElementwiseCudaKernel( + ctx, ins, outs, func); + break; + default: { + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/functions/eigen/elementwise.h b/paddle/pten/kernels/functions/eigen/elementwise.h new file mode 100644 index 00000000000..21a20562257 --- /dev/null +++ b/paddle/pten/kernels/functions/eigen/elementwise.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/functions/eigen/common.h" + +namespace pten { +namespace eigen { + +template +void ElementwiseAdd(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x + eigen_y; +} + +} // namespace eigen +} // namespace pten diff --git a/paddle/pten/kernels/functions/general/CMakeLists.txt b/paddle/pten/kernels/functions/general/CMakeLists.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/paddle/pten/kernels/functions/general/elementwise_base.h b/paddle/pten/kernels/functions/general/elementwise_base.h new file mode 100644 index 00000000000..8c8a0342f6a --- /dev/null +++ b/paddle/pten/kernels/functions/general/elementwise_base.h @@ -0,0 +1,384 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/transform.h" +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { +namespace general { + +using DDim = paddle::framework::DDim; +using CPUContext = paddle::platform::CPUDeviceContext; + +template +class RowwiseTransformIterator; + +template +class MidWiseTransformIterator; + +// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17 +template +class RowwiseTransformIterator + : public std::iterator { + public: + RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {} + + RowwiseTransformIterator &operator++() { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + return *this; + } + + RowwiseTransformIterator &operator+(int n) { + while (n-- > 0) { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + + return *this; + } + + bool operator==(const RowwiseTransformIterator &rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const RowwiseTransformIterator &rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T &operator*() { return ptr_[i_]; } + + private: + const T *ptr_; + int i_; + int64_t n_; +}; + +template +class MidWiseTransformIterator + : public std::iterator { + public: + MidWiseTransformIterator(const T *ptr, int n, int post) + : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} + + MidWiseTransformIterator &operator++() { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + return *this; + } + + MidWiseTransformIterator &operator+(int n) { + while (n-- > 0) { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + } + return *this; + } + + bool operator==(const MidWiseTransformIterator &rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const MidWiseTransformIterator &rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T &operator*() { return ptr_[i_]; } + + private: + const T *ptr_; + int64_t i_; + int64_t j_; + int64_t n_; + int64_t post_; +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +using CUDAContext = paddle::platform::CUDADeviceContext; +template +class RowwiseTransformIterator + : public thrust::iterator_adaptor, + const T *> { + public: + typedef thrust::iterator_adaptor, + const T *> + super_t; + HOSTDEVICE RowwiseTransformIterator(const T *x, int n) + : super_t(x), begin_(x), n_(n) {} + friend class thrust::iterator_core_access; + + private: + unsigned int n_; + const T *begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (this->base() - begin_) % n_); + } +}; + +template +class MidWiseTransformIterator + : public thrust::iterator_adaptor, + const T *> { + public: + typedef thrust::iterator_adaptor, + const T *> + super_t; + HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post) + : super_t(x), begin_(x), n_(n), post_(post) {} + friend class thrust::iterator_core_access; + + private: + unsigned int post_; + unsigned int n_; + const T *begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (((this->base() - begin_) / post_) % n_)); + } +}; +#endif + +template +class TransformFunctor { + public: + TransformFunctor(const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + const DeviceContext &ctx, + Functor func, + const bool is_xsize_larger = true) + : x_(x.data()), + y_(y.data()), + z_(z->mutable_data()), + nx_(x.numel()), + ctx_(ctx), + func_(func), + is_xsize_larger_(is_xsize_larger) { + if (is_xsize_larger_ == false) { + nx_ = y.numel(); + } + } + + inline void Run() const { + paddle::platform::Transform trans; + trans(ctx_, x_, x_ + nx_, y_, z_, func_); + } + + inline void RunRowWise(int n, int pre) const { + paddle::platform::Transform trans; + if (is_xsize_larger_) { + trans(ctx_, + x_, + x_ + nx_, + RowwiseTransformIterator(y_, n), + z_, + func_); + } else { + trans(ctx_, + y_, + y_ + nx_, + RowwiseTransformIterator(x_, n), + z_, + func_); + } + } + + inline void RunMidWise(int n, int pre, int post) const { + paddle::platform::Transform trans; + if (is_xsize_larger_) { + trans(ctx_, + x_, + x_ + nx_, + MidWiseTransformIterator(y_, n, post), + z_, + func_); + } else { + trans(ctx_, + y_, + y_ + nx_, + MidWiseTransformIterator(x_, n, post), + z_, + func_); + } + } + + private: + const T *x_; + const T *y_; + OutType *z_; + int64_t nx_; + const DeviceContext &ctx_; + Functor func_; + bool is_xsize_larger_; +}; + +inline DDim trim_trailing_singular_dims(const DDim &dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + if (actual_dims_size == dims.size()) return dims; + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(paddle::framework::make_dim()); + } + DDim actual_dims = paddle::framework::make_ddim(trim_dims); + return actual_dims; +} + +/* + * Out = X ⊙ Y + * If Y's shape does not match X' shape, they will be reshaped. + * For example: + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) + * + * New parameter: *is_run_common_broadcast* is a flag to record whether to run + * common broadcast code. + */ +inline void get_mid_dims(const DDim &x_dims, + const DDim &y_dims, + const int axis, + int *pre, + int *n, + int *post, + int *is_run_common_broadcast) { + *pre = 1; + *n = 1; + *post = 1; + *is_run_common_broadcast = 0; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + if (x_dims[i + axis] != y_dims[i]) { + PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, + true, + paddle::platform::errors::InvalidArgument( + "Broadcast dimension mismatch. Operands " + "could not be broadcast together with the shape of " + "X = [%s] and the shape of Y = [%s]. Received [%d] " + "in X is not equal to [%d] in Y.", + x_dims, + y_dims, + x_dims[i + axis], + y_dims[i])); + *is_run_common_broadcast = 1; + return; + } + (*n) *= y_dims[i]; + } + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } +} + +inline void GetBroadcastDimsArrays(const DDim &x_dims, + const DDim &y_dims, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + const int max_dim, + const int axis) { + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + if (x_dims.size() > y_dims.size()) { + std::fill(y_dims_array, y_dims_array + axis, 1); + if (axis + y_dims.size() < max_dim) { + std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1); + } + std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array); + std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array + axis); + } else { + std::fill(x_dims_array, x_dims_array + axis, 1); + if (axis + x_dims.size() < max_dim) { + std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1); + } + std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis); + std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array); + } + + for (int i = 0; i < max_dim; i++) { + PADDLE_ENFORCE_EQ( + x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || + y_dims_array[i] <= 1, + true, + paddle::platform::errors::InvalidArgument( + "Broadcast dimension mismatch. Operands could " + "not be broadcast together with the shape of X = [%s] and " + "the shape of Y = [%s]. Received [%d] in X is not equal to " + "[%d] in Y at i:%d.", + x_dims, + y_dims, + x_dims_array[i], + y_dims_array[i], + i)); + if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || + (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { + out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]); + } else { + out_dims_array[i] = -1; + } + } +} + +} // namespace general +} // namespace pten diff --git a/paddle/pten/kernels/functions/general/elementwise_functor.h b/paddle/pten/kernels/functions/general/elementwise_functor.h new file mode 100644 index 00000000000..ab9960d1013 --- /dev/null +++ b/paddle/pten/kernels/functions/general/elementwise_functor.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/functions/blas/elementwise.h" +#include "paddle/pten/kernels/functions/eigen/elementwise.h" + +namespace pten { +namespace general { + +// Define the binary functors used in elementwise ops. + +// Add +template +struct SameDimsAddFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + blas::ElementwiseAdd(dev_ctx, x, y, z); + } +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + eigen::ElementwiseAdd(dev_ctx, x, y, z); + } +}; + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } +}; +template +struct InverseAddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } +}; + +} // namespace general +} // namespace pten diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt index 2c6bd9c45d1..00a9496d84b 100644 --- a/paddle/pten/tests/api/CMakeLists.txt +++ b/paddle/pten/tests/api/CMakeLists.txt @@ -5,3 +5,4 @@ cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_api pten_api_utils) cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_api pten_api_utils) cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils) cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils) +cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_api pten_api_utils) diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc new file mode 100644 index 00000000000..79f69343149 --- /dev/null +++ b/paddle/pten/tests/api/test_elementwise_api.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/pten/api/include/math.h" + +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" + +PT_DECLARE_MODULE(MathCPU); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PT_DECLARE_MODULE(MathCUDA); +#endif + +namespace framework = paddle::framework; +using DDim = paddle::framework::DDim; + +// TODO(chenweihang): Remove this test after the API is used in the dygraph +TEST(API, add) { + // 1. create tensor + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + auto dense_x = std::make_shared( + alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x->mutable_data(); + + auto dense_y = std::make_shared( + alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y->mutable_data(); + + float sum[3][10] = {0.0}; + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + sum[i][j] = (i * 10 + j) * 1.0 + j * 2.0; + } + } + for (size_t i = 0; i < 10; ++i) { + dense_y_data[i] = i * 2.0; + } + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Tensor y(dense_y); + + // 2. test API + auto out = paddle::experimental::add(x, y); + + // 3. check result + ASSERT_EQ(out.shape().size(), 2); + ASSERT_EQ(out.shape()[0], 3); + ASSERT_EQ(out.numel(), 30); + ASSERT_EQ(out.is_cpu(), true); + ASSERT_EQ(out.type(), pten::DataType::FLOAT32); + ASSERT_EQ(out.layout(), pten::DataLayout::NCHW); + ASSERT_EQ(out.initialized(), true); + + auto expect_result = sum; + auto dense_out = std::dynamic_pointer_cast(out.impl()); + auto actual_result0 = dense_out->data()[0]; + auto actual_result1 = dense_out->data()[1]; + auto actual_result2 = dense_out->data()[10]; + ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f); + ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); + ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); +} diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt index b0dc29de521..aa9050e3b4e 100644 --- a/paddle/pten/tests/kernels/CMakeLists.txt +++ b/paddle/pten/tests/kernels/CMakeLists.txt @@ -4,3 +4,4 @@ cc_test(test_fill_dev_api SRCS test_fill_dev_api.cc DEPS pten pten_api_utils) cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils) cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils) cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils) +cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten_api_utils) diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc new file mode 100644 index 00000000000..a93b3ffb4bc --- /dev/null +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/pten/include/math.h" + +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" + +PT_DECLARE_MODULE(MathCPU); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PT_DECLARE_MODULE(MathCUDA); +#endif + +namespace framework = paddle::framework; +using DDim = paddle::framework::DDim; + +TEST(DEV_API, elementwise_add) { + // 1. create tensor + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + pten::DenseTensor dense_x(alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x.mutable_data(); + + pten::DenseTensor dense_y(alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y.mutable_data(); + + float sum[3][10] = {0.0}; + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + sum[i][j] = (i * 10 + j) * 1.0 + j * 2.0; + } + } + for (size_t i = 0; i < 10; ++i) { + dense_y_data[i] = i * 2.0; + } + int axis = 1; + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(paddle::platform::CPUPlace()); + + // 2. test API + auto dense_out = pten::ElementwiseAdd( + *(static_cast(dev_ctx)), + dense_x, + dense_y, + axis); + + // 3. check result + ASSERT_EQ(dense_out.dims().size(), 2); + ASSERT_EQ(dense_out.dims()[0], 3); + ASSERT_EQ(dense_out.meta().type, pten::DataType::FLOAT32); + ASSERT_EQ(dense_out.meta().layout, pten::DataLayout::NCHW); + + auto expect_result = sum; + auto actual_result0 = dense_out.data()[0]; + auto actual_result1 = dense_out.data()[1]; + auto actual_result2 = dense_out.data()[10]; + ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f); + ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); + ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); +} -- GitLab