From 7c020c71c4cadc2aae9e1895289e7d542f0b7617 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 4 Jan 2022 16:19:51 +0800 Subject: [PATCH] [Pten]Move CPU_implementation of elementwise kernel in new directory (#38651) * change 'math' to 'math_kernel' * fix compile bugs * merge develop * fix compile bugs * move cpu_impl of elementwise kernel to new directory --- .../framework/data_device_transform_test.cu | 12 +- .../elementwise/elementwise_functor.h | 52 +-- .../elementwise/elementwise_op_function.h | 89 +--- paddle/pten/api/lib/kernel_declare.h | 21 - paddle/pten/infermeta/binary.cc | 16 +- paddle/pten/kernels/cpu/elementwise_impl.h | 392 ++++++++++++++++++ paddle/pten/kernels/cpu/math_kernel.cc | 61 +-- .../general => funcs}/elementwise_base.h | 5 +- .../pten/kernels/funcs/elementwise_functor.h | 83 ++++ paddle/pten/kernels/gpu/math_kernel.cu | 32 +- paddle/pten/kernels/hybird/CMakeLists.txt | 1 - .../pten/kernels/hybird/blas/CMakeLists.txt | 0 paddle/pten/kernels/hybird/blas/elementwise.h | 59 --- paddle/pten/kernels/hybird/cpu/CMakeLists.txt | 0 paddle/pten/kernels/hybird/cpu/elementwise.h | 230 ---------- .../cuda/elementwise/elementwise_common.cu.h | 2 +- .../pten/kernels/hybird/eigen/elementwise.h | 61 --- .../hybird/general/elementwise_functor.h | 223 ---------- python/paddle/utils/code_gen/api_gen.py | 1 - 19 files changed, 568 insertions(+), 772 deletions(-) delete mode 100644 paddle/pten/api/lib/kernel_declare.h create mode 100644 paddle/pten/kernels/cpu/elementwise_impl.h rename paddle/pten/kernels/{hybird/general => funcs}/elementwise_base.h (99%) create mode 100644 paddle/pten/kernels/funcs/elementwise_functor.h delete mode 100644 paddle/pten/kernels/hybird/blas/CMakeLists.txt delete mode 100644 paddle/pten/kernels/hybird/blas/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/cpu/CMakeLists.txt delete mode 100644 paddle/pten/kernels/hybird/cpu/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/eigen/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/general/elementwise_functor.h diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 4e5be2e535..a81e4abd45 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/pten/include/core.h" + namespace paddle { namespace framework { @@ -73,9 +76,12 @@ class TestKernel : public OpKernel { output->Resize(input->dims()); output->mutable_data(ctx.GetPlace()); - operators::TransformFunctor, T, DeviceContext> functor( - input, input, output, ctx.template device_context(), - AddFunctor()); + auto pt_input = paddle::experimental::MakePtenDenseTensor(*input); + auto pt_out = paddle::experimental::MakePtenDenseTensor(*output); + + pten::funcs::TransformFunctor, T, DeviceContext> functor( + *pt_input, *pt_input, pt_out.get(), + ctx.template device_context(), AddFunctor()); functor.Run(); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 6e53af41b6..7ff8e6a154 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" namespace paddle { namespace operators { @@ -25,58 +26,31 @@ namespace operators { // Add template -struct AddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } -}; +using AddFunctor = pten::funcs::AddFunctor; + template -struct InverseAddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } -}; +using InverseAddFunctor = pten::funcs::InverseAddFunctor; // Subtract template -struct SubFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } -}; +using SubFunctor = pten::funcs::SubtractFunctor; + template -struct InverseSubFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } -}; +using InverseSubFunctor = pten::funcs::InverseSubtractFunctor; // Multiply template -struct MulFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } -}; +using MulFunctor = pten::funcs::MultiplyFunctor; + template -struct InverseMulFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } -}; +using InverseMulFunctor = pten::funcs::InverseMultiplyFunctor; // Divide -#define DIV_ERROR_INFO \ - "InvalidArgumentError: Integer division by zero encountered in " \ - "(floor) divide. Please check the input value." - -template -struct DivFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } -}; - template -struct DivFunctor::value>::type> { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); - return a / b; - } -}; +using DivFunctor = pten::funcs::DivideFunctor; -template -struct InverseDivFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } -}; +template +using InverseDivFunctor = pten::funcs::InverseDivideFunctor; // Floor Divide template diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 9700ca3584..6f3e17ea4d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -31,8 +31,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/kernels/hybird/cpu/elementwise.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/cpu/elementwise_impl.h" #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ @@ -151,9 +150,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, int *x_dims_array, int *y_dims_array, int *out_dims_array, const int max_dim, const int axis) { - pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array, - y_dims_array, out_dims_array, max_dim, - axis); + pten::funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array, + y_dims_array, out_dims_array, max_dim, + axis); } template @@ -1073,71 +1072,9 @@ void CommonGradBroadcastCUDA( inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - return pten::general::trim_trailing_singular_dims(dims); + return pten::funcs::trim_trailing_singular_dims(dims); } -template -class TransformFunctor { - public: - TransformFunctor(const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z, const DeviceContext &ctx, Functor func, - const bool is_xsize_larger = true) - : x_(x->data()), - y_(y->data()), - z_(z->mutable_data(ctx.GetPlace())), - nx_(x->numel()), - ctx_(ctx), - func_(func), - is_xsize_larger_(is_xsize_larger) { - if (is_xsize_larger_ == false) { - nx_ = y->numel(); - } - } - - inline void Run() const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, y_, z_, func_); - } - - inline void RunRowWise(int n, int pre) const { - platform::Transform trans; - if (is_xsize_larger_) { - trans(ctx_, x_, x_ + nx_, - pten::general::RowwiseTransformIterator(y_, n), - z_, func_); - } else { - trans(ctx_, y_, y_ + nx_, - pten::general::RowwiseTransformIterator(x_, n), - z_, func_); - } - } - - inline void RunMidWise(int n, int pre, int post) const { - platform::Transform trans; - if (is_xsize_larger_) { - trans(ctx_, x_, x_ + nx_, - pten::general::MidWiseTransformIterator(y_, n, - post), - z_, func_); - } else { - trans(ctx_, y_, y_ + nx_, - pten::general::MidWiseTransformIterator(x_, n, - post), - z_, func_); - } - } - - private: - const T *x_; - const T *y_; - OutType *z_; - int64_t nx_; - const DeviceContext &ctx_; - Functor func_; - bool is_xsize_larger_; -}; - template struct ElemwiseGradNoBroadcast { const T *x_; @@ -1457,13 +1394,13 @@ void ElemwiseGradComputeWithBroadcast( if (is_xsize_larger) { auto y_dims_trimed = trim_trailing_singular_dims(y_dims); axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, - &post, &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); } else { auto x_dims_trimed = trim_trailing_singular_dims(x_dims); axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, - &post, &is_run_common_broadcast); + pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); } // special case for common backward implementation. if (is_run_common_broadcast) { @@ -1861,8 +1798,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -2409,8 +2346,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h deleted file mode 100644 index 4d3143ef09..0000000000 --- a/paddle/pten/api/lib/kernel_declare.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/kernel_registry.h" - -// TODO(chenweihang) After the kernel is split into a single file, -// the kernel declare statement is automatically generated according to the -// file name of the kernel, and this header file will be removed diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc index 5d3844a1de..944c64ecd7 100644 --- a/paddle/pten/infermeta/binary.cc +++ b/paddle/pten/infermeta/binary.cc @@ -14,7 +14,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/infermeta/binary.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" namespace pten { @@ -162,13 +162,13 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta, std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); - general::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); return_meta.dims = paddle::framework::make_ddim(out_dims_array); } return_meta.lod = x_meta.lod; diff --git a/paddle/pten/kernels/cpu/elementwise_impl.h b/paddle/pten/kernels/cpu/elementwise_impl.h new file mode 100644 index 0000000000..d3687b22fb --- /dev/null +++ b/paddle/pten/kernels/cpu/elementwise_impl.h @@ -0,0 +1,392 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/pten/kernels/hybird/eigen/common.h" + +namespace pten { + +// Add +template +struct SameDimsAddFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VADD(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + z->mutable_data(); + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x + eigen_y; + } +}; + +// Subtract +template +struct SameDimsSubtractFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsSubtractFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VSUB(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsSubtractFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x - eigen_y; + } +}; + +// Divide +template +struct SameDimsDivideFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsDivideFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + paddle::platform::errors::InvalidArgument( + "If use SameDimsDivideFunctor, template args(T) must be floating " + "point. "); + } +}; + +template +struct SameDimsDivideFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VDIV(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +// Multiply +template +struct SameDimsMultiplyFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsMultiplyFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VMUL(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsMultiplyFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x * eigen_y; + } +}; + +inline void UpdateElementwiseIndexArray(const int* out_dims_array, + const int max_dim, + int* index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; + } else { + break; + } + } +} + +inline int GetElementwiseIndex(const int* x_dims_array, + const int max_dim, + const int* index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +template +void CommonForwardBroadcastCPU(const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z, + int* x_dims_array, + int* y_dims_array, + int* out_dims_array, + int max_dim, + const paddle::platform::CPUDeviceContext& ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T* x_data = x.data(); + const T* y_data = y.data(); + PADDLE_ENFORCE_NOT_NULL(x_data, + paddle::platform::errors::InvalidArgument( + "The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL(y_data, + paddle::platform::errors::InvalidArgument( + "The input Y should not be empty.")); + OutType* out_data = z->mutable_data(); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +template +void CommonElementwiseBroadcastForward( + const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z, + const DDim& x_dims, + const DDim& y_dims, + Functor func, + int axis, + const bool is_xsize_larger = true) { + int max_dim = (std::max)(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + CommonForwardBroadcastCPU(x, + y, + z, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dev_ctx, + func, + is_xsize_larger); +} + +// It is a common CPU implementation to compute binary calculation with the +// support of broadcast. Note: +// 1. CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like AddFunctor and InverseAddFunctor. +// 2. The corresponding GPU implementation supports all the broadcast cases, +// thus there is no need to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. +template +void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + Functor func, + DenseTensor* z) { + z->mutable_data(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + funcs:: + TransformFunctor + functor(x, y, z, dev_ctx, func, is_xsize_larger); + if (x_dims == y_dims) { + functor.Run(); + return; + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + funcs::get_mid_dims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + funcs::get_mid_dims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); + return; + } + + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } +} + +template +struct SameDimsElementwiseCompute { + void operator()(const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + Functor()(dev_ctx, x, y, z); + } +}; + +} // namespace pten diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 152d945144..c022dd08bb 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -18,9 +18,11 @@ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/common/scalar.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/kernels/hybird/cpu/elementwise.h" + +#include "paddle/pten/kernels/cpu/elementwise_impl.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" + #include "paddle/pten/kernels/hybird/eigen/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" #include "paddle/pten/kernels/hybird/general/reduce_impl.h" // See Note [ Why still include the fluid headers? ] @@ -30,29 +32,28 @@ namespace pten { -#define DEFINE_CPU_ELEMENTWISE_OP(name) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - out->mutable_data(); \ - if (x.dims() == y.dims()) { \ - SameDimsElementwiseCompute< \ - general::SameDims##name##Functor>()( \ - dev_ctx, x, y, out); \ - } else { \ - auto x_dims = x.dims(); \ - auto y_dims = y.dims(); \ - if (x_dims.size() >= y_dims.size()) { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::name##Functor(), out); \ - } else { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::Inverse##name##Functor(), out); \ - } \ - } \ +#define DEFINE_CPU_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + out->mutable_data(); \ + if (x.dims() == y.dims()) { \ + SameDimsElementwiseCompute>()( \ + dev_ctx, x, y, out); \ + } else { \ + auto x_dims = x.dims(); \ + auto y_dims = y.dims(); \ + if (x_dims.size() >= y_dims.size()) { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::name##Functor(), out); \ + } else { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ + } \ + } \ } template @@ -76,17 +77,17 @@ void DivideKernel(const Context& dev_ctx, // allocate memory for out out->mutable_data(); if (x.dims() == y.dims() && std::is_floating_point::value) { - SameDimsElementwiseCompute>()( + SameDimsElementwiseCompute>()( dev_ctx, x, y, out); } else { auto x_dims = x.dims(); auto y_dims = y.dims(); if (x_dims.size() >= y_dims.size()) { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::DivideFunctor(), out); + ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::DivideFunctor(), out); } else { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::InverseDivideFunctor(), out); + ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); } } } diff --git a/paddle/pten/kernels/hybird/general/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h similarity index 99% rename from paddle/pten/kernels/hybird/general/elementwise_base.h rename to paddle/pten/kernels/funcs/elementwise_base.h index 20154a8744..a0c6d5ba57 100644 --- a/paddle/pten/kernels/hybird/general/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" namespace pten { -namespace general { +namespace funcs { using DDim = paddle::framework::DDim; @@ -378,6 +378,5 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, } } } - -} // namespace general +} // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h new file mode 100644 index 0000000000..9b2519b0fd --- /dev/null +++ b/paddle/pten/kernels/funcs/elementwise_functor.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace pten { +namespace funcs { + +// Define the binary functors used in elementwise ops. + +// Add +template +struct AddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } +}; +template +struct InverseAddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } +}; + +// Subtract +template +struct SubtractFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; +template +struct InverseSubtractFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } +}; + +// Multiply +template +struct MultiplyFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } +}; +template +struct InverseMultiplyFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } +}; + +// Divide +#define DIV_ERROR_INFO \ + "InvalidArgumentError: Integer division by zero encountered in " \ + "(floor) divide. Please check the input value." + +template +struct DivideFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } +}; + +template +struct DivideFunctor< + T, + typename std::enable_if::value>::type> { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); + return a / b; + } +}; + +template +struct InverseDivideFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } +}; + +} // namespace funcs +} // namespace pten diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 636d0f16b0..760bebe687 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" #include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h" #include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" #include "paddle/pten/kernels/hybird/general/reduce_impl.h" #ifdef __NVCC__ @@ -39,21 +39,21 @@ namespace kps = paddle::operators::kernel_primitives; namespace pten { -#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - std::vector inputs; \ - std::vector outputs; \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ - out->mutable_data(); \ - LaunchElementwiseCudaKernel( \ - dev_ctx, inputs, &outputs, axis, general::name##Functor()); \ +#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + std::vector inputs; \ + std::vector outputs; \ + inputs.emplace_back(&x); \ + inputs.emplace_back(&y); \ + outputs.emplace_back(out); \ + out->mutable_data(); \ + LaunchElementwiseCudaKernel( \ + dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } /** diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt index 1304aa1798..5d04bae2ea 100644 --- a/paddle/pten/kernels/hybird/CMakeLists.txt +++ b/paddle/pten/kernels/hybird/CMakeLists.txt @@ -1,5 +1,4 @@ add_subdirectory(eigen) -add_subdirectory(blas) add_subdirectory(general) cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context) diff --git a/paddle/pten/kernels/hybird/blas/CMakeLists.txt b/paddle/pten/kernels/hybird/blas/CMakeLists.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/paddle/pten/kernels/hybird/blas/elementwise.h b/paddle/pten/kernels/hybird/blas/elementwise.h deleted file mode 100644 index 1a530c9f8e..0000000000 --- a/paddle/pten/kernels/hybird/blas/elementwise.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/pten/core/dense_tensor.h" - -namespace pten { -namespace blas { - -template -void ElementwiseAdd(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VADD(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseSub(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VSUB(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseDiv(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VDIV(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseMul(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VMUL(x.numel(), x.data(), y.data(), out->mutable_data()); -} -} // namespace blas -} // namespace pten diff --git a/paddle/pten/kernels/hybird/cpu/CMakeLists.txt b/paddle/pten/kernels/hybird/cpu/CMakeLists.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/paddle/pten/kernels/hybird/cpu/elementwise.h b/paddle/pten/kernels/hybird/cpu/elementwise.h deleted file mode 100644 index d503957a76..0000000000 --- a/paddle/pten/kernels/hybird/cpu/elementwise.h +++ /dev/null @@ -1,230 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" - -namespace pten { - -inline void UpdateElementwiseIndexArray(const int *out_dims_array, - const int max_dim, - int *index_array) { - for (int i = max_dim - 1; i >= 0; --i) { - ++index_array[i]; - if (index_array[i] >= out_dims_array[i]) { - index_array[i] -= out_dims_array[i]; - } else { - break; - } - } -} - -inline int GetElementwiseIndex(const int *x_dims_array, - const int max_dim, - const int *index_array) { - int index_ = 0; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] > 1) { - index_ = index_ * x_dims_array[i] + index_array[i]; - } - } - return index_; -} - -template -void CommonForwardBroadcastCPU(const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z, - int *x_dims_array, - int *y_dims_array, - int *out_dims_array, - int max_dim, - const paddle::platform::CPUDeviceContext &ctx, - Functor func, - const bool is_xsize_larger = true) { - std::vector index_array(max_dim, 0); - const T *x_data = x.data(); - const T *y_data = y.data(); - PADDLE_ENFORCE_NOT_NULL(x_data, - paddle::platform::errors::InvalidArgument( - "The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL(y_data, - paddle::platform::errors::InvalidArgument( - "The input Y should not be empty.")); - OutType *out_data = z->mutable_data(); - - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (is_xsize_larger) { - out_data[out_index] = func(x_data[x_index], y_data[y_index]); - } else { - out_data[out_index] = func(y_data[y_index], x_data[x_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonElementwiseBroadcastForward( - const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z, - const DDim &x_dims, - const DDim &y_dims, - Functor func, - int axis, - const bool is_xsize_larger = true) { - int max_dim = (std::max)(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - paddle::platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - paddle::platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - general::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - - CommonForwardBroadcastCPU(x, - y, - z, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - dev_ctx, - func, - is_xsize_larger); -} - -// It is a common CPU implementation to compute binary calculation with the -// support of broadcast. Note: -// 1. CPU implementation cannot support the case when x needs broadcast, thus -// this function need to be called with XxxFunctor and XxxInverseFunctor, -// like AddFunctor and InverseAddFunctor. -// 2. The corresponding GPU implementation supports all the broadcast cases, -// thus there is no need to define and call with XxxInverseFunctor. -// TODO(liuyiqun): optimize the CPU implementation to support all broadcast -// cases and avoid the need of XxxInverseFunctor. -template -void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - int axis, - Functor func, - DenseTensor *z) { - z->mutable_data(); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - general:: - TransformFunctor - functor(x, y, z, dev_ctx, func, is_xsize_larger); - if (x_dims == y_dims) { - functor.Run(); - return; - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - paddle::platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - paddle::platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - general::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - general::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common implementation. - // case 1: x=[2,3,1,5], y=[2,1,4,1] - // case 2: x=[2,3,4], y=[1,1,4] - if (is_run_common_broadcast == 1) { - CommonElementwiseBroadcastForward( - dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); - return; - } - - if (post == 1) { - functor.RunRowWise(n, pre); - return; - } else { - functor.RunMidWise(n, pre, post); - return; - } -} - -template -struct SameDimsElementwiseCompute { - void operator()(const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z) { - Functor()(dev_ctx, x, y, z); - } -}; - -} // namespace pten diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h index 7c5f3a9778..ae38469324 100644 --- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h +++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" namespace pten { namespace kps = paddle::operators::kernel_primitives; diff --git a/paddle/pten/kernels/hybird/eigen/elementwise.h b/paddle/pten/kernels/hybird/eigen/elementwise.h deleted file mode 100644 index e67cce63d4..0000000000 --- a/paddle/pten/kernels/hybird/eigen/elementwise.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/eigen/common.h" - -namespace pten { -namespace eigen { - -template -void ElementwiseAdd(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - out->mutable_data(); - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x + eigen_y; -} - -template -void ElementwiseSub(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x - eigen_y; -} - -template -void ElementwiseMul(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x * eigen_y; -} - -} // namespace eigen -} // namespace pten diff --git a/paddle/pten/kernels/hybird/general/elementwise_functor.h b/paddle/pten/kernels/hybird/general/elementwise_functor.h deleted file mode 100644 index 62b422f4ae..0000000000 --- a/paddle/pten/kernels/hybird/general/elementwise_functor.h +++ /dev/null @@ -1,223 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/hostdevice.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/blas/elementwise.h" -#include "paddle/pten/kernels/hybird/eigen/elementwise.h" - -namespace pten { -namespace general { - -// Define the binary functors used in elementwise ops. - -// Add -template -struct SameDimsAddFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseAdd(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseAdd(dev_ctx, x, y, z); - } -}; - -template -struct AddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } -}; -template -struct InverseAddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } -}; - -// Subtract -template -struct SameDimsSubtractFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseSub(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseSub(dev_ctx, x, y, z); - } -}; - -template -struct SubtractFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } -}; -template -struct InverseSubtractFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } -}; - -// Divide -template -struct SameDimsDivideFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - paddle::platform::errors::InvalidArgument( - "If use SameDimsDivideFunctor, template args(T) must be floating " - "point. "); - } -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseDiv(dev_ctx, x, y, z); - } -}; - -#define DIV_ERROR_INFO \ - "InvalidArgumentError: Integer division by zero encountered in " \ - "(floor) divide. Please check the input value." - -template -struct DivideFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } -}; - -template -struct DivideFunctor< - T, - typename std::enable_if::value>::type> { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); - return a / b; - } -}; - -template -struct InverseDivideFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } -}; - -// Multiply -template -struct SameDimsMultiplyFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseMul(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseMul(dev_ctx, x, y, z); - } -}; -template -struct MultiplyFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } -}; -template -struct InverseMultiplyFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } -}; - -} // namespace general -} // namespace pten diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 72bf26c57d..35720ae32f 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -342,7 +342,6 @@ def source_include(header_file_path): #include "paddle/pten/api/include/kernel_signature.h" #include "paddle/pten/api/lib/api_registry.h" -#include "paddle/pten/api/lib/kernel_declare.h" #include "paddle/pten/api/lib/kernel_dispatch.h" #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/kernel_registry.h" -- GitLab