modified reduce_max, reduce_min, reduce_prod to higher_performance implementation. (#32974)

480b284c · niuliling123 · GitHub · 20eafd79 · 480b284c · 480b284c
5 changed file
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -13,46 +13,98 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
+#include <cmath>
+#include <limits>
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/macros.h"
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif

 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomMin {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = detail::IdentityFunctor<Tx>;
+
+  inline Ty initial() {
+    return static_cast<Ty>(std::numeric_limits<Ty>::max());
+  }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
    return (b < a) ? b : a;
  }
 };

-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomMax {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = detail::IdentityFunctor<Tx>;
+
+  inline Ty initial() {
+    return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
+  }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
    return (b > a) ? b : a;
  }
 };

-template <typename T>
+// for cub::Reduce
+template <typename Tx, typename Ty = Tx>
 struct CustomSum {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = detail::IdentityFunctor<Tx, Ty>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
    return b + a;
  }
 };

-template <typename T>
+template <typename Tx, typename Ty = Tx>
+struct CustomMean {
+  using Transformer = detail::DivideFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b + a;
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
 struct CustomMul {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = detail::IdentityFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(1.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
    return b * a;
  }
 };

+template <typename Tx, typename Ty = Tx>
+struct CustomLogicalOr {
+  using Transformer = detail::IdentityFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(false); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b || a;
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
+struct CustomLogicalAnd {
+  using Transformer = detail::IdentityFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(true); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b && a;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -11,15 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"

-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_max,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MaxFunctor>);
+// reduce_max
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max, ops::ReduceCudaKernel<float, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMax>);
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -11,15 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"

-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_min,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MinFunctor>);
+// reduce_min
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min, ops::ReduceCudaKernel<float, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMin>);
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -12,26 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"

+// reduce_prod
 #ifdef __HIPCC__
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
 #else
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
 #endif