modified pool_op for higher performance (#33144)

b95c5ae0 · niuliling123 · GitHub · 0eb7c942 · b95c5ae0
隐藏空白更改
内联并排

Showing with 4 addition and 16 deletion

paddle/fluid/operators/pool_op.h paddle/fluid/operators/pool_op.h +4 -16

未找到文件。
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -23,23 +23,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #endif
 namespace paddle {
 namespace operators {
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-  template <typename U>
-  HOSTDEVICE inline U operator()(const U& x) const {
-    return x * static_cast<U>(n_inv);
-  }
- private:
-  T n_inv;
-};
 using Tensor = framework::Tensor;
@@ -219,9 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
 #if defined(__HIPCC__) || defined(__NVCC__)
            auto stream = dev_ctx.stream();
-            TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
+            TensorReduceFunctorImpl<T, T, CustomMean>(*in_x, out, reduce_dim,
-                *in_x, out, reduce_dim, static_cast<T>(0), cub::Sum(),
+                                                      stream);
-                DivideFunctor<T>(reduce_num), stream);
 #else  // for cpu
            paddle::operators::math::Pool2dFunctor<
                DeviceContext, paddle::operators::math::AvgPool<T>, T>