move for_range into phi (#39931)

94d8f392 · Chen Weihang · GitHub · 687902fc · 94d8f392 · 94d8f392
14 changed file
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,136 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace paddle {
 namespace platform {

 template <typename DeviceContext>
-struct ForRange {
-  ForRange(const DeviceContext& dev_ctx, size_t limit);
-
-  template <typename Function>
-  void operator()(Function func) const;
-};
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CPUDeviceContext> {
-  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::CPUContext> {
-  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    for (size_t i = 0; i < limit_; ++i) {
-      func(i);
-    }
-  }
-
-  size_t limit_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename Function>
-__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
-  size_t idx = static_cast<size_t>(threadIdx.x);
-  func(idx);
-}
-
-template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
-  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < limit) {
-    func(idx);
-  }
-}
-
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<CUDADeviceContext> {
-  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const CUDADeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
-template <>
-struct ForRange<phi::GPUContext> {
-  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
-
-  template <typename Function>
-  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
-    // JETSON_NANO will throw core dump when threads > 128
-    int num_thread = 256;
-    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
-    const int num_threads = num_thread;
-#else
-    constexpr int num_threads = 1024;
-#endif
-    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
-    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
-    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
-    }
-  }
-
-  const phi::GPUContext& dev_ctx_;
-  size_t limit_;
-};
-
-#endif
+using ForRange = phi::funcs::ForRange<DeviceContext>;

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.

 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -29,7 +29,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
      out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
  auto* out_data = out->data<phi::funcs::Real<T>>();

-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -22,8 +22,8 @@

 #include <algorithm>

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {
 namespace funcs {
@@ -118,7 +118,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 #endif

    // auto& dev_ctx = context.template device_context<DeviceContext>();
-    paddle::platform::ForRange<DeviceContext> for_range(context, diag.numel());
+    phi::funcs::ForRange<DeviceContext> for_range(context, diag.numel());
    DiagonalFunctor<T> functor(
        input_data, diag_arr, ret_arr, pos, dim_size, diag_data);
    for_range(functor);

--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -14,11 +14,11 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -418,7 +418,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
                                    DX_OP dx_op,
                                    DY_OP dy_op) {
  size_t N = static_cast<size_t>(phi::product(x_dim));
-  paddle::platform::ForRange<DeviceContext> for_range(dev_ctx, N);
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
      x.data<T>(),
      y.data<T>(),

--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+struct ForRange {
+  ForRange(const Context& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<phi::CPUContext> {
+  ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CPUDeviceContext> {
+  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CPUDeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<phi::GPUContext> {
+  ForRange(const phi::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const phi::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
+template <>
+struct ForRange<paddle::platform::CUDADeviceContext> {
+  ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(limit) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx_, limit_);
+    for_range(func);
+  }
+
+  const paddle::platform::CUDADeviceContext& dev_ctx_;
+  size_t limit_;
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -19,9 +19,9 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/poisson_kernel.h"

 namespace phi {
@@ -65,7 +65,7 @@ void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  uint64_t seed = seed_offset.first;
  uint64_t offset = seed_offset.second;

-  paddle::platform::ForRange<Context> for_range(ctx, size);
+  phi::funcs::ForRange<Context> for_range(ctx, size);

  PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
  for_range(functor);

--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -14,10 +14,10 @@

 #pragma once

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/abs_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -53,7 +53,7 @@ void AbsGradKernel(const Context& ctx,
  ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
  auto* dx_data = dx->data<T>();

-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  phi::funcs::AbsGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
  for_range(functor);
 }
@@ -70,7 +70,7 @@ void AbsDoubleGradKernel(const Context& ctx,
  ctx.template Alloc<T>(ddout, static_cast<size_t>(numel * sizeof(T)));
  auto* ddout_data = ddout->data<T>();

-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  phi::funcs::AbsGradGradFunctor<T> functor(
      ddx_data, x_data, ddout_data, numel);
  for_range(functor);

--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,9 +14,9 @@

 #pragma once

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -85,7 +85,7 @@ void Atan2GradKernel(const Context& ctx,
  auto* y_grad_data =
      ctx.template Alloc<T>(y_grad, size_t(y.numel() * sizeof(T)));

-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  phi::Atan2GradFunctor<T> functor(
      x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel);
  for_range(functor);

--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,9 +14,9 @@

 #pragma once

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {
 template <typename T>
@@ -80,7 +80,7 @@ void Atan2Kernel(const Context& ctx,
  auto* out_data = ctx.template Alloc<typename Atan2Out<T>::type>(
      out, size_t(x.numel() * sizeof(typename Atan2Out<T>::type)));

-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  phi::Atan2Functor<T> functor(x_data, y_data, out_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -14,8 +14,8 @@

 #pragma once

-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -28,7 +28,7 @@ void RealGradKernel(const Context& dev_ctx,
  auto* dx_data =
      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));

-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
  phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
  for_range(functor);
 }
@@ -42,7 +42,7 @@ void ImagGradKernel(const Context& dev_ctx,
  auto* dx_data =
      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));

-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
  phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -28,7 +28,7 @@ void ConjKernel(const Context& dev_ctx,
  auto* x_data = x.data<T>();
  auto* out_data = dev_ctx.template Alloc<T>(out);

-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
  phi::funcs::ConjFunctor<T> functor(x_data, numel, out_data);
  for_range(functor);
 }
@@ -42,7 +42,7 @@ void RealKernel(const Context& dev_ctx,
  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));

-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
  phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
  for_range(functor);
 }
@@ -56,7 +56,7 @@ void ImagKernel(const Context& dev_ctx,
  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));

-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
  phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once

 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx,
  auto* x_data = x.data<T>();
  auto* dx_data = x_grad->data<T>();
  auto numel = out_grad.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once

 #include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"

 namespace phi {

@@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  auto* x_data = x.data<T>();
  auto* out_data = out->data<T>();
  auto numel = x.numel();
-  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::funcs::ForRange<Context> for_range(ctx, numel);
  DigammaFunctor<T> functor(x_data, out_data, numel);
  for_range(functor);
 }

--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -21,7 +21,7 @@

 #include <algorithm>

-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace phi {
@@ -130,7 +130,7 @@ void TraceGradKernel(const Context& ctx,
    const auto* input_arr = input_stride.Get();
 #endif

-    paddle::platform::ForRange<Context> for_range(ctx, in_grad->numel());
+    phi::funcs::ForRange<Context> for_range(ctx, in_grad->numel());
    TraceGradFunctor<T> functor(out_data,
                                output_arr,
                                input_arr,