diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index f3f7064efeeb2e1121c09a29473a4a81a063f849..abc427a3ca8815ecf193e4f9213223aa79069ea5 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,136 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" + #include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace paddle { namespace platform { template -struct ForRange { - ForRange(const DeviceContext& dev_ctx, size_t limit); - - template - void operator()(Function func) const; -}; - -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} - - template - void operator()(Function func) const { - for (size_t i = 0; i < limit_; ++i) { - func(i); - } - } - - size_t limit_; -}; - -template <> -struct ForRange { - ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {} - - template - void operator()(Function func) const { - for (size_t i = 0; i < limit_; ++i) { - func(i); - } - } - - size_t limit_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) -template -__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { - size_t idx = static_cast(threadIdx.x); - func(idx); -} - -template -__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (idx < limit) { - func(idx); - } -} - -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const CUDADeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} - - template - inline void operator()(Function func) const { -#ifdef __HIPCC__ - // HIP will throw core dump when threads > 256 - constexpr int num_threads = 256; -#elif WITH_NV_JETSON - // JETSON_NANO will throw core dump when threads > 128 - int num_thread = 256; - platform::ChangeThreadNum(dev_ctx_, &num_thread, 128); - const int num_threads = num_thread; -#else - constexpr int num_threads = 1024; -#endif - size_t block_size = limit_ <= num_threads ? limit_ : num_threads; - size_t grid_size = (limit_ + num_threads - 1) / num_threads; - - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); - } else { - ForRangeElemwiseOp<<>>( - func, limit_); - } - } - - const CUDADeviceContext& dev_ctx_; - size_t limit_; -}; - -template <> -struct ForRange { - ForRange(const phi::GPUContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} - - template - inline void operator()(Function func) const { -#ifdef __HIPCC__ - // HIP will throw core dump when threads > 256 - constexpr int num_threads = 256; -#elif WITH_NV_JETSON - // JETSON_NANO will throw core dump when threads > 128 - int num_thread = 256; - platform::ChangeThreadNum(dev_ctx_, &num_thread, 128); - const int num_threads = num_thread; -#else - constexpr int num_threads = 1024; -#endif - size_t block_size = limit_ <= num_threads ? limit_ : num_threads; - size_t grid_size = (limit_ + num_threads - 1) / num_threads; - - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); - } else { - ForRangeElemwiseOp<<>>( - func, limit_); - } - } - - const phi::GPUContext& dev_ctx_; - size_t limit_; -}; - -#endif +using ForRange = phi::funcs::ForRange; } // namespace platform } // namespace paddle diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index 71d818c45e6f3f28697d3496cc9ae8a0d209ce6e..efe7d090405df72ce07b2b2bb7f045977d982eff 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/abs_kernel.h" -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -29,7 +29,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { out, size_t(x.numel() * sizeof(phi::funcs::Real))); auto* out_data = out->data>(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index a82c4f66d010273f0f09fa71a38c3081fd1bc2ee..19a93970d090af060b888f512782975b073fff72 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -22,8 +22,8 @@ #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { namespace funcs { @@ -118,7 +118,7 @@ DenseTensor Diagonal(const DeviceContext& context, #endif // auto& dev_ctx = context.template device_context(); - paddle::platform::ForRange for_range(context, diag.numel()); + phi::funcs::ForRange for_range(context, diag.numel()); DiagonalFunctor functor( input_data, diag_arr, ret_arr, pos, dim_size, diag_data); for_range(functor); diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 47f1593a11eb9e29cc90b7db36650826734ac27f..d369781f845eb0887817f83be761b1027fc0bab0 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -14,11 +14,11 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -418,7 +418,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, DX_OP dx_op, DY_OP dy_op) { size_t N = static_cast(phi::product(x_dim)); - paddle::platform::ForRange for_range(dev_ctx, N); + phi::funcs::ForRange for_range(dev_ctx, N); for_range(ElemwiseGradNoBroadcast{ x.data(), y.data(), diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h new file mode 100644 index 0000000000000000000000000000000000000000..bf0888c301fe739994089b8e05357bd810455756 --- /dev/null +++ b/paddle/phi/kernels/funcs/for_range.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" + +namespace phi { +namespace funcs { + +template +struct ForRange { + ForRange(const Context& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +// NOTE: After the pten kernel is migrated, it needs to be deleted. +template <> +struct ForRange { + ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + void operator()(Function func) const { + phi::funcs::ForRange for_range(dev_ctx_, limit_); + for_range(func); + } + + const paddle::platform::CPUDeviceContext& dev_ctx_; + size_t limit_; +}; + +#if defined(__NVCC__) || defined(__HIPCC__) + +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const phi::GPUContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + inline void operator()(Function func) const { +#ifdef __HIPCC__ + // HIP will throw core dump when threads > 256 + constexpr int num_threads = 256; +#elif WITH_NV_JETSON + // JETSON_NANO will throw core dump when threads > 128 + int num_thread = 256; + backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128); + const int num_threads = num_thread; +#else + constexpr int num_threads = 1024; +#endif + size_t block_size = limit_ <= num_threads ? limit_ : num_threads; + size_t grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const phi::GPUContext& dev_ctx_; + size_t limit_; +}; + +// NOTE: After the pten kernel is migrated, it needs to be deleted. +template <> +struct ForRange { + ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + inline void operator()(Function func) const { + phi::funcs::ForRange for_range(dev_ctx_, limit_); + for_range(func); + } + + const paddle::platform::CUDADeviceContext& dev_ctx_; + size_t limit_; +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu index ae97f2fca68cb37445086065ed421f160b481235..347f70b166657622840fbd3cfb4e62aa1f87eb2a 100644 --- a/paddle/phi/kernels/gpu/poisson_kernel.cu +++ b/paddle/phi/kernels/gpu/poisson_kernel.cu @@ -19,9 +19,9 @@ limitations under the License. */ #include #endif -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/poisson_kernel.h" namespace phi { @@ -65,7 +65,7 @@ void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - paddle::platform::ForRange for_range(ctx, size); + phi::funcs::ForRange for_range(ctx, size); PoissonCudaFunctor functor(x_data, out_data, seed, offset); for_range(functor); diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 4b31393a71f3623bff168dfc17612ceda250c506..78c25200bbd284489ee431cdb78a81748565050b 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -14,10 +14,10 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/abs_grad_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -53,7 +53,7 @@ void AbsGradKernel(const Context& ctx, ctx.template Alloc(dx, static_cast(numel * sizeof(T))); auto* dx_data = dx->data(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsGradFunctor functor(dout_data, x_data, dx_data, numel); for_range(functor); } @@ -70,7 +70,7 @@ void AbsDoubleGradKernel(const Context& ctx, ctx.template Alloc(ddout, static_cast(numel * sizeof(T))); auto* ddout_data = ddout->data(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsGradGradFunctor functor( ddx_data, x_data, ddout_data, numel); for_range(functor); diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h index 5f75a95f4a7b18f0ccf450e003860eeeef3c649d..d0dd18298518ab351918aa2492eb48d11d3cf1d7 100644 --- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_grad_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -85,7 +85,7 @@ void Atan2GradKernel(const Context& ctx, auto* y_grad_data = ctx.template Alloc(y_grad, size_t(y.numel() * sizeof(T))); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::Atan2GradFunctor functor( x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel); for_range(functor); diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h index c29449a27e0b5603c4e6f50c8ed676677c29796a..2cae914e2f61555377f7a41b3d89cdbb2b589247 100644 --- a/paddle/phi/kernels/impl/atan2_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { template @@ -80,7 +80,7 @@ void Atan2Kernel(const Context& ctx, auto* out_data = ctx.template Alloc::type>( out, size_t(x.numel() * sizeof(typename Atan2Out::type))); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::Atan2Functor functor(x_data, y_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h index febc464e6a1f5780ac6a25f0baa55449014a4f66..a10481284b17fbc21865ab8aa3b5ebad4e0a7d95 100644 --- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h @@ -14,8 +14,8 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -28,7 +28,7 @@ void RealGradKernel(const Context& dev_ctx, auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } @@ -42,7 +42,7 @@ void ImagGradKernel(const Context& dev_ctx, auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h index 2f9b1ad04665378307b099f0fc3a0c75f487e41a..ff5cf86ed2ea240747f70f4410b339a135a49d3a 100644 --- a/paddle/phi/kernels/impl/complex_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -28,7 +28,7 @@ void ConjKernel(const Context& dev_ctx, auto* x_data = x.data(); auto* out_data = dev_ctx.template Alloc(out); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ConjFunctor functor(x_data, numel, out_data); for_range(functor); } @@ -42,7 +42,7 @@ void RealKernel(const Context& dev_ctx, auto* out_data = dev_ctx.template Alloc>( out, static_cast(numel * sizeof(phi::funcs::Real))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); for_range(functor); } @@ -56,7 +56,7 @@ void ImagKernel(const Context& dev_ctx, auto* out_data = dev_ctx.template Alloc>( out, static_cast(numel * sizeof(phi::funcs::Real))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h index f94fe7168b2a5cb338f5fdc741d9be56b810f7c6..74ded1569eb5804950898bc1b824367b56480cda 100644 --- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx, auto* x_data = x.data(); auto* dx_data = x_grad->data(); auto numel = out_grad.numel(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); DigammaGradFunctor functor(dout_data, x_data, dx_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h index 5a924a322d6e9941475854dbc01bc4b1d0084bb5..8994979e64d70753ba7b0a6a4debc5e48a95f243 100644 --- a/paddle/phi/kernels/impl/digamma_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { auto* x_data = x.data(); auto* out_data = out->data(); auto numel = x.numel(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); DigammaFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h index 5263f92cb578b9cda612e7bfa4edb2b425876b20..b0878d779462a9c351caa038af2ac017bbf4a14f 100644 --- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h @@ -21,7 +21,7 @@ #include -#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -130,7 +130,7 @@ void TraceGradKernel(const Context& ctx, const auto* input_arr = input_stride.Get(); #endif - paddle::platform::ForRange for_range(ctx, in_grad->numel()); + phi::funcs::ForRange for_range(ctx, in_grad->numel()); TraceGradFunctor functor(out_data, output_arr, input_arr,