From 7d727f366e1353c5d858f4b29fa50c2733d473ca Mon Sep 17 00:00:00 2001 From: Huang Jiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 14 Mar 2023 11:14:22 +0800 Subject: [PATCH] [phi decopuling] decouple dependency to device_context in phi (Part 3) (#51559) * remove device_context include * fix bug * fix bug --- paddle/fluid/imperative/reducer.cc | 3 +- paddle/fluid/pybind/tensor_py.h | 15 +-- paddle/phi/kernels/cpu/concat_kernel.cc | 15 +-- paddle/phi/kernels/funcs/math_function.cc | 1 - paddle/phi/kernels/funcs/math_function.cu | 1 - paddle/phi/kernels/funcs/strided_memcpy.h | 99 +++++++++---------- paddle/phi/kernels/gpu/concat_kernel.cu | 15 +-- .../kernels/impl/concat_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/split_kernel_impl.h | 3 +- 9 files changed, 76 insertions(+), 78 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index d0d985874fa..f9e60b91517 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -103,7 +103,8 @@ static void SplitTensorsForAllReduce( } // Sometimes direct copies will be faster if (p_dense_tensors->size() < 10) { - phi::funcs::StridedMemcpyWithAxis0(context, *in, shape_refer, &outs); + phi::funcs::StridedMemcpyWithAxis0( + context, *in, shape_refer, &outs); } else { operators::math::SplitFunctor split_functor_; split_functor_(context, *in, shape_refer, 0, &outs); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index f930fc32a47..607aa5a894c 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -727,13 +727,14 @@ void _concatCompute(const std::vector &ins, for (auto &in : ins) { auto in_stride = phi::stride_numel(in.dims()); auto out_stride = phi::stride_numel(out->dims()); - phi::funcs::StridedNumelCopyWithAxis(ctx, - axis, - out->data() + output_offset, - out_stride, - in.data(), - in_stride, - in_stride[axis]); + phi::funcs::StridedNumelCopyWithAxis( + ctx, + axis, + out->data() + output_offset, + out_stride, + in.data(), + in_stride, + in_stride[axis]); output_offset += in_stride[axis]; } } else { diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index da5415d9e49..58cb1d95666 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -86,13 +86,14 @@ void ConcatKernel(const Context& dev_ctx, } auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); - phi::funcs::StridedNumelCopyWithAxis(dev_ctx, - axis, - out->data() + output_offset, - out_stride, - in->data(), - in_stride, - in_stride[axis]); + phi::funcs::StridedNumelCopyWithAxis( + dev_ctx, + axis, + out->data() + output_offset, + out_stride, + in->data(), + in_stride, + in_stride[axis]); output_offset += in_stride[axis]; } } else { diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index e8bd17efc7d..d5e727c39fe 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/data_type.h" diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index 801076f46c6..24d9248c371 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -14,7 +14,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/data_type.h" diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h index 76252398b14..547b5853129 100644 --- a/paddle/phi/kernels/funcs/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/strided_memcpy.h @@ -14,9 +14,12 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/detail/strided_memcpy.h" -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/dense_tensor.h" +namespace phi { +class CPUContext; +} // namespace phi + namespace phi { namespace funcs { @@ -46,6 +49,32 @@ inline void StridedMemcpy(const phi::DeviceContext& dev_ctx, dst_dim.apply_visitor(func); } +template +inline void CopyWithContext(const Context& ctx, + const Place& dst_place, + void* dst, + const Place& src_place, + const void* src, + size_t num) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) + memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream()); +#else + PADDLE_THROW( + phi::errors::PreconditionNotMet("Paddle is not compiled with GPU.")); +#endif +} + +template <> +inline void CopyWithContext(const phi::CPUContext& ctx, + const Place& dst_place, + void* dst, + const Place& src_place, + const void* src, + size_t num) { + memory_utils::Copy(dst_place, dst, src_place, src, num); +} + // Strided numel memory copy from src to dst by the specified axis // // For example, for a tensor dims [4, 20, 100], the strieded numel is @@ -53,8 +82,8 @@ inline void StridedMemcpy(const phi::DeviceContext& dev_ctx, // // NOTE: The src and dst tensor should have the same elements // except the specified axis. -template -inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, +template +inline void StridedNumelCopyWithAxis(const Context& ctx, int64_t axis, T* dst, const phi::DDim& dst_stride_numel, @@ -102,52 +131,18 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, } for (int64_t i = 0; i < before; ++i) { - if (place.GetType() == phi::AllocationType::CPU) { - auto& cpu_place = place; - memory_utils::Copy(cpu_place, - dst + i * dst_after, - cpu_place, - src + i * src_after, - sizeof(T) * size); - } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto& gpu_place = place; - auto& cuda_ctx = reinterpret_cast(ctx); - memory_utils::Copy(gpu_place, - dst + i * dst_after, - gpu_place, - src + i * src_after, - sizeof(T) * size, - cuda_ctx.stream()); -#elif defined(PADDLE_WITH_ASCEND_CL) - auto& npu_place = place; - auto& npu_ctx = reinterpret_cast(ctx); - memory_utils::Copy(npu_place, - dst + i * dst_after, - npu_place, - src + i * src_after, - sizeof(T) * size, - npu_ctx.stream()); -#elif defined(PADDLE_WITH_MLU) - auto& mlu_place = place; - auto& mlu_ctx = reinterpret_cast(ctx); - memory_utils::Copy(mlu_place, - dst + i * dst_after, - mlu_place, - src + i * src_after, - sizeof(T) * size, - mlu_ctx.stream()); -#else - PADDLE_THROW( - phi::errors::PreconditionNotMet("Paddle is not compiled with GPU.")); -#endif - } + CopyWithContext(ctx, + place, + dst + i * dst_after, + place, + src + i * src_after, + sizeof(T) * size); } } -template +template inline void StridedMemcpyWithAxis0( - const phi::DeviceContext& dev_ctx, + const Context& dev_ctx, const phi::DenseTensor& input, const std::vector& shape_refer, std::vector* outputs) { @@ -159,13 +154,13 @@ inline void StridedMemcpyWithAxis0( auto out_stride = stride_numel(shape_refer[i]->dims()); auto out = outputs->at(i); if (out != nullptr && out->initialized() && out->numel() > 0) { - StridedNumelCopyWithAxis(dev_ctx, - axis, - out->data(), - out_stride, - input.data() + input_offset, - in_stride, - out_stride[axis]); + StridedNumelCopyWithAxis(dev_ctx, + axis, + out->data(), + out_stride, + input.data() + input_offset, + in_stride, + out_stride[axis]); } input_offset += out_stride[axis]; } diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index ac83cb3f829..47e5a220e66 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -85,13 +85,14 @@ void ConcatKernel(const Context& dev_ctx, } auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); - phi::funcs::StridedNumelCopyWithAxis(dev_ctx, - axis, - out->data() + output_offset, - out_stride, - in->data(), - in_stride, - in_stride[axis]); + phi::funcs::StridedNumelCopyWithAxis( + dev_ctx, + axis, + out->data() + output_offset, + out_stride, + in->data(), + in_stride, + in_stride[axis]); output_offset += in_stride[axis]; } } else { diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h index b0b0e5728d4..a1f0a98463f 100644 --- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h @@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx, if (axis == 0 && outs.size() < 10) { std::vector ref_shape; ref_shape.insert(ref_shape.begin(), x.begin(), x.end()); - phi::funcs::StridedMemcpyWithAxis0( + phi::funcs::StridedMemcpyWithAxis0( dev_ctx, out_grad, ref_shape, &outputs); } else { phi::funcs::SplitFunctor split_functor; diff --git a/paddle/phi/kernels/impl/split_kernel_impl.h b/paddle/phi/kernels/impl/split_kernel_impl.h index 77acf81cf4c..83968d913fe 100644 --- a/paddle/phi/kernels/impl/split_kernel_impl.h +++ b/paddle/phi/kernels/impl/split_kernel_impl.h @@ -37,7 +37,8 @@ void SplitKernel(const Context& dev_ctx, int axis = axis_scalar.to(); // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && outs.size() < 10) { - phi::funcs::StridedMemcpyWithAxis0(dev_ctx, x, shape_refer, &outs); + phi::funcs::StridedMemcpyWithAxis0( + dev_ctx, x, shape_refer, &outs); } else { phi::funcs::SplitFunctor functor; functor(dev_ctx, x, shape_refer, axis, &outs); -- GitLab