未验证 提交 17318c1a 编写于 作者: H Huang Jiyi 提交者: GitHub

[PHI decoupling] move strided_memcpy.h to phi (#50346)

* decouple strided_memcpy

* move strided_memcpy

* move strided_memcpy to phi

* fix namespace

* update

* fix gpu compile bugs
上级 90650534
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/imperative/parallel_context.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#endif #endif
...@@ -103,7 +103,7 @@ static void SplitTensorsForAllReduce( ...@@ -103,7 +103,7 @@ static void SplitTensorsForAllReduce(
} }
// Sometimes direct copies will be faster // Sometimes direct copies will be faster
if (p_dense_tensors->size() < 10) { if (p_dense_tensors->size() < 10) {
operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs); phi::funcs::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
} else { } else {
operators::math::SplitFunctor<DeviceContext, T> split_functor_; operators::math::SplitFunctor<DeviceContext, T> split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs); split_functor_(context, *in, shape_refer, 0, &outs);
......
...@@ -188,7 +188,6 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor) ...@@ -188,7 +188,6 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -20,10 +20,10 @@ limitations under the License. */ ...@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/concat_kernel.h" #include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { // Internal namespace operators { // Internal
......
...@@ -24,11 +24,11 @@ namespace cub = hipcub; ...@@ -24,11 +24,11 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/mixed_vector.h" #include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -140,7 +140,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -140,7 +140,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
static_cast<int>(lod[0][i] + offset_data[i]), static_cast<int>(lod[0][i] + offset_data[i]),
static_cast<int>(lod[0][i] + offset_data[i] + length_data[i])); static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
StridedMemcpy<T>(ctx.device_context(), phi::funcs::StridedMemcpy<T>(ctx.device_context(),
in_t.data<T>(), in_t.data<T>(),
in_stride, in_stride,
in_t.dims(), in_t.dims(),
...@@ -201,7 +201,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> { ...@@ -201,7 +201,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
static_cast<int>(lod[0][i] + offset_data[i]), static_cast<int>(lod[0][i] + offset_data[i]),
static_cast<int>(lod[0][i] + offset_data[i] + length_data[i])); static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
StridedMemcpy<T>(ctx.device_context(), phi::funcs::StridedMemcpy<T>(ctx.device_context(),
out_grad_t.data<T>(), out_grad_t.data<T>(),
out_grad_stride, out_grad_stride,
out_grad_t.dims(), out_grad_t.dims(),
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -96,7 +96,8 @@ class SppKernel : public framework::OpKernel<T> { ...@@ -96,7 +96,8 @@ class SppKernel : public framework::OpKernel<T> {
out_level.Resize(output_flatten_shape); out_level.Resize(output_flatten_shape);
// concat // concat
auto out_level_stride = phi::stride(out_level.dims()); auto out_level_stride = phi::stride(out_level.dims());
StridedMemcpy<T>(context.template device_context<DeviceContext>(), phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out_level.data<T>(), out_level.data<T>(),
out_level_stride, out_level_stride,
out_level.dims(), out_level.dims(),
...@@ -150,14 +151,16 @@ class SppGradKernel : public framework::OpKernel<T> { ...@@ -150,14 +151,16 @@ class SppGradKernel : public framework::OpKernel<T> {
outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace()); outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
auto flatten_stride = phi::stride(out_level.dims()); auto flatten_stride = phi::stride(out_level.dims());
// memcpy // memcpy
StridedMemcpy<T>(context.template device_context<DeviceContext>(), phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out->data<T>() + out_offset, out->data<T>() + out_offset,
out_stride, out_stride,
out_level.dims(), out_level.dims(),
flatten_stride, flatten_stride,
out_level.data<T>()); out_level.data<T>());
StridedMemcpy<T>(context.template device_context<DeviceContext>(), phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out_grad->data<T>() + out_offset, out_grad->data<T>() + out_offset,
out_stride, out_stride,
outgrad_level.dims(), outgrad_level.dims(),
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -28,9 +28,9 @@ limitations under the License. */ ...@@ -28,9 +28,9 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -724,8 +724,7 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins, ...@@ -724,8 +724,7 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
for (auto &in : ins) { for (auto &in : ins) {
auto in_stride = phi::stride_numel(in.dims()); auto in_stride = phi::stride_numel(in.dims());
auto out_stride = phi::stride_numel(out->dims()); auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>( phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
ctx,
axis, axis,
out->data<T>() + output_offset, out->data<T>() + output_offset,
out_stride, out_stride,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/concat_kernel.h" #include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h" #include "paddle/phi/common/complex.h"
...@@ -24,6 +23,7 @@ ...@@ -24,6 +23,7 @@
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi { namespace phi {
...@@ -86,8 +86,7 @@ void ConcatKernel(const Context& dev_ctx, ...@@ -86,8 +86,7 @@ void ConcatKernel(const Context& dev_ctx,
} }
auto in_stride = phi::stride_numel(in->dims()); auto in_stride = phi::stride_numel(in->dims());
auto out_stride = phi::stride_numel(out->dims()); auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>( phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
dev_ctx,
axis, axis,
out->data<T>() + output_offset, out->data<T>() + output_offset,
out_stride, out_stride,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -14,11 +14,15 @@ limitations under the License. */ ...@@ -14,11 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/device_context.h"
namespace paddle { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
namespace operators { #include "paddle/phi/backends/gpu/gpu_context.h"
#endif
namespace phi {
namespace funcs {
namespace detail { namespace detail {
template <typename T, int Rank> template <typename T, int Rank>
...@@ -26,25 +30,25 @@ struct StridedMemcpyFunctor; ...@@ -26,25 +30,25 @@ struct StridedMemcpyFunctor;
template <typename T> template <typename T>
struct StridedMemcpyFunctor<T, 0> { struct StridedMemcpyFunctor<T, 0> {
void operator()(const platform::DeviceContext& dev_ctx, void operator()(const phi::DeviceContext& dev_ctx,
const T* src, const T* src,
const int64_t* src_stride, const int64_t* src_stride,
const int64_t* dst_dim, const int64_t* dst_dim,
const int64_t* dst_stride, const int64_t* dst_stride,
T* dst) const { T* dst) const {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
memory::Copy( paddle::memory::Copy(
gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream()); gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with GPU.")); phi::errors::Unavailable("Paddle is not compiled with GPU."));
#endif #endif
} }
} }
...@@ -52,21 +56,22 @@ struct StridedMemcpyFunctor<T, 0> { ...@@ -52,21 +56,22 @@ struct StridedMemcpyFunctor<T, 0> {
template <typename T> template <typename T>
struct StridedMemcpyFunctor<T, 1> { struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx, void operator()(const phi::DeviceContext& dev_ctx,
const T* src, const T* src,
const int64_t* src_stride, const int64_t* src_stride,
const int64_t* dst_dim, const int64_t* dst_dim,
const int64_t* dst_stride, const int64_t* dst_stride,
T* dst) const { T* dst) const {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); paddle::memory::Copy(
cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
memory::Copy(gpu_place, paddle::memory::Copy(gpu_place,
dst, dst,
gpu_place, gpu_place,
src, src,
...@@ -74,7 +79,7 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -74,7 +79,7 @@ struct StridedMemcpyFunctor<T, 1> {
cuda_ctx.stream()); cuda_ctx.stream());
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with GPU.")); phi::errors::Unavailable("Paddle is not compiled with GPU."));
#endif #endif
} }
} }
...@@ -82,7 +87,7 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -82,7 +87,7 @@ struct StridedMemcpyFunctor<T, 1> {
template <typename T, int Rank> template <typename T, int Rank>
struct StridedMemcpyFunctor { struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx, void operator()(const phi::DeviceContext& dev_ctx,
const T* src, const T* src,
const int64_t* src_stride, const int64_t* src_stride,
const int64_t* dst_dim, const int64_t* dst_dim,
...@@ -99,10 +104,10 @@ struct StridedMemcpyFunctor { ...@@ -99,10 +104,10 @@ struct StridedMemcpyFunctor {
template <typename T> template <typename T>
struct StridedCopyDimVisitor { struct StridedCopyDimVisitor {
StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, StridedCopyDimVisitor(const phi::DeviceContext& dev_ctx,
const T* src, const T* src,
const framework::DDim& src_stride, const phi::DDim& src_stride,
const framework::DDim& dst_stride, const phi::DDim& dst_stride,
T* dst) T* dst)
: dev_ctx_(dev_ctx), : dev_ctx_(dev_ctx),
src_(src), src_(src),
...@@ -111,7 +116,7 @@ struct StridedCopyDimVisitor { ...@@ -111,7 +116,7 @@ struct StridedCopyDimVisitor {
dst_(dst) {} dst_(dst) {}
template <int D> template <int D>
void operator()(const framework::Dim<D>& dst_dim) const { void operator()(const phi::Dim<D>& dst_dim) const {
StridedMemcpyFunctor<T, D> functor; StridedMemcpyFunctor<T, D> functor;
functor(dev_ctx_, functor(dev_ctx_,
src_, src_,
...@@ -121,13 +126,13 @@ struct StridedCopyDimVisitor { ...@@ -121,13 +126,13 @@ struct StridedCopyDimVisitor {
dst_); dst_);
} }
const platform::DeviceContext& dev_ctx_; const phi::DeviceContext& dev_ctx_;
const T* src_; const T* src_;
const framework::DDim& src_stride_; const phi::DDim& src_stride_;
const framework::DDim& dst_stride_; const phi::DDim& dst_stride_;
T* dst_; T* dst_;
}; };
} // namespace detail } // namespace detail
} // namespace operators } // namespace funcs
} // namespace paddle } // namespace phi
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
...@@ -12,11 +12,13 @@ limitations under the License. */ ...@@ -12,11 +12,13 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h" #include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
#include "paddle/fluid/operators/detail/strided_memcpy.h"
namespace paddle { #include "paddle/fluid/platform/device_context.h"
namespace operators { #include "paddle/phi/core/dense_tensor.h"
namespace phi {
namespace funcs {
// Strided memory copy from src to dst. // Strided memory copy from src to dst.
// //
...@@ -33,13 +35,13 @@ namespace operators { ...@@ -33,13 +35,13 @@ namespace operators {
// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
// `dev_ctx.Wait()`. // `dev_ctx.Wait()`.
template <typename T> template <typename T>
inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
const T* src, const T* src,
const framework::DDim& src_stride, const phi::DDim& src_stride,
const framework::DDim& dst_dim, const phi::DDim& dst_dim,
const framework::DDim& dst_stride, const phi::DDim& dst_stride,
T* dst) { T* dst) {
paddle::operators::detail::StridedCopyDimVisitor<T> func( detail::StridedCopyDimVisitor<T> func(
dev_ctx, src, src_stride, dst_stride, dst); dev_ctx, src, src_stride, dst_stride, dst);
dst_dim.apply_visitor(func); dst_dim.apply_visitor(func);
} }
...@@ -52,12 +54,12 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, ...@@ -52,12 +54,12 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
// NOTE: The src and dst tensor should have the same elements // NOTE: The src and dst tensor should have the same elements
// except the specified axis. // except the specified axis.
template <typename T> template <typename T>
inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
int64_t axis, int64_t axis,
T* dst, T* dst,
const framework::DDim& dst_stride_numel, const phi::DDim& dst_stride_numel,
const T* src, const T* src,
const framework::DDim& src_stride_numel, const phi::DDim& src_stride_numel,
int64_t size) { int64_t size) {
int64_t before = dst_stride_numel[0] / dst_stride_numel[axis]; int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
int64_t src_after = src_stride_numel[axis]; int64_t src_after = src_stride_numel[axis];
...@@ -66,7 +68,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -66,7 +68,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ(src_stride_numel.size(), PADDLE_ENFORCE_EQ(src_stride_numel.size(),
dst_stride_numel.size(), dst_stride_numel.size(),
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"Source and destination tensor should have the same " "Source and destination tensor should have the same "
"dimension size, but source tensor dimension size is " "dimension size, but source tensor dimension size is "
"%u, destination tensor size is %u.", "%u, destination tensor size is %u.",
...@@ -78,7 +80,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -78,7 +80,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
src_stride_numel[i] / src_stride_numel[axis], src_stride_numel[i] / src_stride_numel[axis],
dst_stride_numel[i] / dst_stride_numel[axis], dst_stride_numel[i] / dst_stride_numel[axis],
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"Source and destination tensor should have the same number of " "Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements " "elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.", "number is %d, destination elements number is %d.",
...@@ -90,7 +92,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -90,7 +92,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
src_stride_numel[i], src_stride_numel[i],
dst_stride_numel[i], dst_stride_numel[i],
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"Source and destination tensor should have the same number of " "Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements " "elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.", "number is %d, destination elements number is %d.",
...@@ -100,9 +102,9 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -100,9 +102,9 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
} }
for (int64_t i = 0; i < before; ++i) { for (int64_t i = 0; i < before; ++i) {
if (platform::is_cpu_place(place)) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
memory::Copy(cpu_place, paddle::memory::Copy(cpu_place,
dst + i * dst_after, dst + i * dst_after,
cpu_place, cpu_place,
src + i * src_after, src + i * src_after,
...@@ -111,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -111,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
memory::Copy(gpu_place, paddle::memory::Copy(gpu_place,
dst + i * dst_after, dst + i * dst_after,
gpu_place, gpu_place,
src + i * src_after, src + i * src_after,
...@@ -120,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -120,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
auto& npu_place = place; auto& npu_place = place;
auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx); auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
memory::Copy(npu_place, paddle::memory::Copy(npu_place,
dst + i * dst_after, dst + i * dst_after,
npu_place, npu_place,
src + i * src_after, src + i * src_after,
...@@ -129,15 +131,15 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -129,15 +131,15 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
auto& mlu_place = place; auto& mlu_place = place;
auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx); auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
memory::Copy(mlu_place, paddle::memory::Copy(mlu_place,
dst + i * dst_after, dst + i * dst_after,
mlu_place, mlu_place,
src + i * src_after, src + i * src_after,
sizeof(T) * size, sizeof(T) * size,
mlu_ctx.stream()); mlu_ctx.stream());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(
"Paddle is not compiled with GPU.")); phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
#endif #endif
} }
} }
...@@ -145,11 +147,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -145,11 +147,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
template <typename T> template <typename T>
inline void StridedMemcpyWithAxis0( inline void StridedMemcpyWithAxis0(
const platform::DeviceContext& dev_ctx, const phi::DeviceContext& dev_ctx,
const phi::DenseTensor& input, const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& shape_refer, const std::vector<const phi::DenseTensor*>& shape_refer,
std::vector<phi::DenseTensor*>* outputs) { std::vector<phi::DenseTensor*>* outputs) {
const framework::DDim in_stride = stride_numel(input.dims()); const phi::DDim in_stride = stride_numel(input.dims());
const int axis = 0; const int axis = 0;
size_t input_offset = 0; size_t input_offset = 0;
...@@ -169,5 +171,5 @@ inline void StridedMemcpyWithAxis0( ...@@ -169,5 +171,5 @@ inline void StridedMemcpyWithAxis0(
} }
} }
} // namespace operators } // namespace funcs
} // namespace paddle } // namespace phi
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/concat_kernel.h" #include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h" #include "paddle/phi/common/complex.h"
...@@ -24,6 +23,7 @@ ...@@ -24,6 +23,7 @@
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi { namespace phi {
...@@ -85,8 +85,7 @@ void ConcatKernel(const Context& dev_ctx, ...@@ -85,8 +85,7 @@ void ConcatKernel(const Context& dev_ctx,
} }
auto in_stride = phi::stride_numel(in->dims()); auto in_stride = phi::stride_numel(in->dims());
auto out_stride = phi::stride_numel(out->dims()); auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>( phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
dev_ctx,
axis, axis,
out->data<T>() + output_offset, out->data<T>() + output_offset,
out_stride, out_stride,
......
...@@ -13,10 +13,10 @@ ...@@ -13,10 +13,10 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/concat_grad_kernel.h" #include "paddle/phi/kernels/concat_grad_kernel.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi { namespace phi {
...@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx, ...@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx,
if (axis == 0 && outs.size() < 10) { if (axis == 0 && outs.size() < 10) {
std::vector<const DenseTensor*> ref_shape; std::vector<const DenseTensor*> ref_shape;
ref_shape.insert(ref_shape.begin(), x.begin(), x.end()); ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
paddle::operators::StridedMemcpyWithAxis0<T>( phi::funcs::StridedMemcpyWithAxis0<T>(
dev_ctx, out_grad, ref_shape, &outputs); dev_ctx, out_grad, ref_shape, &outputs);
} else { } else {
phi::funcs::SplitFunctor<Context, T> split_functor; phi::funcs::SplitFunctor<Context, T> split_functor;
......
...@@ -15,11 +15,11 @@ ...@@ -15,11 +15,11 @@
#pragma once #pragma once
#include "paddle/phi/kernels/split_kernel.h" #include "paddle/phi/kernels/split_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
...@@ -37,8 +37,7 @@ void SplitKernel(const Context& dev_ctx, ...@@ -37,8 +37,7 @@ void SplitKernel(const Context& dev_ctx,
int axis = axis_scalar.to<int>(); int axis = axis_scalar.to<int>();
// Sometimes direct copies will be faster, this maybe need deeply analysis. // Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) { if (axis == 0 && outs.size() < 10) {
paddle::operators::StridedMemcpyWithAxis0<T>( phi::funcs::StridedMemcpyWithAxis0<T>(dev_ctx, x, shape_refer, &outs);
dev_ctx, x, shape_refer, &outs);
} else { } else {
phi::funcs::SplitFunctor<Context, T> functor; phi::funcs::SplitFunctor<Context, T> functor;
functor(dev_ctx, x, shape_refer, axis, &outs); functor(dev_ctx, x, shape_refer, axis, &outs);
......
...@@ -95,3 +95,8 @@ cc_test( ...@@ -95,3 +95,8 @@ cc_test(
test_cache test_cache
SRCS test_cache.cc SRCS test_cache.cc
DEPS gtest cache) DEPS gtest cache)
cc_test(
strided_memcpy_test
SRCS strided_memcpy_test.cc
DEPS device_context memory)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle { namespace phi {
namespace operators { namespace tests {
TEST(StridedMemcpy, CPUCrop) { TEST(StridedMemcpy, CPUCrop) {
// clang-format off // clang-format off
...@@ -29,14 +29,15 @@ TEST(StridedMemcpy, CPUCrop) { ...@@ -29,14 +29,15 @@ TEST(StridedMemcpy, CPUCrop) {
}; };
// clang-format on // clang-format on
framework::DDim src_stride({5, 1}); phi::DDim src_stride({5, 1});
int dst[4]; int dst[4];
framework::DDim dst_dim({2, 2}); phi::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1}); phi::DDim dst_stride({2, 1});
phi::CPUContext ctx; phi::CPUContext ctx;
StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst); phi::funcs::StridedMemcpy<int>(
ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
ASSERT_EQ(1, dst[0]); ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]); ASSERT_EQ(2, dst[1]);
...@@ -54,13 +55,15 @@ TEST(StridedMemcpy, CPUConcat) { ...@@ -54,13 +55,15 @@ TEST(StridedMemcpy, CPUConcat) {
int dst[8]; int dst[8];
framework::DDim src_stride({2, 1}); phi::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2}); phi::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1}); phi::DDim dst_stride({4, 1});
phi::CPUContext ctx; phi::CPUContext ctx;
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst); phi::funcs::StridedMemcpy<int>(
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2); ctx, src, src_stride, dst_dim, dst_stride, dst);
phi::funcs::StridedMemcpy<int>(
ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
// clang-format off // clang-format off
int expect_dst[] = { int expect_dst[] = {
...@@ -83,8 +86,8 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -83,8 +86,8 @@ TEST(StridedMemcpy, GPUCrop) {
}; };
// clang-format on // clang-format on
platform::CUDAPlace gpu0(0); phi::GPUPlace gpu0(0);
platform::CPUPlace cpu; phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0); phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
...@@ -92,24 +95,24 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -92,24 +95,24 @@ TEST(StridedMemcpy, GPUCrop) {
.get()); .get());
ctx.PartialInitWithAllocator(); ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src)); auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1}); phi::DDim src_stride({5, 1});
int dst[4]; int dst[4];
auto dst_allocation = memory::Alloc(gpu0, sizeof(dst)); auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr()); int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
framework::DDim dst_dim({2, 2}); phi::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1}); phi::DDim dst_stride({2, 1});
StridedMemcpy<int>( phi::funcs::StridedMemcpy<int>(
ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait(); ctx.Wait();
ASSERT_EQ(1, dst[0]); ASSERT_EQ(1, dst[0]);
...@@ -126,30 +129,31 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -126,30 +129,31 @@ TEST(StridedMemcpy, GPUConcat) {
}; };
// clang-format on // clang-format on
platform::CUDAPlace gpu0(0); phi::GPUPlace gpu0(0);
platform::CPUPlace cpu; phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0); phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream()) .GetAllocator(gpu0, ctx.stream())
.get()); .get());
ctx.PartialInitWithAllocator(); ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src)); auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8]; int dst[8];
auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst)); auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr()); int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
framework::DDim src_stride({2, 1}); phi::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2}); phi::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1}); phi::DDim dst_stride({4, 1});
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); phi::funcs::StridedMemcpy<int>(
StridedMemcpy<int>( ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2); ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait(); ctx.Wait();
// clang-format off // clang-format off
...@@ -164,5 +168,5 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -164,5 +168,5 @@ TEST(StridedMemcpy, GPUConcat) {
} }
#endif #endif
} // namespace operators } // namespace tests
} // namespace paddle } // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册