未验证 提交 17318c1a 编写于 作者: H Huang Jiyi 提交者: GitHub

[PHI decoupling] move strided_memcpy.h to phi (#50346)

* decouple strided_memcpy

* move strided_memcpy

* move strided_memcpy to phi

* fix namespace

* update

* fix gpu compile bugs
上级 90650534
......@@ -20,7 +20,7 @@
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/parallel_context.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
#endif
......@@ -103,7 +103,7 @@ static void SplitTensorsForAllReduce(
}
// Sometimes direct copies will be faster
if (p_dense_tensors->size() < 10) {
operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
phi::funcs::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
} else {
operators::math::SplitFunctor<DeviceContext, T> split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs);
......
......@@ -188,7 +188,6 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
if (WITH_GPU)
......
......@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators { // Internal
......
......@@ -24,11 +24,11 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......
......@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......@@ -140,12 +140,12 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
static_cast<int>(lod[0][i] + offset_data[i]),
static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
StridedMemcpy<T>(ctx.device_context(),
in_t.data<T>(),
in_stride,
in_t.dims(),
out_stride,
out->data<T>() + out_offset);
phi::funcs::StridedMemcpy<T>(ctx.device_context(),
in_t.data<T>(),
in_stride,
in_t.dims(),
out_stride,
out->data<T>() + out_offset);
out_offset += length_data[i] * in_stride[0];
}
}
......@@ -201,12 +201,12 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
static_cast<int>(lod[0][i] + offset_data[i]),
static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
StridedMemcpy<T>(ctx.device_context(),
out_grad_t.data<T>(),
out_grad_stride,
out_grad_t.dims(),
x_grad_stride,
x_grad_t.data<T>());
phi::funcs::StridedMemcpy<T>(ctx.device_context(),
out_grad_t.data<T>(),
out_grad_stride,
out_grad_t.dims(),
x_grad_stride,
x_grad_t.data<T>());
}
}
}
......
......@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......@@ -96,12 +96,13 @@ class SppKernel : public framework::OpKernel<T> {
out_level.Resize(output_flatten_shape);
// concat
auto out_level_stride = phi::stride(out_level.dims());
StridedMemcpy<T>(context.template device_context<DeviceContext>(),
out_level.data<T>(),
out_level_stride,
out_level.dims(),
out_stride,
out->data<T>() + output_offset);
phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out_level.data<T>(),
out_level_stride,
out_level.dims(),
out_stride,
out->data<T>() + output_offset);
output_offset += out_level.dims()[1] * out_level_stride[1];
}
}
......@@ -150,19 +151,21 @@ class SppGradKernel : public framework::OpKernel<T> {
outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
auto flatten_stride = phi::stride(out_level.dims());
// memcpy
StridedMemcpy<T>(context.template device_context<DeviceContext>(),
out->data<T>() + out_offset,
out_stride,
out_level.dims(),
flatten_stride,
out_level.data<T>());
phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out->data<T>() + out_offset,
out_stride,
out_level.dims(),
flatten_stride,
out_level.data<T>());
StridedMemcpy<T>(context.template device_context<DeviceContext>(),
out_grad->data<T>() + out_offset,
out_stride,
outgrad_level.dims(),
flatten_stride,
outgrad_level.data<T>());
phi::funcs::StridedMemcpy<T>(
context.template device_context<DeviceContext>(),
out_grad->data<T>() + out_offset,
out_stride,
outgrad_level.dims(),
flatten_stride,
outgrad_level.data<T>());
out_offset += out_level.dims()[1] * out_stride[1];
// flatten backward to nchw
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace paddle {
namespace operators {
......
......@@ -28,9 +28,9 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -724,14 +724,13 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
for (auto &in : ins) {
auto in_stride = phi::stride_numel(in.dims());
auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>(
ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in.data<T>(),
in_stride,
in_stride[axis]);
phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in.data<T>(),
in_stride,
in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
......@@ -24,6 +23,7 @@
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi {
......@@ -86,14 +86,13 @@ void ConcatKernel(const Context& dev_ctx,
}
auto in_stride = phi::stride_numel(in->dims());
auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>(
dev_ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in->data<T>(),
in_stride,
in_stride[axis]);
phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in->data<T>(),
in_stride,
in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -14,11 +14,15 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/device_context.h"
namespace paddle {
namespace operators {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif
namespace phi {
namespace funcs {
namespace detail {
template <typename T, int Rank>
......@@ -26,25 +30,25 @@ struct StridedMemcpyFunctor;
template <typename T>
struct StridedMemcpyFunctor<T, 0> {
void operator()(const platform::DeviceContext& dev_ctx,
void operator()(const phi::DeviceContext& dev_ctx,
const T* src,
const int64_t* src_stride,
const int64_t* dst_dim,
const int64_t* dst_stride,
T* dst) const {
auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) {
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
memory::Copy(
paddle::memory::Copy(
gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
#else
PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with GPU."));
phi::errors::Unavailable("Paddle is not compiled with GPU."));
#endif
}
}
......@@ -52,29 +56,30 @@ struct StridedMemcpyFunctor<T, 0> {
template <typename T>
struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx,
void operator()(const phi::DeviceContext& dev_ctx,
const T* src,
const int64_t* src_stride,
const int64_t* dst_dim,
const int64_t* dst_stride,
T* dst) const {
auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) {
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
paddle::memory::Copy(
cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
memory::Copy(gpu_place,
dst,
gpu_place,
src,
sizeof(T) * dst_dim[0],
cuda_ctx.stream());
paddle::memory::Copy(gpu_place,
dst,
gpu_place,
src,
sizeof(T) * dst_dim[0],
cuda_ctx.stream());
#else
PADDLE_THROW(
platform::errors::Unavailable("Paddle is not compiled with GPU."));
phi::errors::Unavailable("Paddle is not compiled with GPU."));
#endif
}
}
......@@ -82,7 +87,7 @@ struct StridedMemcpyFunctor<T, 1> {
template <typename T, int Rank>
struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx,
void operator()(const phi::DeviceContext& dev_ctx,
const T* src,
const int64_t* src_stride,
const int64_t* dst_dim,
......@@ -99,10 +104,10 @@ struct StridedMemcpyFunctor {
template <typename T>
struct StridedCopyDimVisitor {
StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx,
StridedCopyDimVisitor(const phi::DeviceContext& dev_ctx,
const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_stride,
const phi::DDim& src_stride,
const phi::DDim& dst_stride,
T* dst)
: dev_ctx_(dev_ctx),
src_(src),
......@@ -111,7 +116,7 @@ struct StridedCopyDimVisitor {
dst_(dst) {}
template <int D>
void operator()(const framework::Dim<D>& dst_dim) const {
void operator()(const phi::Dim<D>& dst_dim) const {
StridedMemcpyFunctor<T, D> functor;
functor(dev_ctx_,
src_,
......@@ -121,13 +126,13 @@ struct StridedCopyDimVisitor {
dst_);
}
const platform::DeviceContext& dev_ctx_;
const phi::DeviceContext& dev_ctx_;
const T* src_;
const framework::DDim& src_stride_;
const framework::DDim& dst_stride_;
const phi::DDim& src_stride_;
const phi::DDim& dst_stride_;
T* dst_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
......@@ -12,11 +12,13 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
namespace paddle {
namespace operators {
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
namespace funcs {
// Strided memory copy from src to dst.
//
......@@ -33,13 +35,13 @@ namespace operators {
// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
// `dev_ctx.Wait()`.
template <typename T>
inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_dim,
const framework::DDim& dst_stride,
const phi::DDim& src_stride,
const phi::DDim& dst_dim,
const phi::DDim& dst_stride,
T* dst) {
paddle::operators::detail::StridedCopyDimVisitor<T> func(
detail::StridedCopyDimVisitor<T> func(
dev_ctx, src, src_stride, dst_stride, dst);
dst_dim.apply_visitor(func);
}
......@@ -52,12 +54,12 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
// NOTE: The src and dst tensor should have the same elements
// except the specified axis.
template <typename T>
inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
int64_t axis,
T* dst,
const framework::DDim& dst_stride_numel,
const phi::DDim& dst_stride_numel,
const T* src,
const framework::DDim& src_stride_numel,
const phi::DDim& src_stride_numel,
int64_t size) {
int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
int64_t src_after = src_stride_numel[axis];
......@@ -66,7 +68,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ(src_stride_numel.size(),
dst_stride_numel.size(),
platform::errors::InvalidArgument(
phi::errors::InvalidArgument(
"Source and destination tensor should have the same "
"dimension size, but source tensor dimension size is "
"%u, destination tensor size is %u.",
......@@ -78,7 +80,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ(
src_stride_numel[i] / src_stride_numel[axis],
dst_stride_numel[i] / dst_stride_numel[axis],
platform::errors::InvalidArgument(
phi::errors::InvalidArgument(
"Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.",
......@@ -90,7 +92,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ(
src_stride_numel[i],
dst_stride_numel[i],
platform::errors::InvalidArgument(
phi::errors::InvalidArgument(
"Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.",
......@@ -100,44 +102,44 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
}
for (int64_t i = 0; i < before; ++i) {
if (platform::is_cpu_place(place)) {
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
memory::Copy(cpu_place,
dst + i * dst_after,
cpu_place,
src + i * src_after,
sizeof(T) * size);
paddle::memory::Copy(cpu_place,
dst + i * dst_after,
cpu_place,
src + i * src_after,
sizeof(T) * size);
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
memory::Copy(gpu_place,
dst + i * dst_after,
gpu_place,
src + i * src_after,
sizeof(T) * size,
cuda_ctx.stream());
paddle::memory::Copy(gpu_place,
dst + i * dst_after,
gpu_place,
src + i * src_after,
sizeof(T) * size,
cuda_ctx.stream());
#elif defined(PADDLE_WITH_ASCEND_CL)
auto& npu_place = place;
auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
memory::Copy(npu_place,
dst + i * dst_after,
npu_place,
src + i * src_after,
sizeof(T) * size,
npu_ctx.stream());
paddle::memory::Copy(npu_place,
dst + i * dst_after,
npu_place,
src + i * src_after,
sizeof(T) * size,
npu_ctx.stream());
#elif defined(PADDLE_WITH_MLU)
auto& mlu_place = place;
auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
memory::Copy(mlu_place,
dst + i * dst_after,
mlu_place,
src + i * src_after,
sizeof(T) * size,
mlu_ctx.stream());
paddle::memory::Copy(mlu_place,
dst + i * dst_after,
mlu_place,
src + i * src_after,
sizeof(T) * size,
mlu_ctx.stream());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Paddle is not compiled with GPU."));
PADDLE_THROW(
phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
#endif
}
}
......@@ -145,11 +147,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
template <typename T>
inline void StridedMemcpyWithAxis0(
const platform::DeviceContext& dev_ctx,
const phi::DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& shape_refer,
std::vector<phi::DenseTensor*>* outputs) {
const framework::DDim in_stride = stride_numel(input.dims());
const phi::DDim in_stride = stride_numel(input.dims());
const int axis = 0;
size_t input_offset = 0;
......@@ -169,5 +171,5 @@ inline void StridedMemcpyWithAxis0(
}
}
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/concat_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
......@@ -24,6 +23,7 @@
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi {
......@@ -85,14 +85,13 @@ void ConcatKernel(const Context& dev_ctx,
}
auto in_stride = phi::stride_numel(in->dims());
auto out_stride = phi::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>(
dev_ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in->data<T>(),
in_stride,
in_stride[axis]);
phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
axis,
out->data<T>() + output_offset,
out_stride,
in->data<T>(),
in_stride,
in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
......
......@@ -13,10 +13,10 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/concat_grad_kernel.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi {
......@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx,
if (axis == 0 && outs.size() < 10) {
std::vector<const DenseTensor*> ref_shape;
ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
paddle::operators::StridedMemcpyWithAxis0<T>(
phi::funcs::StridedMemcpyWithAxis0<T>(
dev_ctx, out_grad, ref_shape, &outputs);
} else {
phi::funcs::SplitFunctor<Context, T> split_functor;
......
......@@ -15,11 +15,11 @@
#pragma once
#include "paddle/phi/kernels/split_kernel.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
namespace phi {
template <typename T, typename Context>
......@@ -37,8 +37,7 @@ void SplitKernel(const Context& dev_ctx,
int axis = axis_scalar.to<int>();
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) {
paddle::operators::StridedMemcpyWithAxis0<T>(
dev_ctx, x, shape_refer, &outs);
phi::funcs::StridedMemcpyWithAxis0<T>(dev_ctx, x, shape_refer, &outs);
} else {
phi::funcs::SplitFunctor<Context, T> functor;
functor(dev_ctx, x, shape_refer, axis, &outs);
......
......@@ -95,3 +95,8 @@ cc_test(
test_cache
SRCS test_cache.cc
DEPS gtest cache)
cc_test(
strided_memcpy_test
SRCS strided_memcpy_test.cc
DEPS device_context memory)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle {
namespace operators {
namespace phi {
namespace tests {
TEST(StridedMemcpy, CPUCrop) {
// clang-format off
......@@ -29,14 +29,15 @@ TEST(StridedMemcpy, CPUCrop) {
};
// clang-format on
framework::DDim src_stride({5, 1});
phi::DDim src_stride({5, 1});
int dst[4];
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1});
phi::DDim dst_dim({2, 2});
phi::DDim dst_stride({2, 1});
phi::CPUContext ctx;
StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
phi::funcs::StridedMemcpy<int>(
ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]);
......@@ -54,13 +55,15 @@ TEST(StridedMemcpy, CPUConcat) {
int dst[8];
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
phi::DDim src_stride({2, 1});
phi::DDim dst_dim({2, 2});
phi::DDim dst_stride({4, 1});
phi::CPUContext ctx;
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
phi::funcs::StridedMemcpy<int>(
ctx, src, src_stride, dst_dim, dst_stride, dst);
phi::funcs::StridedMemcpy<int>(
ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
// clang-format off
int expect_dst[] = {
......@@ -83,8 +86,8 @@ TEST(StridedMemcpy, GPUCrop) {
};
// clang-format on
platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu;
phi::GPUPlace gpu0(0);
phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
......@@ -92,24 +95,24 @@ TEST(StridedMemcpy, GPUCrop) {
.get());
ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src));
auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1});
phi::DDim src_stride({5, 1});
int dst[4];
auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1});
phi::DDim dst_dim({2, 2});
phi::DDim dst_stride({2, 1});
StridedMemcpy<int>(
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
ASSERT_EQ(1, dst[0]);
......@@ -126,30 +129,31 @@ TEST(StridedMemcpy, GPUConcat) {
};
// clang-format on
platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu;
phi::GPUPlace gpu0(0);
phi::CPUPlace cpu;
phi::GPUContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8];
auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
phi::DDim src_stride({2, 1});
phi::DDim dst_dim({2, 2});
phi::DDim dst_stride({4, 1});
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
phi::funcs::StridedMemcpy<int>(
ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
// clang-format off
......@@ -164,5 +168,5 @@ TEST(StridedMemcpy, GPUConcat) {
}
#endif
} // namespace operators
} // namespace paddle
} // namespace tests
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册