From 0d8ddf9fce71a909fa88f4b6e20b6faf1037beb5 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Wed, 7 Dec 2022 10:41:52 +0800 Subject: [PATCH] modify d2d copy to xpu::copy in xpu kernel, test=kunlun (#48710) --- .../fluid/operators/reader/buffered_reader.cc | 7 ++++- paddle/phi/backends/xpu/xpu_info.cc | 19 ++++-------- paddle/phi/kernels/reshape_grad_kernel.cc | 22 ++++++++++++++ paddle/phi/kernels/reshape_kernel.cc | 30 +++++++++++++++++++ paddle/phi/kernels/xpu/gather_nd_kernel.cc | 12 ++++---- .../kernels/xpu/generate_proposals_kernel.cc | 20 ++++++------- paddle/phi/kernels/xpu/scatter_kernel.cc | 5 +++- paddle/phi/kernels/xpu/tile_kernel.cc | 6 +++- .../unittests/xpu/get_test_cover_info.py | 2 ++ .../unittests/xpu/test_reshape2_op_xpu.py | 1 + 10 files changed, 91 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 5bb8a29ce3..ddb85f3cfb 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -417,8 +417,13 @@ void BufferedReader::ReadAsync(size_t i) { // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe // KL3 if ((platform::is_xpu_place(cpu_place))) { - memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size); platform::XPUStreamSync(stream_.get()); + char *tmp = new char[size]; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy( + tmp, cpu_ptr, size, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy( + xpu_ptr, tmp, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + delete[] tmp; } else { memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size); } diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index d084afee22..89ebce438a 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -169,19 +169,12 @@ void MemcpySyncD2D(void* dst, const phi::XPUContext& dev_ctx) { int dev_id = GetXPUCurrentDeviceId(); if (dst_place.device == dev_id && src_place.device == dev_id) { - dev_ctx.Wait(); - char* tmp = new char[count]; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(tmp, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, tmp, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - delete[] tmp; - // PADDLE_ENFORCE_XDNN_SUCCESS( - // baidu::xpu::api::copy(dev_ctx.x_context(), - // static_cast(src), - // static_cast(dst), - // count), - // "copy "); + PADDLE_ENFORCE_XDNN_SUCCESS( + baidu::xpu::api::copy(dev_ctx.x_context(), + static_cast(src), + static_cast(dst), + count), + "copy "); } else { PADDLE_ENFORCE_XPU_SUCCESS( xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc index c4b92c4f76..ffd616054c 100644 --- a/paddle/phi/kernels/reshape_grad_kernel.cc +++ b/paddle/phi/kernels/reshape_grad_kernel.cc @@ -17,6 +17,9 @@ #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#endif namespace phi { @@ -29,6 +32,25 @@ void ReshapeGradKernel(const Context& dev_ctx, x_grad->Resize(x_dims); } +#ifdef PADDLE_WITH_XPU +template <> +void ReshapeGradKernel(const XPUContext& dev_ctx, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + auto x_dims = x_grad->dims(); + dev_ctx.Alloc(x_grad, out_grad.dtype()); + auto* src_ptr = out_grad.data(); + auto* dst_ptr = x_grad->data(); + auto size = out_grad.numel() * paddle::experimental::SizeOf(out_grad.dtype()); + int ret = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(src_ptr), + reinterpret_cast(dst_ptr), + size); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy"); + x_grad->Resize(x_dims); +} +#endif + template void ReshapeDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc index 632a63c9ab..a792322a44 100644 --- a/paddle/phi/kernels/reshape_kernel.cc +++ b/paddle/phi/kernels/reshape_kernel.cc @@ -19,6 +19,9 @@ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/funcs/common_shape.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#endif namespace phi { @@ -42,6 +45,33 @@ void ReshapeKernel(const Context& dev_ctx, out->ResetLoD(x.lod()); } +#ifdef PADDLE_WITH_XPU +template <> +void ReshapeKernel(const XPUContext& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + DenseTensor* out) { + MetaTensor meta_out(out); + InferMetaFromVecValue(x, shape.GetData(), &meta_out); + if (x.initialized() && x.Holder() == out->Holder()) { + dev_ctx.Alloc(out, x.dtype()); + return; + } + dev_ctx.Alloc(out, x.dtype()); + auto dims = out->dims(); + auto* src_ptr = x.data(); + auto* dst_ptr = out->data(); + auto size = x.numel() * paddle::experimental::SizeOf(x.dtype()); + int ret = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(src_ptr), + reinterpret_cast(dst_ptr), + size); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy"); + out->Resize(dims); + out->ResetLoD(x.lod()); +} +#endif + template void ReshapeWithXShape(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/xpu/gather_nd_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_kernel.cc index d7d23fa17c..8241e5109d 100644 --- a/paddle/phi/kernels/xpu/gather_nd_kernel.cc +++ b/paddle/phi/kernels/xpu/gather_nd_kernel.cc @@ -30,7 +30,10 @@ void GatherNdKernel(const Context &ctx, if (x.numel() == 0) return; if (index.numel() == 0) { - phi::Copy(ctx, x, phi::XPUPlace(), true, out); + out->Resize(x.dims()); + ctx.template Alloc(out); + int r = xpu::copy(ctx.x_context(), x.data(), out->data(), x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); return; } @@ -69,12 +72,7 @@ void GatherNdKernel(const Context &ctx, x_vec, index_shape); } - PADDLE_ENFORCE_EQ( - ret, - XPU_SUCCESS, - phi::errors::External("XPU gather_nd kernel return wrong value[%d %s]", - ret, - XPUAPIErrorMsg[ret])); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather_nd"); } } // namespace phi diff --git a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc index bf7f3e90bf..f19d19241e 100644 --- a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc +++ b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc @@ -372,16 +372,16 @@ void GenerateProposalsKernel(const Context& dev_ctx, DenseTensor& proposals = tensor_pair.first; DenseTensor& nscores = tensor_pair.second; - paddle::memory::Copy(place, - rpn_rois->data() + num_proposals * 4, - place, - proposals.data(), - sizeof(T) * proposals.numel()); - paddle::memory::Copy(place, - rpn_roi_probs->data() + num_proposals, - place, - nscores.data(), - sizeof(T) * scores.numel()); + r = xpu::copy(dev_ctx.x_context(), + proposals.data(), + rpn_rois->data() + num_proposals * 4, + proposals.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + r = xpu::copy(dev_ctx.x_context(), + nscores.data(), + rpn_roi_probs->data() + num_proposals, + nscores.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); if (dev_ctx.x_context()->xpu_stream) { dev_ctx.Wait(); diff --git a/paddle/phi/kernels/xpu/scatter_kernel.cc b/paddle/phi/kernels/xpu/scatter_kernel.cc index 988b8a7156..18e4e03dd2 100644 --- a/paddle/phi/kernels/xpu/scatter_kernel.cc +++ b/paddle/phi/kernels/xpu/scatter_kernel.cc @@ -27,7 +27,10 @@ void ScatterKernel(const Context &ctx, const DenseTensor &updates, bool overwrite, DenseTensor *out) { - phi::Copy(ctx, x, ctx.GetPlace(), false, out); + out->Resize(x.dims()); + ctx.template Alloc(out); + int ret = xpu::copy(ctx.x_context(), x.data(), out->data(), x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy"); // Apply ScatterUpdate: Out[index] = Updates[:] const auto &index_type = index.dtype(); bool index_type_match = diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc index 022e355f4c..b9383f108e 100644 --- a/paddle/phi/kernels/xpu/tile_kernel.cc +++ b/paddle/phi/kernels/xpu/tile_kernel.cc @@ -102,7 +102,11 @@ void TileKernel(const Context& dev_ctx, std::vector temp(repeat_times.size(), 1); if (repeat_times == temp) { - phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + out->Resize(x.dims()); + dev_ctx.template Alloc(out); + int r = + xpu::copy(dev_ctx.x_context(), x.data(), out->data(), x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); return; } diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index 22131010d9..afaf3b2a52 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -94,6 +94,8 @@ xpu_test_op_type_white_list = [ "c_embedding_float32", # unittests of collective ops do not using xpu testing framework "c_sync_comm_stream_float32", "c_sync_calc_stream_float32", + "reshape2_bool", + "reshape2_grad_bool", ] xpu_test_device_op_white_list = [] xpu_test_device_op_type_white_list = [] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py index 01773e8a28..e85ccf0cc4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py @@ -41,6 +41,7 @@ class XPUTestReshapeOp(XPUOpTestWrapper): def setUp(self): self.init_data() self.op_type = "reshape2" + self.dtype = self.in_type self.init_test_input() self.init_test_output() self.init_attrs() -- GitLab