From bdf5834e259b9a2afc04749ff4f8347276074ad3 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Mon, 13 Dec 2021 16:49:41 +0800 Subject: [PATCH] update xpu_memcpy (#38049) --- .../framework/details/nan_inf_utils_detail.cc | 6 +++-- paddle/fluid/memory/memcpy.cc | 14 ++++++++++-- .../amp/check_finite_and_unscale_op_xpu.cc | 6 +++-- .../amp/update_loss_scaling_op_xpu.cc | 22 ++++++++++++------- .../fluid/operators/masked_select_op_xpu.cc | 13 +++-------- paddle/fluid/operators/range_op_xpu.cc | 10 ++++----- paddle/fluid/operators/where_index_op_xpu.cc | 13 +++-------- paddle/fluid/pybind/tensor_py.h | 5 +++-- 8 files changed, 47 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 031c2a27660..a5787ac3966 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -353,8 +353,10 @@ void CheckVarHasNanOrInf(const std::string& op_type, } float* cpu_data = new float[tensor->numel()]; - xpu_memcpy(cpu_data, tensor->data(), tensor->numel() * sizeof(float), - XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), static_cast(cpu_data), + BOOST_GET_CONST(platform::XPUPlace, tensor->place()), + static_cast(tensor->data()), + tensor->numel() * sizeof(float)); bool flag = false; for (int i = 0; i < tensor->numel(); i++) { if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index fe38200efa8..4de81435881 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -136,6 +136,11 @@ void Copy(platform::CPUPlace dst_place, "Baidu Kunlun Card is properly installed.", ret)); } + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.GetByPlace(src_place); + dev_ctx->Wait(); + ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External( @@ -182,6 +187,11 @@ void Copy(platform::XPUPlace dst_place, "Baidu Kunlun Card is properly installed.", ret)); void* tmp = malloc(num); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.GetByPlace(src_place); + dev_ctx->Wait(); + ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST); PADDLE_ENFORCE_EQ( ret, XPU_SUCCESS, @@ -214,8 +224,8 @@ void Copy(platform::XPUPlace dst_place, } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.GetByPlace(src_place); - dev_ctx->Wait(); - int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num); + int ret = xpu::copy(dev_ctx->x_context(), static_cast(src), + static_cast(dst), num); PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External( "XPU API return wrong value[%d %s]", ret, XPUAPIErrorMsg[ret])); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index ad726f2bf19..5d5e13e848a 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -40,8 +40,10 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { MPDType cpu_scale_data; if (platform::is_xpu_place(scale->place())) { - xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), static_cast(&cpu_scale_data), + BOOST_GET_CONST(platform::XPUPlace, scale->place()), + static_cast(scale_data), sizeof(MPDType)); + } else { cpu_scale_data = (*scale_data); } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc index d9b3dcd6c15..fa7985e186d 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -42,8 +42,10 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { const bool* found_inf_data = found_inf->data(); bool cpu_found_inf_data = false; if (platform::is_xpu_place(found_inf->place())) { - xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), + static_cast(&cpu_found_inf_data), + BOOST_GET_CONST(platform::XPUPlace, found_inf->place()), + static_cast(found_inf_data), sizeof(bool)); } else { cpu_found_inf_data = (*found_inf_data); } @@ -94,22 +96,26 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel { int cpu_good_in_data; MPDType cpu_pre_loss_scaling_data; if (platform::is_xpu_place(bad_in->place())) { - xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), static_cast(&cpu_bad_in_data), + BOOST_GET_CONST(platform::XPUPlace, bad_in->place()), + static_cast(bad_in_data), sizeof(int)); } else { cpu_bad_in_data = (*bad_in_data); } if (platform::is_xpu_place(good_in->place())) { - xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), static_cast(&cpu_good_in_data), + BOOST_GET_CONST(platform::XPUPlace, good_in->place()), + static_cast(good_in_data), sizeof(int)); } else { cpu_good_in_data = (*good_in_data); } if (platform::is_xpu_place(pre_loss_scaling->place())) { - xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data, - sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + memory::Copy( + platform::CPUPlace(), static_cast(&cpu_pre_loss_scaling_data), + BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()), + static_cast(pre_loss_scaling_data), sizeof(MPDType)); } else { cpu_pre_loss_scaling_data = (*pre_loss_scaling_data); } diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc index 665ac937fdc..d86ad8f89b9 100644 --- a/paddle/fluid/operators/masked_select_op_xpu.cc +++ b/paddle/fluid/operators/masked_select_op_xpu.cc @@ -48,16 +48,9 @@ class MaskedSelectXPUKernel : public framework::OpKernel { "XPU nonzero_count kernel return wrong value[%d %s]", ret, XPUAPIErrorMsg[ret])); - if (dev_ctx.x_context()->xpu_stream) { - dev_ctx.Wait(); - } - ret = xpu_memcpy(static_cast(&out_size_cpu), - static_cast(out_size), sizeof(int32_t), - XPU_DEVICE_TO_HOST); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External("XPU xpu_memcpy return wrong " - "value[%d %s]", - ret, XPUAPIErrorMsg[ret])); + memory::Copy(platform::CPUPlace(), static_cast(&out_size_cpu), + BOOST_GET_CONST(platform::XPUPlace, mask->place()), + static_cast(out_size), sizeof(int32_t)); framework::DDim out_dim{out_size_cpu}; out->Resize(out_dim); diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc index b450ece4528..1d4de779781 100644 --- a/paddle/fluid/operators/range_op_xpu.cc +++ b/paddle/fluid/operators/range_op_xpu.cc @@ -50,12 +50,10 @@ class XPURangeKernel : public framework::OpKernel { out_cpu_data_ptr[i] = value; value += step; } - int ret = xpu_memcpy(out_data, out_cpu_data_ptr, out->numel() * sizeof(T), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External("XPU xpu_memcpy return wrong " - "value[%d %s]", - ret, XPUAPIErrorMsg[ret])); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), + static_cast(out_data), platform::CPUPlace(), + static_cast(out_cpu_data_ptr), + out->numel() * sizeof(T)); } }; diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc index 58f09e7381e..53ddefbbe0c 100644 --- a/paddle/fluid/operators/where_index_op_xpu.cc +++ b/paddle/fluid/operators/where_index_op_xpu.cc @@ -43,16 +43,9 @@ class WhereIndexXPUKernel : public framework::OpKernel { "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex", ret, XPUAPIErrorMsg[ret])); - if (dev_ctx.x_context()->xpu_stream) { - dev_ctx.Wait(); - } - ret = xpu_memcpy(static_cast(&true_num_cpu), - static_cast(true_num), sizeof(int32_t), - XPU_DEVICE_TO_HOST); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External("XPU xpu_memcpy return wrong " - "value[%d %s]", - ret, XPUAPIErrorMsg[ret])); + memory::Copy(platform::CPUPlace(), static_cast(&true_num_cpu), + BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), + static_cast(true_num), sizeof(int32_t)); out->Resize( framework::make_ddim({static_cast(true_num_cpu), rank})); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 935a6437338..df9ba02eadf 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -307,8 +307,9 @@ void SetTensorFromPyArrayT( platform::XPUDeviceGuard guard( BOOST_GET_CONST(platform::XPUPlace, tmp_place).device); auto dst = self->mutable_data(place); - xpu_memcpy(dst, array.data(), array.nbytes(), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, tmp_place), + static_cast(dst), platform::CPUPlace(), + static_cast(array.data()), array.nbytes()); #else PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use XPUPlace in CPU/GPU version, " -- GitLab