From 03533b0ca02aa46e5ab93abd54320ff1c0abccc2 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Tue, 19 Apr 2022 14:36:50 +0800 Subject: [PATCH] [MLU] support add callback to stream (#41831) --- .../fluid/framework/data_device_transform.cc | 8 -------- paddle/fluid/operators/pool_op_mlu.cc | 15 ++++++++++----- paddle/fluid/platform/device/mlu/mlu_stream.h | 1 - .../profiler/mlu/cnpapi_data_process.cc | 1 + .../fluid/platform/stream_callback_manager.cc | 6 ++---- paddle/phi/common/backend.h | 6 ++++++ paddle/phi/core/compat/convert_utils.cc | 2 ++ .../fluid/dygraph/varbase_patch_methods.py | 3 ++- tools/dockerfile/Dockerfile.mlu | 18 +++++++++--------- 9 files changed, 32 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 589d09bf81c..1a4f283f511 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, return; } - // NOTE(hqp): Special case for CPU->MLU, avoid stream sync. - if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) { - paddle::framework::TensorCopy( - in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), - out); - return; - } - // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index fa88d128a9a..c1bcf82c332 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel { framework::Tensor extra_device_tensor = ctx.AllocateTmpTensor( {static_cast(extra_input_size)}, dev_ctx); - // TODO(fwg): use Async copy, and add a callback to stream that free - // host - // memory. - framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), - &extra_device_tensor); + framework::TensorCopy(extra_host_tensor, ctx.GetPlace(), + &extra_device_tensor); + // Increase extra_host_tensor holder_ reference count until copy + // complete. + auto increase_ref_count = [extra_host_tensor]() { + VLOG(4) << "Finished copying extra_host_tensor[" + << GetBasePtr(&extra_host_tensor) + << "] in mlu pooling kernel."; + }; + dev_ctx.AddStreamCallback(increase_ref_count); MLUCnnl::PoolingForward( ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.h b/paddle/fluid/platform/device/mlu/mlu_stream.h index 3f4b27e370f..b20949f3bfe 100644 --- a/paddle/fluid/platform/device/mlu/mlu_stream.h +++ b/paddle/fluid/platform/device/mlu/mlu_stream.h @@ -40,7 +40,6 @@ class MLUStream final { template void AddCallback(Callback&& callback) const { - // TODO(mlu): mlu not support AddCallback callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc index eceb5fabe8d..36abf77279d 100644 --- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc @@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() { REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent); REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice); REGISTER_RUNTIME_CBID_STR(cnCtxSync); + REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc); #undef REGISTER_RUNTIME_CBID_STR } diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 7148afee273..6fa326d57bc 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -80,10 +80,8 @@ void StreamCallbackManager::AddCallback( #endif #if PADDLE_WITH_MLU - VLOG(3) << "MLULaunchCallback at stream: " << stream_ - << " Failed to call MLULaunchCallback, " - << "because mlu not support StreamAddCallback yet. " - << "function: " << func; + VLOG(3) << "MLULaunchCallback at stream: " << stream_; + cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func); #endif } diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index bfa45869f5f..3e1787cb12c 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -47,6 +47,7 @@ enum class Backend : uint8_t { GPU, XPU, // XPU currently does not exist at the same time as CUDA NPU, // NPU currently does not exist at the same time as CUDA + MLU, // MLU currently does not exist at the same time as CUDA // the third library backend MKLDNN, @@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::NPU: os << "NPU"; break; + case Backend::MLU: + os << "MLU"; + break; case Backend::MKLDNN: os << "MKLDNN"; break; @@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::XPU; } else if (s == std::string("NPU")) { return Backend::NPU; + } else if (s == std::string("MLU")) { + return Backend::MLU; } else if (s == std::string("MKLDNN")) { return Backend::MKLDNN; } else if (s == std::string("GPUDNN")) { diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 4fa11ac7860..3b49b40dff1 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) { return Backend::NPU; } else if (allocation_type == phi::AllocationType::IPU) { return Backend::IPU; + } else if (allocation_type == phi::AllocationType::MLU) { + return Backend::MLU; } else if (allocation_type == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 03045579e71..db6af87635c 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -271,7 +271,8 @@ def monkey_patch_varbase(): if _grad_scalar: # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. self = _grad_scalar.scale(self) - if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(): + if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu( + ) or paddle.is_compiled_with_mlu(): # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. scaled_loss = scale_loss(self) if framework._in_eager_mode_: diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu index f7823738afc..07535a63743 100644 --- a/tools/dockerfile/Dockerfile.mlu +++ b/tools/dockerfile/Dockerfile.mlu @@ -2,9 +2,9 @@ # Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions # # Build: -# - CNTOOLKIT_VERSION 2.6.5-1 -# - CNNL_VERSION 1.8.3-1 -# - CNCL_VERSION 1.0.2-1 +# - CNTOOLKIT_VERSION 2.8.1-1 +# - CNNL_VERSION 1.9.3-1 +# - CNCL_VERSION 1.0.4-1 # # Download three packages from FTP (need to connect cambricon AE to get FTP url) # - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb @@ -21,9 +21,9 @@ # (get cncl pkg) # # docker build -f Dockerfile.mlu \ -# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \ -# --build-arg CNNL_VERSION=1.8.3-1 \ -# --build-arg CNCL_VERSION=1.0.2-1 \ +# --build-arg CNTOOLKIT_VERSION=2.8.1-1 \ +# --build-arg CNNL_VERSION=1.9.3-1 \ +# --build-arg CNCL_VERSION=1.0.4-1 \ # -t paddlepaddle/paddle:latest-dev-mlu . # # without mlu device: @@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors ENV WITH_GPU=OFF -ARG CNTOOLKIT_VERSION=2.6.5-1 -ARG CNNL_VERSION=1.8.3-1 -ARG CNCL_VERSION=1.0.2-1 +ARG CNTOOLKIT_VERSION=2.8.1-1 +ARG CNNL_VERSION=1.9.3-1 +ARG CNCL_VERSION=1.0.4-1 ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb -- GitLab