[MLU] support add callback to stream (#41831)

03533b0c · fwenguang · GitHub · bb71d834 · 03533b0c · 03533b0c
9 changed file
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
    return;
  }

-  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
-  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
-    paddle::framework::TensorCopy(
-        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
-        out);
-    return;
-  }
-
  // NOTE(yy): TransDataDevice should wait for computation of input.
  if (!platform::is_cuda_pinned_place(in.place())) {
    platform::DeviceContextPool::Instance().Get(in.place())->Wait();

--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
        framework::Tensor extra_device_tensor =
            ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
                {static_cast<int64_t>(extra_input_size)}, dev_ctx);
-        // TODO(fwg): use Async copy, and add a callback to stream that free
-        // host
-        // memory.
-        framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
-                                  &extra_device_tensor);
+        framework::TensorCopy(extra_host_tensor, ctx.GetPlace(),
+                              &extra_device_tensor);
+        // Increase extra_host_tensor holder_ reference count until copy
+        // complete.
+        auto increase_ref_count = [extra_host_tensor]() {
+          VLOG(4) << "Finished copying extra_host_tensor["
+                  << GetBasePtr(&extra_host_tensor)
+                  << "] in mlu pooling kernel.";
+        };
+        dev_ctx.AddStreamCallback(increase_ref_count);
        MLUCnnl::PoolingForward(
            ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
            in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,

--- a/paddle/fluid/platform/device/mlu/mlu_stream.h
+++ b/paddle/fluid/platform/device/mlu/mlu_stream.h
@@ -40,7 +40,6 @@ class MLUStream final {

  template <typename Callback>
  void AddCallback(Callback&& callback) const {
-    // TODO(mlu): mlu not support AddCallback
    callback_manager_->AddCallback(callback);
  }


--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
  REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
  REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
  REGISTER_RUNTIME_CBID_STR(cnCtxSync);
+  REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
 #undef REGISTER_RUNTIME_CBID_STR
 }


--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif

 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_
-          << " Failed to call MLULaunchCallback, "
-          << "because mlu not support StreamAddCallback yet. "
-          << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
+  cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
 #endif
 }


--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -47,6 +47,7 @@ enum class Backend : uint8_t {
  GPU,
  XPU,  // XPU currently does not exist at the same time as CUDA
  NPU,  // NPU currently does not exist at the same time as CUDA
+  MLU,  // MLU currently does not exist at the same time as CUDA

  // the third library backend
  MKLDNN,
@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    case Backend::NPU:
      os << "NPU";
      break;
+    case Backend::MLU:
+      os << "MLU";
+      break;
    case Backend::MKLDNN:
      os << "MKLDNN";
      break;
@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
    return Backend::XPU;
  } else if (s == std::string("NPU")) {
    return Backend::NPU;
+  } else if (s == std::string("MLU")) {
+    return Backend::MLU;
  } else if (s == std::string("MKLDNN")) {
    return Backend::MKLDNN;
  } else if (s == std::string("GPUDNN")) {

--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
    return Backend::NPU;
  } else if (allocation_type == phi::AllocationType::IPU) {
    return Backend::IPU;
+  } else if (allocation_type == phi::AllocationType::MLU) {
+    return Backend::MLU;
  } else if (allocation_type == phi::AllocationType::CUSTOM) {
    return static_cast<Backend>(
        static_cast<size_t>(Backend::NUM_BACKENDS) +

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -271,7 +271,8 @@ def monkey_patch_varbase():
            if _grad_scalar:
                # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                self = _grad_scalar.scale(self)
-            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(
+            ) or paddle.is_compiled_with_mlu():
                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                scaled_loss = scale_loss(self)
                if framework._in_eager_mode_:

--- a/tools/dockerfile/Dockerfile.mlu
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -2,9 +2,9 @@
 # Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
 #
 # Build:
-# - CNTOOLKIT_VERSION 2.6.5-1
-# - CNNL_VERSION 1.8.3-1
-# - CNCL_VERSION 1.0.2-1
+# - CNTOOLKIT_VERSION 2.8.1-1
+# - CNNL_VERSION 1.9.3-1
+# - CNCL_VERSION 1.0.4-1
 #
 # Download three packages from FTP (need to connect cambricon AE to get FTP url)
 # - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
@@ -21,9 +21,9 @@
 # (get cncl pkg)
 #
 # docker build -f Dockerfile.mlu  \
-# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
-# --build-arg CNNL_VERSION=1.8.3-1 \
-# --build-arg CNCL_VERSION=1.0.2-1 \
+# --build-arg CNTOOLKIT_VERSION=2.8.1-1 \
+# --build-arg CNNL_VERSION=1.9.3-1 \
+# --build-arg CNCL_VERSION=1.0.4-1 \
 # -t paddlepaddle/paddle:latest-dev-mlu .
 #
 # without mlu device:
@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

 ENV WITH_GPU=OFF

-ARG CNTOOLKIT_VERSION=2.6.5-1
-ARG CNNL_VERSION=1.8.3-1
-ARG CNCL_VERSION=1.0.2-1
+ARG CNTOOLKIT_VERSION=2.8.1-1
+ARG CNNL_VERSION=1.9.3-1
+ARG CNCL_VERSION=1.0.4-1
 ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
 ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
 ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb