diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 589d09bf81c1d95795cd80ed22581e52156ae417..1a4f283f511da4300d26e764907998ad647eeebf 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
     return;
   }
 
-  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
-  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
-    paddle::framework::TensorCopy(
-        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
-        out);
-    return;
-  }
-
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index fa88d128a9a1d572414a6459933a8988cae1fda0..c1bcf82c332561deb1dbb3a648d3808ff9aae1ef 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
         framework::Tensor extra_device_tensor =
             ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
                 {static_cast<int64_t>(extra_input_size)}, dev_ctx);
-        // TODO(fwg): use Async copy, and add a callback to stream that free
-        // host
-        // memory.
-        framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
-                                  &extra_device_tensor);
+        framework::TensorCopy(extra_host_tensor, ctx.GetPlace(),
+                              &extra_device_tensor);
+        // Increase extra_host_tensor holder_ reference count until copy
+        // complete.
+        auto increase_ref_count = [extra_host_tensor]() {
+          VLOG(4) << "Finished copying extra_host_tensor["
+                  << GetBasePtr(&extra_host_tensor)
+                  << "] in mlu pooling kernel.";
+        };
+        dev_ctx.AddStreamCallback(increase_ref_count);
         MLUCnnl::PoolingForward(
             ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
             in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.h b/paddle/fluid/platform/device/mlu/mlu_stream.h
index 3f4b27e370f2e729c84cf8d5a9ccdefb6d1a4e1e..b20949f3bfe85cdc7368a9093b386a0a535292de 100644
--- a/paddle/fluid/platform/device/mlu/mlu_stream.h
+++ b/paddle/fluid/platform/device/mlu/mlu_stream.h
@@ -40,7 +40,6 @@ class MLUStream final {
 
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
-    // TODO(mlu): mlu not support AddCallback
     callback_manager_->AddCallback(callback);
   }
 
diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
index eceb5fabe8dba0cd73613910ce65968c841779fb..36abf77279d061f72a47fae32d5cd54c4f03a160 100644
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
   REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
   REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
   REGISTER_RUNTIME_CBID_STR(cnCtxSync);
+  REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
 #undef REGISTER_RUNTIME_CBID_STR
 }
 
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7148afee273fda61f991e386353fc323dd2f2ea2..6fa326d57bc676283a99905d269ee4e27277e5cc 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_
-          << " Failed to call MLULaunchCallback, "
-          << "because mlu not support StreamAddCallback yet. "
-          << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
+  cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
 #endif
 }
 
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index bfa45869f5ff689ac4af16203c7e74cdc4f198ff..3e1787cb12cfaec5afeef004e63af08b3a46c27b 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -47,6 +47,7 @@ enum class Backend : uint8_t {
   GPU,
   XPU,  // XPU currently does not exist at the same time as CUDA
   NPU,  // NPU currently does not exist at the same time as CUDA
+  MLU,  // MLU currently does not exist at the same time as CUDA
 
   // the third library backend
   MKLDNN,
@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::NPU:
       os << "NPU";
       break;
+    case Backend::MLU:
+      os << "MLU";
+      break;
     case Backend::MKLDNN:
       os << "MKLDNN";
       break;
@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::XPU;
   } else if (s == std::string("NPU")) {
     return Backend::NPU;
+  } else if (s == std::string("MLU")) {
+    return Backend::MLU;
   } else if (s == std::string("MKLDNN")) {
     return Backend::MKLDNN;
   } else if (s == std::string("GPUDNN")) {
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 4fa11ac7860ef534e2e48973723407c4e318dfd2..3b49b40dff124c279f218e6278bbb7cc40bccaac 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
     return Backend::NPU;
   } else if (allocation_type == phi::AllocationType::IPU) {
     return Backend::IPU;
+  } else if (allocation_type == phi::AllocationType::MLU) {
+    return Backend::MLU;
   } else if (allocation_type == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 03045579e7198a54e446c97f3cdbe62df1b27772..db6af87635ccb12da3bd30118cce72d6dc066cd8 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -271,7 +271,8 @@ def monkey_patch_varbase():
             if _grad_scalar:
                 # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                 self = _grad_scalar.scale(self)
-            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(
+            ) or paddle.is_compiled_with_mlu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 if framework._in_eager_mode_:
diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu
index f7823738afc536bbba3cab78794be3d8417ee6eb..07535a637431e49ddef65a648b601697e05c1162 100644
--- a/tools/dockerfile/Dockerfile.mlu
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -2,9 +2,9 @@
 # Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
 #
 # Build:
-# - CNTOOLKIT_VERSION 2.6.5-1
-# - CNNL_VERSION 1.8.3-1
-# - CNCL_VERSION 1.0.2-1
+# - CNTOOLKIT_VERSION 2.8.1-1
+# - CNNL_VERSION 1.9.3-1
+# - CNCL_VERSION 1.0.4-1
 #
 # Download three packages from FTP (need to connect cambricon AE to get FTP url)
 # - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
@@ -21,9 +21,9 @@
 # (get cncl pkg)
 #
 # docker build -f Dockerfile.mlu  \
-# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
-# --build-arg CNNL_VERSION=1.8.3-1 \
-# --build-arg CNCL_VERSION=1.0.2-1 \
+# --build-arg CNTOOLKIT_VERSION=2.8.1-1 \
+# --build-arg CNNL_VERSION=1.9.3-1 \
+# --build-arg CNCL_VERSION=1.0.4-1 \
 # -t paddlepaddle/paddle:latest-dev-mlu .
 #
 # without mlu device:
@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ENV WITH_GPU=OFF
 
-ARG CNTOOLKIT_VERSION=2.6.5-1
-ARG CNNL_VERSION=1.8.3-1
-ARG CNCL_VERSION=1.0.2-1
+ARG CNTOOLKIT_VERSION=2.8.1-1
+ARG CNNL_VERSION=1.9.3-1
+ARG CNCL_VERSION=1.0.4-1
 ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
 ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
 ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb