diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 3c62f3c5e43d7e1fdbce624d061783553bfb43b2..f0c41e6dc0fcfd8e98117a2c4563f5ba9555a10c 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -235,7 +235,7 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_MLU
-  if (platform::is_mlu_place(dst_place)) {
+  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
     memory::Copy(
         dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f804c2af53916ab0f04cd5a281684b597527a4d7..6d348ceb87c83de1bb201a6b57477d764b58a2ba 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -962,6 +962,23 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
        stream);
 }
 
+// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, mluStream stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       mluStream stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
 #endif  // PADDLE_WITH_MLU
 
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
index 9862c2bd95256541ebeed20b202991f7bf6d3bc8..ca4f3dcc3f465fd2993e78763d0cdd66eddfa452 100644
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -35,9 +35,7 @@ class MeanMLUKernel : public framework::OpKernel<T> {
     auto stream = context.template device_context<MLUDeviceContext>().stream();
 
     if (rank == 0) {  // scalar
-      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
-      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
-                   stream);
+      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
       return;
     }
 
@@ -85,9 +83,7 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
     auto stream = context.template device_context<MLUDeviceContext>().stream();
 
     if (rank == 0) {  // scalar
-      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
-      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
-                   stream);
+      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
       return;
     }
 
diff --git a/paddle/fluid/operators/mlu/CMakeLists.txt b/paddle/fluid/operators/mlu/CMakeLists.txt
index 3fc411d6d13fa60169f0df8ccf3fe8d95af7c76e..59fab48b271d556c84d4f502022d41ec6d830e0a 100644
--- a/paddle/fluid/operators/mlu/CMakeLists.txt
+++ b/paddle/fluid/operators/mlu/CMakeLists.txt
@@ -1,5 +1,5 @@
 
 IF(WITH_MLU)
-    cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib)
+    cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib framework_proto xxhash)
     cc_test(activation_op_mlu_test SRCS activation_op_mlu_test.cc DEPS op_registry activation_op scope device_context executor)
 ENDIF()
diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..f7823738afc536bbba3cab78794be3d8417ee6eb
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -0,0 +1,73 @@
+# A image for building paddle binaries
+# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
+#
+# Build:
+# - CNTOOLKIT_VERSION 2.6.5-1
+# - CNNL_VERSION 1.8.3-1
+# - CNCL_VERSION 1.0.2-1
+#
+# Download three packages from FTP (need to connect cambricon AE to get FTP url)
+# - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
+# - cnnl_1.8.3-1.ubuntu18.04_amd64.deb
+# - cncl_1.0.2-1.ubuntu18.04_amd64.deb
+# copy them to current directory first, then run build commands
+#
+# For example:
+#
+# cd Paddle/tools/dockerfile
+#
+# (get cntoolkit pkg)
+# (get cnnl pkg)
+# (get cncl pkg)
+#
+# docker build -f Dockerfile.mlu  \
+# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
+# --build-arg CNNL_VERSION=1.8.3-1 \
+# --build-arg CNCL_VERSION=1.0.2-1 \
+# -t paddlepaddle/paddle:latest-dev-mlu .
+#
+# without mlu device:
+# docker run -it --network=host --pids-limit 409600 \
+# paddlepaddle/paddle:latest-dev-mlu /bin/bash
+#
+# with mlu device:
+# docker run -it --network=host --pids-limit 409600 \
+# --device=/dev/cambricon_ctl --device=/dev/cambricon_dev0 \
+# paddlepaddle/paddle:latest-dev-mlu /bin/bash
+
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ENV WITH_GPU=OFF
+
+ARG CNTOOLKIT_VERSION=2.6.5-1
+ARG CNNL_VERSION=1.8.3-1
+ARG CNCL_VERSION=1.0.2-1
+ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
+ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
+ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
+
+# install cntoolkit
+COPY $CNTOOLKIT_PKG ./
+RUN dpkg -i $CNTOOLKIT_PKG && \
+    apt-get update && \
+    apt-get install -y cnrt cnperf cnpapi cnlicense cngdb cndrv cndev cncodec cncc cnas cnbin cnstudio cnrtc cnpx && \
+    rm -f $CNTOOLKIT_PKG
+
+ENV NEUWARE_HOME=/usr/local/neuware
+ENV LD_LIBRARY_PATH=$NEUWARE_HOME/lib64:$LD_LIBRARY_PATH
+
+# install cnnl
+COPY $CNNL_PKG ./
+RUN dpkg -i $CNNL_PKG && \
+    rm -f $CNNL_PKG
+
+# install cncl
+COPY $CNCL_PKG ./
+RUN dpkg -i $CNCL_PKG && \
+    rm -f $CNCL_PKG
+
+# Clean
+RUN apt-get clean -y
+
+EXPOSE 22