diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3aa47c9e092f7a9327ff25b818a002003396cae6..7b17899d3da2383d7aef0593cd7f40c529afbdaf 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -167,7 +167,7 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
index 3814faa7662fc556bc84d61802772b7e7db7ad74..bb36eedb83238149cc38bdfc8e2033834140c7d1 100644
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -190,6 +190,8 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
                    enum aclrtMemcpyKind kind, size_t dst_max_count) {
   // NOTE(zhiqiu):  The default max_count is count
   dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind;
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
 }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 038bcc7f850990feaf4ce8f2df41f2b39dd247df..f315323ed592efc396ae83060dacdf34ce8ac73e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2178,9 +2178,7 @@ All parameter, weight, gradient are variables in Paddle.
 
 #ifdef PADDLE_WITH_ASCEND_CL
   m.def("get_npu_device_count", platform::GetNPUDeviceCount);
-  m.def("_npu_finalize", []() {
-    platform::AclInstance::Instance().Finalize();
-  });  // private interface
+  m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); });
 
   py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6dd1478dc1f45fb388fbc4ca978db30522e058d4..439b5c64615c06827f77cee1d9fd85ca566eebdd 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import os
 import sys
+import atexit
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepadde without -U option
@@ -255,3 +256,8 @@ def __bootstrap__():
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+
+# NOTE(zhiqiu): register npu_finalize on the exit of Python,
+# do some clean up manually.
+if core.is_compiled_with_npu():
+    atexit.register(core.npu_finalize)