diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index ecc75332d1f15e0ddcefbb521083d76111f7bb33..f46c5bf7ac09de7c7f74a5ae27c87e79335e0f25 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -65,11 +65,13 @@ if(WITH_ASCEND_CL)
   set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
   set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
   set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
-  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
 
-  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
   message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
 
   ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 3c06008d00a8573bf9b1cab3d62633854a1bf0e7..bb3a6512d2c8ba3b5f0d643a5ae6d906a00717c3 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -58,12 +58,15 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
             expand_times.size(), static_cast<size_t>(in_dims.size())));
     auto* out0 = context.Output<framework::LoDTensor>("Out");
     framework::DDim out_dims(in_dims);
+
     for (size_t i = 0; i < expand_times.size(); ++i) {
       out_dims[i] *= expand_times[i];
     }
+
     out0->Resize(out_dims);
     out0->mutable_data<T>(context.device_context().GetPlace());
-    auto runner = NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto runner =
+        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
     auto stream =
         context.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index e5031acb9b42e337807b06f8d663d0ab73868932..2a7519706cc5d48318f0ad4a5b143a2e5d96e105 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-
 #include "glog/logging.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace memory {
@@ -253,6 +253,7 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
+  platform::RecordEvent record_event("NPUDeviceContext/wait");
   NPUDeviceGuard guard(place_.device);
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
 }
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index bbf8e4d5ca783b2f7e7b0842ef924378574a8a51..1469e0ecd06f63771ff88f0289abe2c6def05f52 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -598,6 +598,8 @@ class DeviceTracerImpl : public DeviceTracer {
               BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else if (platform::is_npu_place(r.place)) {
+          event->set_place(proto::MemEvent::NPUPlace);
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "The current place is not supported."));
diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..05325aaf9baa1363fead39d107b0de5a3922eea2
--- /dev/null
+++ b/paddle/fluid/platform/npu_profiler.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "acl/acl_prof.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+// For ACL 20.1
+// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
+// ACL_AICORE_PIPELINE = 1, record pipeline
+// ACL_AICORE_SYNCHRONIZATION = 2, record sync
+// ACL_AICORE_MEMORY = 3, recore memory
+// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
+// ACL_AICORE_STALL = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMATIC_THROUGHPUT;
+
+// ACL_PROF_ACL_API, record ACL API stats
+// ACL_PROF_TASK_TIME, record AI core stats
+// ACL_PROF_AICORE_METRICS, must include
+// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
+constexpr uint64_t default_type =
+    ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
+
+aclprofConfig *NPUProfilerCreateConfig(
+    std::vector<uint32_t> devices = {},
+    aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
+    aclprofAicoreEvents *events = nullptr) {
+  if (devices.size() == 0) {
+    int device_id = GetCurrentNPUDeviceId();
+    devices.emplace_back(device_id);
+  }
+  aclprofConfig *config =
+      aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
+                                      "Failed to create prof config for NPU"));
+  return config;
+}
+
+void NPUProfilerDestroyConfig(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
+}
+
+void NPUProfilerInit(std::string output_path) {
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclprofInit(output_path.c_str(), output_path.size()));
+}
+
+void NPUProfilerStart(const aclprofConfig *config) {
+  if (config == nullptr) {
+    // NOTE(zhiqiu): support single device by default.
+    int device_id = GetCurrentNPUDeviceId();
+    std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
+    config = NPUProfilerCreateConfig(devices);
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
+}
+
+void NPUProfilerStop(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
+  NPUProfilerDestroyConfig(config);
+}
+
+void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
+
+struct NPUProfConfigWrapper {
+  aclprofConfig *p_;
+  explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
+  aclprofConfig *ptr() { return p_; }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index cfa3c6906f83f750c8d6dc654f29b8fe95ec17ac..31193534a00be03ed96c5ba01666614389830f71 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -21,6 +21,7 @@ message Event {
   enum EventType {
     CPU = 0;
     GPUKernel = 1;
+    NPUKernel = 2;
   }
   optional EventType type = 8;
   optional string name = 1;
@@ -39,6 +40,8 @@ message MemEvent {
     CUDAPlace = 0;
     CPUPlace = 1;
     CUDAPinnedPlace = 2;
+    XPUPlace = 3;
+    NPUPlace = 4;
   }
   optional uint64 start_ns = 1;
   optional uint64 end_ns = 2;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b3d9e22dba8d244f5e57267a527de0bbdc534996..857498e852fafd88d7830c9aa9f45dee0b45c3d2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -104,6 +104,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_profiler.h"
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -499,11 +500,6 @@ PYBIND11_MODULE(core_noavx, m) {
         make_ddim(x_dim), make_ddim(y_dim), -1));
   });
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  m.def("_npu_finalize",
-        []() { platform::AclInstance::Instance().Finalize(); });
-#endif
-
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -2082,6 +2078,31 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
+  m.def("_npu_finalize", []() {
+    platform::AclInstance::Instance().Finalize();
+  });  // private interface
+
+  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
+
+  m.def("npu_prof_init", platform::NPUProfilerInit);
+  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStart(c.ptr());
+  });
+  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStop(c.ptr());
+  });
+  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
+  m.def("npu_prof_create_config", []() {
+    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
+  });
+
+  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerDestroyConfig(c.ptr());
+  });
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index bc7a60af94617c8ea4102ae30ccf0d04330d199b..40b0862be0177ec9ce90088bf48ff7a068868bec 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
         os.remove(config_file)
 
 
+@signature_safe_contextmanager
+def npu_profiler(output_file, config=None):
+    """
+    The NPU profiler.
+    
+    This fuctions is used to profile NPU program by NPU runtime application
+    programming interface. The profiling result will be written into
+    `output_file`. The users can set set the NPU profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    to load this output file to visualize results.
+
+    Args:
+        output_file (str) : The output file name, the result will be
+            written into this file. It should be absolute path. 
+        config (list<str>, optional) : NPU profile config. For more details, please
+            refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            import numpy as np
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'npu.txt'
+            with profiler.npu_profiler(output_file) as npu_prof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NPU profiler tools to load this output file
+            # to visualize results.
+    """
+    # TODO: support config in python.
+    if not config:
+        config = core.npu_prof_create_config()
+
+    core.npu_prof_init(output_file)
+    # Enables profiler collection by the active NPU profiling tool.
+    core.npu_prof_start(config)
+    try:
+        yield
+    # Disables profiler collection.
+    finally:
+        core.npu_prof_stop(config)
+        core.npu_prof_finalize()
+
+
 def reset_profiler():
     """
     Clear the previous time record. This interface does not work for
diff --git a/tools/timeline.py b/tools/timeline.py
index 119018380b551cd10e419f0083774af5e4ff27ac..2a399b71b778634e820a1e3d5cedaa378616c22d 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -186,6 +186,13 @@ class Timeline(object):
                         self._chrome_trace.emit_pid(
                             "memory usage on %s:cudapinnedplace:%d" %
                             (k, mevent.device_id), pid)
+                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
+                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
+                            pid)
                 if (k, 0, "CPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CPU")] = pid
@@ -201,6 +208,11 @@ class Timeline(object):
                     self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                     self._chrome_trace.emit_pid(
                         "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
+                if (k, 0, "NPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "NPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
+                                                (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -227,7 +239,8 @@ class Timeline(object):
         place_to_str = {
             profiler_pb2.MemEvent.CPUPlace: "CPU",
             profiler_pb2.MemEvent.CUDAPlace: "GPU",
-            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
+            profiler_pb2.MemEvent.NPUPlace: "NPU"
         }
         for k, profile_pb in six.iteritems(self._profile_dict):
             mem_list = []