diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f2a8309f00c67e7694f81ac95db3656ab0a37863..73add8ea06f060949bf6084408600a1d38c274fd 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -124,7 +124,7 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index a9382f2c8adcb18e320ef44086a312f89c03ad09..6edc141205a95252b2d1811b839d53da67a456c5 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/platform/dynload/nvtx.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -38,5 +39,13 @@ void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
 
 void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
 
+#ifndef _WIN32
+void CudaNvtxRangePush(std::string name) {
+  dynload::nvtxRangePushA(name.c_str());
+}
+
+void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 647bff93122b13117ea83dd616f1c744264aeb1a..725b7fcf9dde9809fab9ead225206cf239a0df54 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
 #hip
 if (WITH_ROCM_PLATFORM)
     list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 4c39a35030b3f47627135bed7b7cc8e8b714480c..e71305446890501908ecf4431da81bb765fce4f8 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -416,6 +416,19 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #endif
 }
 
+void* GetNvtxDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
+#elif defined(_WIN32)
+  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
+#elif !defined(PADDLE_WITH_CUDA)
+  PADDLE_THROW(
+      platform::errors::Unimplemented("Nvtx do not support without CUDA."));
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 1136184ce1fc9a2d1506cc0a424b2c94d0da4ee9..c3f5953c7857913980bca07d9fcffe459e483cfa 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -37,6 +37,7 @@ void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc
new file mode 100644
index 0000000000000000000000000000000000000000..372f8500e54dda0ee43c4129f9697c7e48d529a0
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvtx.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _WIN32
+#include "paddle/fluid/platform/dynload/nvtx.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nvtx_dso_flag;
+void *nvtx_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVTX_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
new file mode 100644
index 0000000000000000000000000000000000000000..b696bbf91816aa286f113cae70afe1f7683d24db
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef _WIN32
+#include <cuda.h>
+#include <nvToolsExt.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag nvtx_dso_flag;
+extern void *nvtx_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    int operator()(Args... args) {                                       \
+      using nvtxFunc = decltype(&::__name);                              \
+      std::call_once(nvtx_dso_flag, []() {                               \
+        nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
+      });                                                                \
+      static void *p_##__name = dlsym(nvtx_dso_handle, #__name);         \
+      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);            \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePop);
+
+NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 56a6275b582d753c0fa579d624b7f50c16f579b8..c8e8e68dcda4c81a7dc13a70373200e370ed5a15 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler_helper.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/nvtx.h"
+#endif
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 
@@ -51,6 +54,14 @@ double Event::CudaElapsedMs(const Event &e) const {
 }
 
 RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name.c_str());
+    is_pushed_ = true;
+  }
+#endif
+#endif
   if (g_state == ProfilerState::kDisabled || name.empty()) return;
 
   // do some initialization
@@ -65,6 +76,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
 }
 
 RecordEvent::~RecordEvent() {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook && is_pushed_) {
+    dynload::nvtxRangePop();
+  }
+#endif
+#endif
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
@@ -299,5 +317,12 @@ void SetProfileListener() {
 
 int64_t ListenerId() { return profiler_lister_id; }
 
+void NvprofEnableRecordEvent() {
+  SynchronizeAllDevice();
+  g_enable_nvprof_hook = true;
+}
+
+void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 0185328ff32004a35283c1d2883c2834b49b473b..66a102a3d58636ee3e2913d1cd0d91533684d505 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -131,6 +131,7 @@ struct RecordEvent {
   ~RecordEvent();
 
   bool is_enabled_{false};
+  bool is_pushed_{false};
   uint64_t start_ns_;
   // Event name
   std::string name_;
@@ -227,5 +228,8 @@ void DummyKernelAndEvent();
 void SetProfileListener();
 int64_t ListenerId();
 
+void NvprofEnableRecordEvent();
+void NvprofDisableRecordEvent();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 96296861322101ee166bd880d31903862e44e556..66595aa651a5450221c16e3e44982b3cdd16134b 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -42,6 +42,8 @@ std::mutex profiler_mu;
 static TracerOption g_tracer_option = TracerOption::kDefault;
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To hook RecordEvent's events, use it to nvtx timeline
+static bool g_enable_nvprof_hook = false;
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 0f52d7344c87fac8397aa2fc84e7b19672bc1280..39e83ab12d56daebe4b2d095b0939274c664d1db 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,6 +3,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper)
 
+if (WITH_GPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
+endif()
+
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 72b3c9645ba2d5571fd815e64b7af74e24e29948..03a21b29921de17799da580e31130c5ca9134729 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1951,6 +1951,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
+  m.def("nvprof_nvtx_push", platform::CudaNvtxRangePush);
+  m.def("nvprof_nvtx_pop", platform::CudaNvtxRangePop);
+  m.def("nvprof_enable_record_event", platform::NvprofEnableRecordEvent);
+  m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent);
 #endif
 #endif