diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
index a4a913cdff22db18e467670be9644ed90dca542e..21b2927b52eab653e20611e135a8c0f905057fcf 100644
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
@@ -44,7 +44,6 @@ class ThreadDataRegistry {
   template <typename Alias = T,
             typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
   void SetCurrentThreadData(const T& val) {
-    std::lock_guard<std::mutex> lock(lock_);
     CurrentThreadData() = val;
   }
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 07c5298c2f22377e277939e11af6fa6c142f24bc..596ffb9bfc0c4f624aeaf5874bdf18563d96d14c 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -8,6 +8,7 @@
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -61,6 +62,8 @@ class WorkQueueImpl : public WorkQueue {
   }
 
   void AddTask(std::function<void()> fn) override {
+    platform::RecordEvent("WorkQueue::AddTask",
+                          platform::TracerEventType::UserDefined, 10 /*level*/);
     if (tracker_ != nullptr) {
       fn = [
         task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
@@ -156,6 +159,8 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
 }
 
 void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
   assert(queue_idx < queues_.size());
   if (queues_options_.at(queue_idx).track_task) {
     fn = [
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index d86e5e35c08c0ef46ce86c0f372fc90f8df1811b..f5e4941d787097b5e349c0b668d6c95fad137873 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -18,6 +18,7 @@
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 PADDLE_DEFINE_EXPORTED_READONLY_bool(
     free_idle_chunk, false,
@@ -47,6 +48,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -108,6 +111,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   VLOG(10) << "Free " << allocation->size()
            << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index e7b86d6ec19c06d4ee9086590763f1afe23f99a9..8627e3e6f8811e162ce3014c01145f331a03ee4b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -117,6 +118,8 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   ProcessUnfreedAllocations();
   VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
@@ -144,6 +147,8 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
 }
 
 void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,