Profile Executor (#39641)

* add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * add log for Executor * Profile Allocators * Profile Allocators * adjust interface * remove lock for set * fix Co-authored-by: N liutiexing <liutiexing@google.com>

Profile Executor (#39641)
* add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * add log for Executor * Profile Allocators * Profile Allocators * adjust interface * remove lock for set * fix Co-authored-by: N liutiexing <liutiexing@google.com>
7ecefec3 · liutiexing · GitHub · bc99a76c · 7ecefec3 · 7ecefec3
4 changed file
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
@@ -44,7 +44,6 @@ class ThreadDataRegistry {
  template <typename Alias = T,
            typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
  void SetCurrentThreadData(const T& val) {
-    std::lock_guard<std::mutex> lock(lock_);
    CurrentThreadData() = val;
  }


--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -8,6 +8,7 @@
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"

 namespace paddle {
 namespace framework {
@@ -61,6 +62,8 @@ class WorkQueueImpl : public WorkQueue {
  }

  void AddTask(std::function<void()> fn) override {
+    platform::RecordEvent("WorkQueue::AddTask",
+                          platform::TracerEventType::UserDefined, 10 /*level*/);
    if (tracker_ != nullptr) {
      fn = [
        task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
@@ -156,6 +159,8 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
 }

 void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
+  platform::RecordEvent("WorkQueue::AddTask",
+                        platform::TracerEventType::UserDefined, 10 /*level*/);
  assert(queue_idx < queues_.size());
  if (queues_options_.at(queue_idx).track_task) {
    fn = [

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -18,6 +18,7 @@
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"

 PADDLE_DEFINE_EXPORTED_READONLY_bool(
    free_idle_chunk, false,
@@ -47,6 +48,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(

 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
    size_t unaligned_size) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
  size_t size = AlignedSize(unaligned_size, alignment_);
  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;

@@ -108,6 +111,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
 }

 void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
+  platform::RecordEvent("AutoGrowthBestFitAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
  VLOG(10) << "Free " << allocation->size()
           << " bytes, ptr = " << allocation->ptr();
  std::lock_guard<SpinLock> guard(spinlock_);

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"

 namespace paddle {
 namespace memory {
@@ -117,6 +118,8 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }

 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
  ProcessUnfreedAllocations();
  VLOG(8) << "Try allocate " << size << " bytes";
  AllocationPtr underlying_allocation;
@@ -144,6 +147,8 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
 }

 void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
+  platform::RecordEvent("StreamSafeCUDAAllocator::Free",
+                        platform::TracerEventType::UserDefined, 9 /*level*/);
  StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,