executor perf statistics (#41648)

* executor perf statistics * fix ut * fix ut * fix ut * add ut * add ut

executor perf statistics (#41648)
* executor perf statistics * fix ut * fix ut * fix ut * add ut * add ut
cbe7466f · liutiexing · GitHub · d0f3296b · cbe7466f · cbe7466f
10 changed file
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -20,6 +20,8 @@ endif()
 cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
+cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info)
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")

--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
--- a/paddle/fluid/framework/new_executor/executor_statistics.h
+++ b/paddle/fluid/framework/new_executor/executor_statistics.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/platform/profiler/event_node.h"
+namespace paddle {
+namespace framework {
+void StaticGraphExecutorPerfStatistics(
+    std::shared_ptr<const platform::NodeTrees> profiling_data);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 namespace paddle {
 namespace framework {
@@ -59,6 +60,9 @@ paddle::framework::FetchList StandaloneExecutor::Run(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors,
    const std::vector<std::string>& fetch_names) {
+  platform::RecordEvent record_event("StandaloneExecutor::run",
+                                     platform::TracerEventType::UserDefined, 1);
  auto core = GetInterpreterCore(feed_names, fetch_names, true);
  return core->Run(feed_names, feed_tensors);
@@ -67,6 +71,9 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 paddle::framework::FetchList StandaloneExecutor::Run(
    const std::vector<std::string>& feed_names,
    const std::vector<std::string>& fetch_names) {
+  platform::RecordEvent record_event("StandaloneExecutor::run",
+                                     platform::TracerEventType::UserDefined, 1);
  auto core = GetInterpreterCore(feed_names, fetch_names, false);
  VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
  return core->Run(feed_names);

--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
 cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog)
-cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog)
+cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog os_info)
 cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -129,6 +129,7 @@ class ThreadPoolTempl {
    // this. We expect that such scenario is prevented by program, that is,
    // this is kept alive while any threads can potentially be in Schedule.
    if (!t.f) {
+      // Allow 'false positive' which makes a redundant notification.
      if (num_tasks > num_threads_ - blocked_) {
        VLOG(6) << "Add task, Notify";
        ec_.Notify(false);
@@ -379,9 +380,8 @@ class ThreadPoolTempl {
      return false;
    }
-    // Number of blocked threads is used as termination condition.
+    // Number of blocked threads is used as notification condition.
-    // If we are shutting down and all worker threads blocked without work,
+    // We must increase the counter before the emptiness check.
-    // that's we are done.
    blocked_++;
    // Now do a reliable emptiness check.
@@ -393,6 +393,9 @@ class ThreadPoolTempl {
      return true;
    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
      ec_.CancelWait();
      // Almost done, but need to re-check queues.

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -350,7 +350,7 @@ if(WITH_PYTHON)
    add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
  endif()
-  list(APPEND PYBIND_DEPS interpretercore standalone_executor)
+  list(APPEND PYBIND_DEPS interpretercore standalone_executor staticgraph_executor_statistics)
  cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
  list(APPEND PYBIND_DEPS op_function_common)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -46,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -2903,9 +2904,6 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run",
           [](StandaloneExecutor &self, std::vector<std::string> feed_names,
              std::vector<std::string> fetch_names) {
-             platform::RecordEvent record_event(
-                 "StandaloneExecutor::run",
-                 platform::TracerEventType::UserDefined, 1);
             paddle::framework::FetchList ret;
             {
               pybind11::gil_scoped_release release;
@@ -3380,7 +3378,10 @@ All parameter, weight, gradient are variables in Paddle.
      .def("stop",
           [](paddle::platform::Profiler *profiler) {
             platform::DisableHostEventRecorder();
-             return profiler->Stop();
+             auto result = profiler->Stop();
+             framework::StaticGraphExecutorPerfStatistics(
+                 result->GetNodeTrees());
+             return result;
           },
           py::return_value_policy::automatic_reference);

--- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -2,7 +2,7 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 foreach(target ${TEST_INTERP_CASES})
-  py_test_modules(${target} MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0)
+  py_test_modules(${target} MODULES ${target} ENVS FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0)
  py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) 
  py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0)
  py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001)

--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -15,10 +15,13 @@
 import os
 os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
 import sys
+import shutil
 import unittest
 import paddle
+import json
 from paddle.fluid import core
 from paddle.fluid.core import StandaloneExecutor
+from paddle.profiler import profiler
 import numpy as np
@@ -116,6 +119,107 @@ def build_program():
    return main_program, startup_program, [mean]
+class ExecutorStatisticsTestCase(unittest.TestCase):
+    def setUp(self):
+        self.iter_n = 3
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+    def test_standalone_executor_statistics(self):
+        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
+            return
+        paddle.seed(2020)
+        main_program, startup_program, fetch_list = build_program()
+        fetch_list = [x.name for x in fetch_list]
+        p = core.Place()
+        p.set_place(self.place)
+        executor = StandaloneExecutor(p, startup_program.desc,
+                                      main_program.desc, core.Scope())
+        helper_profiler = profiler.Profiler(
+            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
+        helper_profiler.start()
+        for i in range(self.iter_n):
+            executor.run({}, fetch_list)
+            helper_profiler.step()
+        helper_profiler.stop()
+        perfstat_filepath = os.environ[
+            'FLAGS_static_executor_perfstat_filepath']
+        self.assertTrue(os.path.exists(perfstat_filepath))
+        with open(perfstat_filepath, 'r') as load_f:
+            stat_res = json.load(load_f)
+            self.assertTrue(len(stat_res) > 0)
+        os.remove(perfstat_filepath)
+        shutil.rmtree('./profiler_log')
+    def test_parallel_executor_statistics(self):
+        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
+            return
+        paddle.seed(2020)
+        main_program, startup_program, fetch_list = build_program()
+        fetch_list = [x.name for x in fetch_list]
+        main_program = paddle.fluid.compiler.CompiledProgram(main_program)
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
+        executor = paddle.static.Executor(self.place)
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        executor.run(startup_program)
+        helper_profiler = profiler.Profiler(
+            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
+        helper_profiler.start()
+        for i in range(self.iter_n):
+            executor.run(main_program, fetch_list=fetch_list)
+            helper_profiler.step()
+        helper_profiler.stop()
+        perfstat_filepath = os.environ[
+            'FLAGS_static_executor_perfstat_filepath']
+        self.assertTrue(os.path.exists(perfstat_filepath))
+        with open(perfstat_filepath, 'r') as load_f:
+            stat_res = json.load(load_f)
+            self.assertTrue(len(stat_res) > 0)
+        os.remove(perfstat_filepath)
+        shutil.rmtree('./profiler_log')
+    def test_executor_statistics(self):
+        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
+            return
+        paddle.seed(2020)
+        main_program, startup_program, fetch_list = build_program()
+        fetch_list = [x.name for x in fetch_list]
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
+        executor = paddle.static.Executor(self.place)
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        executor.run(startup_program)
+        helper_profiler = profiler.Profiler(
+            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
+        helper_profiler.start()
+        for i in range(self.iter_n):
+            executor.run(main_program, fetch_list=fetch_list)
+            helper_profiler.step()
+        helper_profiler.stop()
+        perfstat_filepath = os.environ[
+            'FLAGS_static_executor_perfstat_filepath']
+        self.assertTrue(os.path.exists(perfstat_filepath))
+        with open(perfstat_filepath, 'r') as load_f:
+            stat_res = json.load(load_f)
+            self.assertTrue(len(stat_res) > 0)
+        os.remove(perfstat_filepath)
+        shutil.rmtree('./profiler_log')
 class MultiStreamModelTestCase(unittest.TestCase):
    def setUp(self):
        self.iter_n = 2
@@ -155,6 +259,7 @@ class MultiStreamModelTestCase(unittest.TestCase):
        p.set_place(self.place)
        inter_core = StandaloneExecutor(p, startup_program.desc,
                                        main_program.desc, core.Scope())
        outs = []
        for i in range(self.iter_n):
            outs.append(