未验证 提交 cbe7466f 编写于 作者: L liutiexing 提交者: GitHub

executor perf statistics (#41648)

* executor perf statistics

* fix ut

* fix ut

* fix ut

* add ut

* add ut
上级 d0f3296b
...@@ -20,6 +20,8 @@ endif() ...@@ -20,6 +20,8 @@ endif()
cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore) cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
# skip win32 since wget is not installed by default on windows machine. # skip win32 since wget is not installed by default on windows machine.
if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/platform/profiler/event_node.h"
namespace paddle {
namespace framework {
void StaticGraphExecutorPerfStatistics(
std::shared_ptr<const platform::NodeTrees> profiling_data);
} // namespace framework
} // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -59,6 +60,9 @@ paddle::framework::FetchList StandaloneExecutor::Run( ...@@ -59,6 +60,9 @@ paddle::framework::FetchList StandaloneExecutor::Run(
const std::vector<std::string>& feed_names, const std::vector<std::string>& feed_names,
const std::vector<framework::LoDTensor>& feed_tensors, const std::vector<framework::LoDTensor>& feed_tensors,
const std::vector<std::string>& fetch_names) { const std::vector<std::string>& fetch_names) {
platform::RecordEvent record_event("StandaloneExecutor::run",
platform::TracerEventType::UserDefined, 1);
auto core = GetInterpreterCore(feed_names, fetch_names, true); auto core = GetInterpreterCore(feed_names, fetch_names, true);
return core->Run(feed_names, feed_tensors); return core->Run(feed_names, feed_tensors);
...@@ -67,6 +71,9 @@ paddle::framework::FetchList StandaloneExecutor::Run( ...@@ -67,6 +71,9 @@ paddle::framework::FetchList StandaloneExecutor::Run(
paddle::framework::FetchList StandaloneExecutor::Run( paddle::framework::FetchList StandaloneExecutor::Run(
const std::vector<std::string>& feed_names, const std::vector<std::string>& feed_names,
const std::vector<std::string>& fetch_names) { const std::vector<std::string>& fetch_names) {
platform::RecordEvent record_event("StandaloneExecutor::run",
platform::TracerEventType::UserDefined, 1);
auto core = GetInterpreterCore(feed_names, fetch_names, false); auto core = GetInterpreterCore(feed_names, fetch_names, false);
VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
return core->Run(feed_names); return core->Run(feed_names);
......
cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog) cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog)
cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog) cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog os_info)
cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue) cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
...@@ -129,6 +129,7 @@ class ThreadPoolTempl { ...@@ -129,6 +129,7 @@ class ThreadPoolTempl {
// this. We expect that such scenario is prevented by program, that is, // this. We expect that such scenario is prevented by program, that is,
// this is kept alive while any threads can potentially be in Schedule. // this is kept alive while any threads can potentially be in Schedule.
if (!t.f) { if (!t.f) {
// Allow 'false positive' which makes a redundant notification.
if (num_tasks > num_threads_ - blocked_) { if (num_tasks > num_threads_ - blocked_) {
VLOG(6) << "Add task, Notify"; VLOG(6) << "Add task, Notify";
ec_.Notify(false); ec_.Notify(false);
...@@ -379,9 +380,8 @@ class ThreadPoolTempl { ...@@ -379,9 +380,8 @@ class ThreadPoolTempl {
return false; return false;
} }
// Number of blocked threads is used as termination condition. // Number of blocked threads is used as notification condition.
// If we are shutting down and all worker threads blocked without work, // We must increase the counter before the emptiness check.
// that's we are done.
blocked_++; blocked_++;
// Now do a reliable emptiness check. // Now do a reliable emptiness check.
...@@ -393,6 +393,9 @@ class ThreadPoolTempl { ...@@ -393,6 +393,9 @@ class ThreadPoolTempl {
return true; return true;
} }
// Number of blocked threads is used as termination condition.
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) { if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
ec_.CancelWait(); ec_.CancelWait();
// Almost done, but need to re-check queues. // Almost done, but need to re-check queues.
......
...@@ -350,7 +350,7 @@ if(WITH_PYTHON) ...@@ -350,7 +350,7 @@ if(WITH_PYTHON)
add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
endif() endif()
list(APPEND PYBIND_DEPS interpretercore standalone_executor) list(APPEND PYBIND_DEPS interpretercore standalone_executor staticgraph_executor_statistics)
cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS}) cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
list(APPEND PYBIND_DEPS op_function_common) list(APPEND PYBIND_DEPS op_function_common)
......
...@@ -46,6 +46,7 @@ limitations under the License. */ ...@@ -46,6 +46,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/new_executor/executor_statistics.h"
#include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -2903,9 +2904,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2903,9 +2904,6 @@ All parameter, weight, gradient are variables in Paddle.
.def("run", .def("run",
[](StandaloneExecutor &self, std::vector<std::string> feed_names, [](StandaloneExecutor &self, std::vector<std::string> feed_names,
std::vector<std::string> fetch_names) { std::vector<std::string> fetch_names) {
platform::RecordEvent record_event(
"StandaloneExecutor::run",
platform::TracerEventType::UserDefined, 1);
paddle::framework::FetchList ret; paddle::framework::FetchList ret;
{ {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
...@@ -3380,7 +3378,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3380,7 +3378,10 @@ All parameter, weight, gradient are variables in Paddle.
.def("stop", .def("stop",
[](paddle::platform::Profiler *profiler) { [](paddle::platform::Profiler *profiler) {
platform::DisableHostEventRecorder(); platform::DisableHostEventRecorder();
return profiler->Stop(); auto result = profiler->Stop();
framework::StaticGraphExecutorPerfStatistics(
result->GetNodeTrees());
return result;
}, },
py::return_value_policy::automatic_reference); py::return_value_policy::automatic_reference);
......
...@@ -2,7 +2,7 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") ...@@ -2,7 +2,7 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}") string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
foreach(target ${TEST_INTERP_CASES}) foreach(target ${TEST_INTERP_CASES})
py_test_modules(${target} MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0) py_test_modules(${target} MODULES ${target} ENVS FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0)
py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001)
py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0) py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0)
py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001) py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001)
......
...@@ -15,10 +15,13 @@ ...@@ -15,10 +15,13 @@
import os import os
os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true" os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
import sys import sys
import shutil
import unittest import unittest
import paddle import paddle
import json
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.core import StandaloneExecutor from paddle.fluid.core import StandaloneExecutor
from paddle.profiler import profiler
import numpy as np import numpy as np
...@@ -116,6 +119,107 @@ def build_program(): ...@@ -116,6 +119,107 @@ def build_program():
return main_program, startup_program, [mean] return main_program, startup_program, [mean]
class ExecutorStatisticsTestCase(unittest.TestCase):
def setUp(self):
self.iter_n = 3
self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
) else paddle.CPUPlace()
def test_standalone_executor_statistics(self):
if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
return
paddle.seed(2020)
main_program, startup_program, fetch_list = build_program()
fetch_list = [x.name for x in fetch_list]
p = core.Place()
p.set_place(self.place)
executor = StandaloneExecutor(p, startup_program.desc,
main_program.desc, core.Scope())
helper_profiler = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
helper_profiler.start()
for i in range(self.iter_n):
executor.run({}, fetch_list)
helper_profiler.step()
helper_profiler.stop()
perfstat_filepath = os.environ[
'FLAGS_static_executor_perfstat_filepath']
self.assertTrue(os.path.exists(perfstat_filepath))
with open(perfstat_filepath, 'r') as load_f:
stat_res = json.load(load_f)
self.assertTrue(len(stat_res) > 0)
os.remove(perfstat_filepath)
shutil.rmtree('./profiler_log')
def test_parallel_executor_statistics(self):
if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
return
paddle.seed(2020)
main_program, startup_program, fetch_list = build_program()
fetch_list = [x.name for x in fetch_list]
main_program = paddle.fluid.compiler.CompiledProgram(main_program)
os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
executor = paddle.static.Executor(self.place)
os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
executor.run(startup_program)
helper_profiler = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
helper_profiler.start()
for i in range(self.iter_n):
executor.run(main_program, fetch_list=fetch_list)
helper_profiler.step()
helper_profiler.stop()
perfstat_filepath = os.environ[
'FLAGS_static_executor_perfstat_filepath']
self.assertTrue(os.path.exists(perfstat_filepath))
with open(perfstat_filepath, 'r') as load_f:
stat_res = json.load(load_f)
self.assertTrue(len(stat_res) > 0)
os.remove(perfstat_filepath)
shutil.rmtree('./profiler_log')
def test_executor_statistics(self):
if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
return
paddle.seed(2020)
main_program, startup_program, fetch_list = build_program()
fetch_list = [x.name for x in fetch_list]
os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
executor = paddle.static.Executor(self.place)
os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
executor.run(startup_program)
helper_profiler = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
helper_profiler.start()
for i in range(self.iter_n):
executor.run(main_program, fetch_list=fetch_list)
helper_profiler.step()
helper_profiler.stop()
perfstat_filepath = os.environ[
'FLAGS_static_executor_perfstat_filepath']
self.assertTrue(os.path.exists(perfstat_filepath))
with open(perfstat_filepath, 'r') as load_f:
stat_res = json.load(load_f)
self.assertTrue(len(stat_res) > 0)
os.remove(perfstat_filepath)
shutil.rmtree('./profiler_log')
class MultiStreamModelTestCase(unittest.TestCase): class MultiStreamModelTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
self.iter_n = 2 self.iter_n = 2
...@@ -155,6 +259,7 @@ class MultiStreamModelTestCase(unittest.TestCase): ...@@ -155,6 +259,7 @@ class MultiStreamModelTestCase(unittest.TestCase):
p.set_place(self.place) p.set_place(self.place)
inter_core = StandaloneExecutor(p, startup_program.desc, inter_core = StandaloneExecutor(p, startup_program.desc,
main_program.desc, core.Scope()) main_program.desc, core.Scope())
outs = [] outs = []
for i in range(self.iter_n): for i in range(self.iter_n):
outs.append( outs.append(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册