未验证 提交 c65c6ae5 编写于 作者: W wangchaochaohu 提交者: GitHub

add flag to control profile level in python API (#22319)

* add python flag to control profile level test=develop
上级 fce37bc5
......@@ -167,7 +167,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
}
{
platform::RecordEvent record_event(Type());
// TODO(wangchaochaohu) : refine code to use only one RecordEvent)
// in order to record different op type cost time
// and different op name cost time,we set two event.
platform::RecordEvent op_type_record_event(Type());
auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event(
op_name, platform::RecordRole::kUniqueOp);
RunImpl(scope, place);
}
......@@ -950,7 +956,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
std::vector<std::string> transfered_inplace_vars;
Scope* transfer_scope = nullptr;
{
platform::RecordEvent record_event("prepare_data_inner_op");
platform::RecordEvent record_event("prepare_data",
platform::RecordRole::kInnerOp);
transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
runtime_ctx);
}
......@@ -963,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape_inner_op");
platform::RecordEvent record_event("infer_shape",
platform::RecordRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->InferShape(&infer_shape_ctx);
}
......@@ -975,7 +983,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
{
platform::RecordEvent record_event("compute_inner_op");
platform::RecordEvent record_event("compute",
platform::RecordRole::kInnerOp);
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
}
......
......@@ -42,6 +42,7 @@ static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
std::mutex profiler_mu;
static TracerOption g_tracer_option = TracerOption::kDefault;
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
......@@ -137,11 +138,15 @@ void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id);
}
RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) {
RecordEvent::RecordEvent(const std::string &name, const RecordRole role)
: is_enabled_(false), start_ns_(PosixInNsec()), role_(role) {
if (g_state == ProfilerState::kDisabled || name.empty()) return;
if ((g_tracer_option == TracerOption::kOpDetail &&
role_ != RecordRole::kInnerOp) ||
(g_tracer_option == TracerOption::kDefault &&
role != RecordRole::kOrdinary))
return;
// lock is not needed, the code below is thread-safe
is_enabled_ = true;
Event *e = PushEvent(name);
// Maybe need the same push/pop behavior.
......@@ -363,30 +368,21 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
}
if (print_depth >= 100) return;
int print_depth_next = print_depth;
if (events_table.size() <= 0) return;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
auto event_item = events_table[i][j];
std::vector<std::vector<EventItem>> child_table;
std::vector<EventItem> table;
bool do_next = false;
std::string op_end_str = "inner_op";
for (auto it = child_map.begin(); it != child_map.end(); it++) {
if (it->first == event_item.name) {
table.push_back(it->second);
if (!do_next)
do_next = !(it->second.name.rfind(op_end_str) ==
(it->second.name.length() - op_end_str.length()));
}
}
child_table.push_back(table);
if (do_next)
print_depth_next = print_depth + 1;
else
print_depth_next = print_depth + 100;
auto name_len = event_item.name.length();
std::string print_name = event_item.name.substr(remove_len, name_len);
std::string delimiter;
......@@ -394,10 +390,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
delimiter = " " + delimiter;
}
print_name = delimiter + print_name;
size_t pos = 0;
while ((pos = print_name.find(op_end_str)) != std::string::npos) {
print_name.erase(pos, op_end_str.length());
}
std::cout << std::setw(name_width) << print_name << std::setw(data_width)
<< event_item.calls << std::setw(data_width)
<< event_item.total_time;
......@@ -416,14 +409,14 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.ratio << std::endl;
PrintProfiler(child_table, child_map, sorted_domain, name_width,
data_width, merge_thread, print_depth_next, 0);
data_width, merge_thread, print_depth + 1, 0);
}
}
}
std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
EventSortingKey sorted_by, std::string *domain) {
auto sorted_domain = *domain;
std::string sorted_domain;
std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) {
case EventSortingKey::kCalls:
......@@ -471,6 +464,7 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
default:
sorted_domain = "event first end time";
}
*domain = sorted_domain;
return sorted_func;
}
......@@ -709,6 +703,45 @@ void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
PrintMemProfiler(annotation_report, 55, 18);
}
void DealWithShowName() {
std::unordered_map<std::string, int> prefix_name;
std::vector<std::string> op_out_name;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
for (auto &block : (*it)->event_blocks) {
for (auto &r : block) {
auto event_name = r.name();
size_t start = event_name.find('%', 0);
size_t end = event_name.find('%', start + 1);
std::string prefix_str = event_name.substr(0, start);
while (start != std::string::npos && end != std::string::npos) {
auto search_str = event_name.substr(start, end - start + 1);
auto it = find(op_out_name.begin(), op_out_name.end(), search_str);
std::string replace_str;
bool prefix_find = true;
if (prefix_name.find(prefix_str) == prefix_name.end()) {
prefix_find = false;
prefix_name[prefix_str] = 0;
}
if (it == op_out_name.end()) {
if (prefix_find)
prefix_name[prefix_str] = prefix_name[prefix_str] + 1;
op_out_name.push_back(search_str);
}
replace_str = std::to_string(prefix_name[prefix_str]);
event_name.replace(start, end - start + 1, replace_str);
start = start + 1;
start = event_name.find('%', start);
end = event_name.find('%', start + 1);
prefix_str = event_name.substr(0, start);
}
r.set_name(event_name);
}
}
}
}
void DisableProfiler(EventSortingKey sorted_key,
const std::string &profile_path) {
SynchronizeAllDevice();
......@@ -718,37 +751,17 @@ void DisableProfiler(EventSortingKey sorted_key,
if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop.
Mark("_stop_profiler_");
DealWithShowName();
DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile(profile_path);
tracer->GenEventKernelCudaElapsedTime();
tracer->GenProfile(profile_path);
}
std::vector<std::vector<Event>> all_events = GetAllEvents();
std::vector<std::string> op_name;
for (size_t i = 0; i < (all_events).size(); i++) {
for (size_t j = 0; j < (all_events)[i].size(); j++) {
std::string event_name = (all_events)[i][j].name();
size_t start = event_name.find('%', 0);
size_t end = event_name.rfind('%', event_name.length() - 1);
if (start == std::string::npos || end == std::string::npos) continue;
std::string search_str = event_name.substr(start, end - start + 1);
auto it = find(op_name.begin(), op_name.end(), search_str);
std::string replace_str;
if (it != op_name.end()) {
replace_str = std::to_string(std::distance(op_name.begin(), it));
} else {
replace_str = std::to_string(op_name.size());
op_name.push_back(search_str);
}
event_name.replace(start, end - start + 1, replace_str);
(all_events)[i][j].set_name(event_name);
}
}
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
......@@ -773,5 +786,30 @@ void SetProfileListener() {
}
int64_t ListenerId() { return profiler_lister_id; }
std::string OpName(const framework::VariableNameMap &name_map,
const std::string &type_name) {
if (platform::GetTracerOption() != platform::TracerOption::kAllOpDetail)
return "";
std::string ret = type_name + "%";
for (auto it = name_map.begin(); it != name_map.end(); it++) {
auto name_outputs = it->second;
if (!name_outputs.empty() &&
type_name.length() < name_outputs[0].length()) {
ret = ret + name_outputs[0];
break;
}
}
ret = ret + "%";
return ret;
}
void SetTracerOption(TracerOption option) {
std::lock_guard<std::mutex> l(profiler_mu);
g_tracer_option = option;
}
platform::TracerOption GetTracerOption() { return g_tracer_option; }
} // namespace platform
} // namespace paddle
......@@ -23,6 +23,7 @@ limitations under the License. */
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
......@@ -39,6 +40,19 @@ enum class ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental).
};
enum class RecordRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
};
// it is the flag to control to print the profiling result
enum class TracerOption {
kDefault, // print the different op type profiling result
kOpDetail, // print the detail profiling result of different op type
kAllOpDetail, // print the detail profiling result of different op name
};
void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
......@@ -79,7 +93,8 @@ Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name);
struct RecordEvent {
explicit RecordEvent(const std::string& name);
RecordEvent(const std::string& name,
const RecordRole r_type = RecordRole::kOrdinary);
~RecordEvent();
......@@ -90,6 +105,7 @@ struct RecordEvent {
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
std::string full_name_;
RecordRole role_{RecordRole::kOrdinary};
};
class RecordRPCEvent {
......@@ -181,6 +197,10 @@ bool ShouldSendProfileState();
void SetProfileListener();
int64_t ListenerId();
std::string OpName(const framework::VariableNameMap& name_map,
const std::string& type_name);
void SetTracerOption(TracerOption option);
platform::TracerOption GetTracerOption();
#ifdef PADDLE_WITH_CUDA
void DummyKernelAndEvent();
#endif
......
......@@ -1555,6 +1555,12 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
.value("kAllOpDetail", platform::TracerOption::kAllOpDetail)
.export_values();
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
.value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU)
......@@ -1571,6 +1577,7 @@ All parameter, weight, gradient are variables in Paddle.
.value("kAve", platform::EventSortingKey::kAve)
.export_values();
m.def("set_tracer_option", platform::SetTracerOption);
m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled);
......
......@@ -126,7 +126,7 @@ def reset_profiler():
core.reset_profiler()
def start_profiler(state):
def start_profiler(state, tracer_option='Default'):
"""
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to profile, which is equal to the usage
......@@ -137,9 +137,15 @@ def start_profiler(state):
or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
both CPU and GPU; 'All' means profiling both CPU and GPU, and
generates timeline as well.
tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
can control the profile level and print the different level profile result. Default option print
the different Op type profiling result and the OpDetail option print the detail profiling
result of different op types such as compute and data transform, AllOpDetail option
print the detail profiling result of different op name same as OpDetail.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
ValueError: If `tracer_option` is not in ['Default', 'OpDetail', 'AllOpDetail']
Examples:
......@@ -154,6 +160,13 @@ def start_profiler(state):
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
profiler.start_profiler('GPU', "OpDetail")
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
"""
if core.is_profiler_enabled():
return
......@@ -165,6 +178,18 @@ def start_profiler(state):
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
if tracer_option not in ['Default', 'OpDetail', 'AllOpDetail']:
raise ValueError(
"tracer option must be 'Default', 'OpDetail', 'AllOpDetail'.")
if tracer_option == "Default":
prof_tracer_option = core.TracerOption.kDefault
elif tracer_option == "OpDetail":
prof_tracer_option = core.TracerOption.kOpDetail
else:
prof_tracer_option = core.TracerOption.kAllOpDetail
core.set_tracer_option(prof_tracer_option)
core.enable_profiler(prof_state)
......@@ -184,8 +209,8 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (str, optional) : If state == 'All', it will generate timeline,
and write it into `profile_path`. The default profile_path is `/tmp/profile`.
profile_path (str, optional) : If state == 'All', it will generate timeline,
Raises:
ValueError: If `sorted_key` is not in
......@@ -225,7 +250,10 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
@signature_safe_contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
def profiler(state,
sorted_key=None,
profile_path='/tmp/profile',
tracer_option='Default'):
"""
The profiler interface. Different from `fluid.profiler.cuda_profiler`,
this profiler can be used to profile both CPU and GPU program.
......@@ -246,6 +274,11 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
The `ave` means sorting by the average execution time.
profile_path (str, optional) : If state == 'All', it will generate timeline,
and write it into `profile_path`. The default profile_path is `/tmp/profile`.
tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
can control the profile level and print the different level profile result. Default option print
the different Op type profiling result and the OpDetail option print the detail profiling
result of different op types such as compute and data transform, AllOpDetail option
print the detail profiling result of different op name same as OpDetail.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
......@@ -268,7 +301,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
with profiler.profiler('CPU', 'total', '/tmp/profile') as prof:
with profiler.profiler('CPU', 'total', '/tmp/profile', 'Default') as prof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
......@@ -311,6 +344,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
thread0::conv2d 8 7.93456 0.291385 5.63342 0.99182 0.795243
thread0::elementwise_add 8 1.96555 0.191884 0.518004 0.245693 0.196998
"""
start_profiler(state)
start_profiler(state, tracer_option)
yield
stop_profiler(sorted_key, profile_path)
......@@ -22,6 +22,7 @@ import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle.fluid.layers as layers
import paddle.fluid.core as core
from paddle.fluid import compiler, Program, program_guard
import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
......@@ -30,7 +31,11 @@ class TestProfiler(unittest.TestCase):
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def net_profiler(self, state, use_parallel_executor=False):
def net_profiler(self,
state,
option,
iter_range=None,
use_parallel_executor=False):
profile_path = os.path.join(tempfile.gettempdir(), "profile")
open(profile_path, "w").write("")
startup_program = fluid.Program()
......@@ -75,7 +80,7 @@ class TestProfiler(unittest.TestCase):
main_program=main_program)
pass_acc_calculator = fluid.average.WeightedAverage()
with profiler.profiler(state, 'total', profile_path) as prof:
with profiler.profiler(state, 'total', profile_path, option) as prof:
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
......@@ -94,36 +99,37 @@ class TestProfiler(unittest.TestCase):
pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval()
data = open(profile_path, 'rb').read()
self.assertGreater(len(data), 0)
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(data)
self.assertGreater(len(profile_pb.events), 0)
for event in profile_pb.events:
if event.type == profiler_pb2.Event.GPUKernel:
if not event.detail_info and not event.name.startswith("MEM"):
raise Exception(
"Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
% event.name)
elif event.type == profiler_pb2.Event.CPU and (
event.name.startswith("Driver API") or
event.name.startswith("Runtime API")):
print("Warning: unregister", event.name)
if (len(data) > 0):
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(data)
self.assertGreater(len(profile_pb.events), 0)
for event in profile_pb.events:
if event.type == profiler_pb2.Event.GPUKernel:
if not event.detail_info and not event.name.startswith(
"MEM"):
raise Exception(
"Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
% event.name)
elif event.type == profiler_pb2.Event.CPU and (
event.name.startswith("Driver API") or
event.name.startswith("Runtime API")):
print("Warning: unregister", event.name)
def test_cpu_profiler(self):
self.net_profiler('CPU')
self.net_profiler('CPU', use_parallel_executor=True)
self.net_profiler('CPU', "Default")
self.net_profiler('CPU', "Default", use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_cuda_profiler(self):
self.net_profiler('GPU')
self.net_profiler('GPU', use_parallel_executor=True)
self.net_profiler('GPU', "OpDetail")
self.net_profiler('GPU', "OpDetail", use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_all_profiler(self):
self.net_profiler('All')
self.net_profiler('All', use_parallel_executor=True)
self.net_profiler('All', "AllOpDetail")
self.net_profiler('All', "AllOpDetail", use_parallel_executor=True)
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册