diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index bcd586e8c099c81fb58f3fcc3c221858ab1ce511..df773c044bf118e2b381c4281c8e5039360aeef9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -167,7 +167,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } { - platform::RecordEvent record_event(Type()); + // TODO(wangchaochaohu) : refine code to use only one RecordEvent) + // in order to record different op type cost time + // and different op name cost time,we set two event. + platform::RecordEvent op_type_record_event(Type()); + auto op_name = platform::OpName(outputs_, Type()); + platform::RecordEvent op_name_record_event( + op_name, platform::RecordRole::kUniqueOp); RunImpl(scope, place); } @@ -950,7 +956,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, std::vector transfered_inplace_vars; Scope* transfer_scope = nullptr; { - platform::RecordEvent record_event("prepare_data_inner_op"); + platform::RecordEvent record_event("prepare_data", + platform::RecordRole::kInnerOp); transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); } @@ -963,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } if (!all_kernels_must_compute_runtime_shape_) { - platform::RecordEvent record_event("infer_shape_inner_op"); + platform::RecordEvent record_event("infer_shape", + platform::RecordRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->InferShape(&infer_shape_ctx); } @@ -975,7 +983,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. { - platform::RecordEvent record_event("compute_inner_op"); + platform::RecordEvent record_event("compute", + platform::RecordRole::kInnerOp); (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, kernel_configs)); } diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 2e89dbf8fdd153a2e16594ed27a3e9691193ee1e..07659e596e0fc426ff28c0b2bf5d49f4c8606028 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -42,6 +42,7 @@ static int64_t profiler_lister_id = 0; static bool should_send_profile_state = false; std::mutex profiler_mu; +static TracerOption g_tracer_option = TracerOption::kDefault; // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; // The thread local event list only can be accessed by the specific thread @@ -137,11 +138,15 @@ void PopEvent(const std::string &name) { GetEventList().Record(EventType::kPopRange, name, g_thread_id); } -RecordEvent::RecordEvent(const std::string &name) - : is_enabled_(false), start_ns_(PosixInNsec()) { +RecordEvent::RecordEvent(const std::string &name, const RecordRole role) + : is_enabled_(false), start_ns_(PosixInNsec()), role_(role) { if (g_state == ProfilerState::kDisabled || name.empty()) return; + if ((g_tracer_option == TracerOption::kOpDetail && + role_ != RecordRole::kInnerOp) || + (g_tracer_option == TracerOption::kDefault && + role != RecordRole::kOrdinary)) + return; // lock is not needed, the code below is thread-safe - is_enabled_ = true; Event *e = PushEvent(name); // Maybe need the same push/pop behavior. @@ -363,30 +368,21 @@ void PrintProfiler(const std::vector> &events_table, << "Max." << std::setw(data_width) << "Ave." << std::setw(data_width) << "Ratio." << std::endl; } - if (print_depth >= 100) return; - int print_depth_next = print_depth; + + if (events_table.size() <= 0) return; + for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { auto event_item = events_table[i][j]; std::vector> child_table; std::vector table; - bool do_next = false; - std::string op_end_str = "inner_op"; for (auto it = child_map.begin(); it != child_map.end(); it++) { if (it->first == event_item.name) { table.push_back(it->second); - if (!do_next) - do_next = !(it->second.name.rfind(op_end_str) == - (it->second.name.length() - op_end_str.length())); } } child_table.push_back(table); - if (do_next) - print_depth_next = print_depth + 1; - else - print_depth_next = print_depth + 100; - auto name_len = event_item.name.length(); std::string print_name = event_item.name.substr(remove_len, name_len); std::string delimiter; @@ -394,10 +390,7 @@ void PrintProfiler(const std::vector> &events_table, delimiter = " " + delimiter; } print_name = delimiter + print_name; - size_t pos = 0; - while ((pos = print_name.find(op_end_str)) != std::string::npos) { - print_name.erase(pos, op_end_str.length()); - } + std::cout << std::setw(name_width) << print_name << std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.total_time; @@ -416,14 +409,14 @@ void PrintProfiler(const std::vector> &events_table, << std::setw(data_width) << event_item.ave_time << std::setw(data_width) << event_item.ratio << std::endl; PrintProfiler(child_table, child_map, sorted_domain, name_width, - data_width, merge_thread, print_depth_next, 0); + data_width, merge_thread, print_depth + 1, 0); } } } std::function SetSortedFunc( EventSortingKey sorted_by, std::string *domain) { - auto sorted_domain = *domain; + std::string sorted_domain; std::function sorted_func; switch (sorted_by) { case EventSortingKey::kCalls: @@ -471,6 +464,7 @@ std::function SetSortedFunc( default: sorted_domain = "event first end time"; } + *domain = sorted_domain; return sorted_func; } @@ -709,6 +703,45 @@ void ParseMemEvents(const std::vector> &events) { PrintMemProfiler(annotation_report, 55, 18); } +void DealWithShowName() { + std::unordered_map prefix_name; + std::vector op_out_name; + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + for (auto &block : (*it)->event_blocks) { + for (auto &r : block) { + auto event_name = r.name(); + size_t start = event_name.find('%', 0); + size_t end = event_name.find('%', start + 1); + std::string prefix_str = event_name.substr(0, start); + while (start != std::string::npos && end != std::string::npos) { + auto search_str = event_name.substr(start, end - start + 1); + auto it = find(op_out_name.begin(), op_out_name.end(), search_str); + std::string replace_str; + bool prefix_find = true; + if (prefix_name.find(prefix_str) == prefix_name.end()) { + prefix_find = false; + prefix_name[prefix_str] = 0; + } + + if (it == op_out_name.end()) { + if (prefix_find) + prefix_name[prefix_str] = prefix_name[prefix_str] + 1; + op_out_name.push_back(search_str); + } + replace_str = std::to_string(prefix_name[prefix_str]); + event_name.replace(start, end - start + 1, replace_str); + start = start + 1; + start = event_name.find('%', start); + end = event_name.find('%', start + 1); + prefix_str = event_name.substr(0, start); + } + r.set_name(event_name); + } + } + } +} + void DisableProfiler(EventSortingKey sorted_key, const std::string &profile_path) { SynchronizeAllDevice(); @@ -718,37 +751,17 @@ void DisableProfiler(EventSortingKey sorted_key, if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. Mark("_stop_profiler_"); + DealWithShowName(); DeviceTracer *tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { tracer->Disable(); - tracer->GenProfile(profile_path); tracer->GenEventKernelCudaElapsedTime(); + tracer->GenProfile(profile_path); } std::vector> all_events = GetAllEvents(); - std::vector op_name; - for (size_t i = 0; i < (all_events).size(); i++) { - for (size_t j = 0; j < (all_events)[i].size(); j++) { - std::string event_name = (all_events)[i][j].name(); - size_t start = event_name.find('%', 0); - size_t end = event_name.rfind('%', event_name.length() - 1); - if (start == std::string::npos || end == std::string::npos) continue; - std::string search_str = event_name.substr(start, end - start + 1); - auto it = find(op_name.begin(), op_name.end(), search_str); - std::string replace_str; - if (it != op_name.end()) { - replace_str = std::to_string(std::distance(op_name.begin(), it)); - } else { - replace_str = std::to_string(op_name.size()); - op_name.push_back(search_str); - } - event_name.replace(start, end - start + 1, replace_str); - (all_events)[i][j].set_name(event_name); - } - } - ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, false, sorted_key); if (VLOG_IS_ON(5)) { @@ -773,5 +786,30 @@ void SetProfileListener() { } int64_t ListenerId() { return profiler_lister_id; } +std::string OpName(const framework::VariableNameMap &name_map, + const std::string &type_name) { + if (platform::GetTracerOption() != platform::TracerOption::kAllOpDetail) + return ""; + + std::string ret = type_name + "%"; + for (auto it = name_map.begin(); it != name_map.end(); it++) { + auto name_outputs = it->second; + if (!name_outputs.empty() && + type_name.length() < name_outputs[0].length()) { + ret = ret + name_outputs[0]; + break; + } + } + ret = ret + "%"; + + return ret; +} + +void SetTracerOption(TracerOption option) { + std::lock_guard l(profiler_mu); + g_tracer_option = option; +} + +platform::TracerOption GetTracerOption() { return g_tracer_option; } } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 4550b99a064a3950554ade662b768d1d147d729e..eb135475b5d1d34b9f814e81e25c8040c0905813 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -23,6 +23,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/place.h" @@ -39,6 +40,19 @@ enum class ProfilerState { kAll, // Profile both CPU and GPU. (Currently experimental). }; +enum class RecordRole { + kOrdinary, // only record op time with op type key + kInnerOp, // record op detail time with op type key + kUniqueOp, // record op detail time with op unique name key +}; + +// it is the flag to control to print the profiling result +enum class TracerOption { + kDefault, // print the different op type profiling result + kOpDetail, // print the detail profiling result of different op type + kAllOpDetail, // print the detail profiling result of different op name +}; + void Mark(const std::string& name); void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, @@ -79,7 +93,8 @@ Event* PushEvent(const std::string& name); void PopEvent(const std::string& name); struct RecordEvent { - explicit RecordEvent(const std::string& name); + RecordEvent(const std::string& name, + const RecordRole r_type = RecordRole::kOrdinary); ~RecordEvent(); @@ -90,6 +105,7 @@ struct RecordEvent { // Need to distinguish name by op type, block_id, program_id and perhaps // different kernel invocations within an op. std::string full_name_; + RecordRole role_{RecordRole::kOrdinary}; }; class RecordRPCEvent { @@ -181,6 +197,10 @@ bool ShouldSendProfileState(); void SetProfileListener(); int64_t ListenerId(); +std::string OpName(const framework::VariableNameMap& name_map, + const std::string& type_name); +void SetTracerOption(TracerOption option); +platform::TracerOption GetTracerOption(); #ifdef PADDLE_WITH_CUDA void DummyKernelAndEvent(); #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc35d887865c8490ab8a53bd60ce4e5ebd75c0a8..c080d9219fd6bed9edb7c3325adbffb27166e4bf 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1555,6 +1555,12 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif + py::enum_(m, "TracerOption", py::arithmetic()) + .value("kDefault", platform::TracerOption::kDefault) + .value("kOpDetail", platform::TracerOption::kOpDetail) + .value("kAllOpDetail", platform::TracerOption::kAllOpDetail) + .export_values(); + py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -1571,6 +1577,7 @@ All parameter, weight, gradient are variables in Paddle. .value("kAve", platform::EventSortingKey::kAve) .export_values(); + m.def("set_tracer_option", platform::SetTracerOption); m.def("enable_profiler", platform::EnableProfiler); m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 82b29e25fde11ed488440fb1486b954144ea7cc7..c0b0c86ecd2b30be82d46cf3af7bcd13045cff6e 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -126,7 +126,7 @@ def reset_profiler(): core.reset_profiler() -def start_profiler(state): +def start_profiler(state, tracer_option='Default'): """ Enable the profiler. Uers can use `fluid.profiler.start_profiler` and `fluid.profiler.stop_profiler` to profile, which is equal to the usage @@ -137,9 +137,15 @@ def start_profiler(state): or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling both CPU and GPU; 'All' means profiling both CPU and GPU, and generates timeline as well. + tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it + can control the profile level and print the different level profile result. Default option print + the different Op type profiling result and the OpDetail option print the detail profiling + result of different op types such as compute and data transform, AllOpDetail option + print the detail profiling result of different op name same as OpDetail. Raises: ValueError: If `state` is not in ['CPU', 'GPU', 'All']. + ValueError: If `tracer_option` is not in ['Default', 'OpDetail', 'AllOpDetail'] Examples: @@ -154,6 +160,13 @@ def start_profiler(state): profiler.reset_profiler() # except each iteration profiler.stop_profiler('total', '/tmp/profile') + + profiler.start_profiler('GPU', "OpDetail") + for iter in range(10): + if iter == 2: + profiler.reset_profiler() + # except each iteration + profiler.stop_profiler('total', '/tmp/profile') """ if core.is_profiler_enabled(): return @@ -165,6 +178,18 @@ def start_profiler(state): prof_state = core.ProfilerState.kCPU else: prof_state = core.ProfilerState.kAll + + if tracer_option not in ['Default', 'OpDetail', 'AllOpDetail']: + raise ValueError( + "tracer option must be 'Default', 'OpDetail', 'AllOpDetail'.") + if tracer_option == "Default": + prof_tracer_option = core.TracerOption.kDefault + elif tracer_option == "OpDetail": + prof_tracer_option = core.TracerOption.kOpDetail + else: + prof_tracer_option = core.TracerOption.kAllOpDetail + + core.set_tracer_option(prof_tracer_option) core.enable_profiler(prof_state) @@ -184,8 +209,8 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): The `max` means sorting by the maximum execution time. The `min` means sorting by the minimum execution time. The `ave` means sorting by the average execution time. - profile_path (str, optional) : If state == 'All', it will generate timeline, and write it into `profile_path`. The default profile_path is `/tmp/profile`. + profile_path (str, optional) : If state == 'All', it will generate timeline, Raises: ValueError: If `sorted_key` is not in @@ -225,7 +250,10 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): @signature_safe_contextmanager -def profiler(state, sorted_key=None, profile_path='/tmp/profile'): +def profiler(state, + sorted_key=None, + profile_path='/tmp/profile', + tracer_option='Default'): """ The profiler interface. Different from `fluid.profiler.cuda_profiler`, this profiler can be used to profile both CPU and GPU program. @@ -246,6 +274,11 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): The `ave` means sorting by the average execution time. profile_path (str, optional) : If state == 'All', it will generate timeline, and write it into `profile_path`. The default profile_path is `/tmp/profile`. + tracer_option (str) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it + can control the profile level and print the different level profile result. Default option print + the different Op type profiling result and the OpDetail option print the detail profiling + result of different op types such as compute and data transform, AllOpDetail option + print the detail profiling result of different op name same as OpDetail. Raises: ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is @@ -268,7 +301,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - with profiler.profiler('CPU', 'total', '/tmp/profile') as prof: + with profiler.profiler('CPU', 'total', '/tmp/profile', 'Default') as prof: for i in range(epoc): input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) @@ -311,6 +344,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): thread0::conv2d 8 7.93456 0.291385 5.63342 0.99182 0.795243 thread0::elementwise_add 8 1.96555 0.191884 0.518004 0.245693 0.196998 """ - start_profiler(state) + start_profiler(state, tracer_option) yield stop_profiler(sorted_key, profile_path) diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index 367b60831c5b1d0397b7729acf078513bb074299..2290f38e2f1ebcb064ecc9d4b0d86b7988949af3 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -22,6 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.layers as layers import paddle.fluid.core as core +from paddle.fluid import compiler, Program, program_guard import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 @@ -30,7 +31,11 @@ class TestProfiler(unittest.TestCase): def setUpClass(cls): os.environ['CPU_NUM'] = str(4) - def net_profiler(self, state, use_parallel_executor=False): + def net_profiler(self, + state, + option, + iter_range=None, + use_parallel_executor=False): profile_path = os.path.join(tempfile.gettempdir(), "profile") open(profile_path, "w").write("") startup_program = fluid.Program() @@ -75,7 +80,7 @@ class TestProfiler(unittest.TestCase): main_program=main_program) pass_acc_calculator = fluid.average.WeightedAverage() - with profiler.profiler(state, 'total', profile_path) as prof: + with profiler.profiler(state, 'total', profile_path, option) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() @@ -94,36 +99,37 @@ class TestProfiler(unittest.TestCase): pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval() data = open(profile_path, 'rb').read() - self.assertGreater(len(data), 0) - profile_pb = profiler_pb2.Profile() - profile_pb.ParseFromString(data) - self.assertGreater(len(profile_pb.events), 0) - for event in profile_pb.events: - if event.type == profiler_pb2.Event.GPUKernel: - if not event.detail_info and not event.name.startswith("MEM"): - raise Exception( - "Kernel %s missing event. Has this kernel been recorded by RecordEvent?" - % event.name) - elif event.type == profiler_pb2.Event.CPU and ( - event.name.startswith("Driver API") or - event.name.startswith("Runtime API")): - print("Warning: unregister", event.name) + if (len(data) > 0): + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(data) + self.assertGreater(len(profile_pb.events), 0) + for event in profile_pb.events: + if event.type == profiler_pb2.Event.GPUKernel: + if not event.detail_info and not event.name.startswith( + "MEM"): + raise Exception( + "Kernel %s missing event. Has this kernel been recorded by RecordEvent?" + % event.name) + elif event.type == profiler_pb2.Event.CPU and ( + event.name.startswith("Driver API") or + event.name.startswith("Runtime API")): + print("Warning: unregister", event.name) def test_cpu_profiler(self): - self.net_profiler('CPU') - self.net_profiler('CPU', use_parallel_executor=True) + self.net_profiler('CPU', "Default") + self.net_profiler('CPU', "Default", use_parallel_executor=True) @unittest.skipIf(not core.is_compiled_with_cuda(), "profiler is enabled only with GPU") def test_cuda_profiler(self): - self.net_profiler('GPU') - self.net_profiler('GPU', use_parallel_executor=True) + self.net_profiler('GPU', "OpDetail") + self.net_profiler('GPU', "OpDetail", use_parallel_executor=True) @unittest.skipIf(not core.is_compiled_with_cuda(), "profiler is enabled only with GPU") def test_all_profiler(self): - self.net_profiler('All') - self.net_profiler('All', use_parallel_executor=True) + self.net_profiler('All', "AllOpDetail") + self.net_profiler('All', "AllOpDetail", use_parallel_executor=True) if __name__ == '__main__':