diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 5d0882c587dfa5cbea47ce648cb3dfe0820b83bf..eae94926b92254885b7640327c8500ba0a40173f 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" #include #include #include @@ -22,7 +21,6 @@ limitations under the License. */ #include #include #include - #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -30,7 +28,11 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler_helper.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -38,40 +40,7 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); namespace paddle { namespace platform { -static int64_t profiler_lister_id = 0; -static bool should_send_profile_state = false; -std::mutex profiler_mu; - -static TracerOption g_tracer_option = TracerOption::kDefault; -// The profiler state, the initial value is ProfilerState::kDisabled -static ProfilerState g_state = ProfilerState::kDisabled; -// The thread local event list only can be accessed by the specific thread -// The thread index of each thread -static thread_local int32_t g_thread_id; -// The g_next_thread_id is a global counter for threads, by the g_thread_id and -// g_next_thread_id, we can know how many threads have created EventList. -static uint32_t g_next_thread_id = 0; -// The global mutex -static std::mutex g_all_event_lists_mutex; -// The total event lists of all threads -static std::list>> g_all_event_lists; -// The thread local event list only can be accessed by the specific thread -static thread_local std::shared_ptr> g_event_list; - -static std::list>> g_all_mem_event_lists; -static thread_local std::shared_ptr> g_mem_event_list; -static std::mutex g_all_mem_event_lists_mutex; -static thread_local int32_t g_mem_thread_id; -static uint32_t g_mem_next_thread_id = 0; - -inline uint64_t GetTimeInNsec() { - using clock = std::conditional::type; - return std::chrono::duration_cast( - clock::now().time_since_epoch()) - .count(); -} +MemEvenRecorder MemEvenRecorder::recorder; Event::Event(EventType type, std::string name, uint32_t thread_id) : type_(type), name_(name), thread_id_(thread_id) { @@ -93,51 +62,6 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -inline EventList &GetMemEventList() { - if (!g_mem_event_list) { - g_mem_event_list = std::make_shared>(); - std::lock_guard guard(g_all_mem_event_lists_mutex); - g_mem_thread_id = g_mem_next_thread_id++; - g_all_mem_event_lists.emplace_front(g_mem_event_list); - } - return *g_mem_event_list; -} - -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, - place, g_mem_thread_id, annotation); -} - -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, - g_mem_thread_id, annotation); -} - -inline EventList &GetEventList() { - if (!g_event_list) { - std::lock_guard guard(g_all_event_lists_mutex); - g_event_list = std::make_shared>(); - g_thread_id = g_next_thread_id++; - g_all_event_lists.emplace_front(g_event_list); - RecoreCurThreadId(g_thread_id); - } - return *g_event_list; -} - -void Mark(const std::string &name) { - GetEventList().Record(EventType::kMark, name, g_thread_id); -} - -Event *PushEvent(const std::string &name) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id); -} - -void PopEvent(const std::string &name) { - GetEventList().Record(EventType::kPopRange, name, g_thread_id); -} - RecordEvent::RecordEvent(const std::string &name, const RecordRole role) : is_enabled_(false), start_ns_(PosixInNsec()), role_(role) { if (g_state == ProfilerState::kDisabled || name.empty()) return; @@ -161,14 +85,15 @@ RecordEvent::~RecordEvent() { PopEvent(name_); } -MemEvenRecorder MemEvenRecorder::recorder; - void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, size_t size) { if (g_state == ProfilerState::kDisabled) return; std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; - PADDLE_ENFORCE(events.count(ptr) == 0, ""); + PADDLE_ENFORCE_EQ( + events.count(ptr), 0, + platform::errors::InvalidArgument( + "The Place can't exist in the stage of PushMemRecord")); events.emplace(ptr, std::unique_ptr( new MemEvenRecorder::RecordMemEvent(place, size))); } @@ -238,20 +163,34 @@ RecordBlock::~RecordBlock() { ClearCurBlock(); } -void SynchronizeAllDevice() { -#ifdef PADDLE_WITH_CUDA - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; i++) { - SetDeviceId(i); - PADDLE_ENFORCE(cudaDeviceSynchronize()); - } -#endif +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, + place, g_mem_thread_id, annotation); +} + +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, + g_mem_thread_id, annotation); +} + +void Mark(const std::string &name) { + GetEventList().Record(EventType::kMark, name, g_thread_id); +} + +Event *PushEvent(const std::string &name) { + return GetEventList().Record(EventType::kPushRange, name, g_thread_id); } +void PopEvent(const std::string &name) { + GetEventList().Record(EventType::kPopRange, name, g_thread_id); +} void EnableProfiler(ProfilerState state) { - PADDLE_ENFORCE(state != ProfilerState::kDisabled, - "Can't enable profiling, since the input state is ", - "ProfilerState::kDisabled"); + PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, + platform::errors::InvalidArgument( + "Can't enable profiling, since the input state is" + "ProfilerState::kDisabled")); SynchronizeAllDevice(); std::lock_guard l(profiler_mu); if (state == g_state) { @@ -287,574 +226,6 @@ void ResetProfiler() { } } -std::vector> GetAllEvents() { - std::lock_guard guard(g_all_event_lists_mutex); - std::vector> result; - for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); - ++it) { - result.emplace_back((*it)->Reduce()); - } - return result; -} - -std::vector> GetMemEvents() { - std::lock_guard guard(g_all_mem_event_lists_mutex); - std::vector> result; - for (auto &it : g_all_mem_event_lists) { - result.emplace_back((*it).Reduce()); - } - return result; -} - -// The information of each event given in the profiling report -struct EventItem { - std::string name; - int calls; - double total_time; - double max_time; - double ave_time; - double min_time; - double cpu_time; - double gpu_time; - float ratio; -}; - -struct OverHead { - bool print = false; - double total_time = 0.; - float compute_ratio = 0.0f; - float framework_ratio = 0.0f; - EventItem memcpy_item; - std::vector sub_memcpy_items; -}; - -// Print results -void PrintProfiler(const std::vector> &events_table, - const std::multimap &child_map, - const OverHead &overhead, const std::string &sorted_domain, - const size_t name_width, const size_t data_width, - bool merge_thread, int print_depth, int remove_len) { - if (print_depth == 0) { - // Output header information - std::cout << "\n------------------------->" - << " Profiling Report " - << "<-------------------------\n\n"; - std::string place; - if (g_state == ProfilerState::kCPU) { - place = "CPU"; - } else if (g_state == ProfilerState::kCUDA) { - place = "CUDA"; - } else if (g_state == ProfilerState::kAll) { - place = "All"; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but " - "received Invalid profiler state")); - } - - if (merge_thread) { - std::cout << "Note! This Report merge all thread info into one." - << std::endl; - } - std::cout << "Place: " << place << std::endl; - std::cout << "Time unit: ms" << std::endl; - std::cout << "Sorted by " << sorted_domain - << " in descending order in the same thread\n\n"; - - if (overhead.print) { - double compute_time = overhead.total_time * overhead.compute_ratio; - double framework_time = overhead.total_time * overhead.framework_ratio; - std::cout.setf(std::ios::left); - std::cout << "Total time: " << overhead.total_time << std::endl; - std::cout << std::setw(25) << " Computation time" - << "Total: " << std::setw(data_width) << compute_time - << "Ratio: " << overhead.compute_ratio * 100 << "%" - << std::endl; - std::cout << std::setw(25) << " Framework overhead" - << "Total: " << std::setw(data_width) << framework_time - << "Ratio: " << overhead.framework_ratio * 100 << "%" - << std::endl; - - std::cout << "\n-------------------------" - << " GpuMemCpy Summary " - << "-------------------------\n\n"; - std::cout << std::setw(25) << "GpuMemcpy" - << "Calls: " << std::setw(data_width) - << overhead.memcpy_item.calls - << "Total: " << std::setw(data_width) - << overhead.memcpy_item.total_time - << "Ratio: " << overhead.memcpy_item.ratio * 100 << "%" - << std::endl; - for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) { - EventItem item = overhead.sub_memcpy_items[i]; - if (item.calls != 0) { - std::cout << std::setw(25) << " " + item.name - << "Calls: " << std::setw(data_width) << item.calls - << "Total: " << std::setw(data_width) << item.total_time - << "Ratio: " << item.ratio * 100 << "%" << std::endl; - } - } - } - std::cout << "\n-------------------------" - << " Event Summary " - << "-------------------------\n\n"; - // Output events table - std::cout.setf(std::ios::left); - std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Calls" << std::setw(data_width) << "Total"; - if (g_state == ProfilerState::kAll) { - std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)" - << std::setw(data_width * 2) << "GPU Time (Ratio)"; - } - std::cout << std::setw(data_width) << "Min." << std::setw(data_width) - << "Max." << std::setw(data_width) << "Ave." - << std::setw(data_width) << "Ratio." << std::endl; - } - - if (events_table.size() <= 0) return; - - for (size_t i = 0; i < events_table.size(); ++i) { - for (size_t j = 0; j < events_table[i].size(); ++j) { - auto event_item = events_table[i][j]; - std::vector> child_table; - std::vector table; - for (auto it = child_map.begin(); it != child_map.end(); it++) { - if (it->first == event_item.name) { - table.push_back(it->second); - } - } - child_table.push_back(table); - - auto name_len = event_item.name.length(); - std::string print_name = event_item.name.substr(remove_len, name_len); - std::string delimiter; - for (int i = 0; i < print_depth; i++) { - delimiter = " " + delimiter; - } - print_name = delimiter + print_name; - - std::cout << std::setw(name_width) << print_name << std::setw(data_width) - << event_item.calls << std::setw(data_width) - << event_item.total_time; - if (g_state == ProfilerState::kAll) { - std::cout << std::setw(data_width * 2) - << string::Sprintf( - "%f (%f)", event_item.cpu_time, - (event_item.cpu_time / event_item.total_time)) - << std::setw(data_width * 2) - << string::Sprintf( - "%f (%f)", event_item.gpu_time, - (event_item.gpu_time / event_item.total_time)); - } - std::cout << std::setw(data_width) << event_item.min_time - << std::setw(data_width) << event_item.max_time - << std::setw(data_width) << event_item.ave_time - << std::setw(data_width) << event_item.ratio << std::endl; - PrintProfiler(child_table, child_map, overhead, sorted_domain, name_width, - data_width, merge_thread, print_depth + 1, 0); - } - } -} - -std::function SetSortedFunc( - EventSortingKey sorted_by, std::string *domain) { - std::string sorted_domain; - std::function sorted_func; - switch (sorted_by) { - case EventSortingKey::kCalls: - sorted_domain = "number of calls"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.calls > b.calls; - }; - break; - case EventSortingKey::kTotal: - sorted_domain = "total time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.total_time > b.total_time; - }; - break; - case EventSortingKey::kMin: - sorted_domain = "minimum time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.min_time > b.min_time; - }; - break; - case EventSortingKey::kMax: - sorted_domain = "maximum time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.max_time > b.max_time; - }; - break; - case EventSortingKey::kAve: - sorted_domain = "average time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.ave_time > b.ave_time; - }; - break; - case EventSortingKey::kGPUTime: - sorted_domain = "average time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.gpu_time > b.gpu_time; - }; - break; - case EventSortingKey::kCPUTime: - sorted_domain = "average time"; - sorted_func = [](const EventItem &a, const EventItem &b) { - return a.cpu_time > b.cpu_time; - }; - break; - default: - sorted_domain = "event first end time"; - } - *domain = sorted_domain; - return sorted_func; -} - -void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, - std::list *pushed_events, - std::vector *event_items, - std::unordered_map *event_idx) { - if (analyze_event.type() == EventType::kPushRange) { - pushed_events->push_back(analyze_event); - } else if (analyze_event.type() == EventType::kPopRange) { - std::list::reverse_iterator rit = pushed_events->rbegin(); - while (rit != pushed_events->rend() && - rit->name() != analyze_event.name()) { - ++rit; - } - // to find the father name event name - - if (rit != pushed_events->rend()) { - double event_time = 0; - double gpu_time = 0.0f; -#ifdef PADDLE_WITH_CUDA - gpu_time = rit->CudaElapsedMs(analyze_event); -#endif - double cpu_time = rit->CpuElapsedMs(analyze_event); - if (g_state == ProfilerState::kCUDA) { - event_time = gpu_time; - } else if (g_state == ProfilerState::kCPU) { - event_time = cpu_time; - } else { - event_time = gpu_time + cpu_time; - } - - std::string event_name; - if (merge_thread) { - event_name = rit->name(); - *max_name_width = std::max(*max_name_width, event_name.size()); - } else { - event_name = - "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); - *max_name_width = std::max(*max_name_width, event_name.size()); - } - - if (event_idx->find(event_name) == event_idx->end()) { - event_idx->insert({event_name, event_items->size()}); - EventItem event_item = {event_name, 1, event_time, - event_time, event_time, event_time, - cpu_time, gpu_time, 0.}; - event_items->push_back(event_item); - } else { - int index = event_idx->at(event_name); - event_items->at(index).calls += 1; - // total time - event_items->at(index).total_time += event_time; - // min time - event_items->at(index).min_time = - std::min(event_time, event_items->at(index).min_time); - // max time - event_items->at(index).max_time = - std::max(event_time, event_items->at(index).max_time); - event_items->at(index).gpu_time += gpu_time; - event_items->at(index).cpu_time += cpu_time; - } - - // remove the push marker from the list - pushed_events->erase((++rit).base()); - } else { - LOG(WARNING) << "Cannot find the push marker of event \'" - << analyze_event.name() - << "\', which will be ignored in profiling report."; - } - } -} - -void ComputeOverhead(const std::multimap &sub_child_map, - OverHead *overhead) { - EventItem memcpy_async = {"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; - EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; - for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { - if (it->second.name.find("compute") != std::string::npos) { - overhead->compute_ratio += it->second.ratio; - } - if (it->second.name.find("GpuMemcpyAsync") != std::string::npos) { - memcpy_async.calls += it->second.calls; - memcpy_async.total_time += it->second.total_time; - memcpy_async.ratio += it->second.ratio; - } else if (it->second.name.find("GpuMemcpySync") != std::string::npos) { - memcpy_sync.calls += it->second.calls; - memcpy_sync.total_time += it->second.total_time; - memcpy_sync.ratio += it->second.ratio; - } - } - overhead->framework_ratio = 1.0f - overhead->compute_ratio; - overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls; - overhead->memcpy_item.total_time = - memcpy_async.total_time + memcpy_sync.total_time; - overhead->memcpy_item.ratio = memcpy_async.ratio + memcpy_sync.ratio; - overhead->sub_memcpy_items = {memcpy_async, memcpy_sync}; -} - -// When TracerOption is KDefault, OpDetail will be recorded but only default -// profile result will be printed. -// GpuMemcpy should be printed in kDefault setting, however it offten occurs -// during 'compute' or 'prepare data' process, so the elements of sub_child_map -// need to be changed before being inserted into child_map. for instance: -// it->first: OpType/compute => OpType -// it->second.name: OpType/compute/GpuMemcpyAsync => OpType/GpuMemcpyAsync. -void GetChildMap(const std::multimap &sub_child_map, - std::multimap *child_map) { - if (platform::GetTracerOption() != TracerOption::kDefault) { - for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { - child_map->insert( - std::pair(it->first, it->second)); - } - } else { - for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { - if (it->second.name.find("GpuMemcpy") != std::string::npos) { - std::string parent_name = it->first; - auto left_pos = it->first.find("/"); - if (left_pos != std::string::npos) { - parent_name = it->first.substr(0, left_pos); - } - auto item = it->second; - auto right_pos = item.name.rfind("/"); - if (right_pos != std::string::npos) { - std::string child_name = item.name.substr( - right_pos + 1, item.name.length() - right_pos - 1); - item.name = parent_name + "/" + child_name; - } - child_map->insert(std::pair(parent_name, item)); - } - } - } -} - -// Parse the event list and output the profiling report -void ParseEvents(const std::vector> &events, - bool merge_thread, - EventSortingKey sorted_by = EventSortingKey::kDefault) { - if (g_state == ProfilerState::kDisabled) return; - if (merge_thread && events.size() < 2) return; - - std::string sorted_domain; - std::function sorted_func; - sorted_func = SetSortedFunc(sorted_by, &sorted_domain); - - const std::vector> *analyze_events; - std::vector> merged_events_list; - if (merge_thread) { - std::vector merged_events; - for (size_t i = 0; i < events.size(); ++i) { - for (size_t j = 0; j < events[i].size(); ++j) { - merged_events.push_back(events[i][j]); - } - } - merged_events_list.push_back(merged_events); - analyze_events = &merged_events_list; - } else { - analyze_events = &events; - } - - std::vector> events_table; - std::multimap child_map; - size_t max_name_width = 0; - OverHead overhead; - - for (size_t i = 0; i < (*analyze_events).size(); i++) { - double total = 0.; // the total time in one thread - std::list pushed_events; - std::vector event_items; - std::vector main_event_items; - std::unordered_map event_idx; - std::multimap sub_child_map; - - for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { - Event analyze_event = (*analyze_events)[i][j]; - SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events, - &event_items, &event_idx); - } - - auto table_size = event_items.size(); - std::vector child_index(table_size, 0); - for (size_t j = 0; j < table_size; ++j) { - std::string fname = event_items[j].name; - std::string grad_name = event_items[j].name + "_grad"; - for (size_t k = 0; k < table_size; ++k) { - std::string cname = event_items[k].name; - bool condition = cname.length() > fname.length() && - cname.rfind(fname, 0) == 0 && - !cname.rfind(grad_name, 0) == 0 && - (cname[fname.length()] == '/' && - cname.rfind('/') == fname.length()); - if (condition) { - sub_child_map.insert( - std::pair(fname, event_items[k])); - child_index[k] = 1; - } - } - } - - for (size_t j = 0; j < table_size; ++j) { - if (child_index[j] == 0) { - main_event_items.push_back(event_items[j]); - total += event_items[j].total_time; - } - } - - // average time - for (auto &item : main_event_items) { - item.ave_time = item.total_time / item.calls; - item.ratio = item.total_time / total; - } - for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { - it->second.ratio = it->second.total_time / total; - it->second.ave_time = it->second.total_time / it->second.calls; - } - - // When multi-threaded, overhead are printed only if merge_thread is true - if ((*analyze_events).size() == 1) { - overhead.total_time = total; - overhead.print = true; - ComputeOverhead(sub_child_map, &overhead); - } - - // sort - if (sorted_by != EventSortingKey::kDefault) { - std::sort(main_event_items.begin(), main_event_items.end(), sorted_func); - } - - events_table.push_back(main_event_items); - // log warning if there are events with `push` but without `pop` - std::list::reverse_iterator rit = pushed_events.rbegin(); - while (rit != pushed_events.rend()) { - LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() - << "\', which will be ignored in profiling report."; - ++rit; - } - - GetChildMap(sub_child_map, &child_map); - } - - // Print report - PrintProfiler(events_table, child_map, overhead, sorted_domain, - max_name_width + 8, 12, merge_thread, 0, 0); -} - -struct MemoryProfierReport { - size_t alloc_times{0}; - size_t alloc_size{0}; - size_t free_times{0}; - size_t free_size{0}; -}; - -// Print results -void PrintMemProfiler( - const std::map> - &annotation_report, - const size_t name_width, const size_t data_width) { - // Output header information - std::cout << "\n------------------------->" - << " Memory Profiling Report " - << "<-------------------------\n\n"; - - // Output events table - std::cout.setf(std::ios::left); - std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Alloc Calls" << std::setw(data_width) << "Size(MB)" - << std::setw(data_width) << "Free Calls" << std::setw(data_width) - << "Size(MB)" << std::endl; - - for (auto &tmp : annotation_report) { - for (auto &e : tmp.second) { - auto event_name = string::Sprintf("%s:%s", tmp.first, e.first); - std::cout << std::setw(name_width) << event_name; - std::cout << std::setw(data_width) << e.second.alloc_times; - std::cout << std::setw(data_width) - << e.second.alloc_size / (1024.0 * 1024.0); - std::cout << std::setw(data_width) << e.second.free_times; - std::cout << std::setw(data_width) - << e.second.free_size / (1024.0 * 1024.0) << std::endl; - } - } - std::cout << std::endl; -} - -// parse memory events -void ParseMemEvents(const std::vector> &events) { - if (g_state == ProfilerState::kDisabled) return; - // place, annotation, alloc times, alloc size - std::map> - annotation_report; - - for (auto &tmp : events) { - for (auto &e : tmp) { - if (e.type() == EventType::kPushRange) { - annotation_report[e.place()][e.annotation()].alloc_times += 1; - annotation_report[e.place()][e.annotation()].alloc_size += e.bytes(); - } else if (e.type() == EventType::kPopRange) { - annotation_report[e.place()][e.annotation()].free_times += 1; - annotation_report[e.place()][e.annotation()].free_size += e.bytes(); - } - } - } - PrintMemProfiler(annotation_report, 55, 18); -} - -void DealWithShowName() { - std::unordered_map> profiler_name_info; - for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); - ++it) { - for (auto &block : (*it)->event_blocks) { - for (auto &r : block) { - auto event_name = r.name(); - size_t start = event_name.find('%', 0); - size_t end = event_name.find('%', start + 1); - std::string prefix_str = event_name.substr(0, start); - while (start != std::string::npos && end != std::string::npos) { - auto search_str = event_name.substr(start, end - start + 1); - std::string replace_str = ""; - int replace_index = 0; - - auto it = profiler_name_info.find(prefix_str); - if (it == profiler_name_info.end()) { - std::vector op_name_vector{search_str}; - profiler_name_info[prefix_str] = op_name_vector; - } else { - auto op_name_vector = it->second; - auto iter = - find(op_name_vector.begin(), op_name_vector.end(), search_str); - if (iter == op_name_vector.end()) { - replace_index = it->second.size(); - it->second.push_back(search_str); - } else { - replace_index = it->second.size() - 1; - } - } - replace_str = std::to_string(replace_index); - event_name.replace(start, end - start + 1, replace_str); - start = start + 1; - start = event_name.find('%', start); - end = event_name.find('%', start + 1); - prefix_str = event_name.substr(0, start); - } - r.set_name(event_name); - } - } - } -} - void DisableProfiler(EventSortingKey sorted_key, const std::string &profile_path) { SynchronizeAllDevice(); @@ -887,17 +258,19 @@ void DisableProfiler(EventSortingKey sorted_key, should_send_profile_state = true; } +std::vector> GetAllEvents() { + std::lock_guard guard(g_all_event_lists_mutex); + std::vector> result; + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); + } + return result; +} + bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } -bool ShouldSendProfileState() { return should_send_profile_state; } -void SetProfileListener() { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - profiler_lister_id = dist6(rng); -} -int64_t ListenerId() { return profiler_lister_id; } +bool ShouldSendProfileState() { return should_send_profile_state; } std::string OpName(const framework::VariableNameMap &name_map, const std::string &type_name) { @@ -923,5 +296,16 @@ void SetTracerOption(TracerOption option) { } platform::TracerOption GetTracerOption() { return g_tracer_option; } + +void SetProfileListener() { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + profiler_lister_id = dist6(rng); +} + +int64_t ListenerId() { return profiler_lister_id; } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 3b821606472b8b0a7eaf87ee9e49afaba433b002..0a509a7afc4666d40ead4c99831109fc7f135014 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -33,6 +33,9 @@ limitations under the License. */ namespace paddle { namespace platform { +const int kEnableProfiler = 1; +const int kDisableProfiler = 2; + enum class ProfilerState { kDisabled, // disabled state kCPU, // CPU profiling state @@ -53,12 +56,46 @@ enum class TracerOption { kAllOpDetail, // print the detail profiling result of different op name }; -void Mark(const std::string& name); +// Candidate keys to sort the profiling report +enum class EventSortingKey { + kDefault, + kCalls, + kTotal, + kMin, + kMax, + kAve, + kCPUTime, + kGPUTime +}; -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place); -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place); +struct MemoryProfierReport { + size_t alloc_times{0}; + size_t alloc_size{0}; + size_t free_times{0}; + size_t free_size{0}; +}; + +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double max_time; + double ave_time; + double min_time; + double cpu_time; + double gpu_time; + float ratio; +}; + +struct OverHead { + bool print = false; + double total_time = 0.; + float compute_ratio = 0.0f; + float framework_ratio = 0.0f; + EventItem memcpy_item; + std::vector sub_memcpy_items; +}; struct MemEvenRecorder { public: @@ -89,9 +126,6 @@ struct MemEvenRecorder { DISABLE_COPY_AND_ASSIGN(MemEvenRecorder); }; -Event* PushEvent(const std::string& name); -void PopEvent(const std::string& name); - struct RecordEvent { RecordEvent(const std::string& name, const RecordRole role = RecordRole::kOrdinary); @@ -127,22 +161,6 @@ struct RecordBlock { uint64_t start_ns_; }; -// Return the event list of all threads. Assumed the returned value calls -// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. -std::vector> GetAllEvents(); - -// Candidate keys to sort the profiling report -enum class EventSortingKey { - kDefault, - kCalls, - kTotal, - kMin, - kMax, - kAve, - kCPUTime, - kGPUTime -}; - template struct EventList { constexpr static size_t kMB = 1024 * 1024; @@ -178,25 +196,27 @@ struct EventList { std::forward_list> event_blocks; }; +void Mark(const std::string& name); +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place, const std::string& annotation); +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place, const std::string& annotation); +Event* PushEvent(const std::string& name); +void PopEvent(const std::string& name); +// Return the event list of all threads. Assumed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. +std::vector> GetAllEvents(); + // Enable the profiling function. void EnableProfiler(ProfilerState state); - // Clear the g_all_event_lists, which is total event lists of all threads. void ResetProfiler(); - void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path); - -const int kEnableProfiler = 1; -const int kDisableProfiler = 2; // Test if the profiler is currently enabled. bool IsProfileEnabled(); // Whether the trainer should send profiling state to PS. bool ShouldSendProfileState(); -// Mark current process as PS by assigning a lister id. -void SetProfileListener(); -int64_t ListenerId(); - std::string OpName(const framework::VariableNameMap& name_map, const std::string& type_name); void SetTracerOption(TracerOption option); @@ -205,5 +225,9 @@ platform::TracerOption GetTracerOption(); void DummyKernelAndEvent(); #endif +// Mark current process as PS by assigning a lister id. +void SetProfileListener(); +int64_t ListenerId(); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..44140d3e5faaf2169409bfd7b163cf6e01d237ee --- /dev/null +++ b/paddle/fluid/platform/profiler_helper.h @@ -0,0 +1,637 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif // PADDLE_WITH_CUDA + +namespace paddle { +namespace platform { + +static int64_t profiler_lister_id = 0; +static bool should_send_profile_state = false; +std::mutex profiler_mu; + +static TracerOption g_tracer_option = TracerOption::kDefault; +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list>> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr> g_event_list; + +static std::list>> g_all_mem_event_lists; +static thread_local std::shared_ptr> g_mem_event_list; +static std::mutex g_all_mem_event_lists_mutex; +static thread_local int32_t g_mem_thread_id; +static uint32_t g_mem_next_thread_id = 0; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +inline EventList &GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared>(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + RecoreCurThreadId(g_thread_id); + } + return *g_event_list; +} + +inline EventList &GetMemEventList() { + if (!g_mem_event_list) { + g_mem_event_list = std::make_shared>(); + std::lock_guard guard(g_all_mem_event_lists_mutex); + g_mem_thread_id = g_mem_next_thread_id++; + g_all_mem_event_lists.emplace_front(g_mem_event_list); + } + return *g_mem_event_list; +} + +std::vector> GetMemEvents() { + std::lock_guard guard(g_all_mem_event_lists_mutex); + std::vector> result; + for (auto &it : g_all_mem_event_lists) { + result.emplace_back((*it).Reduce()); + } + return result; +} + +void SynchronizeAllDevice() { +#ifdef PADDLE_WITH_CUDA + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaDeviceSynchronize(), + platform::errors::External( + "Device synchronize failed in cudaDeviceSynchronize()")); + } +#endif +} + +// Print results +void PrintMemProfiler( + const std::map> + &annotation_report, + const size_t name_width, const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Memory Profiling Report " + << "<-------------------------\n\n"; + + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Alloc Calls" << std::setw(data_width) << "Size(MB)" + << std::setw(data_width) << "Free Calls" << std::setw(data_width) + << "Size(MB)" << std::endl; + + for (auto &tmp : annotation_report) { + for (auto &e : tmp.second) { + auto event_name = string::Sprintf("%s:%s", tmp.first, e.first); + std::cout << std::setw(name_width) << event_name; + std::cout << std::setw(data_width) << e.second.alloc_times; + std::cout << std::setw(data_width) + << e.second.alloc_size / (1024.0 * 1024.0); + std::cout << std::setw(data_width) << e.second.free_times; + std::cout << std::setw(data_width) + << e.second.free_size / (1024.0 * 1024.0) << std::endl; + } + } + std::cout << std::endl; +} + +// parse memory events +void ParseMemEvents(const std::vector> &events) { + if (g_state == ProfilerState::kDisabled) return; + // place, annotation, alloc times, alloc size + std::map> + annotation_report; + + for (auto &tmp : events) { + for (auto &e : tmp) { + if (e.type() == EventType::kPushRange) { + annotation_report[e.place()][e.annotation()].alloc_times += 1; + annotation_report[e.place()][e.annotation()].alloc_size += e.bytes(); + } else if (e.type() == EventType::kPopRange) { + annotation_report[e.place()][e.annotation()].free_times += 1; + annotation_report[e.place()][e.annotation()].free_size += e.bytes(); + } + } + } + PrintMemProfiler(annotation_report, 55, 18); +} + +void DealWithShowName() { + std::unordered_map> profiler_name_info; + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + for (auto &block : (*it)->event_blocks) { + for (auto &r : block) { + auto event_name = r.name(); + size_t start = event_name.find('%', 0); + size_t end = event_name.find('%', start + 1); + std::string prefix_str = event_name.substr(0, start); + while (start != std::string::npos && end != std::string::npos) { + auto search_str = event_name.substr(start, end - start + 1); + std::string replace_str = ""; + int replace_index = 0; + + auto it = profiler_name_info.find(prefix_str); + if (it == profiler_name_info.end()) { + std::vector op_name_vector{search_str}; + profiler_name_info[prefix_str] = op_name_vector; + } else { + auto op_name_vector = it->second; + auto iter = + find(op_name_vector.begin(), op_name_vector.end(), search_str); + if (iter == op_name_vector.end()) { + replace_index = it->second.size(); + it->second.push_back(search_str); + } else { + replace_index = it->second.size() - 1; + } + } + replace_str = std::to_string(replace_index); + event_name.replace(start, end - start + 1, replace_str); + start = start + 1; + start = event_name.find('%', start); + end = event_name.find('%', start + 1); + prefix_str = event_name.substr(0, start); + } + r.set_name(event_name); + } + } + } +} + +std::function SetSortedFunc( + EventSortingKey sorted_by, std::string *domain) { + std::string sorted_domain; + std::function sorted_func; + switch (sorted_by) { + case EventSortingKey::kCalls: + sorted_domain = "number of calls"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.calls > b.calls; + }; + break; + case EventSortingKey::kTotal: + sorted_domain = "total time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.total_time > b.total_time; + }; + break; + case EventSortingKey::kMin: + sorted_domain = "minimum time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.min_time > b.min_time; + }; + break; + case EventSortingKey::kMax: + sorted_domain = "maximum time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.max_time > b.max_time; + }; + break; + case EventSortingKey::kAve: + sorted_domain = "average time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.ave_time > b.ave_time; + }; + break; + case EventSortingKey::kGPUTime: + sorted_domain = "average time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.gpu_time > b.gpu_time; + }; + break; + case EventSortingKey::kCPUTime: + sorted_domain = "average time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.cpu_time > b.cpu_time; + }; + break; + default: + sorted_domain = "event first end time"; + } + *domain = sorted_domain; + return sorted_func; +} + +void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, + std::list *pushed_events, + std::vector *event_items, + std::unordered_map *event_idx) { + if (analyze_event.type() == EventType::kPushRange) { + pushed_events->push_back(analyze_event); + } else if (analyze_event.type() == EventType::kPopRange) { + std::list::reverse_iterator rit = pushed_events->rbegin(); + while (rit != pushed_events->rend() && + rit->name() != analyze_event.name()) { + ++rit; + } + + // to find the father name event name + if (rit != pushed_events->rend()) { + double event_time = 0; + double gpu_time = 0.0f; +#ifdef PADDLE_WITH_CUDA + gpu_time = rit->CudaElapsedMs(analyze_event); +#endif + double cpu_time = rit->CpuElapsedMs(analyze_event); + if (g_state == ProfilerState::kCUDA) { + event_time = gpu_time; + } else if (g_state == ProfilerState::kCPU) { + event_time = cpu_time; + } else { + event_time = gpu_time + cpu_time; + } + + std::string event_name; + if (merge_thread) { + event_name = rit->name(); + *max_name_width = std::max(*max_name_width, event_name.size()); + } else { + event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + *max_name_width = std::max(*max_name_width, event_name.size()); + } + + if (event_idx->find(event_name) == event_idx->end()) { + event_idx->insert({event_name, event_items->size()}); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time, + cpu_time, gpu_time, 0.}; + event_items->push_back(event_item); + } else { + int index = event_idx->at(event_name); + event_items->at(index).calls += 1; + // total time + event_items->at(index).total_time += event_time; + // min time + event_items->at(index).min_time = + std::min(event_time, event_items->at(index).min_time); + // max time + event_items->at(index).max_time = + std::max(event_time, event_items->at(index).max_time); + event_items->at(index).gpu_time += gpu_time; + event_items->at(index).cpu_time += cpu_time; + } + + // remove the push marker from the list + pushed_events->erase((++rit).base()); + } else { + LOG(WARNING) << "Cannot find the push marker of event \'" + << analyze_event.name() + << "\', which will be ignored in profiling report."; + } + } +} + +void ComputeOverhead(const std::multimap &sub_child_map, + OverHead *overhead) { + EventItem memcpy_async = {"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; + EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + if (it->second.name.find("compute") != std::string::npos) { + overhead->compute_ratio += it->second.ratio; + } + if (it->second.name.find("GpuMemcpyAsync") != std::string::npos) { + memcpy_async.calls += it->second.calls; + memcpy_async.total_time += it->second.total_time; + memcpy_async.ratio += it->second.ratio; + } else if (it->second.name.find("GpuMemcpySync") != std::string::npos) { + memcpy_sync.calls += it->second.calls; + memcpy_sync.total_time += it->second.total_time; + memcpy_sync.ratio += it->second.ratio; + } + } + overhead->framework_ratio = 1.0f - overhead->compute_ratio; + overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls; + overhead->memcpy_item.total_time = + memcpy_async.total_time + memcpy_sync.total_time; + overhead->memcpy_item.ratio = memcpy_async.ratio + memcpy_sync.ratio; + overhead->sub_memcpy_items = {memcpy_async, memcpy_sync}; +} + +// When TracerOption is KDefault, OpDetail will be recorded but only default +// profile result will be printed. +// GpuMemcpy should be printed in kDefault setting, however it offten occurs +// during 'compute' or 'prepare data' process, so the elements of sub_child_map +// need to be changed before being inserted into child_map. for instance: +// it->first: OpType/compute => OpType +// it->second.name: OpType/compute/GpuMemcpyAsync => OpType/GpuMemcpyAsync. +void GetChildMap(const std::multimap &sub_child_map, + std::multimap *child_map) { + if (platform::GetTracerOption() != TracerOption::kDefault) { + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + child_map->insert( + std::pair(it->first, it->second)); + } + } else { + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + if (it->second.name.find("GpuMemcpy") != std::string::npos) { + std::string parent_name = it->first; + auto left_pos = it->first.find("/"); + if (left_pos != std::string::npos) { + parent_name = it->first.substr(0, left_pos); + } + auto item = it->second; + auto right_pos = item.name.rfind("/"); + if (right_pos != std::string::npos) { + std::string child_name = item.name.substr( + right_pos + 1, item.name.length() - right_pos - 1); + item.name = parent_name + "/" + child_name; + } + child_map->insert(std::pair(parent_name, item)); + } + } + } +} + +// Print results +void PrintProfiler(const std::vector> &events_table, + const std::multimap &child_map, + const OverHead &overhead, const std::string &sorted_domain, + const size_t name_width, const size_t data_width, + bool merge_thread, int print_depth, int remove_len) { + if (print_depth == 0) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::string place; + if (g_state == ProfilerState::kCPU) { + place = "CPU"; + } else if (g_state == ProfilerState::kCUDA) { + place = "CUDA"; + } else if (g_state == ProfilerState::kAll) { + place = "All"; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but " + "received Invalid profiler state")); + } + + if (merge_thread) { + std::cout << "Note! This Report merge all thread info into one." + << std::endl; + } + std::cout << "Place: " << place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + + if (overhead.print) { + double compute_time = overhead.total_time * overhead.compute_ratio; + double framework_time = overhead.total_time * overhead.framework_ratio; + std::cout.setf(std::ios::left); + std::cout << "Total time: " << overhead.total_time << std::endl; + std::cout << std::setw(25) << " Computation time" + << "Total: " << std::setw(data_width) << compute_time + << "Ratio: " << overhead.compute_ratio * 100 << "%" + << std::endl; + std::cout << std::setw(25) << " Framework overhead" + << "Total: " << std::setw(data_width) << framework_time + << "Ratio: " << overhead.framework_ratio * 100 << "%" + << std::endl; + + std::cout << "\n-------------------------" + << " GpuMemCpy Summary " + << "-------------------------\n\n"; + std::cout << std::setw(25) << "GpuMemcpy" + << "Calls: " << std::setw(data_width) + << overhead.memcpy_item.calls + << "Total: " << std::setw(data_width) + << overhead.memcpy_item.total_time + << "Ratio: " << overhead.memcpy_item.ratio * 100 << "%" + << std::endl; + for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) { + EventItem item = overhead.sub_memcpy_items[i]; + if (item.calls != 0) { + std::cout << std::setw(25) << " " + item.name + << "Calls: " << std::setw(data_width) << item.calls + << "Total: " << std::setw(data_width) << item.total_time + << "Ratio: " << item.ratio * 100 << "%" << std::endl; + } + } + } + std::cout << "\n-------------------------" + << " Event Summary " + << "-------------------------\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total"; + if (g_state == ProfilerState::kAll) { + std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)" + << std::setw(data_width * 2) << "GPU Time (Ratio)"; + } + std::cout << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." + << std::setw(data_width) << "Ratio." << std::endl; + } + + if (events_table.size() <= 0) return; + + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + auto event_item = events_table[i][j]; + std::vector> child_table; + std::vector table; + for (auto it = child_map.begin(); it != child_map.end(); it++) { + if (it->first == event_item.name) { + table.push_back(it->second); + } + } + child_table.push_back(table); + + auto name_len = event_item.name.length(); + std::string print_name = event_item.name.substr(remove_len, name_len); + std::string delimiter; + for (int i = 0; i < print_depth; i++) { + delimiter = " " + delimiter; + } + print_name = delimiter + print_name; + + std::cout << std::setw(name_width) << print_name << std::setw(data_width) + << event_item.calls << std::setw(data_width) + << event_item.total_time; + if (g_state == ProfilerState::kAll) { + std::cout << std::setw(data_width * 2) + << string::Sprintf( + "%f (%f)", event_item.cpu_time, + (event_item.cpu_time / event_item.total_time)) + << std::setw(data_width * 2) + << string::Sprintf( + "%f (%f)", event_item.gpu_time, + (event_item.gpu_time / event_item.total_time)); + } + std::cout << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time + << std::setw(data_width) << event_item.ratio << std::endl; + PrintProfiler(child_table, child_map, overhead, sorted_domain, name_width, + data_width, merge_thread, print_depth + 1, 0); + } + } +} + +// Parse the event list and output the profiling report +void ParseEvents(const std::vector> &events, + bool merge_thread, + EventSortingKey sorted_by = EventSortingKey::kDefault) { + if (g_state == ProfilerState::kDisabled) return; + if (merge_thread && events.size() < 2) return; + + std::string sorted_domain; + std::function sorted_func; + sorted_func = SetSortedFunc(sorted_by, &sorted_domain); + + const std::vector> *analyze_events; + std::vector> merged_events_list; + if (merge_thread) { + std::vector merged_events; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + merged_events.push_back(events[i][j]); + } + } + merged_events_list.push_back(merged_events); + analyze_events = &merged_events_list; + } else { + analyze_events = &events; + } + + std::vector> events_table; + std::multimap child_map; + size_t max_name_width = 0; + OverHead overhead; + + for (size_t i = 0; i < (*analyze_events).size(); i++) { + double total = 0.; // the total time in one thread + std::list pushed_events; + std::vector event_items; + std::vector main_event_items; + std::unordered_map event_idx; + std::multimap sub_child_map; + + for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { + Event analyze_event = (*analyze_events)[i][j]; + SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events, + &event_items, &event_idx); + } + + auto table_size = event_items.size(); + std::vector child_index(table_size, 0); + for (size_t j = 0; j < table_size; ++j) { + std::string fname = event_items[j].name; + std::string grad_name = event_items[j].name + "_grad"; + for (size_t k = 0; k < table_size; ++k) { + std::string cname = event_items[k].name; + bool condition = cname.length() > fname.length() && + cname.rfind(fname, 0) == 0 && + !cname.rfind(grad_name, 0) == 0 && + (cname[fname.length()] == '/' && + cname.rfind('/') == fname.length()); + if (condition) { + sub_child_map.insert( + std::pair(fname, event_items[k])); + child_index[k] = 1; + } + } + } + + for (size_t j = 0; j < table_size; ++j) { + if (child_index[j] == 0) { + main_event_items.push_back(event_items[j]); + total += event_items[j].total_time; + } + } + + // average time + for (auto &item : main_event_items) { + item.ave_time = item.total_time / item.calls; + item.ratio = item.total_time / total; + } + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + it->second.ratio = it->second.total_time / total; + it->second.ave_time = it->second.total_time / it->second.calls; + } + + // When multi-threaded, overhead are printed only if merge_thread is true + if ((*analyze_events).size() == 1) { + overhead.total_time = total; + overhead.print = true; + ComputeOverhead(sub_child_map, &overhead); + } + + // sort + if (sorted_by != EventSortingKey::kDefault) { + std::sort(main_event_items.begin(), main_event_items.end(), sorted_func); + } + + events_table.push_back(main_event_items); + // log warning if there are events with `push` but without `pop` + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend()) { + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; + ++rit; + } + + GetChildMap(sub_child_map, &child_map); + } + + // Print report + PrintProfiler(events_table, child_map, overhead, sorted_domain, + max_name_width + 8, 12, merge_thread, 0, 0); +} + +} // namespace platform +} // namespace paddle