# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import json import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--profile_path', type=str, default='', help='Input profile file name. If there are multiple file, the format ' 'should be trainer1=file1,trainer2=file2,ps=file3', ) parser.add_argument( '--timeline_path', type=str, default='', help='Output timeline file name.' ) args = parser.parse_args() class _ChromeTraceFormatter(object): def __init__(self): self._events = [] self._metadata = [] def _create_event(self, ph, category, name, pid, tid, timestamp): """Creates a new Chrome Trace event. For details of the file format, see: https://github.com/catapult-project/catapult/blob/master/tracing/README.md Args: ph: The type of event - usually a single character. category: The event category as a string. name: The event name as a string. pid: Identifier of the process generating this event as an integer. tid: Identifier of the thread generating this event as an integer. timestamp: The timestamp of this event as a long integer. Returns: A JSON compatible event object. """ event = {} event['ph'] = ph event['cat'] = category event['name'] = name.replace("ParallelExecutor::Run/", "") event['pid'] = pid event['tid'] = tid event['ts'] = timestamp return event def emit_pid(self, name, pid): """Adds a process metadata event to the trace. Args: name: The process name as a string. pid: Identifier of the process as an integer. """ event = {} event['name'] = 'process_name' event['ph'] = 'M' event['pid'] = pid event['args'] = {'name': name} self._metadata.append(event) def emit_region(self, timestamp, duration, pid, tid, category, name, args): """Adds a region event to the trace. Args: timestamp: The start timestamp of this region as a long integer. duration: The duration of this region as a long integer. pid: Identifier of the process generating this event as an integer. tid: Identifier of the thread generating this event as an integer. category: The event category as a string. name: The event name as a string. args: A JSON-compatible dictionary of event arguments. """ event = self._create_event('X', category, name, pid, tid, timestamp) event['dur'] = duration event['args'] = args self._events.append(event) def emit_counter(self, category, name, pid, timestamp, counter, value): """Emits a record for a single counter. Args: category: The event category as string name: The event name as string pid: Identifier of the process generating this event as integer timestamp: The timestamps of this event as long integer counter: Name of the counter as string value: Value of the counter as integer tid: Thread id of the allocation as integer """ event = self._create_event('C', category, name, pid, 0, timestamp) event['args'] = {counter: value} self._events.append(event) def format_to_string(self, pretty=False): """Formats the chrome trace to a string. Args: pretty: (Optional.) If True, produce human-readable JSON output. Returns: A JSON-formatted string in Chrome Trace format. """ trace = {} trace['traceEvents'] = self._metadata + self._events if pretty: return json.dumps(trace, indent=4, separators=(',', ': ')) else: return json.dumps(trace, separators=(',', ':')) class Timeline(object): def __init__(self, profile_dict): self._profile_dict = profile_dict self._pid = 0 self._devices = dict() self._mem_devices = dict() self._chrome_trace = _ChromeTraceFormatter() def _allocate_pid(self): cur_pid = self._pid self._pid += 1 return cur_pid def _allocate_pids(self): for k, profile_pb in self._profile_dict.items(): for event in profile_pb.events: if event.type == profiler_pb2.Event.CPU: if (k, event.device_id, "CPU") not in self._devices: pid = self._allocate_pid() self._devices[(k, event.device_id, "CPU")] = pid # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy) if event.device_id == -1: self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) else: self._chrome_trace.emit_pid( "%s:cpu:block:%d" % (k, event.device_id), pid ) elif event.type == profiler_pb2.Event.GPUKernel: if (k, event.device_id, "GPUKernel") not in self._devices: pid = self._allocate_pid() self._devices[(k, event.device_id, "GPUKernel")] = pid self._chrome_trace.emit_pid( "%s:gpu:%d" % (k, event.device_id), pid ) if not hasattr(profile_pb, "mem_events"): continue for mevent in profile_pb.mem_events: if mevent.place == profiler_pb2.MemEvent.CUDAPlace: if (k, mevent.device_id, "GPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "GPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:gpu:%d" % (k, mevent.device_id), pid, ) elif mevent.place == profiler_pb2.MemEvent.CPUPlace: if (k, mevent.device_id, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "CPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:cpu:%d" % (k, mevent.device_id), pid, ) elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace: if ( k, mevent.device_id, "CUDAPinnedPlace", ) not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[ (k, mevent.device_id, "CUDAPinnedPlace") ] = pid self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, mevent.device_id), pid, ) elif mevent.place == profiler_pb2.MemEvent.NPUPlace: if (k, mevent.device_id, "NPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "NPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:npu:%d" % (k, mevent.device_id), pid, ) if (k, 0, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "CPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:cpu:%d" % (k, 0), pid ) if (k, 0, "GPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "GPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:gpu:%d" % (k, 0), pid ) if (k, 0, "CUDAPinnedPlace") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid ) if (k, 0, "NPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "NPU")] = pid self._chrome_trace.emit_pid( "memory usage on %s:npu:%d" % (k, 0), pid ) def _allocate_events(self): for k, profile_pb in self._profile_dict.items(): for event in profile_pb.events: if event.type == profiler_pb2.Event.CPU: type = "CPU" elif event.type == profiler_pb2.Event.GPUKernel: type = "GPUKernel" pid = self._devices[(k, event.device_id, type)] args = {'name': event.name} if event.memcopy.bytes > 0: args['mem_bytes'] = event.memcopy.bytes if hasattr(event, "detail_info") and event.detail_info: args['detail_info'] = event.detail_info # TODO(panyx0718): Chrome tracing only handles ms. However, some # ops takes micro-seconds. Hence, we keep the ns here. self._chrome_trace.emit_region( event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, event.sub_device_id, 'Op', event.name, args, ) def _allocate_memory_event(self): if not hasattr(profiler_pb2, "MemEvent"): return place_to_str = { profiler_pb2.MemEvent.CPUPlace: "CPU", profiler_pb2.MemEvent.CUDAPlace: "GPU", profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace", profiler_pb2.MemEvent.NPUPlace: "NPU", } for k, profile_pb in self._profile_dict.items(): mem_list = [] end_profiler = 0 for mevent in profile_pb.mem_events: crt_info = dict() crt_info['time'] = mevent.start_ns crt_info['size'] = mevent.bytes if mevent.place in place_to_str: place = place_to_str[mevent.place] else: place = "UnDefine" crt_info['place'] = place pid = self._mem_devices[(k, mevent.device_id, place)] crt_info['pid'] = pid crt_info['thread_id'] = mevent.thread_id crt_info['device_id'] = mevent.device_id mem_list.append(crt_info) crt_info = dict() crt_info['place'] = place crt_info['pid'] = pid crt_info['thread_id'] = mevent.thread_id crt_info['device_id'] = mevent.device_id crt_info['time'] = mevent.end_ns crt_info['size'] = -mevent.bytes mem_list.append(crt_info) end_profiler = max(end_profiler, crt_info['time']) mem_list.sort(key=lambda tmp: (tmp.get('time', 0))) i = 0 total_size = 0 while i < len(mem_list): total_size += mem_list[i]['size'] while ( i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[i + 1]['time'] ): total_size += mem_list[i + 1]['size'] i += 1 self._chrome_trace.emit_counter( "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'], 0, total_size, ) i += 1 def generate_chrome_trace(self): self._allocate_pids() self._allocate_events() self._allocate_memory_event() return self._chrome_trace.format_to_string() profile_path = '/tmp/profile' if args.profile_path: profile_path = args.profile_path timeline_path = '/tmp/timeline' if args.timeline_path: timeline_path = args.timeline_path profile_paths = profile_path.split(',') profile_dict = dict() if len(profile_paths) == 1: with open(profile_path, 'rb') as f: profile_s = f.read() profile_pb = profiler_pb2.Profile() profile_pb.ParseFromString(profile_s) profile_dict['trainer'] = profile_pb else: for profile_path in profile_paths: k, v = profile_path.split('=') with open(v, 'rb') as f: profile_s = f.read() profile_pb = profiler_pb2.Profile() profile_pb.ParseFromString(profile_s) profile_dict[k] = profile_pb tl = Timeline(profile_dict) with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace())