timeline.py 13.8 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json

import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
22 23 24 25
    '--profile_path',
    type=str,
    default='',
    help='Input profile file name. If there are multiple file, the format '
26 27 28 29 30
    'should be trainer1=file1,trainer2=file2,ps=file3',
)
parser.add_argument(
    '--timeline_path', type=str, default='', help='Output timeline file name.'
)
X
Xin Pan 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
args = parser.parse_args()


class _ChromeTraceFormatter(object):
    def __init__(self):
        self._events = []
        self._metadata = []

    def _create_event(self, ph, category, name, pid, tid, timestamp):
        """Creates a new Chrome Trace event.

        For details of the file format, see:
        https://github.com/catapult-project/catapult/blob/master/tracing/README.md

        Args:
          ph:  The type of event - usually a single character.
          category: The event category as a string.
          name:  The event name as a string.
          pid:  Identifier of the process generating this event as an integer.
          tid:  Identifier of the thread generating this event as an integer.
          timestamp:  The timestamp of this event as a long integer.

        Returns:
          A JSON compatible event object.
        """
        event = {}
        event['ph'] = ph
        event['cat'] = category
59
        event['name'] = name.replace("ParallelExecutor::Run/", "")
X
Xin Pan 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
        event['pid'] = pid
        event['tid'] = tid
        event['ts'] = timestamp
        return event

    def emit_pid(self, name, pid):
        """Adds a process metadata event to the trace.

        Args:
          name:  The process name as a string.
          pid:  Identifier of the process as an integer.
        """
        event = {}
        event['name'] = 'process_name'
        event['ph'] = 'M'
        event['pid'] = pid
        event['args'] = {'name': name}
        self._metadata.append(event)

    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
        """Adds a region event to the trace.

        Args:
          timestamp:  The start timestamp of this region as a long integer.
          duration:  The duration of this region as a long integer.
          pid:  Identifier of the process generating this event as an integer.
          tid:  Identifier of the thread generating this event as an integer.
          category: The event category as a string.
          name:  The event name as a string.
          args:  A JSON-compatible dictionary of event arguments.
        """
        event = self._create_event('X', category, name, pid, tid, timestamp)
        event['dur'] = duration
        event['args'] = args
        self._events.append(event)

C
chengduo 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    def emit_counter(self, category, name, pid, timestamp, counter, value):
        """Emits a record for a single counter.

        Args:
            category: The event category as string
            name: The event name as string
            pid: Identifier of the process generating this event as integer
            timestamp: The timestamps of this event as long integer
            counter: Name of the counter as string
            value: Value of the counter as integer
            tid: Thread id of the allocation as integer
        """
        event = self._create_event('C', category, name, pid, 0, timestamp)
        event['args'] = {counter: value}
        self._events.append(event)

X
Xin Pan 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
    def format_to_string(self, pretty=False):
        """Formats the chrome trace to a string.

        Args:
          pretty: (Optional.)  If True, produce human-readable JSON output.

        Returns:
          A JSON-formatted string in Chrome Trace format.
        """
        trace = {}
        trace['traceEvents'] = self._metadata + self._events
        if pretty:
            return json.dumps(trace, indent=4, separators=(',', ': '))
        else:
            return json.dumps(trace, separators=(',', ':'))


class Timeline(object):
130 131
    def __init__(self, profile_dict):
        self._profile_dict = profile_dict
X
Xin Pan 已提交
132 133
        self._pid = 0
        self._devices = dict()
C
chengduo 已提交
134
        self._mem_devices = dict()
X
Xin Pan 已提交
135 136 137 138 139 140 141 142
        self._chrome_trace = _ChromeTraceFormatter()

    def _allocate_pid(self):
        cur_pid = self._pid
        self._pid += 1
        return cur_pid

    def _allocate_pids(self):
143
        for k, profile_pb in self._profile_dict.items():
144 145 146 147 148
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    if (k, event.device_id, "CPU") not in self._devices:
                        pid = self._allocate_pid()
                        self._devices[(k, event.device_id, "CPU")] = pid
149
                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
150 151 152 153
                        if event.device_id == -1:
                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
                        else:
                            self._chrome_trace.emit_pid(
154 155
                                "%s:cpu:block:%d" % (k, event.device_id), pid
                            )
156 157 158 159
                elif event.type == profiler_pb2.Event.GPUKernel:
                    if (k, event.device_id, "GPUKernel") not in self._devices:
                        pid = self._allocate_pid()
                        self._devices[(k, event.device_id, "GPUKernel")] = pid
160
                        self._chrome_trace.emit_pid(
161 162
                            "%s:gpu:%d" % (k, event.device_id), pid
                        )
C
chengduo 已提交
163 164
            if not hasattr(profile_pb, "mem_events"):
                continue
C
chengduo 已提交
165 166 167 168 169 170 171
            for mevent in profile_pb.mem_events:
                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
172 173
                            pid,
                        )
C
chengduo 已提交
174 175 176 177 178 179
                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
180 181
                            pid,
                        )
C
chengduo 已提交
182
                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
183 184 185 186 187
                    if (
                        k,
                        mevent.device_id,
                        "CUDAPinnedPlace",
                    ) not in self._mem_devices:
C
chengduo 已提交
188
                        pid = self._allocate_pid()
189 190 191
                        self._mem_devices[
                            (k, mevent.device_id, "CUDAPinnedPlace")
                        ] = pid
C
chengduo 已提交
192
                        self._chrome_trace.emit_pid(
193 194 195 196
                            "memory usage on %s:cudapinnedplace:%d"
                            % (k, mevent.device_id),
                            pid,
                        )
197 198 199 200 201 202
                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
203 204
                            pid,
                        )
C
chengduo 已提交
205 206 207
                if (k, 0, "CPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CPU")] = pid
208
                    self._chrome_trace.emit_pid(
209 210
                        "memory usage on %s:cpu:%d" % (k, 0), pid
                    )
C
chengduo 已提交
211 212 213
                if (k, 0, "GPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "GPU")] = pid
214
                    self._chrome_trace.emit_pid(
215 216
                        "memory usage on %s:gpu:%d" % (k, 0), pid
                    )
C
chengduo 已提交
217 218 219 220
                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                    self._chrome_trace.emit_pid(
221 222
                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid
                    )
223 224 225
                if (k, 0, "NPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "NPU")] = pid
226
                    self._chrome_trace.emit_pid(
227 228
                        "memory usage on %s:npu:%d" % (k, 0), pid
                    )
X
Xin Pan 已提交
229 230

    def _allocate_events(self):
231
        for k, profile_pb in self._profile_dict.items():
232 233 234 235 236 237 238 239
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    type = "CPU"
                elif event.type == profiler_pb2.Event.GPUKernel:
                    type = "GPUKernel"
                pid = self._devices[(k, event.device_id, type)]
                args = {'name': event.name}
                if event.memcopy.bytes > 0:
240
                    args['mem_bytes'] = event.memcopy.bytes
C
chengduo 已提交
241
                if hasattr(event, "detail_info") and event.detail_info:
242
                    args['detail_info'] = event.detail_info
243 244 245
                # TODO(panyx0718): Chrome tracing only handles ms. However, some
                # ops takes micro-seconds. Hence, we keep the ns here.
                self._chrome_trace.emit_region(
246 247 248 249 250 251 252 253
                    event.start_ns,
                    (event.end_ns - event.start_ns) / 1.0,
                    pid,
                    event.sub_device_id,
                    'Op',
                    event.name,
                    args,
                )
X
Xin Pan 已提交
254

C
chengduo 已提交
255
    def _allocate_memory_event(self):
C
chengduo 已提交
256 257
        if not hasattr(profiler_pb2, "MemEvent"):
            return
C
chengduo 已提交
258 259 260
        place_to_str = {
            profiler_pb2.MemEvent.CPUPlace: "CPU",
            profiler_pb2.MemEvent.CUDAPlace: "GPU",
261
            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
262
            profiler_pb2.MemEvent.NPUPlace: "NPU",
C
chengduo 已提交
263
        }
264
        for k, profile_pb in self._profile_dict.items():
C
chengduo 已提交
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
            mem_list = []
            end_profiler = 0
            for mevent in profile_pb.mem_events:
                crt_info = dict()
                crt_info['time'] = mevent.start_ns
                crt_info['size'] = mevent.bytes
                if mevent.place in place_to_str:
                    place = place_to_str[mevent.place]
                else:
                    place = "UnDefine"
                crt_info['place'] = place
                pid = self._mem_devices[(k, mevent.device_id, place)]
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                mem_list.append(crt_info)
                crt_info = dict()
                crt_info['place'] = place
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                crt_info['time'] = mevent.end_ns
                crt_info['size'] = -mevent.bytes
                mem_list.append(crt_info)
                end_profiler = max(end_profiler, crt_info['time'])
            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
            i = 0
            total_size = 0
            while i < len(mem_list):
                total_size += mem_list[i]['size']
295 296 297 298
                while (
                    i < len(mem_list) - 1
                    and mem_list[i]['time'] == mem_list[i + 1]['time']
                ):
C
chengduo 已提交
299 300 301
                    total_size += mem_list[i + 1]['size']
                    i += 1

302 303 304 305 306 307 308 309
                self._chrome_trace.emit_counter(
                    "Memory",
                    "Memory",
                    mem_list[i]['pid'],
                    mem_list[i]['time'],
                    0,
                    total_size,
                )
C
chengduo 已提交
310 311
                i += 1

X
Xin Pan 已提交
312 313 314
    def generate_chrome_trace(self):
        self._allocate_pids()
        self._allocate_events()
C
chengduo 已提交
315
        self._allocate_memory_event()
X
Xin Pan 已提交
316 317 318 319 320 321 322 323 324 325
        return self._chrome_trace.format_to_string()


profile_path = '/tmp/profile'
if args.profile_path:
    profile_path = args.profile_path
timeline_path = '/tmp/timeline'
if args.timeline_path:
    timeline_path = args.timeline_path

326 327
profile_paths = profile_path.split(',')
profile_dict = dict()
C
chengduoZH 已提交
328
if len(profile_paths) == 1:
329
    with open(profile_path, 'rb') as f:
330 331 332 333 334 335 336
        profile_s = f.read()
        profile_pb = profiler_pb2.Profile()
        profile_pb.ParseFromString(profile_s)
    profile_dict['trainer'] = profile_pb
else:
    for profile_path in profile_paths:
        k, v = profile_path.split('=')
337
        with open(v, 'rb') as f:
338 339 340 341 342 343
            profile_s = f.read()
            profile_pb = profiler_pb2.Profile()
            profile_pb.ParseFromString(profile_s)
        profile_dict[k] = profile_pb

tl = Timeline(profile_dict)
X
Xin Pan 已提交
344 345
with open(timeline_path, 'w') as f:
    f.write(tl.generate_chrome_trace())