test_profiler_statistic.py 26.0 KB
Newer Older
C
chenjian 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import paddle
import paddle.profiler as profiler


class HostPythonNode:
22

C
chenjian 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35
    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
        self.name = name
        self.type = type
        self.start_ns = start_ns
        self.end_ns = end_ns
        self.process_id = process_id
        self.thread_id = thread_id
        self.children_node = []
        self.runtime_node = []
        self.device_node = []


class DevicePythonNode:
36

C
chenjian 已提交
37 38 39 40 41 42 43 44 45 46 47 48
    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
                 stream_id):
        self.name = name
        self.type = type
        self.start_ns = start_ns
        self.end_ns = end_ns
        self.device_id = device_id
        self.context_id = context_id
        self.stream_id = stream_id


class TestProfilerStatistic(unittest.TestCase):
49

C
chenjian 已提交
50 51 52 53 54 55 56
    def test_statistic_case1(self):
        root_node = HostPythonNode('Root Node',
                                   profiler.TracerEventType.UserDefined, 0,
                                   float('inf'), 1000, 1001)
        profilerstep_node = HostPythonNode('ProfileStep#1',
                                           profiler.TracerEventType.ProfileStep,
                                           0, 400, 1000, 1001)
57 58 59
        dataloader_node = HostPythonNode('Dataloader',
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
60 61 62 63 64 65
        mobilenet_node = HostPythonNode('MobileNet',
                                        profiler.TracerEventType.Forward, 20,
                                        50, 1000, 1001)
        yolonet_node = HostPythonNode('Yolov3Net',
                                      profiler.TracerEventType.Forward, 50, 110,
                                      1000, 1001)
C
chenjian 已提交
66

67 68 69
        userdefined_node = HostPythonNode(
            'Communication Time', profiler.TracerEventType.PythonUserDefined,
            100, 110, 1000, 1001)
C
chenjian 已提交
70 71 72 73

        communication_node = HostPythonNode(
            'Communication', profiler.TracerEventType.Communication, 105, 110,
            1000, 1001)
C
chenjian 已提交
74 75 76 77 78 79
        backward_node = HostPythonNode('Gradient Backward',
                                       profiler.TracerEventType.Backward, 120,
                                       200, 1000, 1001)
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
80 81 82
        conv2d_node = HostPythonNode('conv2d',
                                     profiler.TracerEventType.Operator, 25, 40,
                                     1000, 1001)
C
chenjian 已提交
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                              profiler.TracerEventType.Operator,
                                              60, 100, 1000, 1001)
        conv2d_infer_shape = HostPythonNode(
            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
            30, 1000, 1001)
        conv2d_compute = HostPythonNode('conv2d::compute',
                                        profiler.TracerEventType.OperatorInner,
                                        30, 40, 1000, 1001)
        conv2d_launchkernel = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
            1000, 1001)
        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
                                       profiler.TracerEventType.UserDefined, 35,
                                       40, 1000, 1001)
        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                           profiler.TracerEventType.CudaRuntime,
                                           35, 40, 1000, 1001)
101 102 103 104 105 106
        conv2d_kernel = DevicePythonNode('conv2d_kernel',
                                         profiler.TracerEventType.Kernel, 35,
                                         50, 0, 0, 0)
        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
                                         profiler.TracerEventType.Memcpy, 50,
                                         60, 0, 0, 0)
C
chenjian 已提交
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
        sync_batch_norm_infer_shape = HostPythonNode(
            'sync_batch_norm::infer_shape',
            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
        sync_batch_norm_compute = HostPythonNode(
            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
            80, 100, 1000, 1001)
        sync_batch_norm_launchkernel = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
            1000, 1001)
        sync_batch_norm_MemCpy = HostPythonNode(
            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
            1001)
        sync_batch_norm_cudaMemCpy = HostPythonNode(
            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
            1001)
        sync_batch_norm_kernel = DevicePythonNode(
            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
            0, 0, 0)
        sync_batch_norm_memcpy = DevicePythonNode(
            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
            0, 0, 1)
        root_node.children_node.append(profilerstep_node)
        profilerstep_node.children_node.extend([
            dataloader_node, mobilenet_node, yolonet_node, backward_node,
            optimization_node
        ])
        mobilenet_node.children_node.append(conv2d_node)
C
chenjian 已提交
134 135 136
        yolonet_node.children_node.extend(
            [sync_batch_norm_node, userdefined_node])
        userdefined_node.children_node.append(communication_node)
C
chenjian 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
        conv2d_node.children_node.extend(
            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
        conv2d_compute.runtime_node.append(conv2d_launchkernel)
        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
        conv2d_launchkernel.device_node.append(conv2d_kernel)
        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
        sync_batch_norm_node.children_node.extend([
            sync_batch_norm_infer_shape, sync_batch_norm_compute,
            sync_batch_norm_MemCpy
        ])
        sync_batch_norm_compute.runtime_node.append(
            sync_batch_norm_launchkernel)
        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
        thread_tree = {'thread1001': root_node}
        extra_info = {
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
157 158
        statistic_data = profiler.profiler_statistic.StatisticData(
            thread_tree, extra_info)
C
chenjian 已提交
159 160 161 162 163 164 165 166
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary

        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.ProfileStep), 400)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
167
                profiler.TracerEventType.Forward), 90)
C
chenjian 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Backward), 80)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Optimization), 80)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Operator), 55)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.OperatorInner), 45)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.CudaRuntime), 30)
        self.assertEqual(
            time_range_summary.get_gpu_range_sum(
                0, profiler.TracerEventType.Kernel), 75)
        self.assertEqual(
            time_range_summary.get_gpu_range_sum(
                0, profiler.TracerEventType.Memcpy), 60)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
191
                profiler.TracerEventType.UserDefined), 15)
C
chenjian 已提交
192 193 194
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Communication), 5)
C
chenjian 已提交
195
        self.assertEqual(len(event_summary.items), 2)
C
chenjian 已提交
196
        self.assertEqual(len(event_summary.userdefined_items), 1)
197
        self.assertEqual(len(event_summary.model_perspective_items), 5)
C
chenjian 已提交
198 199
        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
200
        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
C
chenjian 已提交
201
        self.assertEqual(
202
            event_summary.model_perspective_items['Forward'].cpu_time, 90)
C
chenjian 已提交
203
        self.assertEqual(
204 205
            event_summary.model_perspective_items['Forward'].general_gpu_time,
            135)
C
chenjian 已提交
206
        self.assertEqual(
207 208
            event_summary.model_perspective_items['Backward'].general_gpu_time,
            0)
C
chenjian 已提交
209 210
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
211 212 213
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].
            general_gpu_time, 60)
C
chenjian 已提交
214 215 216 217 218 219 220 221
        print(
            profiler.profiler_statistic._build_table(
                statistic_data,
                sorted_by=profiler.SortedKeys.CPUTotal,
                op_detail=True,
                thread_sep=False,
                time_unit='ms'))

C
chenjian 已提交
222 223 224 225 226 227 228 229
    def test_statistic_case2(self):
        root_node = HostPythonNode('Root Node',
                                   profiler.TracerEventType.UserDefined, 0,
                                   float('inf'), 1000, 1001)
        profilerstep_node = HostPythonNode('ProfileStep#1',
                                           profiler.TracerEventType.ProfileStep,
                                           0, 400, 1000, 1001)

230 231 232
        dataloader_node = HostPythonNode('Dataloader',
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
C
chenjian 已提交
233

234 235 236 237 238 239
        mobilenet_node = HostPythonNode('MobileNet',
                                        profiler.TracerEventType.Forward, 20,
                                        50, 1000, 1001)
        yolonet_node = HostPythonNode('Yolov3Net',
                                      profiler.TracerEventType.Forward, 50, 110,
                                      1000, 1001)
C
chenjian 已提交
240

241 242 243
        userdefined_node = HostPythonNode(
            'Communication Time', profiler.TracerEventType.PythonUserDefined,
            100, 110, 1000, 1001)
244
        allreduce_launchkernel0 = HostPythonNode(
C
chenjian 已提交
245 246 247
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
            1000, 1001)

248 249
        nccl_allreduce_kernel0 = DevicePythonNode(
            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 105, 120,
C
chenjian 已提交
250 251 252 253 254 255
            0, 0, 2)

        communication_node = HostPythonNode(
            'Communication', profiler.TracerEventType.Communication, 105, 110,
            1000, 1001)

256 257 258 259 260 261
        allreduce_op1 = HostPythonNode('allreduce_op1',
                                       profiler.TracerEventType.Operator, 105,
                                       108, 1000, 1001)
        allreduce_op1_infershape = HostPythonNode(
            'allreduce_op1::infershape', profiler.TracerEventType.OperatorInner,
            105, 106, 1000, 1001)
C
chenjian 已提交
262

263
        allreduce_launchkernel1 = HostPythonNode(
C
chenjian 已提交
264 265 266
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
            1000, 1001)

267 268
        nccl_allreduce_kernel1 = DevicePythonNode(
            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 130, 150,
C
chenjian 已提交
269 270 271 272 273 274 275 276
            0, 0, 2)

        backward_node = HostPythonNode('Gradient Backward',
                                       profiler.TracerEventType.Backward, 120,
                                       200, 1000, 1001)
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
277 278 279
        conv2d_node = HostPythonNode('conv2d',
                                     profiler.TracerEventType.Operator, 25, 40,
                                     1000, 1001)
C
chenjian 已提交
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                              profiler.TracerEventType.Operator,
                                              60, 100, 1000, 1001)
        conv2d_infer_shape = HostPythonNode(
            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
            30, 1000, 1001)
        conv2d_compute = HostPythonNode('conv2d::compute',
                                        profiler.TracerEventType.OperatorInner,
                                        30, 40, 1000, 1001)
        conv2d_launchkernel = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
            1000, 1001)
        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
                                       profiler.TracerEventType.UserDefined, 35,
                                       40, 1000, 1001)
        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                           profiler.TracerEventType.CudaRuntime,
                                           35, 40, 1000, 1001)
298 299 300 301 302 303
        conv2d_kernel = DevicePythonNode('conv2d_kernel',
                                         profiler.TracerEventType.Kernel, 35,
                                         50, 0, 0, 0)
        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
                                         profiler.TracerEventType.Memcpy, 50,
                                         60, 0, 0, 0)
C
chenjian 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
        sync_batch_norm_infer_shape = HostPythonNode(
            'sync_batch_norm::infer_shape',
            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
        sync_batch_norm_compute = HostPythonNode(
            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
            80, 100, 1000, 1001)
        sync_batch_norm_launchkernel = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
            1000, 1001)
        sync_batch_norm_MemCpy = HostPythonNode(
            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
            1001)
        sync_batch_norm_cudaMemCpy = HostPythonNode(
            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
            1001)
        sync_batch_norm_kernel = DevicePythonNode(
            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300,
            0, 0, 0)
        sync_batch_norm_memcpy = DevicePythonNode(
            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
            0, 0, 1)

326 327 328
        allreduce_node2 = HostPythonNode('allreduce',
                                         profiler.TracerEventType.Operator, 230,
                                         250, 1000, 1001)
C
chenjian 已提交
329

330 331
        allreduce_node2_infershape = HostPythonNode(
            'allreduce_node2::infershape',
C
chenjian 已提交
332
            profiler.TracerEventType.OperatorInner, 231, 232, 1000, 1001)
333
        allreduce_launchkernel2 = HostPythonNode(
C
chenjian 已提交
334 335 336
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
            1000, 1001)

337 338
        nccl_allreduce_kernel2 = DevicePythonNode(
            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 250, 280,
C
chenjian 已提交
339 340 341 342 343 344 345 346 347 348 349
            0, 0, 2)

        root_node.children_node.append(profilerstep_node)
        profilerstep_node.children_node.extend([
            dataloader_node, mobilenet_node, yolonet_node, backward_node,
            optimization_node
        ])
        mobilenet_node.children_node.append(conv2d_node)
        yolonet_node.children_node.extend(
            [sync_batch_norm_node, userdefined_node])
        userdefined_node.children_node.append(communication_node)
350 351 352 353 354 355
        userdefined_node.runtime_node.append(allreduce_launchkernel0)
        allreduce_launchkernel0.device_node.append(nccl_allreduce_kernel0)
        communication_node.children_node.append(allreduce_op1)
        allreduce_op1.children_node.append(allreduce_op1_infershape)
        allreduce_op1.runtime_node.append(allreduce_launchkernel1)
        allreduce_launchkernel1.device_node.append(nccl_allreduce_kernel1)
C
chenjian 已提交
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
        conv2d_node.children_node.extend(
            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
        conv2d_compute.runtime_node.append(conv2d_launchkernel)
        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
        conv2d_launchkernel.device_node.append(conv2d_kernel)
        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
        sync_batch_norm_node.children_node.extend([
            sync_batch_norm_infer_shape, sync_batch_norm_compute,
            sync_batch_norm_MemCpy
        ])
        sync_batch_norm_compute.runtime_node.append(
            sync_batch_norm_launchkernel)
        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
371 372 373 374
        optimization_node.children_node.append(allreduce_node2)
        allreduce_node2.children_node.append(allreduce_node2_infershape)
        allreduce_node2.runtime_node.append(allreduce_launchkernel2)
        allreduce_launchkernel2.device_node.append(nccl_allreduce_kernel2)
C
chenjian 已提交
375 376 377 378 379
        thread_tree = {'thread1001': root_node}
        extra_info = {
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
380 381
        statistic_data = profiler.profiler_statistic.StatisticData(
            thread_tree, extra_info)
C
chenjian 已提交
382 383 384 385 386 387 388 389 390
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary
        distributed_summary = statistic_data.distributed_summary

        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.ProfileStep), 400)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
391
                profiler.TracerEventType.Forward), 90)
C
chenjian 已提交
392 393 394 395 396 397 398 399 400 401 402
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Backward), 80)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Optimization), 80)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Operator), 78)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
C
chenjian 已提交
403
                profiler.TracerEventType.OperatorInner), 47)
C
chenjian 已提交
404 405 406 407 408 409 410 411 412 413 414
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.CudaRuntime), 38)
        self.assertEqual(
            time_range_summary.get_gpu_range_sum(
                0, profiler.TracerEventType.Kernel), 220)
        self.assertEqual(
            time_range_summary.get_gpu_range_sum(
                0, profiler.TracerEventType.Memcpy), 60)
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
415
                profiler.TracerEventType.UserDefined), 15)
C
chenjian 已提交
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
        self.assertEqual(
            time_range_summary.get_cpu_range_sum(
                profiler.TracerEventType.Communication), 5)
        self.assertEqual(
            profiler.statistic_helper.sum_ranges(
                distributed_summary.cpu_communication_range), 25)
        self.assertEqual(
            profiler.statistic_helper.sum_ranges(
                distributed_summary.gpu_communication_range), 65)
        self.assertEqual(
            profiler.statistic_helper.sum_ranges(
                distributed_summary.communication_range), 85)
        self.assertEqual(
            profiler.statistic_helper.sum_ranges(
                distributed_summary.computation_range), 220)
        self.assertEqual(
            profiler.statistic_helper.sum_ranges(
                distributed_summary.overlap_range), 85)
        self.assertEqual(len(event_summary.items), 4)
        self.assertEqual(len(event_summary.userdefined_items), 1)
436
        self.assertEqual(len(event_summary.model_perspective_items), 5)
C
chenjian 已提交
437 438
        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
439
        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
C
chenjian 已提交
440
        self.assertEqual(
441
            event_summary.model_perspective_items['Forward'].cpu_time, 90)
C
chenjian 已提交
442
        self.assertEqual(
443 444
            event_summary.model_perspective_items['Forward'].general_gpu_time,
            315)
C
chenjian 已提交
445
        self.assertEqual(
446 447
            event_summary.model_perspective_items['Backward'].general_gpu_time,
            0)
C
chenjian 已提交
448 449
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
450 451 452
        self.assertEqual(
            event_summary.memory_manipulation_items['AsyncMemcpy'].
            general_gpu_time, 60)
C
chenjian 已提交
453 454 455 456 457 458 459 460
        print(
            profiler.profiler_statistic._build_table(
                statistic_data,
                sorted_by=profiler.SortedKeys.CPUTotal,
                op_detail=True,
                thread_sep=False,
                time_unit='ms'))

461 462 463 464 465 466 467 468 469 470 471
    def test_statistic_case3(self):
        # for coverage, test all time is 0
        root_node = HostPythonNode('Root Node',
                                   profiler.TracerEventType.UserDefined, 0,
                                   float('inf'), 1000, 1001)
        profilerstep_node = HostPythonNode('ProfileStep#1',
                                           profiler.TracerEventType.ProfileStep,
                                           0, 400, 1000, 1001)
        dataloader_node = HostPythonNode('Dataloader',
                                         profiler.TracerEventType.Dataloader, 5,
                                         15, 1000, 1001)
472 473 474
        mobilenet_node = HostPythonNode('MobileNet',
                                        profiler.TracerEventType.Forward, 20,
                                        50, 1000, 1001)
475 476 477 478 479 480 481

        backward_node = HostPythonNode('Gradient Backward',
                                       profiler.TracerEventType.Backward, 120,
                                       200, 1000, 1001)
        optimization_node = HostPythonNode(
            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
            1000, 1001)
482 483 484
        userdefined_node = HostPythonNode(
            'Communication Time', profiler.TracerEventType.PythonUserDefined,
            60, 70, 1000, 1001)
485

486 487 488
        conv2d_node = HostPythonNode('conv2d',
                                     profiler.TracerEventType.Operator, 25, 25,
                                     1000, 1001)
489 490 491 492 493 494 495 496 497 498 499

        conv2d_infer_shape = HostPythonNode(
            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
            25, 1000, 1001)
        conv2d_compute = HostPythonNode('conv2d::compute',
                                        profiler.TracerEventType.OperatorInner,
                                        25, 25, 1000, 1001)
        conv2d_launchkernel = HostPythonNode(
            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
            1000, 1001)

500 501 502
        conv2d_kernel = DevicePythonNode('conv2d_kernel',
                                         profiler.TracerEventType.Kernel, 35,
                                         35, 0, 0, 0)
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520
        another_kernel = DevicePythonNode(
            'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
            profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
        root_node.children_node.append(profilerstep_node)
        profilerstep_node.children_node.extend([
            dataloader_node, mobilenet_node, userdefined_node, backward_node,
            optimization_node
        ])
        mobilenet_node.children_node.append(conv2d_node)
        conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute])
        conv2d_compute.runtime_node.append(conv2d_launchkernel)
        conv2d_launchkernel.device_node.append(conv2d_kernel)
        conv2d_launchkernel.device_node.append(another_kernel)
        thread_tree = {'thread1001': root_node}
        extra_info = {
            'Process Cpu Utilization': '1.02',
            'System Cpu Utilization': '0.68'
        }
521 522
        statistic_data = profiler.profiler_statistic.StatisticData(
            thread_tree, extra_info)
523 524 525 526 527
        time_range_summary = statistic_data.time_range_summary
        event_summary = statistic_data.event_summary

        self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
528 529 530
        self.assertEqual(
            event_summary.userdefined_items['Communication Time'].
            general_gpu_time, 0)
531 532 533 534 535 536 537
        for sort_key in [
                profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
                profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
                profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax,
                profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
        ]:
            print(
538 539 540 541 542
                profiler.profiler_statistic._build_table(statistic_data,
                                                         sorted_by=sort_key,
                                                         op_detail=True,
                                                         thread_sep=False,
                                                         time_unit='ms'))
543

C
chenjian 已提交
544 545 546

if __name__ == '__main__':
    unittest.main()