未验证 提交 2ea5e02c 编写于 作者: C chenjian 提交者: GitHub

[cherry-pick] Refine user experience for profiler (#41989)

* fix divide zero error when cpu only (#41794)

* reduce performance influence by RecordEvent in Python (#41822)

* reduce performance influence

* add unit test

* fix

* Rebase for profiler statistic ratio (#41939)

* fix according to suggestion

* add kernel summary

* improve coverage
上级 85a4ecb6
......@@ -20,6 +20,7 @@ import tempfile
import paddle
import paddle.profiler as profiler
import paddle.profiler.utils as utils
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, DataLoader
......@@ -40,11 +41,17 @@ class TestProfiler(unittest.TestCase):
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
y = x / 2.0
prof = None
self.assertEqual(utils._is_profiler_used, False)
with profiler.RecordEvent(name='test'):
y = x / 2.0
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU],
scheduler=(1, 2)) as prof:
self.assertEqual(utils._is_profiler_used, True)
with profiler.RecordEvent(name='test'):
y = x / 2.0
prof = None
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU],
......
......@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase):
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode(
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode(
......@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100)
profiler.TracerEventType.Forward), 90)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
......@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.Communication), 5)
self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 4)
self.assertEqual(len(event_summary.model_perspective_items), 5)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
event_summary.model_perspective_items['Forward'].cpu_time, 90)
self.assertEqual(
event_summary.model_perspective_items['Forward'].general_gpu_time,
135)
......@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode(
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
......@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100)
profiler.TracerEventType.Forward), 90)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
......@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase):
distributed_summary.overlap_range), 85)
self.assertEqual(len(event_summary.items), 4)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 4)
self.assertEqual(len(event_summary.model_perspective_items), 5)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
event_summary.model_perspective_items['Forward'].cpu_time, 90)
self.assertEqual(
event_summary.model_perspective_items['Forward'].general_gpu_time,
315)
......@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep=False,
time_unit='ms'))
def test_statistic_case3(self):
# for coverage, test all time is 0
root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0,
float('inf'), 1000, 1001)
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
60, 70, 1000, 1001)
conv2d_node = HostPythonNode(
'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
25, 1000, 1001)
conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner,
25, 25, 1000, 1001)
conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
1000, 1001)
conv2d_kernel = DevicePythonNode(
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
another_kernel = DevicePythonNode(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
root_node.children_node.append(profilerstep_node)
profilerstep_node.children_node.extend([
dataloader_node, mobilenet_node, userdefined_node, backward_node,
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
conv2d_launchkernel.device_node.append(conv2d_kernel)
conv2d_launchkernel.device_node.append(another_kernel)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68'
}
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
extra_info)
time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary
self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
self.assertEqual(event_summary.userdefined_items['Communication Time']
.general_gpu_time, 0)
for sort_key in [
profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax,
profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
]:
print(
profiler.profiler_statistic._build_table(
statistic_data,
sorted_by=sort_key,
op_detail=True,
thread_sep=False,
time_unit='ms'))
if __name__ == '__main__':
unittest.main()
......@@ -27,6 +27,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import StatisticData, _build_table, SortedKeys
from paddle.profiler import utils
from .timer import benchmark
......@@ -475,6 +476,7 @@ class Profiler:
if self.timer_only:
return
# CLOSED -> self.current_state
utils._is_profiler_used = True
if self.current_state == ProfilerState.READY:
self.profiler.prepare()
elif self.current_state == ProfilerState.RECORD:
......@@ -527,6 +529,7 @@ class Profiler:
self.profiler_result = self.profiler.stop()
if self.on_trace_ready:
self.on_trace_ready(self)
utils._is_profiler_used = False
def step(self, num_samples: Optional[int]=None):
r"""
......
......@@ -78,15 +78,19 @@ class HostStatisticNode:
self.self_gpu_time = 0
self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset
self.self_general_gpu_time = 0
self.is_terminal_operator_node = True
def cal_statistic(self):
for child in self.children_node:
child.cal_statistic()
if child.is_terminal_operator_node == False:
self.is_terminal_operator_node = False
for rt in self.runtime_node:
rt.cal_statistic()
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
for child in self.children_node:
if child.type == TracerEventType.Operator:
self.is_terminal_operator_node = False
self.gpu_time += child.gpu_time
self.general_gpu_time += child.general_gpu_time
self.self_cpu_time -= (child.end_ns - child.start_ns)
......@@ -419,10 +423,10 @@ class EventSummary:
for runtimenode in node.runtime_node:
for devicenode in runtimenode.device_node:
if devicenode.name not in self.devices:
self.devices[devicenode.name] = EventSummary.DeviceItem(
devicenode.name)
self.devices[devicenode.name].add_item(devicenode)
name = devicenode.name
if name not in self.devices:
self.devices[name] = EventSummary.DeviceItem(name)
self.devices[name].add_item(devicenode)
class GeneralItem:
def __init__(self, name):
......@@ -489,6 +493,7 @@ class EventSummary:
dict) # for userdefined summary
self.model_perspective_items = {} # for model summary
self.memory_manipulation_items = {} # for memory manipulation summary
self.kernel_items = {} # for kernel summary
def parse(self, nodetrees):
r"""
......@@ -508,6 +513,7 @@ class EventSummary:
self.add_memory_manipulation_item(host_statistic_node)
else:
self.add_userdefined_item(host_statistic_node)
self.add_kernel_item(host_statistic_nodes[0])
for threadid, root_statistic_node in node_statistic_trees.items():
deque = collections.deque()
......@@ -525,11 +531,7 @@ class EventSummary:
deque.append(child)
def add_operator_item(self, operator_node):
have_inner = False
for child in operator_node.children_node:
if child.type == TracerEventType.OperatorInner:
have_inner = True
if have_inner == False:
if operator_node.is_terminal_operator_node == False:
return
if operator_node.name not in self.items:
self.items[operator_node.name] = EventSummary.OperatorItem(
......@@ -585,6 +587,15 @@ class EventSummary:
self.model_perspective_items[name] = EventSummary.GeneralItem(name)
self.model_perspective_items[name].add_item(model_perspective_node)
def add_kernel_item(self, root_node):
device_nodes = get_device_nodes(root_node)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
name = device_node.name
if name not in self.kernel_items:
self.kernel_items[name] = EventSummary.DeviceItem(name)
self.kernel_items[name].add_item(device_node)
class StatisticData:
r"""
......@@ -752,6 +763,9 @@ def _build_table(statistic_data,
cpu_call_times[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].call
cpu_type_time[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].cpu_time
gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
......@@ -800,7 +814,6 @@ def _build_table(statistic_data,
append(
"Note:\nIn this table, We sum up all collected events in terms of event type.\n"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
"Example:\n"
......@@ -820,13 +833,18 @@ def _build_table(statistic_data,
all_row_values = []
accmulation_time = 0
gpu_accmulation_time = 0
gpu_total_time = 0
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
for name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization'
]:
if name in model_perspective_items:
item = model_perspective_items[name]
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
name = '{}'.format(
name) if 'ProfileStep' in name else ' {}'.format(name)
row_values = [
......@@ -850,17 +868,19 @@ def _build_table(statistic_data,
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
if 'ProfileStep' not in name:
accmulation_time += item.cpu_time
gpu_accmulation_time += item.gpu_time
else:
gpu_total_time = item.gpu_time
gpu_accmulation_time += item.general_gpu_time
other_time = total_time - accmulation_time
other_gpu_time = gpu_total_time - gpu_accmulation_time
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(other_gpu_time) / gpu_total_time
row_values = [
' Others', '-', '{} / - / - / - / {}'.format(
format_time(
......@@ -869,7 +889,7 @@ def _build_table(statistic_data,
'{} / - / - / - / {}'.format(
format_time(
other_gpu_time, unit=time_unit),
format_ratio(float(other_gpu_time) / gpu_total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
# Calculate the column width
......@@ -913,7 +933,6 @@ def _build_table(statistic_data,
append(
"Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
"Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
)
append('-' * line_length)
append('')
......@@ -981,7 +1000,6 @@ def _build_table(statistic_data,
"Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
"Overlap time: Communication time intersects with computation time.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
"Example:\n"
"Communication:\n"
" CPU: |_________________|\n"
......@@ -1040,8 +1058,22 @@ def _build_table(statistic_data,
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_general_gpu_time)
total_op_cpu_time = 0
total_op_gpu_time = 0
for name, item in sorted_items:
total_op_cpu_time += item.cpu_time
total_op_gpu_time += item.general_gpu_time
for name, item in sorted_items:
if total_op_cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(item.cpu_time) / total_op_cpu_time
if total_op_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format(
format_time(
......@@ -1052,7 +1084,7 @@ def _build_table(statistic_data,
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.general_gpu_time, unit=time_unit),
......@@ -1062,13 +1094,22 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(
float(item.general_gpu_time) / total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
if op_detail:
for innerop_name, innerop_node in item.operator_inners.items(
):
if item.cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(
innerop_node.cpu_time) / item.cpu_time
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(innerop_node.general_gpu_time
) / item.general_gpu_time
if len(innerop_name) + 2 > name_column_width:
innerop_name = innerop_name[:name_column_width - 5]
innerop_name += "..."
......@@ -1083,8 +1124,7 @@ def _build_table(statistic_data,
innerop_node.max_cpu_time, unit=time_unit),
format_time(
innerop_node.min_cpu_time, unit=time_unit),
format_ratio(
float(innerop_node.cpu_time) / total_time)),
format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format(
format_time(
innerop_node.general_gpu_time,
......@@ -1098,13 +1138,17 @@ def _build_table(statistic_data,
format_time(
innerop_node.min_general_gpu_time,
unit=time_unit),
format_ratio(
float(innerop_node.general_gpu_time) /
total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
for device_node_name, device_node in innerop_node.devices.items(
):
if innerop_node.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
device_node.
gpu_time) / innerop_node.general_gpu_time
if len(device_node_name) + 4 > name_column_width:
device_node_name = device_node_name[:
name_column_width
......@@ -1125,12 +1169,15 @@ def _build_table(statistic_data,
format_time(
device_node.min_gpu_time,
unit=time_unit),
format_ratio(
float(device_node.gpu_time) /
total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
for device_node_name, device_node in item.devices.items():
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
device_node.gpu_time) / item.general_gpu_time
if len(device_node_name) + 2 > name_column_width:
device_node_name = device_node_name[:
name_column_width
......@@ -1148,8 +1195,7 @@ def _build_table(statistic_data,
device_node.max_gpu_time, unit=time_unit),
format_time(
device_node.min_gpu_time, unit=time_unit),
format_ratio(
float(device_node.gpu_time) / total_time))
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
# Calculate the column width
......@@ -1197,11 +1243,106 @@ def _build_table(statistic_data,
append('')
append('')
###### Print Kernel Summary Report ######
if statistic_data.event_summary.kernel_items:
all_row_values = []
kernel_items = statistic_data.event_summary.kernel_items
if sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
kernel_items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
kernel_items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
kernel_items.items(), key=lambda x: x[1].min_gpu_time)
else:
sorted_items = sorted(
kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
total_kernel_gpu_time = 0
for name, item in sorted_items:
total_kernel_gpu_time += item.gpu_time
for name, item in sorted_items:
if total_kernel_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)),
]
all_row_values.append(row_values)
headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)']
# Calculate the column width
name_column_width = 90
calltime_width = 6
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[2])
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
add_column(name_column_width)
add_column(calltime_width)
add_column(gpu_data_description_width)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Kernel Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
indx = row_values[0].find('(')
if indx != -1:
name = row_values[0][:indx]
else:
name = row_values[0]
if len(name) > name_column_width:
row_values[0] = name[:name_column_width - 3] + '...'
else:
row_values[0] = name
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Memory Manipulation Summary Report ######
if statistic_data.event_summary.memory_manipulation_items:
all_row_values = []
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
for name, item in memory_manipulation_items.items():
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
row_values = [
name,
item.call,
......@@ -1224,7 +1365,7 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(float(item.general_gpu_time) / total_time)),
format_ratio(gpu_ratio)),
]
all_row_values.append(row_values)
......@@ -1274,6 +1415,8 @@ def _build_table(statistic_data,
###### Print UserDefined Summary Report ######
if statistic_data.event_summary.userdefined_items:
all_row_values = []
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
if thread_sep == True:
userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
else:
......@@ -1319,6 +1462,10 @@ def _build_table(statistic_data,
items.items(), key=lambda x: x[1].min_general_gpu_time)
for name, item in sorted_items:
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
row_values = [
name,
item.call,
......@@ -1341,8 +1488,7 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit),
format_time(
item.min_general_gpu_time, unit=time_unit),
format_ratio(
float(item.general_gpu_time) / total_time)),
format_ratio(gpu_ratio)),
]
all_row_values.append(row_values)
......
......@@ -20,6 +20,8 @@ from contextlib import ContextDecorator
from paddle.fluid import core
from paddle.fluid.core import (_RecordEvent, TracerEventType)
_is_profiler_used = False
_AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep,
TracerEventType.UserDefined, TracerEventType.Forward,
......@@ -91,6 +93,8 @@ class RecordEvent(ContextDecorator):
result = data1 - data2
record_event.end()
"""
if not _is_profiler_used:
return
if self.event_type not in _AllowedEventTypeList:
warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
can be recorded.".format(*_AllowedEventTypeList))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册