diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py index 6dfe6250f91b67c8edc9d2377bb55bda73edf341..c8ab1974beabe5ad32d268ca4930b78ea05331d7 100755 --- a/python/paddle/fluid/tests/unittests/test_newprofiler.py +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -87,6 +87,7 @@ class TestProfiler(unittest.TestCase): targets=[profiler.ProfilerTarget.CPU], scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN, on_trace_ready=my_trace_back, + with_flops=True, ) as prof: for i in range(2): y = x / 2.0 diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index caa0d754cb649838a898b0bf36cd10c44987e45d..eef6714f2e20c94421587382607260ea678e3e6d 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -31,6 +31,14 @@ from paddle.fluid.core import ( enable_memory_recorder, enable_op_info_recorder, ) + +from .utils import RecordEvent, wrap_optimizers +from .profiler_statistic import ( + StatisticData, + _build_table, + SortedKeys, + gen_layer_flops, +) from paddle.profiler import utils from .profiler_statistic import SortedKeys, StatisticData, _build_table @@ -883,6 +891,18 @@ class Profiler: ) ) + if self.with_flops: + self._print_flops() + + def _print_flops(self, repeat=1): + if not self.with_flops: + print('ERROR: with_flops disabled.') + return + + print(" Flops Profiler Begin ".center(100, "-")) + print(gen_layer_flops(self.profiler_result.get_data(), repeat)) + print("- Flops Profiler End -".center(100, "-")) + def get_profiler(config_path): try: diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index cedfac8b61417282f75fe8972ae20564b2261200..c928309b66eac8bc6c42d7a6c6ba76ebd0bc1bc1 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -78,6 +78,16 @@ class SortedKeys(Enum): GPUMin = 7 +def _nodename2opname(name): + r''' + convert static host node name to operator name + ''' + op_name = name.replace(' compute', '') + op_name = op_name.replace(' dygraph', '') + op_name = op_name.replace(' pybind_imperative_func', '') + return op_name + + class HostStatisticNode: r''' Wrap original node for calculating statistic metrics. @@ -98,10 +108,7 @@ class HostStatisticNode: def cal_flops(self): if self.hostnode.type == TracerEventType.Operator: if hasattr(self.hostnode, 'input_shapes'): - op_name = self.hostnode.name - op_name = op_name.replace(' compute', '') - op_name = op_name.replace(' dygraph', '') - op_name = op_name.replace(' pybind_imperative_func', '') + op_name = _nodename2opname(self.hostnode.name) self.flops = flops( op_name, self.hostnode.input_shapes, @@ -111,6 +118,7 @@ class HostStatisticNode: def cal_statistic(self): self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns self.self_cpu_time = self.cpu_time + self.cal_flops() for child in self.children_node: child.cal_flops() child.cal_statistic() @@ -177,6 +185,117 @@ def get_device_nodes(hostnode): return device_nodes +def _build_layer_from_tree(nodetrees): + def build_layer(node, depth=0): + + if "GradNode" in node.name: + return [], 0 + + if node.type in [ + TracerEventType.Backward, + TracerEventType.Optimization, + ]: + return [], 0 + + if node.type == TracerEventType.Operator: + stat_node = HostStatisticNode(node) + stat_node.cal_statistic() + return stat_node, stat_node.flops + + layer = [] + nflops = 0 + for c in node.children_node: + l, f = build_layer(c, depth + 1) + if l: + nflops += f + layer.append(l) + + if node.type == TracerEventType.Forward: + stat_node = HostStatisticNode(node) + stat_node.cal_statistic() + stat_node.flops = nflops + return [stat_node, layer], nflops + + return layer, nflops + + ret = [] + for _, rootnode in nodetrees.items(): + layer, _ = build_layer(rootnode) + ret.append(layer) + + return ret + + +def _format_large_number(n, precision=2): + if n // 1e12 > 0: + return "{} T".format(round(n / 1e12, precision)) + if n // 1e9 > 0: + return "{} G".format(round(n / 1e9, precision)) + if n // 1e6 > 0: + return "{} M".format(round(n / 1e6, precision)) + if n // 1e3 > 0: + return "{} K".format(round(n / 1e3, precision)) + return "{}".format(round(n, precision)) + + +def _format_time(n, precision=2): + if n // 1e9 > 0: + return "{} s".format(round(n / 1e9, precision)) + if n // 1e6 > 0: + return "{} ms".format(round(n / 1e6, precision)) + if n // 1e3 > 0: + return "{} us".format(round(n / 1e3, precision)) + return "{} ns".format(round(n, precision)) + + +def _gen_layer_flops(node, repeat=1): + ret = [] + offset = [] + loop = [] + + def print_layer_tree(node, depth=0): + if isinstance(node, list): + for n in node: + print_layer_tree(n, depth + 1) + + elif node.type in [TracerEventType.Forward, TracerEventType.Operator]: + if len(offset) == 0: + offset.append(depth) + + name = _nodename2opname(node.name) + + if ( + depth == offset[-1] and len(ret) > 0 and ret[0].startswith(name) + ): # repeat begin + loop.append(1) + + if len(loop) >= repeat: + return "".join(ret) + + align = " " * (depth - offset[-1]) + tm = _format_time(node.cpu_time) + flops_n = _format_large_number(node.flops) + flops_s = _format_large_number(node.flops * 1e9 / node.cpu_time) + ret.append( + "{}{} latency: {}, FLOPs: {}, FLOPS: {}\n".format( + align, name, tm, flops_n, flops_s + ) + ) + + for n in node[1:]: + print_layer_tree(n) + + return "".join(ret) + + +def gen_layer_flops(nodetrees, repeat=1): + r''' + gen_layer_flops generate flops/runtime information depend on layer/operator. + ''' + layer_tree = _build_layer_from_tree(nodetrees) + return _gen_layer_flops(layer_tree, repeat) + + def wrap_tree(nodetrees): ''' Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. @@ -387,30 +506,7 @@ class EventSummary: Analyse operator event in profiling data, correlate with its device event. """ - class DeviceItem: - def __init__(self, name): - self.name = name - self.call = 0 - self.gpu_time = 0 - self.max_gpu_time = 0 - self.min_gpu_time = float('inf') - - @property - def avg_gpu_time(self): - return self.gpu_time / self.call - - def add_gpu_time(self, time): - if time > self.max_gpu_time: - self.max_gpu_time = time - if time < self.min_gpu_time: - self.min_gpu_time = time - self.gpu_time += time - - def add_item(self, node): - self.call += 1 - self.add_gpu_time(node.end_ns - node.start_ns) - - class OperatorItem: + class ItemBase: def __init__(self, name): self.name = name self.call = 0 @@ -470,6 +566,15 @@ class EventSummary: def add_flops(self, flops): self._flops += flops + def add_item(self, node): + raise NotImplementedError + + class DeviceItem(ItemBase): + def add_item(self, node): + self.call += 1 + self.add_gpu_time(node.end_ns - node.start_ns) + + class OperatorItem(ItemBase): def add_item(self, node): self.add_call() self.add_cpu_time(node.cpu_time) @@ -491,56 +596,22 @@ class EventSummary: self.devices[name] = EventSummary.DeviceItem(name) self.devices[name].add_item(devicenode) - class GeneralItem: - def __init__(self, name): - self.name = name - self.call = 0 - self.cpu_time = 0 - self.max_cpu_time = 0 - self.min_cpu_time = float('inf') - self.gpu_time = 0 - self.max_gpu_time = 0 - self.min_gpu_time = float('inf') - self.general_gpu_time = 0 - self.min_general_gpu_time = float('inf') - self.max_general_gpu_time = 0 - - @property - def avg_cpu_time(self): - return self.cpu_time / self.call - - @property - def avg_gpu_time(self): - return self.gpu_time / self.call - - @property - def avg_general_gpu_time(self): - return self.general_gpu_time / self.call - - def add_cpu_time(self, time): - if time > self.max_cpu_time: - self.max_cpu_time = time - if time < self.min_cpu_time: - self.min_cpu_time = time - self.cpu_time += time - - def add_gpu_time(self, time): - if time > self.max_gpu_time: - self.max_gpu_time = time - if time < self.min_gpu_time: - self.min_gpu_time = time - self.gpu_time += time - - def add_general_gpu_time(self, time): - if time > self.max_general_gpu_time: - self.max_general_gpu_time = time - if time < self.min_general_gpu_time: - self.min_general_gpu_time = time - self.general_gpu_time += time - - def add_call(self): - self.call += 1 + class ForwardItem(ItemBase): + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + self.add_general_gpu_time(node.general_gpu_time) + self.add_flops(node.flops) + for child in node.children_node: + if child.type != TracerEventType.Operator: + if child.name not in self.operator_inners: + self.operator_inners[ + child.name + ] = EventSummary.OperatorItem(child.name) + self.operator_inners[child.name].add_item(child) + class GeneralItem(ItemBase): def add_item(self, node): self.add_call() self.add_cpu_time(node.cpu_time) @@ -613,6 +684,9 @@ class EventSummary: self.add_model_perspective_item(child) deque.append(child) + def add_forward_item(self, operator_node): + pass + def add_operator_item(self, operator_node): if operator_node.name not in self.items: self.items[operator_node.name] = EventSummary.OperatorItem( diff --git a/python/paddle/utils/flops.py b/python/paddle/utils/flops.py index 114ca6d9ab6c77bc1676b374e91c58b2b154986d..9c1317676934110029a46c27ca322c6bce1d0a1a 100644 --- a/python/paddle/utils/flops.py +++ b/python/paddle/utils/flops.py @@ -164,8 +164,9 @@ def _matmul_flops(input_shapes, attrs): shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1)) ... dim_n_1, dim_m] equation: flops = 2 * numel(output) * dim_n """ - x_shape = input_shapes.get("X")[0] - y_shape = input_shapes.get("Y")[0] + + x_shape = input_shapes.get("X", input_shapes.get("x", [[0]]))[0] + y_shape = input_shapes.get("Y", input_shapes.get("y", [[0]]))[0] if attrs.get('transpose_X') or attrs.get('transpose_x'): x_shape[-1], x_shape[-2] = x_shape[-2], x_shape[-1]