未验证 提交 771811dc 编写于 作者: C Chitsing KUI 提交者: GitHub

[FLOPS] add flops for layer (#48447)

* add flops for layer

* hide api

* add unitest
上级 758fccfe
......@@ -87,6 +87,7 @@ class TestProfiler(unittest.TestCase):
targets=[profiler.ProfilerTarget.CPU],
scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
on_trace_ready=my_trace_back,
with_flops=True,
) as prof:
for i in range(2):
y = x / 2.0
......
......@@ -31,6 +31,14 @@ from paddle.fluid.core import (
enable_memory_recorder,
enable_op_info_recorder,
)
from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import (
StatisticData,
_build_table,
SortedKeys,
gen_layer_flops,
)
from paddle.profiler import utils
from .profiler_statistic import SortedKeys, StatisticData, _build_table
......@@ -883,6 +891,18 @@ class Profiler:
)
)
if self.with_flops:
self._print_flops()
def _print_flops(self, repeat=1):
if not self.with_flops:
print('ERROR: with_flops disabled.')
return
print(" Flops Profiler Begin ".center(100, "-"))
print(gen_layer_flops(self.profiler_result.get_data(), repeat))
print("- Flops Profiler End -".center(100, "-"))
def get_profiler(config_path):
try:
......
......@@ -78,6 +78,16 @@ class SortedKeys(Enum):
GPUMin = 7
def _nodename2opname(name):
r'''
convert static host node name to operator name
'''
op_name = name.replace(' compute', '')
op_name = op_name.replace(' dygraph', '')
op_name = op_name.replace(' pybind_imperative_func', '')
return op_name
class HostStatisticNode:
r'''
Wrap original node for calculating statistic metrics.
......@@ -98,10 +108,7 @@ class HostStatisticNode:
def cal_flops(self):
if self.hostnode.type == TracerEventType.Operator:
if hasattr(self.hostnode, 'input_shapes'):
op_name = self.hostnode.name
op_name = op_name.replace(' compute', '')
op_name = op_name.replace(' dygraph', '')
op_name = op_name.replace(' pybind_imperative_func', '')
op_name = _nodename2opname(self.hostnode.name)
self.flops = flops(
op_name,
self.hostnode.input_shapes,
......@@ -111,6 +118,7 @@ class HostStatisticNode:
def cal_statistic(self):
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
self.self_cpu_time = self.cpu_time
self.cal_flops()
for child in self.children_node:
child.cal_flops()
child.cal_statistic()
......@@ -177,6 +185,117 @@ def get_device_nodes(hostnode):
return device_nodes
def _build_layer_from_tree(nodetrees):
def build_layer(node, depth=0):
if "GradNode" in node.name:
return [], 0
if node.type in [
TracerEventType.Backward,
TracerEventType.Optimization,
]:
return [], 0
if node.type == TracerEventType.Operator:
stat_node = HostStatisticNode(node)
stat_node.cal_statistic()
return stat_node, stat_node.flops
layer = []
nflops = 0
for c in node.children_node:
l, f = build_layer(c, depth + 1)
if l:
nflops += f
layer.append(l)
if node.type == TracerEventType.Forward:
stat_node = HostStatisticNode(node)
stat_node.cal_statistic()
stat_node.flops = nflops
return [stat_node, layer], nflops
return layer, nflops
ret = []
for _, rootnode in nodetrees.items():
layer, _ = build_layer(rootnode)
ret.append(layer)
return ret
def _format_large_number(n, precision=2):
if n // 1e12 > 0:
return "{} T".format(round(n / 1e12, precision))
if n // 1e9 > 0:
return "{} G".format(round(n / 1e9, precision))
if n // 1e6 > 0:
return "{} M".format(round(n / 1e6, precision))
if n // 1e3 > 0:
return "{} K".format(round(n / 1e3, precision))
return "{}".format(round(n, precision))
def _format_time(n, precision=2):
if n // 1e9 > 0:
return "{} s".format(round(n / 1e9, precision))
if n // 1e6 > 0:
return "{} ms".format(round(n / 1e6, precision))
if n // 1e3 > 0:
return "{} us".format(round(n / 1e3, precision))
return "{} ns".format(round(n, precision))
def _gen_layer_flops(node, repeat=1):
ret = []
offset = []
loop = []
def print_layer_tree(node, depth=0):
if isinstance(node, list):
for n in node:
print_layer_tree(n, depth + 1)
elif node.type in [TracerEventType.Forward, TracerEventType.Operator]:
if len(offset) == 0:
offset.append(depth)
name = _nodename2opname(node.name)
if (
depth == offset[-1] and len(ret) > 0 and ret[0].startswith(name)
): # repeat begin
loop.append(1)
if len(loop) >= repeat:
return "".join(ret)
align = " " * (depth - offset[-1])
tm = _format_time(node.cpu_time)
flops_n = _format_large_number(node.flops)
flops_s = _format_large_number(node.flops * 1e9 / node.cpu_time)
ret.append(
"{}{} latency: {}, FLOPs: {}, FLOPS: {}\n".format(
align, name, tm, flops_n, flops_s
)
)
for n in node[1:]:
print_layer_tree(n)
return "".join(ret)
def gen_layer_flops(nodetrees, repeat=1):
r'''
gen_layer_flops generate flops/runtime information depend on layer/operator.
'''
layer_tree = _build_layer_from_tree(nodetrees)
return _gen_layer_flops(layer_tree, repeat)
def wrap_tree(nodetrees):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
......@@ -387,30 +506,7 @@ class EventSummary:
Analyse operator event in profiling data, correlate with its device event.
"""
class DeviceItem:
def __init__(self, name):
self.name = name
self.call = 0
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_item(self, node):
self.call += 1
self.add_gpu_time(node.end_ns - node.start_ns)
class OperatorItem:
class ItemBase:
def __init__(self, name):
self.name = name
self.call = 0
......@@ -470,6 +566,15 @@ class EventSummary:
def add_flops(self, flops):
self._flops += flops
def add_item(self, node):
raise NotImplementedError
class DeviceItem(ItemBase):
def add_item(self, node):
self.call += 1
self.add_gpu_time(node.end_ns - node.start_ns)
class OperatorItem(ItemBase):
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
......@@ -491,56 +596,22 @@ class EventSummary:
self.devices[name] = EventSummary.DeviceItem(name)
self.devices[name].add_item(devicenode)
class GeneralItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.general_gpu_time = 0
self.min_general_gpu_time = float('inf')
self.max_general_gpu_time = 0
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
@property
def avg_general_gpu_time(self):
return self.general_gpu_time / self.call
def add_cpu_time(self, time):
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_general_gpu_time(self, time):
if time > self.max_general_gpu_time:
self.max_general_gpu_time = time
if time < self.min_general_gpu_time:
self.min_general_gpu_time = time
self.general_gpu_time += time
def add_call(self):
self.call += 1
class ForwardItem(ItemBase):
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
self.add_gpu_time(node.gpu_time)
self.add_general_gpu_time(node.general_gpu_time)
self.add_flops(node.flops)
for child in node.children_node:
if child.type != TracerEventType.Operator:
if child.name not in self.operator_inners:
self.operator_inners[
child.name
] = EventSummary.OperatorItem(child.name)
self.operator_inners[child.name].add_item(child)
class GeneralItem(ItemBase):
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
......@@ -613,6 +684,9 @@ class EventSummary:
self.add_model_perspective_item(child)
deque.append(child)
def add_forward_item(self, operator_node):
pass
def add_operator_item(self, operator_node):
if operator_node.name not in self.items:
self.items[operator_node.name] = EventSummary.OperatorItem(
......
......@@ -164,8 +164,9 @@ def _matmul_flops(input_shapes, attrs):
shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1)) ... dim_n_1, dim_m]
equation: flops = 2 * numel(output) * dim_n
"""
x_shape = input_shapes.get("X")[0]
y_shape = input_shapes.get("Y")[0]
x_shape = input_shapes.get("X", input_shapes.get("x", [[0]]))[0]
y_shape = input_shapes.get("Y", input_shapes.get("y", [[0]]))[0]
if attrs.get('transpose_X') or attrs.get('transpose_x'):
x_shape[-1], x_shape[-2] = x_shape[-2], x_shape[-1]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册