未验证 提交 3bc3e2e6 编写于 作者: C chenjian 提交者: GitHub

Add protobuf file support in profiling

上级 8d3e2818
......@@ -8,3 +8,4 @@ requests
six >= 1.14.0
matplotlib
pandas
packaging
\ No newline at end of file
......@@ -17,6 +17,9 @@ import functools
import json
import re
import sys
import tempfile
from .utils import traverse_tree
_show_name_pattern = re.compile(r'(.+)(\[.+\])')
_show_tid_pattern = re.compile(r'\w+(\(.+\))')
......@@ -78,6 +81,25 @@ class HostNode:
self.mem_node = []
return self
@classmethod
def from_protobuf(cls, obj):
self = cls()
self.name = obj.name
self.type = str(obj.type).split('.')[1]
self.start_ns = obj.start_ns
self.end_ns = obj.end_ns
self.process_id = obj.process_id
self.thread_id = obj.thread_id
self.correlation_id = obj.correlation_id
self.input_shapes = obj.input_shapes
self.dtypes = obj.dtypes
self.callstack = obj.callstack
self.children_node = []
self.runtime_node = []
self.device_node = []
self.mem_node = []
return self
class MemNode:
def __init__(self):
......@@ -118,6 +140,22 @@ class MemNode:
'peak_reserved'] if 'peak_reserved' in json_obj['args'] else 0
return self
@classmethod
def from_protobuf(cls, obj):
self = cls()
self.type = str(obj.type).split('.')[1]
self.timestamp_ns = obj.timestamp_ns
self.addr = hex(int(obj.addr))
self.process_id = obj.process_id
self.thread_id = obj.thread_id
self.increase_bytes = obj.increase_bytes
self.place = obj.place
self.current_allocated = obj.current_allocated
self.current_reserved = obj.current_reserved
self.peak_allocated = obj.peak_allocated
self.peak_reserved = obj.peak_reserved
return self
class DeviceNode:
def __init__(self):
......@@ -176,9 +214,35 @@ class DeviceNode:
"warps per SM"] if "warps per SM" in json_obj['args'] else 0
return self
@classmethod
def from_protobuf(cls, obj):
self = cls()
self.name = obj.name
self.type = str(obj.type).split('.')[1]
self.start_ns = obj.start_ns
self.end_ns = obj.end_ns
self.device_id = obj.device_id
self.stream_id = obj.stream_id
self.context_id = obj.context_id
self.correlation_id = obj.correlation_id
self.block_x, self.block_y, self.block_z = [
obj.block_x, obj.block_y, obj.block_z
]
self.grid_x, self.grid_y, self.grid_z = [
obj.grid_x, obj.grid_y, obj.grid_z
]
self.shared_memory = obj.shared_memory
self.registers_per_thread = obj.registers_per_thread
self.num_bytes = obj.num_bytes
self.value = obj.value
self.occupancy = obj.occupancy * 100
self.blocks_per_sm = obj.blocks_per_sm
self.warps_per_sm = obj.warps_per_sm
return self
class ProfilerResult:
def __init__(self, json_data):
def __init__(self, data):
self.device_infos = None
self.span_idx = None
self.data = None
......@@ -187,11 +251,18 @@ class ProfilerResult:
self.has_hostnodes = True
self.has_devicenodes = True
self.has_memnodes = True
self.parse(json_data)
self.content = json_data
self.start_in_timeline_ns = None
if isinstance(data, dict):
self.parse_json(data)
self.content = data
else:
self.parse_protobuf(data)
with tempfile.NamedTemporaryFile("r") as fp:
data.save(fp.name, "json")
fp.seek(0)
self.content = json.loads(fp.read())
def parse(self, json_data):
def parse_json(self, json_data):
self.schema_version = json_data['schemaVersion']
self.span_idx = json_data['span_indx']
self.device_infos = {
......@@ -232,6 +303,82 @@ class ProfilerResult:
memnodes)
self.extra_info = json_data['ExtraInfo']
def parse_protobuf(self, protobuf_data): # noqa: C901
self.schema_version = protobuf_data.get_version()
self.span_idx = str(protobuf_data.get_span_indx())
try:
self.device_infos = {
device_id: {
'name': device_property.name,
'totalGlobalMem': device_property.total_memory,
'computeMajor': device_property.major,
'computeMinor': device_property.minor
}
for device_id, device_property in
protobuf_data.get_device_property().items()
}
except Exception:
print(
"paddlepaddle-gpu version is needed to get GPU device informations."
)
self.device_infos = {}
self.extra_info = protobuf_data.get_extra_info()
self.start_in_timeline_ns = float('inf')
self.has_hostnodes = False
self.has_devicenodes = False
self.has_memnodes = False
node_trees = protobuf_data.get_data()
new_node_trees = {}
for threadid, root in node_trees.items():
stack = []
new_stack = []
new_root = HostNode.from_protobuf(root)
new_node_trees[threadid] = new_root
stack.append(root)
new_stack.append(new_root)
while stack:
current_node = stack.pop()
new_current_node = new_stack.pop()
for child_node in current_node.children_node:
if self.has_hostnodes is False:
self.has_hostnodes = True
new_child_node = HostNode.from_protobuf(child_node)
new_current_node.children_node.append(new_child_node)
stack.append(child_node)
new_stack.append(new_child_node)
for runtime_node in current_node.runtime_node:
new_runtime_node = HostNode.from_protobuf(runtime_node)
new_current_node.runtime_node.append(new_runtime_node)
for device_node in runtime_node.device_node:
new_device_node = DeviceNode.from_protobuf(device_node)
new_runtime_node.device_node.append(new_device_node)
for mem_node in current_node.mem_node:
new_mem_node = MemNode.from_protobuf(mem_node)
new_current_node.mem_node.append(new_mem_node)
new_node_tree_list = traverse_tree(new_node_trees)
for threadid, node_tree_list in new_node_tree_list.items():
for node in node_tree_list[1:]: # skip root
if node.start_ns < self.start_in_timeline_ns:
self.start_in_timeline_ns = node.start_ns
for threadid, node_tree_list in new_node_tree_list.items():
for node in node_tree_list:
if node != node_tree_list[0]: # skip root
node.start_ns -= self.start_in_timeline_ns
node.end_ns -= self.start_in_timeline_ns
for runtimenode in node.runtime_node:
runtimenode.end_ns -= self.start_in_timeline_ns
runtimenode.start_ns -= self.start_in_timeline_ns
for device_node in runtimenode.device_node:
if self.has_devicenodes is False:
self.has_devicenodes = True
device_node.start_ns -= self.start_in_timeline_ns
device_node.end_ns -= self.start_in_timeline_ns
for mem_node in node.mem_node:
if self.has_memnodes is False:
self.has_memnodes = True
mem_node.timestamp_ns -= self.start_in_timeline_ns
self.data = new_node_trees
def build_tree( # noqa: C901
self, hostnodes, runtimenodes, devicenodes, memnodes):
thread2host_event_nodes = collections.defaultdict(list)
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import sys
StageType = ['Dataloader', 'Forward', 'Backward', 'Optimization']
......@@ -466,7 +468,6 @@ def format_time(time, unit='ms', inf_subs='-'):
result /= 1e6
elif unit == 'us':
result /= 1e3
# return '{:.2f}'.format(result)
return round(result, 2)
......@@ -474,7 +475,6 @@ def format_ratio(ratio):
r"""
Transform ratio within [0, 1] to percentage presentation.
"""
# return '{:.2f}'.format(ratio * 100)
return round(ratio * 100, 2)
......@@ -490,5 +490,27 @@ def format_memory(memory, memory_unit='KB'):
result /= (1024 * 1024)
elif memory_unit == 'KB':
result /= 1024
# return '{:.2f}'.format(result)
return round(result, 2)
class RedirectStdStreams(object):
def __init__(self, stdout=None, stderr=None):
self._stdout = stdout or sys.stdout
self._stderr = stderr or sys.stderr
def __enter__(self):
'''
Replace stdout and stderr to specified stream.
'''
sys.stdout.flush()
sys.stderr.flush()
self.old_stdout_fileno, self.old_stderr_fileno = os.dup(
sys.stdout.fileno()), os.dup(sys.stderr.fileno())
os.dup2(self._stdout.fileno(), sys.stdout.fileno())
os.dup2(self._stderr.fileno(), sys.stderr.fileno())
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.old_stdout_fileno, sys.stdout.fileno())
os.dup2(self.old_stderr_fileno, sys.stderr.fileno())
os.close(self.old_stdout_fileno)
os.close(self.old_stderr_fileno)
......@@ -147,12 +147,14 @@ class ProfilerData:
else:
device_type = 'GPU'
gpu_id = int(next(iter(self.gpu_ids)))
if gpu_id in self.device_infos:
return {
"device_type": device_type,
"CPU": {
"process_utilization":
format_ratio(
float(self.extra_infos["Process Cpu Utilization"])),
float(
self.extra_infos["Process Cpu Utilization"])),
"system_utilization":
format_ratio(
float(self.extra_infos["System Cpu Utilization"]))
......@@ -166,14 +168,46 @@ class ProfilerData:
self.device_infos[gpu_id]['totalGlobalMem'],
'GB')),
"compute_capability":
'{}.{}'.format(self.device_infos[gpu_id]['computeMajor'],
'{}.{}'.format(
self.device_infos[gpu_id]['computeMajor'],
self.device_infos[gpu_id]['computeMinor']),
"utilization":
format_ratio(self.gpu_ulitization),
"sm_efficiency":
format_ratio(
self.sm_efficiency /
self.model_perspective_items['ProfileStep'].cpu_time),
self.sm_efficiency / self.
model_perspective_items['ProfileStep'].cpu_time),
"achieved_occupancy":
format_ratio(self.occupancy),
"tensor_core_percentage":
format_ratio(self.tensorcore_ratio)
}
}
else:
return {
"device_type": device_type,
"CPU": {
"process_utilization":
format_ratio(
float(
self.extra_infos["Process Cpu Utilization"])),
"system_utilization":
format_ratio(
float(self.extra_infos["System Cpu Utilization"]))
},
"GPU": {
"name":
"-",
"memory":
"-",
"compute_capability":
'-',
"utilization":
format_ratio(self.gpu_ulitization),
"sm_efficiency":
format_ratio(
self.sm_efficiency / self.
model_perspective_items['ProfileStep'].cpu_time),
"achieved_occupancy":
format_ratio(self.occupancy),
"tensor_core_percentage":
......
......@@ -16,11 +16,14 @@ import os
import re
from threading import Thread
import packaging.version
from multiprocess import Process
from multiprocess import Queue
from .parser.const_description import * # noqa: F403
from .parser.event_node import load_profiler_json
from .parser.event_node import ProfilerResult
from .parser.utils import RedirectStdStreams
from .run_manager import RunManager
from visualdl.io import bfile
......@@ -175,17 +178,44 @@ class ProfilerReader(object):
if match:
worker_name = match.group(1)
if '.pb' in filename:
def _load_profiler_protobuf(run, filename, worker_name):
devnull = open(os.devnull, 'w+')
with RedirectStdStreams(stdout=devnull, stderr=devnull):
try:
from paddle.profiler import load_profiler_result
except Exception:
print(
'Load paddle.profiler error. Please check paddle >= 2.3.0'
import paddle
except Exception as e:
print('Paddle is required to read protobuf file.\
Please install [paddlepaddle](https://www.paddlepaddle.org.cn/install/quick?\
docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) first.'
)
raise RuntimeError(str(e))
if packaging.version.parse(
'2.4.0') > packaging.version.parse(
paddle.__version__):
raise RuntimeError(
"Please make sure paddlepaddle version >= 2.4.0"
)
exit(0)
profile_result = load_profiler_result(
from paddle.profiler import load_profiler_result
try:
content = load_profiler_result(
os.path.join(run, filename))
self.run_managers[run].add_profile_result(
filename, worker_name, profile_result)
if content is None:
raise RuntimeError("Missing required fields.")
profile_result = ProfilerResult(content)
except Exception as e:
raise RuntimeError(
"An error occurred while loading the protobuf file, which may be caused\
by the outdated version of paddle that generated the profile file. \
Please make sure protobuf file is exported by paddlepaddle version >= 2.4.0. \
Error message: {}".format(e))
self.profile_result_queue.put(
(run, filename, worker_name, profile_result))
Process(
target=_load_profiler_protobuf,
args=(run, filename, worker_name)).start()
else:
def _load_profiler_json(run, filename, worker_name):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册