未验证 提交 26e029ae 编写于 作者: C chenjian 提交者: GitHub

fix a bug when device info not exists in json format (#1166) (#1176)

上级 4aec1570
......@@ -265,10 +265,16 @@ class ProfilerResult:
def parse_json(self, json_data):
self.schema_version = json_data['schemaVersion']
self.span_idx = json_data['span_indx']
self.device_infos = {
device_info['id']: device_info
for device_info in json_data['deviceProperties']
}
try:
self.device_infos = {
device_info['id']: device_info
for device_info in json_data['deviceProperties']
}
except Exception:
print(
"paddlepaddle-gpu version is needed to get GPU device informations."
)
self.device_infos = {}
hostnodes = []
runtimenodes = []
devicenodes = []
......
......@@ -1767,6 +1767,8 @@ class DistributedProfilerData:
data = []
for profile_data in self.profile_datas:
device_infos = profile_data.device_infos
if not device_infos:
return data
gpu_id = int(next(iter(profile_data.gpu_ids)))
data.append({
'worker_name':
......
......@@ -14,6 +14,7 @@
# =======================================================================
import os
import re
from threading import Lock
from threading import Thread
import packaging.version
......@@ -28,6 +29,7 @@ from .run_manager import RunManager
from visualdl.io import bfile
_name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))")
_lock = Lock()
def is_VDLProfiler_file(path):
......@@ -118,8 +120,10 @@ class ProfilerReader(object):
self.run_managers[run] = RunManager(run)
self.run_managers[run].set_all_filenames(filenames)
for filename in filenames:
if self.run_managers[run].has_handled(filename):
continue
with _lock: # we add this to prevent parallel requests for handling a file multiple times
if self.run_managers[run].has_handled(filename):
continue
self.run_managers[run].handled_filenames.add(filename)
self._read_data(run, filename)
return list(self.walks.keys())
......
......@@ -194,6 +194,8 @@ class ProfilerApi(object):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
if distributed_profiler_data is None:
return
return distributed_profiler_data.get_distributed_steps()
@result()
......@@ -201,6 +203,8 @@ class ProfilerApi(object):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
if distributed_profiler_data is None:
return
return distributed_profiler_data.get_distributed_histogram(
step, time_unit)
......
......@@ -104,11 +104,8 @@ class RunManager:
return
def join(self):
if self.has_join:
return
for thread in self.threads.values():
thread.join()
self.has_join = True
distributed_profiler_data = defaultdict(list)
for worker_name, span_data in self.profiler_data.items():
for span_idx, profiler_data in span_data.items():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册