From 812c142bdedcadf7337cd195f6cb65a543839bd7 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Nov 2022 20:29:20 +0800 Subject: [PATCH] fix a bug when device info not exists in json format (#1166) --- visualdl/component/profiler/parser/event_node.py | 14 ++++++++++---- visualdl/component/profiler/profiler_data.py | 2 ++ visualdl/component/profiler/profiler_reader.py | 8 ++++++-- visualdl/component/profiler/profiler_server.py | 4 ++++ visualdl/component/profiler/run_manager.py | 3 --- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/visualdl/component/profiler/parser/event_node.py b/visualdl/component/profiler/parser/event_node.py index b7df5830..b3d0eebf 100644 --- a/visualdl/component/profiler/parser/event_node.py +++ b/visualdl/component/profiler/parser/event_node.py @@ -265,10 +265,16 @@ class ProfilerResult: def parse_json(self, json_data): self.schema_version = json_data['schemaVersion'] self.span_idx = json_data['span_indx'] - self.device_infos = { - device_info['id']: device_info - for device_info in json_data['deviceProperties'] - } + try: + self.device_infos = { + device_info['id']: device_info + for device_info in json_data['deviceProperties'] + } + except Exception: + print( + "paddlepaddle-gpu version is needed to get GPU device informations." + ) + self.device_infos = {} hostnodes = [] runtimenodes = [] devicenodes = [] diff --git a/visualdl/component/profiler/profiler_data.py b/visualdl/component/profiler/profiler_data.py index 100f8a24..e6ce7f75 100644 --- a/visualdl/component/profiler/profiler_data.py +++ b/visualdl/component/profiler/profiler_data.py @@ -1767,6 +1767,8 @@ class DistributedProfilerData: data = [] for profile_data in self.profile_datas: device_infos = profile_data.device_infos + if not device_infos: + return data gpu_id = int(next(iter(profile_data.gpu_ids))) data.append({ 'worker_name': diff --git a/visualdl/component/profiler/profiler_reader.py b/visualdl/component/profiler/profiler_reader.py index 15985802..85230c72 100644 --- a/visualdl/component/profiler/profiler_reader.py +++ b/visualdl/component/profiler/profiler_reader.py @@ -14,6 +14,7 @@ # ======================================================================= import os import re +from threading import Lock from threading import Thread import packaging.version @@ -28,6 +29,7 @@ from .run_manager import RunManager from visualdl.io import bfile _name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))") +_lock = Lock() def is_VDLProfiler_file(path): @@ -130,8 +132,10 @@ class ProfilerReader(object): self.run_managers[run] = RunManager(run) self.run_managers[run].set_all_filenames(filenames) for filename in filenames: - if self.run_managers[run].has_handled(filename): - continue + with _lock: # we add this to prevent parallel requests for handling a file multiple times + if self.run_managers[run].has_handled(filename): + continue + self.run_managers[run].handled_filenames.add(filename) self._read_data(run, filename) return list(self.walks.keys()) diff --git a/visualdl/component/profiler/profiler_server.py b/visualdl/component/profiler/profiler_server.py index 408c3349..ad11434e 100644 --- a/visualdl/component/profiler/profiler_server.py +++ b/visualdl/component/profiler/profiler_server.py @@ -202,6 +202,8 @@ class ProfilerApi(object): run_manager = self._reader.get_run_manager(run) distributed_profiler_data = run_manager.get_distributed_profiler_data( span) + if distributed_profiler_data is None: + return return distributed_profiler_data.get_distributed_steps() @result() @@ -209,6 +211,8 @@ class ProfilerApi(object): run_manager = self._reader.get_run_manager(run) distributed_profiler_data = run_manager.get_distributed_profiler_data( span) + if distributed_profiler_data is None: + return return distributed_profiler_data.get_distributed_histogram( step, time_unit) diff --git a/visualdl/component/profiler/run_manager.py b/visualdl/component/profiler/run_manager.py index 037ca02e..418626ab 100644 --- a/visualdl/component/profiler/run_manager.py +++ b/visualdl/component/profiler/run_manager.py @@ -104,11 +104,8 @@ class RunManager: return def join(self): - if self.has_join: - return for thread in self.threads.values(): thread.join() - self.has_join = True distributed_profiler_data = defaultdict(list) for worker_name, span_data in self.profiler_data.items(): for span_idx, profiler_data in span_data.items(): -- GitLab