From 5ca50f84aae9ffae9efedec4184f1971485a4c51 Mon Sep 17 00:00:00 2001 From: gaotingquan Date: Thu, 14 Apr 2022 07:36:39 +0000 Subject: [PATCH] fix: convert bn to sync_bn the running_mean and running_var of bn would not be synchronized in dist, so which leads to bug that eval loss in training is inconsistent with eval only. --- ppcls/engine/engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 019cf165..1dcf6535 100644 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -243,6 +243,11 @@ class Engine(object): level=amp_level, save_dtype='float32') + # TODO(gaotingquan): convert_sync_batchnorm is not effective + # eval loss in training is inconsistent with the eval only if bn is used, + # because the running_mean and running_var of bn are not synced in dist. + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 -- GitLab