From 13d5e5905185dc2d42d890782823a0c91e691d23 Mon Sep 17 00:00:00 2001
From: gaotingquan <gaotingquan@baidu.com>
Date: Thu, 14 Apr 2022 07:36:39 +0000
Subject: [PATCH] fix: convert bn to sync_bn

the running_mean and running_var of bn would not be synchronized in dist,
so which leads to bug that eval loss in training is inconsistent with eval only.
---
 ppcls/engine/engine.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index 7ab29d8d..bc3a2a16 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -242,6 +242,11 @@ class Engine(object):
                 level=amp_level,
                 save_dtype='float32')
 
+        # TODO(gaotingquan): convert_sync_batchnorm is not effective
+        # eval loss in training is inconsistent with the eval only if bn is used,
+        # because the running_mean and running_var of bn are not synced in dist.
+        self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
+
         # for distributed
         world_size = dist.get_world_size()
         self.config["Global"]["distributed"] = world_size != 1
-- 
GitLab