From 598d32d664ba251abb2d8624e4984e42bdd475cb Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Tue, 14 Sep 2021 14:26:02 +0800
Subject: [PATCH] fix GradientClipByGlobalNorm in hybrid parallel (#35691)

---
 python/paddle/fluid/clip.py                                | 4 +++-
 .../tests/unittests/test_fleet_sharding_meta_optimizer.py  | 7 +++----
 python/paddle/fluid/tests/unittests/test_gradient_clip.py  | 7 +++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index e9f5c181a6..5a9ea1a445 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -522,7 +522,9 @@ class ClipGradByGlobalNorm(ClipGradBase):
                     # fp64
                     global_norm_var_other_dtype = layers.sums(sum_square_list)
                     global_norm_var.append(global_norm_var_other_dtype)
-                global_norm_var = layers.sums(global_norm_var)
+
+                global_norm_var = layers.sums(global_norm_var) if len(
+                    global_norm_var) > 1 else global_norm_var[0]
                 global_norm_var = layers.sqrt(x=global_norm_var)
                 max_global_norm = layers.fill_constant(
                     shape=[1],
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index c462896eed..3b0df74d3e 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -266,10 +266,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
             'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
-            'c_allreduce_sum', 'sum', 'c_allreduce_sum', 'sqrt',
-            'fill_constant', 'elementwise_max', 'elementwise_div',
-            'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum',
-            'momentum', 'momentum'
+            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
+            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_clone_for_test(self):
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 4360214e7d..e2050cf32d 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -216,7 +216,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     def test_none_grad_fp32(self):
         ops = self._test_none_grad_helper("float32")
         self.assertListEqual(ops, [
-            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sum', 'sqrt',
+            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt',
             'fill_constant', 'elementwise_max', 'elementwise_div',
             'elementwise_mul', 'elementwise_mul'
         ])
@@ -225,9 +225,8 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
         ops = self._test_none_grad_helper("float16")
         self.assertListEqual(ops, [
             'square', 'reduce_sum', 'square', 'reduce_sum', 'sum', 'cast',
-            'sum', 'sqrt', 'fill_constant', 'elementwise_max',
-            'elementwise_div', 'cast', 'elementwise_mul', 'cast',
-            'elementwise_mul'
+            'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
+            'cast', 'elementwise_mul', 'cast', 'elementwise_mul'
         ])
 
     def _test_none_grad_helper(self, dtype):
-- 
GitLab