From 4fd558556a0cb75e1de4af546aa8ebfcb74bcef7 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Fri, 24 Jul 2020 18:35:10 +0800 Subject: [PATCH] Fix PE Unit Test Failure, test=develop (#25693) Based on the comment here https://github.com/PaddlePaddle/Paddle/blob/b5f8784cab94eae785659787fc529870c87b254c/paddle/fluid/framework/details/build_strategy.h#L49 The unit test which compares Reduce and AllReduce must have diff. The PR_CI_Night runs on P40 machine and it has 8GB GPU, which is smaller than the 16GB normal CI machines. So we decrease the batch size in the past to make it runnable: https://github.com/PaddlePaddle/Paddle/pull/24651/files . Decreasing the batch size makes the difference occurs often. So this PR replace the absolute delta by relative delta. Before this PR, the unit test failure happens with probability about 1/100. After this PR it doesn't happen. --- .../test_parallel_executor_seresnext_with_reduce_cpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py index 62eb7e1155..57ff4890f6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py @@ -44,7 +44,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2) if not use_cuda: return @@ -72,17 +72,17 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2) for loss in zip(reduce_first_loss, reduce_first_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(reduce_last_loss, reduce_last_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2) for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2) class TestResnetWithReduceCPU(TestResnetWithReduceBase): -- GitLab