未验证 提交 4fd55855 编写于 作者: H Huihuang Zheng 提交者: GitHub

Fix PE Unit Test Failure, test=develop (#25693)

Based on the comment here https://github.com/PaddlePaddle/Paddle/blob/b5f8784cab94eae785659787fc529870c87b254c/paddle/fluid/framework/details/build_strategy.h#L49

The unit test which compares Reduce and AllReduce must have diff. The PR_CI_Night runs on P40 machine and it has 8GB GPU, which is smaller than the 16GB normal CI machines. So we decrease the batch size in the past to make it runnable: https://github.com/PaddlePaddle/Paddle/pull/24651/files  . Decreasing the batch size makes the difference occurs often. So this PR replace the absolute delta by relative delta.

Before this PR, the unit test failure happens with probability about 1/100. After this PR it doesn't happen.
上级 cea50868
...@@ -44,7 +44,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -44,7 +44,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
for loss in zip(all_reduce_first_loss, reduce_first_loss): for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(all_reduce_last_loss, reduce_last_loss): for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
if not use_cuda: if not use_cuda:
return return
...@@ -72,17 +72,17 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -72,17 +72,17 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
for loss in zip(reduce_first_loss, reduce_first_loss_seq): for loss in zip(reduce_first_loss, reduce_first_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(reduce_last_loss, reduce_last_loss_seq): for loss in zip(reduce_last_loss, reduce_last_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
class TestResnetWithReduceCPU(TestResnetWithReduceBase): class TestResnetWithReduceCPU(TestResnetWithReduceBase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册