未验证 提交 0e178033 编写于 作者: C chengduo 提交者: GitHub

open compare_reduce_and_allreduce test (#15258)

test=develop
上级 fd854183
...@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase):
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
return img, label return img, label
def _compare_reduce_and_allreduce(self, model, use_cuda): def _compare_reduce_and_allreduce(self,
model,
use_cuda,
delta1=1e-6,
delta2=1e-4):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
...@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
use_reduce=True) use_reduce=True)
for loss in zip(all_reduce_first_loss, reduce_first_loss): for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) self.assertAlmostEqual(loss[0], loss[1], delta=delta1)
for loss in zip(all_reduce_last_loss, reduce_last_loss): for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) self.assertAlmostEqual(loss[0], loss[1], delta=delta2)
# simple_fc # simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False): def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
...@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase):
self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
# FIXME(zcd): close this test temporally. # NOTE: the computation result of nccl_reduce is non-deterministic,
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True) # related issue: https://github.com/NVIDIA/nccl/issues/157
self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-3)
self._compare_reduce_and_allreduce(fc_with_batchnorm, False) self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册