From bb20dcfc1ad8a93ceaf4dcd2b338da40baea790d Mon Sep 17 00:00:00 2001 From: liuyuhui Date: Tue, 29 Dec 2020 10:46:42 +0800 Subject: [PATCH] [Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961) --- .../unittests/parallel_executor_test_base.py | 19 +++++++------------ .../fluid/tests/unittests/seresnext_net.py | 8 ++++---- .../tests/unittests/seresnext_test_base.py | 2 +- .../unittests/test_fuse_all_reduce_pass.py | 8 ++++---- .../test_fuse_elewise_add_act_pass.py | 6 +++--- .../unittests/test_fuse_optimizer_pass.py | 10 +++++----- .../test_fuse_relu_depthwise_conv_pass.py | 6 +++--- .../tests/unittests/test_ir_inplace_pass.py | 2 +- .../unittests/test_ir_memory_optimize_pass.py | 6 +++--- .../test_ir_memory_optimize_transformer.py | 4 ++-- .../test_mix_precision_all_reduce_fuse.py | 2 +- .../unittests/test_parallel_executor_mnist.py | 18 +++++++++--------- .../unittests/test_parallel_executor_pg.py | 4 ++-- ...st_parallel_executor_seresnext_base_gpu.py | 2 +- ...utor_seresnext_with_fuse_all_reduce_gpu.py | 2 +- ...llel_executor_seresnext_with_reduce_cpu.py | 2 +- ...llel_executor_seresnext_with_reduce_gpu.py | 2 +- .../test_parallel_executor_transformer.py | 4 ++-- 18 files changed, 51 insertions(+), 56 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 0d0e118e6e4..47f5c5085a0 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -28,19 +28,14 @@ import sys from feed_data_reader import FeedDataReader __all__ = ['TestParallelExecutorBase'] - - -class DeviceType: - CPU = 1 - GPU = 2 - XPU = 3 +DeviceType = core.DeviceType class TestParallelExecutorBase(unittest.TestCase): @classmethod def check_network_convergence(cls, method, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, iter=5, batch_size=None, feed_dict=None, @@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase): main, method, optimizer) place = fluid.CUDAPlace( - 0) if use_device == DeviceType.GPU else fluid.XPUPlace( + 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) @@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase): if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( - ) if use_device == DeviceType.GPU else fluid.core.get_xpu_device_count( + ) if use_device == DeviceType.CUDA else fluid.core.get_xpu_device_count( ) if use_device == DeviceType.XPU else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) @@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase): @classmethod def check_pass_conflict(cls, method, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, feed_dict=None, get_data_from_feeder=None, use_reduce=False, @@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase): main, method, optimizer) place = fluid.CUDAPlace( - 0) if use_device == DeviceType.GPU else fluid.XPUPlace( + 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) @@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.enable_inplace = enable_inplace build_strategy.enable_sequential_execution = enable_sequential_execution - if use_device == DeviceType.GPU and core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True if use_device == DeviceType.XPU and core.is_compiled_with_xpu(): build_strategy.fuse_elewise_add_act_ops = False diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py index d20cf70b14a..2e4b1828c5b 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_net.py +++ b/python/paddle/fluid/tests/unittests/seresnext_net.py @@ -171,20 +171,20 @@ model = SE_ResNeXt50Small def batch_size(use_device): - if use_device == DeviceType.GPU: + if use_device == DeviceType.CUDA: # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size. return 8 return 12 def iter(use_device): - if use_device == DeviceType.GPU: + if use_device == DeviceType.CUDA: return 10 return 1 gpu_img, gpu_label = init_data( - batch_size=batch_size(use_device=DeviceType.GPU), + batch_size=batch_size(use_device=DeviceType.CUDA), img_shape=img_shape, label_range=999) cpu_img, cpu_label = init_data( @@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label} def feed_dict(use_device): - if use_device == DeviceType.GPU: + if use_device == DeviceType.CUDA: return feed_dict_gpu return feed_dict_cpu diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py index a39ca59b656..cc40b89b585 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py +++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py @@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase): use_device, delta2=1e-5, compare_seperately=True): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return func_1_first_loss, func_1_last_loss = self.check_network_convergence( diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py index aa520beb201..881b9d90579 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): get_data_from_feeder=None, optimizer=None, fuse_all_optimizer_ops=False): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return feed_dict_data = None @@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): fuse_all_optimizer_ops=True) def test_simple_fc_with_fuse_all_reduce(self): - self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.GPU) + self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_all_reduce(self): self._decorate_compare_fused_all_reduce(fc_with_batchnorm, - DeviceType.GPU) + DeviceType.CUDA) self._decorate_compare_fused_all_reduce(fc_with_batchnorm, DeviceType.CPU) @@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): def test_simple_bow_net_with_fuse_all_reduce(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) - self._decorate_compare_fused_all_reduce(model, DeviceType.GPU) + self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py index e5e8eee6f84..a1c20be9a92 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py @@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase): os.environ['CPU_NUM'] = str(4) def _compare_fuse_elewise_add_act_ops(self, model, use_device): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = init_data() @@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) def test_simple_fc_with_fuse_op(self): - self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.GPU) + self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA) self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_op(self): self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, - DeviceType.GPU) + DeviceType.CUDA) self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, DeviceType.CPU) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py index 75aa07c4b9b..51c06bb79d7 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): feed_dict=None, get_data_from_feeder=None, optimizer=fluid.optimizer.Adam): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( @@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps): def test_batchnorm_fc_with_fuse_op(self): self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) @@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): def test_simple_bow_net_with_fuse_op(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) self._decorate_compare_fused_optimizer_ops( - model, DeviceType.GPU, optimizer=self.optimizer) + model, DeviceType.CUDA, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( model, DeviceType.CPU, optimizer=self.optimizer) @@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps): feed_dict=None, get_data_from_feeder=None, optimizer=fluid.optimizer.Adam): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return self.check_pass_conflict( @@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase): self._decorate_compare_fused_optimizer_ops( fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 0e54ebc7f45..9b739ebdfb2 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase): return img, label def _compare(self, model, use_device, random_data=True, only_forward=False): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._init_data(random_data) @@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) def test_simple_depthwise_with_fuse_op(self): - self._compare(simple_depthwise_net, DeviceType.GPU) + self._compare(simple_depthwise_net, DeviceType.CUDA) self._compare(simple_depthwise_net, DeviceType.CPU) def test_simple_depthwise_with_fuse_op_only_forward(self): - self._compare(simple_depthwise_net, DeviceType.GPU, only_forward=True) + self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True) self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True) diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py index f8b2ec21bc5..e2094c76b7d 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase): fc_with_batchnorm, feed_dict={"image": img, "label": label}, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, use_ir_memory_optimize=ir_memory_optimize, enable_inplace=enable_inplace) diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py index 61ceefdad11..f4ec63a8b91 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase): return img, label def _compare_ir_memory_optimize(self, model, use_device): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._dummy_data() @@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_net(self): self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU) - self._compare_ir_memory_optimize(simple_fc_net, DeviceType.GPU) + self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA) def test_fc_with_reshape_net(self): self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU) - self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.GPU) + self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py index 40c4fa74953..aa495c7533c 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py @@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase): # check python transpiler self.check_network_convergence( transformer, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader(), use_ir_memory_optimize=False, iter=2) # check IR memory optimize self.check_network_convergence( transformer, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader(), use_ir_memory_optimize=True, iter=2) diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py index 7df3583f0d2..33393bc2fcd 100644 --- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py +++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py @@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase): def test_model(self): if core.is_compiled_with_cuda(): - self.check_model(DeviceType.GPU) + self.check_model(DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 305c7703be8..2c79670f1a2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase): use_device, delta1=1e-6, delta2=1e-4): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): @@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase): # simple_fc def check_simple_fc_convergence(self, use_device, use_reduce=False): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): @@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc(self): # use_device - self.check_simple_fc_convergence(DeviceType.GPU) + self.check_simple_fc_convergence(DeviceType.CUDA) self.check_simple_fc_convergence(DeviceType.CPU) self.check_simple_fc_convergence(DeviceType.XPU) @@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase): # use_device, use_reduce # NOTE: the computation result of nccl_reduce is non-deterministic, # related issue: https://github.com/NVIDIA/nccl/issues/157 - self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.GPU, 1e-5, + self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CUDA, 1e-5, 1e-2) self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CPU, 1e-5, 1e-2) def check_simple_fc_parallel_accuracy(self, use_device): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._init_data() @@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(DeviceType.GPU) + self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) self.check_simple_fc_parallel_accuracy(DeviceType.CPU) def check_batchnorm_fc_convergence(self, use_device, use_fast_executor): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return @@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase): use_fast_executor=use_fast_executor) def test_batchnorm_fc(self): - for use_device in (DeviceType.CPU, DeviceType.GPU): + for use_device in (DeviceType.CPU, DeviceType.CUDA): for use_fast_executor in (False, True): self.check_batchnorm_fc_convergence(use_device, use_fast_executor) @@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase): def test_batchnorm_fc_with_new_strategy(self): # NOTE: the computation result of nccl_reduce is non-deterministic, # related issue: https://github.com/NVIDIA/nccl/issues/157 - self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.GPU, + self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CUDA, 1e-5, 1e-2) self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CPU, 1e-5, 1e-2) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py index 45008c20827..e07b89f7aae 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py @@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase): # simple_fc def check_simple_fc_convergence(self, use_device, use_reduce=False): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = init_data() @@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(DeviceType.GPU) + self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py index ef6c3e11870..9d1364cc592 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py @@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase): optimizer=seresnext_net.optimizer, use_parallel_executor=False) self._compare_result_with_origin_model( - check_func, use_device=DeviceType.GPU, compare_seperately=False) + check_func, use_device=DeviceType.CUDA, compare_seperately=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py index 111ea507c37..c747591c816 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py @@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase): optimizer=seresnext_net.optimizer, fuse_all_reduce_ops=True) self._compare_result_with_origin_model( - check_func, use_device=DeviceType.GPU, delta2=1e-2) + check_func, use_device=DeviceType.CUDA, delta2=1e-2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py index 2e5ab76377e..e67934d87f9 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py @@ -21,7 +21,7 @@ import paddle.fluid.core as core class TestResnetWithReduceBase(TestParallelExecutorBase): def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): - if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py index ff98d562c41..4de1a6092dc 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py @@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc class TestResnetWithReduceGPU(TestResnetWithReduceBase): def test_seresnext_with_reduce(self): self._compare_reduce_and_allreduce( - use_device=DeviceType.GPU, delta2=1e-2) + use_device=DeviceType.CUDA, delta2=1e-2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 26036e41d9f..1cb39eb131b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase): if core.is_compiled_with_cuda(): self.check_network_convergence( transformer, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader()) self.check_network_convergence( transformer, - use_device=DeviceType.GPU, + use_device=DeviceType.CUDA, enable_sequential_execution=True, feed_data_reader=get_feed_data_reader()) self.check_network_convergence( -- GitLab