From bb20dcfc1ad8a93ceaf4dcd2b338da40baea790d Mon Sep 17 00:00:00 2001
From: liuyuhui <liuyuhui@baidu.com>
Date: Tue, 29 Dec 2020 10:46:42 +0800
Subject: [PATCH] [Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in
 parallel executor  (#29961)

---
 .../unittests/parallel_executor_test_base.py  | 19 +++++++------------
 .../fluid/tests/unittests/seresnext_net.py    |  8 ++++----
 .../tests/unittests/seresnext_test_base.py    |  2 +-
 .../unittests/test_fuse_all_reduce_pass.py    |  8 ++++----
 .../test_fuse_elewise_add_act_pass.py         |  6 +++---
 .../unittests/test_fuse_optimizer_pass.py     | 10 +++++-----
 .../test_fuse_relu_depthwise_conv_pass.py     |  6 +++---
 .../tests/unittests/test_ir_inplace_pass.py   |  2 +-
 .../unittests/test_ir_memory_optimize_pass.py |  6 +++---
 .../test_ir_memory_optimize_transformer.py    |  4 ++--
 .../test_mix_precision_all_reduce_fuse.py     |  2 +-
 .../unittests/test_parallel_executor_mnist.py | 18 +++++++++---------
 .../unittests/test_parallel_executor_pg.py    |  4 ++--
 ...st_parallel_executor_seresnext_base_gpu.py |  2 +-
 ...utor_seresnext_with_fuse_all_reduce_gpu.py |  2 +-
 ...llel_executor_seresnext_with_reduce_cpu.py |  2 +-
 ...llel_executor_seresnext_with_reduce_gpu.py |  2 +-
 .../test_parallel_executor_transformer.py     |  4 ++--
 18 files changed, 51 insertions(+), 56 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 0d0e118e6e..47f5c5085a 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -28,19 +28,14 @@ import sys
 from feed_data_reader import FeedDataReader
 
 __all__ = ['TestParallelExecutorBase']
-
-
-class DeviceType:
-    CPU = 1
-    GPU = 2
-    XPU = 3
+DeviceType = core.DeviceType
 
 
 class TestParallelExecutorBase(unittest.TestCase):
     @classmethod
     def check_network_convergence(cls,
                                   method,
-                                  use_device=DeviceType.GPU,
+                                  use_device=DeviceType.CUDA,
                                   iter=5,
                                   batch_size=None,
                                   feed_dict=None,
@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                               main, method, optimizer)
 
         place = fluid.CUDAPlace(
-            0) if use_device == DeviceType.GPU else fluid.XPUPlace(
+            0) if use_device == DeviceType.CUDA else fluid.XPUPlace(
                 0) if use_device == DeviceType.XPU else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup)
@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
 
         if batch_size is not None:
             batch_size *= fluid.core.get_cuda_device_count(
-            ) if use_device == DeviceType.GPU else fluid.core.get_xpu_device_count(
+            ) if use_device == DeviceType.CUDA else fluid.core.get_xpu_device_count(
             ) if use_device == DeviceType.XPU else int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
 
@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
     @classmethod
     def check_pass_conflict(cls,
                             method,
-                            use_device=DeviceType.GPU,
+                            use_device=DeviceType.CUDA,
                             feed_dict=None,
                             get_data_from_feeder=None,
                             use_reduce=False,
@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                               main, method, optimizer)
 
         place = fluid.CUDAPlace(
-            0) if use_device == DeviceType.GPU else fluid.XPUPlace(
+            0) if use_device == DeviceType.CUDA else fluid.XPUPlace(
                 0) if use_device == DeviceType.XPU else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup)
@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
         build_strategy.enable_inplace = enable_inplace
         build_strategy.enable_sequential_execution = enable_sequential_execution
 
-        if use_device == DeviceType.GPU and core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and core.is_compiled_with_cuda():
             build_strategy.remove_unnecessary_lock = True
         if use_device == DeviceType.XPU and core.is_compiled_with_xpu():
             build_strategy.fuse_elewise_add_act_ops = False
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index d20cf70b14..2e4b1828c5 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
 
 
 def batch_size(use_device):
-    if use_device == DeviceType.GPU:
+    if use_device == DeviceType.CUDA:
         # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
         return 8
     return 12
 
 
 def iter(use_device):
-    if use_device == DeviceType.GPU:
+    if use_device == DeviceType.CUDA:
         return 10
     return 1
 
 
 gpu_img, gpu_label = init_data(
-    batch_size=batch_size(use_device=DeviceType.GPU),
+    batch_size=batch_size(use_device=DeviceType.CUDA),
     img_shape=img_shape,
     label_range=999)
 cpu_img, cpu_label = init_data(
@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
 
 
 def feed_dict(use_device):
-    if use_device == DeviceType.GPU:
+    if use_device == DeviceType.CUDA:
         return feed_dict_gpu
     return feed_dict_cpu
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
index a39ca59b65..cc40b89b58 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
                                           use_device,
                                           delta2=1e-5,
                                           compare_seperately=True):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         func_1_first_loss, func_1_last_loss = self.check_network_convergence(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index aa520beb20..881b9d9057 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
                                     get_data_from_feeder=None,
                                     optimizer=None,
                                     fuse_all_optimizer_ops=False):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         feed_dict_data = None
@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
             fuse_all_optimizer_ops=True)
 
     def test_simple_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.GPU)
+        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
 
     def test_batchnorm_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
-                                                DeviceType.GPU)
+                                                DeviceType.CUDA)
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                 DeviceType.CPU)
 
@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
 
     def test_simple_bow_net_with_fuse_all_reduce(self):
         model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_all_reduce(model, DeviceType.GPU)
+        self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
         self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index e5e8eee6f8..a1c20be9a9 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
         os.environ['CPU_NUM'] = str(4)
 
     def _compare_fuse_elewise_add_act_ops(self, model, use_device):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
         img, label = init_data()
 
@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
 
     def test_simple_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.GPU)
+        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA)
         self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU)
 
     def test_batchnorm_fc_with_fuse_op(self):
         self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm,
-                                               DeviceType.GPU)
+                                               DeviceType.CUDA)
         self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm,
                                                DeviceType.CPU)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index 75aa07c4b9..51c06bb79d 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
                                      feed_dict=None,
                                      get_data_from_feeder=None,
                                      optimizer=fluid.optimizer.Adam):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
 
     def test_batchnorm_fc_with_fuse_op(self):
         self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer)
+            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
         self._decorate_compare_fused_optimizer_ops(
             fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
 
@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
     def test_simple_bow_net_with_fuse_op(self):
         model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
         self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.GPU, optimizer=self.optimizer)
+            model, DeviceType.CUDA, optimizer=self.optimizer)
         self._decorate_compare_fused_optimizer_ops(
             model, DeviceType.CPU, optimizer=self.optimizer)
 
@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
                                      feed_dict=None,
                                      get_data_from_feeder=None,
                                      optimizer=fluid.optimizer.Adam):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         self.check_pass_conflict(
@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
         self._decorate_compare_fused_optimizer_ops(
             fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
         self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer)
+            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
 
 
 class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index 0e54ebc7f4..9b739ebdfb 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
         return img, label
 
     def _compare(self, model, use_device, random_data=True, only_forward=False):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
         img, label = self._init_data(random_data)
 
@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
 
     def test_simple_depthwise_with_fuse_op(self):
-        self._compare(simple_depthwise_net, DeviceType.GPU)
+        self._compare(simple_depthwise_net, DeviceType.CUDA)
         self._compare(simple_depthwise_net, DeviceType.CPU)
 
     def test_simple_depthwise_with_fuse_op_only_forward(self):
-        self._compare(simple_depthwise_net, DeviceType.GPU, only_forward=True)
+        self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True)
         self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index f8b2ec21bc..e2094c76b7 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
-            use_device=DeviceType.GPU,
+            use_device=DeviceType.CUDA,
             use_ir_memory_optimize=ir_memory_optimize,
             enable_inplace=enable_inplace)
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
index 61ceefdad1..f4ec63a8b9 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
         return img, label
 
     def _compare_ir_memory_optimize(self, model, use_device):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         img, label = self._dummy_data()
@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc_net(self):
         self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.GPU)
+        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA)
 
     def test_fc_with_reshape_net(self):
         self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.GPU)
+        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
index 40c4fa7495..aa495c7533 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
             # check python transpiler
             self.check_network_convergence(
                 transformer,
-                use_device=DeviceType.GPU,
+                use_device=DeviceType.CUDA,
                 feed_data_reader=get_feed_data_reader(),
                 use_ir_memory_optimize=False,
                 iter=2)
             # check IR memory optimize
             self.check_network_convergence(
                 transformer,
-                use_device=DeviceType.GPU,
+                use_device=DeviceType.CUDA,
                 feed_data_reader=get_feed_data_reader(),
                 use_ir_memory_optimize=True,
                 iter=2)
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
index 7df3583f0d..33393bc2fc 100644
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
 
     def test_model(self):
         if core.is_compiled_with_cuda():
-            self.check_model(DeviceType.GPU)
+            self.check_model(DeviceType.CUDA)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 305c7703be..2c79670f1a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
                                       use_device,
                                       delta1=1e-6,
                                       delta2=1e-4):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
 
     # simple_fc
     def check_simple_fc_convergence(self, use_device, use_reduce=False):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc(self):
         # use_device
-        self.check_simple_fc_convergence(DeviceType.GPU)
+        self.check_simple_fc_convergence(DeviceType.CUDA)
         self.check_simple_fc_convergence(DeviceType.CPU)
         self.check_simple_fc_convergence(DeviceType.XPU)
 
@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
         # use_device, use_reduce
         # NOTE: the computation result of nccl_reduce is non-deterministic,
         # related issue: https://github.com/NVIDIA/nccl/issues/157
-        self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.GPU, 1e-5,
+        self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CUDA, 1e-5,
                                            1e-2)
         self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CPU, 1e-5,
                                            1e-2)
 
     def check_simple_fc_parallel_accuracy(self, use_device):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         img, label = self._init_data()
@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
             np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(DeviceType.GPU)
+        self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
         self.check_simple_fc_parallel_accuracy(DeviceType.CPU)
 
     def check_batchnorm_fc_convergence(self, use_device, use_fast_executor):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
         if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
             return
@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_fast_executor=use_fast_executor)
 
     def test_batchnorm_fc(self):
-        for use_device in (DeviceType.CPU, DeviceType.GPU):
+        for use_device in (DeviceType.CPU, DeviceType.CUDA):
             for use_fast_executor in (False, True):
                 self.check_batchnorm_fc_convergence(use_device,
                                                     use_fast_executor)
@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
     def test_batchnorm_fc_with_new_strategy(self):
         # NOTE: the computation result of nccl_reduce is non-deterministic,
         # related issue: https://github.com/NVIDIA/nccl/issues/157
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.GPU,
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CUDA,
                                            1e-5, 1e-2)
         self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CPU,
                                            1e-5, 1e-2)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
index 45008c2082..e07b89f7aa 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
 
     # simple_fc
     def check_simple_fc_convergence(self, use_device, use_reduce=False):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         img, label = init_data()
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
             np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(DeviceType.GPU)
+        self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index ef6c3e1187..9d1364cc59 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
             optimizer=seresnext_net.optimizer,
             use_parallel_executor=False)
         self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.GPU, compare_seperately=False)
+            check_func, use_device=DeviceType.CUDA, compare_seperately=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
index 111ea507c3..c747591c81 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
             optimizer=seresnext_net.optimizer,
             fuse_all_reduce_ops=True)
         self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.GPU, delta2=1e-2)
+            check_func, use_device=DeviceType.CUDA, delta2=1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
index 2e5ab76377..e67934d87f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
@@ -21,7 +21,7 @@ import paddle.fluid.core as core
 
 class TestResnetWithReduceBase(TestParallelExecutorBase):
     def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
-        if use_device == DeviceType.GPU and not core.is_compiled_with_cuda():
+        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
index ff98d562c4..4de1a6092d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
 class TestResnetWithReduceGPU(TestResnetWithReduceBase):
     def test_seresnext_with_reduce(self):
         self._compare_reduce_and_allreduce(
-            use_device=DeviceType.GPU, delta2=1e-2)
+            use_device=DeviceType.CUDA, delta2=1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 26036e41d9..1cb39eb131 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
                 transformer,
-                use_device=DeviceType.GPU,
+                use_device=DeviceType.CUDA,
                 feed_data_reader=get_feed_data_reader())
             self.check_network_convergence(
                 transformer,
-                use_device=DeviceType.GPU,
+                use_device=DeviceType.CUDA,
                 enable_sequential_execution=True,
                 feed_data_reader=get_feed_data_reader())
         self.check_network_convergence(
-- 
GitLab