未验证 提交 bb20dcfc 编写于 作者: L liuyuhui 提交者: GitHub

[Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961)

上级 6a0102b0
...@@ -28,19 +28,14 @@ import sys ...@@ -28,19 +28,14 @@ import sys
from feed_data_reader import FeedDataReader from feed_data_reader import FeedDataReader
__all__ = ['TestParallelExecutorBase'] __all__ = ['TestParallelExecutorBase']
DeviceType = core.DeviceType
class DeviceType:
CPU = 1
GPU = 2
XPU = 3
class TestParallelExecutorBase(unittest.TestCase): class TestParallelExecutorBase(unittest.TestCase):
@classmethod @classmethod
def check_network_convergence(cls, def check_network_convergence(cls,
method, method,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
iter=5, iter=5,
batch_size=None, batch_size=None,
feed_dict=None, feed_dict=None,
...@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main, method, optimizer) main, method, optimizer)
place = fluid.CUDAPlace( place = fluid.CUDAPlace(
0) if use_device == DeviceType.GPU else fluid.XPUPlace( 0) if use_device == DeviceType.CUDA else fluid.XPUPlace(
0) if use_device == DeviceType.XPU else fluid.CPUPlace() 0) if use_device == DeviceType.XPU else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
...@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count( batch_size *= fluid.core.get_cuda_device_count(
) if use_device == DeviceType.GPU else fluid.core.get_xpu_device_count( ) if use_device == DeviceType.CUDA else fluid.core.get_xpu_device_count(
) if use_device == DeviceType.XPU else int( ) if use_device == DeviceType.XPU else int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
...@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
@classmethod @classmethod
def check_pass_conflict(cls, def check_pass_conflict(cls,
method, method,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
feed_dict=None, feed_dict=None,
get_data_from_feeder=None, get_data_from_feeder=None,
use_reduce=False, use_reduce=False,
...@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main, method, optimizer) main, method, optimizer)
place = fluid.CUDAPlace( place = fluid.CUDAPlace(
0) if use_device == DeviceType.GPU else fluid.XPUPlace( 0) if use_device == DeviceType.CUDA else fluid.XPUPlace(
0) if use_device == DeviceType.XPU else fluid.CPUPlace() 0) if use_device == DeviceType.XPU else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
...@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.enable_inplace = enable_inplace build_strategy.enable_inplace = enable_inplace
build_strategy.enable_sequential_execution = enable_sequential_execution build_strategy.enable_sequential_execution = enable_sequential_execution
if use_device == DeviceType.GPU and core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and core.is_compiled_with_cuda():
build_strategy.remove_unnecessary_lock = True build_strategy.remove_unnecessary_lock = True
if use_device == DeviceType.XPU and core.is_compiled_with_xpu(): if use_device == DeviceType.XPU and core.is_compiled_with_xpu():
build_strategy.fuse_elewise_add_act_ops = False build_strategy.fuse_elewise_add_act_ops = False
......
...@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small ...@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
def batch_size(use_device): def batch_size(use_device):
if use_device == DeviceType.GPU: if use_device == DeviceType.CUDA:
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size. # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
return 8 return 8
return 12 return 12
def iter(use_device): def iter(use_device):
if use_device == DeviceType.GPU: if use_device == DeviceType.CUDA:
return 10 return 10
return 1 return 1
gpu_img, gpu_label = init_data( gpu_img, gpu_label = init_data(
batch_size=batch_size(use_device=DeviceType.GPU), batch_size=batch_size(use_device=DeviceType.CUDA),
img_shape=img_shape, img_shape=img_shape,
label_range=999) label_range=999)
cpu_img, cpu_label = init_data( cpu_img, cpu_label = init_data(
...@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label} ...@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
def feed_dict(use_device): def feed_dict(use_device):
if use_device == DeviceType.GPU: if use_device == DeviceType.CUDA:
return feed_dict_gpu return feed_dict_gpu
return feed_dict_cpu return feed_dict_cpu
...@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase): ...@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_device, use_device,
delta2=1e-5, delta2=1e-5,
compare_seperately=True): compare_seperately=True):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
func_1_first_loss, func_1_last_loss = self.check_network_convergence( func_1_first_loss, func_1_last_loss = self.check_network_convergence(
......
...@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ...@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
get_data_from_feeder=None, get_data_from_feeder=None,
optimizer=None, optimizer=None,
fuse_all_optimizer_ops=False): fuse_all_optimizer_ops=False):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
feed_dict_data = None feed_dict_data = None
...@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): ...@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
fuse_all_optimizer_ops=True) fuse_all_optimizer_ops=True)
def test_simple_fc_with_fuse_all_reduce(self): def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.GPU) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
def test_batchnorm_fc_with_fuse_all_reduce(self): def test_batchnorm_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.GPU) DeviceType.CUDA)
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.CPU) DeviceType.CPU)
...@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): ...@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def test_simple_bow_net_with_fuse_all_reduce(self): def test_simple_bow_net_with_fuse_all_reduce(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_all_reduce(model, DeviceType.GPU) self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
......
...@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
def _compare_fuse_elewise_add_act_ops(self, model, use_device): def _compare_fuse_elewise_add_act_ops(self, model, use_device):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
img, label = init_data() img, label = init_data()
...@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc_with_fuse_op(self): def test_simple_fc_with_fuse_op(self):
self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.GPU) self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA)
self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU) self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU)
def test_batchnorm_fc_with_fuse_op(self): def test_batchnorm_fc_with_fuse_op(self):
self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm,
DeviceType.GPU) DeviceType.CUDA)
self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm,
DeviceType.CPU) DeviceType.CPU)
......
...@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): ...@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
feed_dict=None, feed_dict=None,
get_data_from_feeder=None, get_data_from_feeder=None,
optimizer=fluid.optimizer.Adam): optimizer=fluid.optimizer.Adam):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
...@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps): ...@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
def test_batchnorm_fc_with_fuse_op(self): def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer) fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
...@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): ...@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def test_simple_bow_net_with_fuse_op(self): def test_simple_bow_net_with_fuse_op(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
model, DeviceType.GPU, optimizer=self.optimizer) model, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
model, DeviceType.CPU, optimizer=self.optimizer) model, DeviceType.CPU, optimizer=self.optimizer)
...@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps): ...@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
feed_dict=None, feed_dict=None,
get_data_from_feeder=None, get_data_from_feeder=None,
optimizer=fluid.optimizer.Adam): optimizer=fluid.optimizer.Adam):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
self.check_pass_conflict( self.check_pass_conflict(
...@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase): ...@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.GPU, optimizer=self.optimizer) fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
......
...@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
return img, label return img, label
def _compare(self, model, use_device, random_data=True, only_forward=False): def _compare(self, model, use_device, random_data=True, only_forward=False):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
img, label = self._init_data(random_data) img, label = self._init_data(random_data)
...@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_depthwise_with_fuse_op(self): def test_simple_depthwise_with_fuse_op(self):
self._compare(simple_depthwise_net, DeviceType.GPU) self._compare(simple_depthwise_net, DeviceType.CUDA)
self._compare(simple_depthwise_net, DeviceType.CPU) self._compare(simple_depthwise_net, DeviceType.CPU)
def test_simple_depthwise_with_fuse_op_only_forward(self): def test_simple_depthwise_with_fuse_op_only_forward(self):
self._compare(simple_depthwise_net, DeviceType.GPU, only_forward=True) self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True)
self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True) self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True)
......
...@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase): ...@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
fc_with_batchnorm, fc_with_batchnorm,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
use_ir_memory_optimize=ir_memory_optimize, use_ir_memory_optimize=ir_memory_optimize,
enable_inplace=enable_inplace) enable_inplace=enable_inplace)
......
...@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
return img, label return img, label
def _compare_ir_memory_optimize(self, model, use_device): def _compare_ir_memory_optimize(self, model, use_device):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
img, label = self._dummy_data() img, label = self._dummy_data()
...@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc_net(self): def test_simple_fc_net(self):
self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU) self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU)
self._compare_ir_memory_optimize(simple_fc_net, DeviceType.GPU) self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA)
def test_fc_with_reshape_net(self): def test_fc_with_reshape_net(self):
self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU) self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU)
self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.GPU) self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase): ...@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
# check python transpiler # check python transpiler
self.check_network_convergence( self.check_network_convergence(
transformer, transformer,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
feed_data_reader=get_feed_data_reader(), feed_data_reader=get_feed_data_reader(),
use_ir_memory_optimize=False, use_ir_memory_optimize=False,
iter=2) iter=2)
# check IR memory optimize # check IR memory optimize
self.check_network_convergence( self.check_network_convergence(
transformer, transformer,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
feed_data_reader=get_feed_data_reader(), feed_data_reader=get_feed_data_reader(),
use_ir_memory_optimize=True, use_ir_memory_optimize=True,
iter=2) iter=2)
......
...@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase): ...@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
def test_model(self): def test_model(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_model(DeviceType.GPU) self.check_model(DeviceType.CUDA)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
use_device, use_device,
delta1=1e-6, delta1=1e-6,
delta2=1e-4): delta2=1e-4):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
...@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc # simple_fc
def check_simple_fc_convergence(self, use_device, use_reduce=False): def check_simple_fc_convergence(self, use_device, use_reduce=False):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
...@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc(self): def test_simple_fc(self):
# use_device # use_device
self.check_simple_fc_convergence(DeviceType.GPU) self.check_simple_fc_convergence(DeviceType.CUDA)
self.check_simple_fc_convergence(DeviceType.CPU) self.check_simple_fc_convergence(DeviceType.CPU)
self.check_simple_fc_convergence(DeviceType.XPU) self.check_simple_fc_convergence(DeviceType.XPU)
...@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
# use_device, use_reduce # use_device, use_reduce
# NOTE: the computation result of nccl_reduce is non-deterministic, # NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157 # related issue: https://github.com/NVIDIA/nccl/issues/157
self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.GPU, 1e-5, self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CUDA, 1e-5,
1e-2) 1e-2)
self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CPU, 1e-5, self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CPU, 1e-5,
1e-2) 1e-2)
def check_simple_fc_parallel_accuracy(self, use_device): def check_simple_fc_parallel_accuracy(self, use_device):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
img, label = self._init_data() img, label = self._init_data()
...@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
np.mean(parallel_last_loss), single_last_loss, delta=1e-6) np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.GPU) self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
self.check_simple_fc_parallel_accuracy(DeviceType.CPU) self.check_simple_fc_parallel_accuracy(DeviceType.CPU)
def check_batchnorm_fc_convergence(self, use_device, use_fast_executor): def check_batchnorm_fc_convergence(self, use_device, use_fast_executor):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
return return
...@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
use_fast_executor=use_fast_executor) use_fast_executor=use_fast_executor)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
for use_device in (DeviceType.CPU, DeviceType.GPU): for use_device in (DeviceType.CPU, DeviceType.CUDA):
for use_fast_executor in (False, True): for use_fast_executor in (False, True):
self.check_batchnorm_fc_convergence(use_device, self.check_batchnorm_fc_convergence(use_device,
use_fast_executor) use_fast_executor)
...@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
# NOTE: the computation result of nccl_reduce is non-deterministic, # NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157 # related issue: https://github.com/NVIDIA/nccl/issues/157
self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.GPU, self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CUDA,
1e-5, 1e-2) 1e-5, 1e-2)
self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CPU, self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CPU,
1e-5, 1e-2) 1e-5, 1e-2)
......
...@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc # simple_fc
def check_simple_fc_convergence(self, use_device, use_reduce=False): def check_simple_fc_convergence(self, use_device, use_reduce=False):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
img, label = init_data() img, label = init_data()
...@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
np.mean(parallel_last_loss), single_last_loss, delta=1e-6) np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.GPU) self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase): ...@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
optimizer=seresnext_net.optimizer, optimizer=seresnext_net.optimizer,
use_parallel_executor=False) use_parallel_executor=False)
self._compare_result_with_origin_model( self._compare_result_with_origin_model(
check_func, use_device=DeviceType.GPU, compare_seperately=False) check_func, use_device=DeviceType.CUDA, compare_seperately=False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase): ...@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
optimizer=seresnext_net.optimizer, optimizer=seresnext_net.optimizer,
fuse_all_reduce_ops=True) fuse_all_reduce_ops=True)
self._compare_result_with_origin_model( self._compare_result_with_origin_model(
check_func, use_device=DeviceType.GPU, delta2=1e-2) check_func, use_device=DeviceType.CUDA, delta2=1e-2)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -21,7 +21,7 @@ import paddle.fluid.core as core ...@@ -21,7 +21,7 @@ import paddle.fluid.core as core
class TestResnetWithReduceBase(TestParallelExecutorBase): class TestResnetWithReduceBase(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
if use_device == DeviceType.GPU and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
......
...@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc ...@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
class TestResnetWithReduceGPU(TestResnetWithReduceBase): class TestResnetWithReduceGPU(TestResnetWithReduceBase):
def test_seresnext_with_reduce(self): def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce( self._compare_reduce_and_allreduce(
use_device=DeviceType.GPU, delta2=1e-2) use_device=DeviceType.CUDA, delta2=1e-2)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_network_convergence( self.check_network_convergence(
transformer, transformer,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
feed_data_reader=get_feed_data_reader()) feed_data_reader=get_feed_data_reader())
self.check_network_convergence( self.check_network_convergence(
transformer, transformer,
use_device=DeviceType.GPU, use_device=DeviceType.CUDA,
enable_sequential_execution=True, enable_sequential_execution=True,
feed_data_reader=get_feed_data_reader()) feed_data_reader=get_feed_data_reader())
self.check_network_convergence( self.check_network_convergence(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册