未验证 提交 fde34eb8 编写于 作者: H Huihuang Zheng 提交者: GitHub

[Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu … (#43925)

* [Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu (#43812)
1. Fix the conflict between #43812 and current release/2.3 branch
2. test_parallel_executor_seresnext_base_gpu failed on 2 P100 GPUs with `470.82` driver.
上级 83520fd2
...@@ -32,6 +32,7 @@ DeviceType = core.DeviceType ...@@ -32,6 +32,7 @@ DeviceType = core.DeviceType
class TestParallelExecutorBase(unittest.TestCase): class TestParallelExecutorBase(unittest.TestCase):
@classmethod @classmethod
def check_network_convergence(cls, def check_network_convergence(cls,
method, method,
...@@ -52,6 +53,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -52,6 +53,7 @@ class TestParallelExecutorBase(unittest.TestCase):
optimizer=fluid.optimizer.Adam, optimizer=fluid.optimizer.Adam,
use_fast_executor=False, use_fast_executor=False,
enable_sequential_execution=False): enable_sequential_execution=False):
def run_executor(exe, binary, feed, fetch_list): def run_executor(exe, binary, feed, fetch_list):
if feed_data_reader is None: if feed_data_reader is None:
res = exe.run(binary, feed=feed, fetch_list=fetch_list) res = exe.run(binary, feed=feed, fetch_list=fetch_list)
...@@ -66,8 +68,8 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -66,8 +68,8 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_data_reader, FeedDataReader feed_data_reader, FeedDataReader
), "feed_data_reader must be type of FeedDataReader" ), "feed_data_reader must be type of FeedDataReader"
paddle.seed(1) paddle.seed(0)
paddle.framework.random._manual_program_seed(1) paddle.framework.random._manual_program_seed(0)
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
...@@ -101,18 +103,29 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -101,18 +103,29 @@ class TestParallelExecutorBase(unittest.TestCase):
) if use_device == DeviceType.XPU else int( ) if use_device == DeviceType.XPU else int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
area_below_loss = 0
begin = time.time() begin = time.time()
first_loss, = run_executor( first_loss, = run_executor(exe=exe,
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * first_loss.mean()
for _ in range(iter): for _ in range(iter):
run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) mid_loss = run_executor(exe=exe,
last_loss, = run_executor( binary=binary,
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += mid_loss[0].mean()
last_loss, = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * last_loss.mean()
end = time.time() end = time.time()
if batch_size is not None: if batch_size is not None:
print("%.4f Instance per second" % ( print("%.4f Instance per second" % ((batch_size * iter + 2) /
(batch_size * iter + 2) / (end - begin))) (end - begin)))
avg_last_loss_val = np.array(last_loss).mean() avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean() avg_first_loss_val = np.array(first_loss).mean()
...@@ -120,9 +133,9 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -120,9 +133,9 @@ class TestParallelExecutorBase(unittest.TestCase):
float(avg_first_loss_val)): float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
print(first_loss, last_loss) print(first_loss, last_loss, area_below_loss)
# self.assertGreater(first_loss[0], last_loss[0]) # self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss return first_loss, last_loss, area_below_loss
@classmethod @classmethod
def check_pass_conflict(cls, def check_pass_conflict(cls,
......
...@@ -21,6 +21,7 @@ import numpy as np ...@@ -21,6 +21,7 @@ import numpy as np
class TestResnetBase(TestParallelExecutorBase): class TestResnetBase(TestParallelExecutorBase):
def _compare_result_with_origin_model(self, def _compare_result_with_origin_model(self,
check_func, check_func,
use_device, use_device,
...@@ -29,7 +30,7 @@ class TestResnetBase(TestParallelExecutorBase): ...@@ -29,7 +30,7 @@ class TestResnetBase(TestParallelExecutorBase):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
func_1_first_loss, func_1_last_loss = self.check_network_convergence( func_1_first_loss, func_1_last_loss, func_1_loss_area = self.check_network_convergence(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -38,7 +39,7 @@ class TestResnetBase(TestParallelExecutorBase): ...@@ -38,7 +39,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_reduce=False, use_reduce=False,
optimizer=seresnext_net.optimizer) optimizer=seresnext_net.optimizer)
func_2_first_loss, func_2_last_loss = check_func( func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -51,7 +52,12 @@ class TestResnetBase(TestParallelExecutorBase): ...@@ -51,7 +52,12 @@ class TestResnetBase(TestParallelExecutorBase):
for loss in zip(func_1_last_loss, func_2_last_loss): for loss in zip(func_1_last_loss, func_2_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
else: else:
self.assertAlmostEquals( np.testing.assert_allclose(func_1_loss_area,
np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5) func_2_loss_area,
self.assertAlmostEquals( rtol=delta2)
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2) self.assertAlmostEquals(np.mean(func_1_first_loss),
func_2_first_loss[0],
delta=1e-5)
self.assertAlmostEquals(np.mean(func_1_last_loss),
func_2_last_loss[0],
delta=delta2)
...@@ -26,6 +26,7 @@ paddle.enable_static() ...@@ -26,6 +26,7 @@ paddle.enable_static()
class TestFuseAllReduceOpsBase(TestParallelExecutorBase): class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -47,7 +48,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ...@@ -47,7 +48,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
img, label = init_feed_dict() img, label = init_feed_dict()
feed_dict_data = {"image": img, "label": label} feed_dict_data = {"image": img, "label": label}
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict=feed_dict_data, feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder, get_data_from_feeder=get_data_from_feeder,
...@@ -55,7 +56,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ...@@ -55,7 +56,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
fuse_all_reduce_ops=False, fuse_all_reduce_ops=False,
fuse_all_optimizer_ops=fuse_all_optimizer_ops, fuse_all_optimizer_ops=fuse_all_optimizer_ops,
optimizer=optimizer) optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict=feed_dict_data, feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder, get_data_from_feeder=get_data_from_feeder,
...@@ -77,13 +78,13 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ...@@ -77,13 +78,13 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
def _decorate_compare_fused_all_reduce(self, model, use_device): def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops( self.compare_fuse_all_reduce_ops(model,
model, use_device,
use_device, init_feed_dict=init_data,
init_feed_dict=init_data, optimizer=self.optimizer,
optimizer=self.optimizer, fuse_all_optimizer_ops=True)
fuse_all_optimizer_ops=True)
def test_simple_fc_with_fuse_all_reduce(self): def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
...@@ -101,16 +102,17 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): ...@@ -101,16 +102,17 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps): class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
def _decorate_compare_fused_all_reduce(self, model, use_device): def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops( self.compare_fuse_all_reduce_ops(model,
model, use_device,
use_device, init_feed_dict=init_data,
init_feed_dict=init_data, optimizer=self.optimizer,
optimizer=self.optimizer, fuse_all_optimizer_ops=True)
fuse_all_optimizer_ops=True)
class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -41,19 +42,23 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -41,19 +42,23 @@ class TestMNIST(TestParallelExecutorBase):
# FIXME (liuwei12) # FIXME (liuwei12)
# the new memory optimize strategy will crash this unittest # the new memory optimize strategy will crash this unittest
# add enable_inplace=False here to force pass the unittest # add enable_inplace=False here to force pass the unittest
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
use_ir_memory_optimize=False, use_ir_memory_optimize=False,
enable_inplace=False, enable_inplace=False,
optimizer=_optimizer) optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
fuse_elewise_add_act_ops=True, fuse_elewise_add_act_ops=True,
use_ir_memory_optimize=False, use_ir_memory_optimize=False,
......
...@@ -24,6 +24,7 @@ import os ...@@ -24,6 +24,7 @@ import os
class TestFuseOptimizationOps(TestParallelExecutorBase): class TestFuseOptimizationOps(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -41,14 +42,14 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): ...@@ -41,14 +42,14 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict=feed_dict, feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder, get_data_from_feeder=get_data_from_feeder,
use_device=use_device, use_device=use_device,
fuse_all_optimizer_ops=False, fuse_all_optimizer_ops=False,
optimizer=optimizer) optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict=feed_dict, feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder, get_data_from_feeder=get_data_from_feeder,
...@@ -63,36 +64,41 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): ...@@ -63,36 +64,41 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
def _decorate_compare_fused_optimizer_ops(self, model, use_device, def _decorate_compare_fused_optimizer_ops(self, model, use_device,
optimizer): optimizer):
self._compare_fused_optimizer_ops( self._compare_fused_optimizer_ops(model,
model, use_device,
use_device, feed_dict=self._get_feed_dict(),
feed_dict=self._get_feed_dict(), optimizer=optimizer)
optimizer=optimizer)
class TestFuseAdamOps(TestFuseOptimizationOps): class TestFuseAdamOps(TestFuseOptimizationOps):
def optimizer(self, learning_rate=1e-4): def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate) return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_batchnorm_fc_with_fuse_op(self): def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) DeviceType.CUDA,
self._decorate_compare_fused_optimizer_ops( optimizer=self.optimizer)
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CPU,
optimizer=self.optimizer)
class TestFuseSGDOps(TestFuseAdamOps): class TestFuseSGDOps(TestFuseAdamOps):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate) return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestFuseMomentumOps(TestFuseAdamOps): class TestFuseMomentumOps(TestFuseAdamOps):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum( return fluid.optimizer.Momentum(learning_rate=learning_rate,
learning_rate=learning_rate, momentum=0.1) momentum=0.1)
class TestSpareFuseAdamOps(TestFuseOptimizationOps): class TestSpareFuseAdamOps(TestFuseOptimizationOps):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -120,24 +126,29 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): ...@@ -120,24 +126,29 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def test_simple_bow_net_with_fuse_op(self): def test_simple_bow_net_with_fuse_op(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(model,
model, DeviceType.CUDA, optimizer=self.optimizer) DeviceType.CUDA,
self._decorate_compare_fused_optimizer_ops( optimizer=self.optimizer)
model, DeviceType.CPU, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops(model,
DeviceType.CPU,
optimizer=self.optimizer)
class TestSpareFuseSGDOps(TestSpareFuseAdamOps): class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate) return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestSpareFuseMomentumOps(TestSpareFuseAdamOps): class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum( return fluid.optimizer.Momentum(learning_rate=learning_rate,
learning_rate=learning_rate, momentum=0.1) momentum=0.1)
class TestPassConflictBase(TestFuseAdamOps): class TestPassConflictBase(TestFuseAdamOps):
def _compare_fused_optimizer_ops(self, def _compare_fused_optimizer_ops(self,
model, model,
use_device, use_device,
...@@ -147,36 +158,40 @@ class TestPassConflictBase(TestFuseAdamOps): ...@@ -147,36 +158,40 @@ class TestPassConflictBase(TestFuseAdamOps):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
self.check_pass_conflict( self.check_pass_conflict(model,
model, feed_dict=feed_dict,
feed_dict=feed_dict, get_data_from_feeder=get_data_from_feeder,
get_data_from_feeder=get_data_from_feeder, use_device=use_device,
use_device=use_device, fuse_all_optimizer_ops=True,
fuse_all_optimizer_ops=True, optimizer=optimizer,
optimizer=optimizer, enable_sequential_execution=True)
enable_sequential_execution=True)
class TestFuseAdamOpsPassConflict(TestPassConflictBase): class TestFuseAdamOpsPassConflict(TestPassConflictBase):
def optimizer(self, learning_rate=1e-4): def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate) return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_batchnorm_fc_with_fuse_op(self): def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops( self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) DeviceType.CPU,
self._decorate_compare_fused_optimizer_ops( optimizer=self.optimizer)
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CUDA,
optimizer=self.optimizer)
class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate) return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict): class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
def optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum( return fluid.optimizer.Momentum(learning_rate=learning_rate,
learning_rate=learning_rate, momentum=0.1) momentum=0.1)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -28,21 +28,25 @@ def norm(*args, **kargs): ...@@ -28,21 +28,25 @@ def norm(*args, **kargs):
def sep_conv(input, channel, stride, filter, dilation=1, act=None): def sep_conv(input, channel, stride, filter, dilation=1, act=None):
# with scope('depthwise'): # with scope('depthwise'):
input = fluid.layers.conv2d( input = fluid.layers.conv2d(input,
input, input.shape[1],
input.shape[1], filter,
filter, stride,
stride, groups=input.shape[1],
groups=input.shape[1], padding=(filter // 2) * dilation,
padding=(filter // 2) * dilation, dilation=dilation,
dilation=dilation, use_cudnn=False,
use_cudnn=False, bias_attr=False)
bias_attr=False)
input = norm(input) input = norm(input)
if act: input = act(input) if act: input = act(input)
# with scope('pointwise'): # with scope('pointwise'):
input = fluid.layers.conv2d( input = fluid.layers.conv2d(input,
input, channel, 1, 1, groups=1, padding=0, bias_attr=False) channel,
1,
1,
groups=1,
padding=0,
bias_attr=False)
input = norm(input) input = norm(input)
if act: input = act(input) if act: input = act(input)
return input return input
...@@ -63,6 +67,7 @@ def simple_depthwise_net(use_feed): ...@@ -63,6 +67,7 @@ def simple_depthwise_net(use_feed):
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
def _init_data(self, random=True): def _init_data(self, random=True):
np.random.seed(5) np.random.seed(5)
if random: if random:
...@@ -86,18 +91,22 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -86,18 +91,22 @@ class TestMNIST(TestParallelExecutorBase):
if only_forward: if only_forward:
_optimizer = None _optimizer = None
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
fuse_relu_depthwise_conv=True, fuse_relu_depthwise_conv=True,
use_ir_memory_optimize=True, use_ir_memory_optimize=True,
optimizer=_optimizer) optimizer=_optimizer)
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
fuse_relu_depthwise_conv=False, fuse_relu_depthwise_conv=False,
optimizer=_optimizer) optimizer=_optimizer)
......
...@@ -54,6 +54,7 @@ def fc_with_inplace_net(use_feed): ...@@ -54,6 +54,7 @@ def fc_with_inplace_net(use_feed):
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
def _dummy_data(self): def _dummy_data(self):
np.random.seed(5) np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32) img = np.random.random(size=[32, 784]).astype(np.float32)
...@@ -65,16 +66,20 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -65,16 +66,20 @@ class TestMNIST(TestParallelExecutorBase):
return return
img, label = self._dummy_data() img, label = self._dummy_data()
first_loss0, last_loss0 = self.check_network_convergence( first_loss0, last_loss0, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_ir_memory_optimize=False) use_ir_memory_optimize=False)
first_loss1, last_loss1 = self.check_network_convergence( first_loss1, last_loss1, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_ir_memory_optimize=True) use_ir_memory_optimize=True)
for loss in zip(first_loss0, first_loss1): for loss in zip(first_loss0, first_loss1):
......
...@@ -34,8 +34,8 @@ def simple_fc_net(use_feed): ...@@ -34,8 +34,8 @@ def simple_fc_net(use_feed):
hidden, hidden,
size=200, size=200,
act='tanh', act='tanh',
bias_attr=fluid.ParamAttr( bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
initializer=fluid.initializer.Constant(value=1.0))) value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax') prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss) loss = fluid.layers.mean(loss)
...@@ -73,6 +73,7 @@ def init_data(): ...@@ -73,6 +73,7 @@ def init_data():
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -90,17 +91,21 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -90,17 +91,21 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data() img, label = init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_reduce=False) use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_reduce=True) use_reduce=True)
...@@ -119,12 +124,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -119,12 +124,13 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data() img, label = init_data()
self.check_network_convergence( self.check_network_convergence(simple_fc_net,
simple_fc_net, feed_dict={
feed_dict={"image": img, "image": img,
"label": label}, "label": label
use_device=use_device, },
use_reduce=use_reduce) use_device=use_device,
use_reduce=use_reduce)
def test_simple_fc(self): def test_simple_fc(self):
# use_device # use_device
...@@ -147,25 +153,31 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -147,25 +153,31 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data() img, label = init_data()
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss, _ = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_parallel_executor=False) use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence( parallel_first_loss, parallel_last_loss, _ = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_parallel_executor=True) use_parallel_executor=True)
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(parallel_first_loss), np.mean(parallel_first_loss),
single_first_loss, single_first_loss,
delta=1e-6, ) delta=1e-6,
self.assertAlmostEquals( )
np.mean(parallel_last_loss), single_last_loss, delta=1e-6) self.assertAlmostEquals(np.mean(parallel_last_loss),
single_last_loss,
delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
...@@ -178,12 +190,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -178,12 +190,13 @@ class TestMNIST(TestParallelExecutorBase):
return return
img, label = init_data() img, label = init_data()
self.check_network_convergence( self.check_network_convergence(fc_with_batchnorm,
fc_with_batchnorm, feed_dict={
feed_dict={"image": img, "image": img,
"label": label}, "label": label
use_device=use_device, },
use_fast_executor=use_fast_executor) use_device=use_device,
use_fast_executor=use_fast_executor)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
for use_device in (DeviceType.CPU, DeviceType.CUDA): for use_device in (DeviceType.CPU, DeviceType.CUDA):
...@@ -201,6 +214,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -201,6 +214,7 @@ class TestMNIST(TestParallelExecutorBase):
class TestMNISTNoReduce(unittest.TestCase): class TestMNISTNoReduce(unittest.TestCase):
def run_program(self, device_type): def run_program(self, device_type):
if device_type == DeviceType.CUDA: if device_type == DeviceType.CUDA:
if not paddle.is_compiled_with_cuda(): if not paddle.is_compiled_with_cuda():
...@@ -225,18 +239,16 @@ class TestMNISTNoReduce(unittest.TestCase): ...@@ -225,18 +239,16 @@ class TestMNISTNoReduce(unittest.TestCase):
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
build_strategy.reduce_strategy = no_reduce build_strategy.reduce_strategy = no_reduce
main_multi_place = paddle.static.CompiledProgram( main_multi_place = paddle.static.CompiledProgram(
main).with_data_parallel( main).with_data_parallel(loss_name=loss.name,
loss_name=loss.name, build_strategy=build_strategy,
build_strategy=build_strategy, places=places)
places=places)
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
build_strategy.reduce_strategy = no_reduce build_strategy.reduce_strategy = no_reduce
main_single_place = paddle.static.CompiledProgram(main.clone( main_single_place = paddle.static.CompiledProgram(
)).with_data_parallel( main.clone()).with_data_parallel(loss_name=loss.name,
loss_name=loss.name, build_strategy=build_strategy,
build_strategy=build_strategy, places=places[0])
places=places[0])
image, label = init_data() image, label = init_data()
feed = {'image': image, 'label': label} feed = {'image': image, 'label': label}
...@@ -256,13 +268,13 @@ class TestMNISTNoReduce(unittest.TestCase): ...@@ -256,13 +268,13 @@ class TestMNISTNoReduce(unittest.TestCase):
grads_single_place[i].append(g) grads_single_place[i].append(g)
for i in range(len(grads)): for i in range(len(grads)):
grads_single_place[i] = np.concatenate( grads_single_place[i] = np.concatenate(grads_single_place[i],
grads_single_place[i], axis=0) / len(places) axis=0) / len(places)
self.assertEqual(len(grads_multi_place), len(grads_single_place)) self.assertEqual(len(grads_multi_place), len(grads_single_place))
for g1, g2 in zip(grads_multi_place, grads_single_place): for g1, g2 in zip(grads_multi_place, grads_single_place):
self.assertTrue( self.assertTrue(np.allclose(g1, g2),
np.allclose(g1, g2), 'g1 = {}\ng2 = {}\n'.format(g1, g2)) 'g1 = {}\ng2 = {}\n'.format(g1, g2))
def split_feed(self, feed, n): def split_feed(self, feed, n):
image = feed['image'] image = feed['image']
......
...@@ -18,6 +18,7 @@ import unittest ...@@ -18,6 +18,7 @@ import unittest
import numpy as np import numpy as np
import os import os
os.environ['FLAGS_enable_parallel_graph'] = str(1) os.environ['FLAGS_enable_parallel_graph'] = str(1)
import paddle.fluid.core as core import paddle.fluid.core as core
import os import os
...@@ -26,6 +27,7 @@ from simple_nets import simple_fc_net, init_data ...@@ -26,6 +27,7 @@ from simple_nets import simple_fc_net, init_data
class TestMNIST(TestParallelExecutorBase): class TestMNIST(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
...@@ -36,12 +38,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -36,12 +38,13 @@ class TestMNIST(TestParallelExecutorBase):
return return
img, label = init_data() img, label = init_data()
self.check_network_convergence( self.check_network_convergence(simple_fc_net,
simple_fc_net, feed_dict={
feed_dict={"image": img, "image": img,
"label": label}, "label": label
use_device=use_device, },
use_reduce=use_reduce) use_device=use_device,
use_reduce=use_reduce)
def test_simple_fc(self): def test_simple_fc(self):
# use_device # use_device
...@@ -52,25 +55,31 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -52,25 +55,31 @@ class TestMNIST(TestParallelExecutorBase):
return return
img, label = init_data() img, label = init_data()
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss, _ = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_parallel_executor=False) use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence( parallel_first_loss, parallel_last_loss, _ = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
feed_dict={"image": img, feed_dict={
"label": label}, "image": img,
"label": label
},
use_device=use_device, use_device=use_device,
use_parallel_executor=True) use_parallel_executor=True)
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(parallel_first_loss), np.mean(parallel_first_loss),
single_first_loss, single_first_loss,
delta=1e-6, ) delta=1e-6,
self.assertAlmostEquals( )
np.mean(parallel_last_loss), single_last_loss, delta=1e-6) self.assertAlmostEquals(np.mean(parallel_last_loss),
single_last_loss,
delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
......
...@@ -20,17 +20,19 @@ from functools import partial ...@@ -20,17 +20,19 @@ from functools import partial
class TestResnetGPU(TestResnetBase): class TestResnetGPU(TestResnetBase):
def test_seresnext_with_learning_rate_decay(self): def test_seresnext_with_learning_rate_decay(self):
# NOTE(zcd): This test is compare the result of use parallel_executor # NOTE(zcd): This test is compare the result of use parallel_executor
# and executor, and the result of drop_out op and batch_norm op in # and executor, and the result of drop_out op and batch_norm op in
# this two executor have diff, so the two ops should be removed # this two executor have diff, so the two ops should be removed
# from the model. # from the model.
check_func = partial( check_func = partial(self.check_network_convergence,
self.check_network_convergence, optimizer=seresnext_net.optimizer,
optimizer=seresnext_net.optimizer, use_parallel_executor=False)
use_parallel_executor=False) self._compare_result_with_origin_model(check_func,
self._compare_result_with_origin_model( use_device=DeviceType.CUDA,
check_func, use_device=DeviceType.CUDA, compare_seperately=False) delta2=1e-5,
compare_seperately=False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -20,11 +20,12 @@ import paddle.fluid.core as core ...@@ -20,11 +20,12 @@ import paddle.fluid.core as core
class TestResnetWithReduceBase(TestParallelExecutorBase): class TestResnetWithReduceBase(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss, _ = self.check_network_convergence(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -32,7 +33,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -32,7 +33,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
use_device=use_device, use_device=use_device,
use_reduce=False, use_reduce=False,
optimizer=seresnext_net.optimizer) optimizer=seresnext_net.optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -49,7 +50,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -49,7 +50,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
if not use_device: if not use_device:
return return
all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence( all_reduce_first_loss_seq, all_reduce_last_loss_seq, _ = self.check_network_convergence(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -59,7 +60,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -59,7 +60,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
optimizer=seresnext_net.optimizer, optimizer=seresnext_net.optimizer,
enable_sequential_execution=True) enable_sequential_execution=True)
reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( reduce_first_loss_seq, reduce_last_loss_seq, _ = self.check_network_convergence(
seresnext_net.model, seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device), feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device), iter=seresnext_net.iter(use_device),
...@@ -86,9 +87,10 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): ...@@ -86,9 +87,10 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
class TestResnetWithReduceCPU(TestResnetWithReduceBase): class TestResnetWithReduceCPU(TestResnetWithReduceBase):
def test_seresnext_with_reduce(self): def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce( self._compare_reduce_and_allreduce(use_device=DeviceType.CPU,
use_device=DeviceType.CPU, delta2=1e-3) delta2=1e-3)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册