未验证 提交 fde34eb8 编写于 作者: H Huihuang Zheng 提交者: GitHub

[Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu … (#43925)

* [Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu (#43812)
1. Fix the conflict between #43812 and current release/2.3 branch
2. test_parallel_executor_seresnext_base_gpu failed on 2 P100 GPUs with `470.82` driver.
上级 83520fd2
......@@ -32,6 +32,7 @@ DeviceType = core.DeviceType
class TestParallelExecutorBase(unittest.TestCase):
@classmethod
def check_network_convergence(cls,
method,
......@@ -52,6 +53,7 @@ class TestParallelExecutorBase(unittest.TestCase):
optimizer=fluid.optimizer.Adam,
use_fast_executor=False,
enable_sequential_execution=False):
def run_executor(exe, binary, feed, fetch_list):
if feed_data_reader is None:
res = exe.run(binary, feed=feed, fetch_list=fetch_list)
......@@ -66,8 +68,8 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_data_reader, FeedDataReader
), "feed_data_reader must be type of FeedDataReader"
paddle.seed(1)
paddle.framework.random._manual_program_seed(1)
paddle.seed(0)
paddle.framework.random._manual_program_seed(0)
main = fluid.Program()
startup = fluid.Program()
......@@ -101,18 +103,29 @@ class TestParallelExecutorBase(unittest.TestCase):
) if use_device == DeviceType.XPU else int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
area_below_loss = 0
begin = time.time()
first_loss, = run_executor(
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
first_loss, = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * first_loss.mean()
for _ in range(iter):
run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
last_loss, = run_executor(
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
mid_loss = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += mid_loss[0].mean()
last_loss, = run_executor(exe=exe,
binary=binary,
feed=feed_dict,
fetch_list=[loss.name])
area_below_loss += 0.5 * last_loss.mean()
end = time.time()
if batch_size is not None:
print("%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin)))
print("%.4f Instance per second" % ((batch_size * iter + 2) /
(end - begin)))
avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
......@@ -120,9 +133,9 @@ class TestParallelExecutorBase(unittest.TestCase):
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")
print(first_loss, last_loss)
print(first_loss, last_loss, area_below_loss)
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
return first_loss, last_loss, area_below_loss
@classmethod
def check_pass_conflict(cls,
......
......@@ -21,6 +21,7 @@ import numpy as np
class TestResnetBase(TestParallelExecutorBase):
def _compare_result_with_origin_model(self,
check_func,
use_device,
......@@ -29,7 +30,7 @@ class TestResnetBase(TestParallelExecutorBase):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return
func_1_first_loss, func_1_last_loss = self.check_network_convergence(
func_1_first_loss, func_1_last_loss, func_1_loss_area = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -38,7 +39,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_reduce=False,
optimizer=seresnext_net.optimizer)
func_2_first_loss, func_2_last_loss = check_func(
func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -51,7 +52,12 @@ class TestResnetBase(TestParallelExecutorBase):
for loss in zip(func_1_last_loss, func_2_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
else:
self.assertAlmostEquals(
np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
self.assertAlmostEquals(
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
np.testing.assert_allclose(func_1_loss_area,
func_2_loss_area,
rtol=delta2)
self.assertAlmostEquals(np.mean(func_1_first_loss),
func_2_first_loss[0],
delta=1e-5)
self.assertAlmostEquals(np.mean(func_1_last_loss),
func_2_last_loss[0],
delta=delta2)
......@@ -26,6 +26,7 @@ paddle.enable_static()
class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -47,7 +48,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
img, label = init_feed_dict()
feed_dict_data = {"image": img, "label": label}
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder,
......@@ -55,7 +56,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
fuse_all_reduce_ops=False,
fuse_all_optimizer_ops=fuse_all_optimizer_ops,
optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict_data,
get_data_from_feeder=get_data_from_feeder,
......@@ -77,13 +78,13 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops(
model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
self.compare_fuse_all_reduce_ops(model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
......@@ -101,16 +102,17 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
def _decorate_compare_fused_all_reduce(self, model, use_device):
self.compare_fuse_all_reduce_ops(
model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
self.compare_fuse_all_reduce_ops(model,
use_device,
init_feed_dict=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......
......@@ -21,6 +21,7 @@ import os
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -41,19 +42,23 @@ class TestMNIST(TestParallelExecutorBase):
# FIXME (liuwei12)
# the new memory optimize strategy will crash this unittest
# add enable_inplace=False here to force pass the unittest
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_elewise_add_act_ops=False,
use_ir_memory_optimize=False,
enable_inplace=False,
optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_elewise_add_act_ops=True,
use_ir_memory_optimize=False,
......
......@@ -24,6 +24,7 @@ import os
class TestFuseOptimizationOps(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -41,14 +42,14 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=False,
optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
......@@ -63,36 +64,41 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
def _decorate_compare_fused_optimizer_ops(self, model, use_device,
optimizer):
self._compare_fused_optimizer_ops(
model,
use_device,
feed_dict=self._get_feed_dict(),
optimizer=optimizer)
self._compare_fused_optimizer_ops(model,
use_device,
feed_dict=self._get_feed_dict(),
optimizer=optimizer)
class TestFuseAdamOps(TestFuseOptimizationOps):
def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CUDA,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CPU,
optimizer=self.optimizer)
class TestFuseSGDOps(TestFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestFuseMomentumOps(TestFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)
class TestSpareFuseAdamOps(TestFuseOptimizationOps):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -120,24 +126,29 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def test_simple_bow_net_with_fuse_op(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_optimizer_ops(
model, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
model, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(model,
DeviceType.CUDA,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(model,
DeviceType.CPU,
optimizer=self.optimizer)
class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)
class TestPassConflictBase(TestFuseAdamOps):
def _compare_fused_optimizer_ops(self,
model,
use_device,
......@@ -147,36 +158,40 @@ class TestPassConflictBase(TestFuseAdamOps):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return
self.check_pass_conflict(
model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=True,
optimizer=optimizer,
enable_sequential_execution=True)
self.check_pass_conflict(model,
feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_device=use_device,
fuse_all_optimizer_ops=True,
optimizer=optimizer,
enable_sequential_execution=True)
class TestFuseAdamOpsPassConflict(TestPassConflictBase):
def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_batchnorm_fc_with_fuse_op(self):
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CPU,
optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
DeviceType.CUDA,
optimizer=self.optimizer)
class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
return fluid.optimizer.Momentum(learning_rate=learning_rate,
momentum=0.1)
if __name__ == '__main__':
......
......@@ -28,21 +28,25 @@ def norm(*args, **kargs):
def sep_conv(input, channel, stride, filter, dilation=1, act=None):
# with scope('depthwise'):
input = fluid.layers.conv2d(
input,
input.shape[1],
filter,
stride,
groups=input.shape[1],
padding=(filter // 2) * dilation,
dilation=dilation,
use_cudnn=False,
bias_attr=False)
input = fluid.layers.conv2d(input,
input.shape[1],
filter,
stride,
groups=input.shape[1],
padding=(filter // 2) * dilation,
dilation=dilation,
use_cudnn=False,
bias_attr=False)
input = norm(input)
if act: input = act(input)
# with scope('pointwise'):
input = fluid.layers.conv2d(
input, channel, 1, 1, groups=1, padding=0, bias_attr=False)
input = fluid.layers.conv2d(input,
channel,
1,
1,
groups=1,
padding=0,
bias_attr=False)
input = norm(input)
if act: input = act(input)
return input
......@@ -63,6 +67,7 @@ def simple_depthwise_net(use_feed):
class TestMNIST(TestParallelExecutorBase):
def _init_data(self, random=True):
np.random.seed(5)
if random:
......@@ -86,18 +91,22 @@ class TestMNIST(TestParallelExecutorBase):
if only_forward:
_optimizer = None
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_relu_depthwise_conv=True,
use_ir_memory_optimize=True,
optimizer=_optimizer)
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
fuse_relu_depthwise_conv=False,
optimizer=_optimizer)
......
......@@ -54,6 +54,7 @@ def fc_with_inplace_net(use_feed):
class TestMNIST(TestParallelExecutorBase):
def _dummy_data(self):
np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32)
......@@ -65,16 +66,20 @@ class TestMNIST(TestParallelExecutorBase):
return
img, label = self._dummy_data()
first_loss0, last_loss0 = self.check_network_convergence(
first_loss0, last_loss0, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_ir_memory_optimize=False)
first_loss1, last_loss1 = self.check_network_convergence(
first_loss1, last_loss1, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_ir_memory_optimize=True)
for loss in zip(first_loss0, first_loss1):
......
......@@ -34,8 +34,8 @@ def simple_fc_net(use_feed):
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
......@@ -73,6 +73,7 @@ def init_data():
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -90,17 +91,21 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
all_reduce_first_loss, all_reduce_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_reduce=True)
......@@ -119,12 +124,13 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_device=use_device,
use_reduce=use_reduce)
self.check_network_convergence(simple_fc_net,
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_reduce=use_reduce)
def test_simple_fc(self):
# use_device
......@@ -147,25 +153,31 @@ class TestMNIST(TestParallelExecutorBase):
img, label = init_data()
single_first_loss, single_last_loss = self.check_network_convergence(
single_first_loss, single_last_loss, _ = self.check_network_convergence(
method=simple_fc_net,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
parallel_first_loss, parallel_last_loss, _ = self.check_network_convergence(
method=simple_fc_net,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_parallel_executor=True)
self.assertAlmostEquals(
np.mean(parallel_first_loss),
single_first_loss,
delta=1e-6, )
self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
delta=1e-6,
)
self.assertAlmostEquals(np.mean(parallel_last_loss),
single_last_loss,
delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
......@@ -178,12 +190,13 @@ class TestMNIST(TestParallelExecutorBase):
return
img, label = init_data()
self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_device=use_device,
use_fast_executor=use_fast_executor)
self.check_network_convergence(fc_with_batchnorm,
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_fast_executor=use_fast_executor)
def test_batchnorm_fc(self):
for use_device in (DeviceType.CPU, DeviceType.CUDA):
......@@ -201,6 +214,7 @@ class TestMNIST(TestParallelExecutorBase):
class TestMNISTNoReduce(unittest.TestCase):
def run_program(self, device_type):
if device_type == DeviceType.CUDA:
if not paddle.is_compiled_with_cuda():
......@@ -225,18 +239,16 @@ class TestMNISTNoReduce(unittest.TestCase):
build_strategy = paddle.static.BuildStrategy()
build_strategy.reduce_strategy = no_reduce
main_multi_place = paddle.static.CompiledProgram(
main).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
places=places)
main).with_data_parallel(loss_name=loss.name,
build_strategy=build_strategy,
places=places)
build_strategy = paddle.static.BuildStrategy()
build_strategy.reduce_strategy = no_reduce
main_single_place = paddle.static.CompiledProgram(main.clone(
)).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
places=places[0])
main_single_place = paddle.static.CompiledProgram(
main.clone()).with_data_parallel(loss_name=loss.name,
build_strategy=build_strategy,
places=places[0])
image, label = init_data()
feed = {'image': image, 'label': label}
......@@ -256,13 +268,13 @@ class TestMNISTNoReduce(unittest.TestCase):
grads_single_place[i].append(g)
for i in range(len(grads)):
grads_single_place[i] = np.concatenate(
grads_single_place[i], axis=0) / len(places)
grads_single_place[i] = np.concatenate(grads_single_place[i],
axis=0) / len(places)
self.assertEqual(len(grads_multi_place), len(grads_single_place))
for g1, g2 in zip(grads_multi_place, grads_single_place):
self.assertTrue(
np.allclose(g1, g2), 'g1 = {}\ng2 = {}\n'.format(g1, g2))
self.assertTrue(np.allclose(g1, g2),
'g1 = {}\ng2 = {}\n'.format(g1, g2))
def split_feed(self, feed, n):
image = feed['image']
......
......@@ -18,6 +18,7 @@ import unittest
import numpy as np
import os
os.environ['FLAGS_enable_parallel_graph'] = str(1)
import paddle.fluid.core as core
import os
......@@ -26,6 +27,7 @@ from simple_nets import simple_fc_net, init_data
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
......@@ -36,12 +38,13 @@ class TestMNIST(TestParallelExecutorBase):
return
img, label = init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_device=use_device,
use_reduce=use_reduce)
self.check_network_convergence(simple_fc_net,
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_reduce=use_reduce)
def test_simple_fc(self):
# use_device
......@@ -52,25 +55,31 @@ class TestMNIST(TestParallelExecutorBase):
return
img, label = init_data()
single_first_loss, single_last_loss = self.check_network_convergence(
single_first_loss, single_last_loss, _ = self.check_network_convergence(
method=simple_fc_net,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
parallel_first_loss, parallel_last_loss, _ = self.check_network_convergence(
method=simple_fc_net,
feed_dict={"image": img,
"label": label},
feed_dict={
"image": img,
"label": label
},
use_device=use_device,
use_parallel_executor=True)
self.assertAlmostEquals(
np.mean(parallel_first_loss),
single_first_loss,
delta=1e-6, )
self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
delta=1e-6,
)
self.assertAlmostEquals(np.mean(parallel_last_loss),
single_last_loss,
delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
......
......@@ -20,17 +20,19 @@ from functools import partial
class TestResnetGPU(TestResnetBase):
def test_seresnext_with_learning_rate_decay(self):
# NOTE(zcd): This test is compare the result of use parallel_executor
# and executor, and the result of drop_out op and batch_norm op in
# this two executor have diff, so the two ops should be removed
# from the model.
check_func = partial(
self.check_network_convergence,
optimizer=seresnext_net.optimizer,
use_parallel_executor=False)
self._compare_result_with_origin_model(
check_func, use_device=DeviceType.CUDA, compare_seperately=False)
check_func = partial(self.check_network_convergence,
optimizer=seresnext_net.optimizer,
use_parallel_executor=False)
self._compare_result_with_origin_model(check_func,
use_device=DeviceType.CUDA,
delta2=1e-5,
compare_seperately=False)
if __name__ == '__main__':
......
......@@ -20,11 +20,12 @@ import paddle.fluid.core as core
class TestResnetWithReduceBase(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
all_reduce_first_loss, all_reduce_last_loss, _ = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -32,7 +33,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
use_device=use_device,
use_reduce=False,
optimizer=seresnext_net.optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -49,7 +50,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
if not use_device:
return
all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
all_reduce_first_loss_seq, all_reduce_last_loss_seq, _ = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -59,7 +60,7 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
optimizer=seresnext_net.optimizer,
enable_sequential_execution=True)
reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
reduce_first_loss_seq, reduce_last_loss_seq, _ = self.check_network_convergence(
seresnext_net.model,
feed_dict=seresnext_net.feed_dict(use_device),
iter=seresnext_net.iter(use_device),
......@@ -86,9 +87,10 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
class TestResnetWithReduceCPU(TestResnetWithReduceBase):
def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce(
use_device=DeviceType.CPU, delta2=1e-3)
self._compare_reduce_and_allreduce(use_device=DeviceType.CPU,
delta2=1e-3)
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册