未验证 提交 610c6442 编写于 作者: C chengduo 提交者: GitHub

Make test_parallel_executor_seresnet.py Faster (#16701)

* slimming test_parallel_executor_seresnet.py
上级 112f1614
...@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase'] ...@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase']
class TestParallelExecutorBase(unittest.TestCase): class TestParallelExecutorBase(unittest.TestCase):
def check_network_convergence(self, @classmethod
def check_network_convergence(cls,
method, method,
use_cuda=True, use_cuda=True,
memory_opt=True, memory_opt=True,
......
...@@ -29,7 +29,7 @@ import unittest ...@@ -29,7 +29,7 @@ import unittest
import math import math
import numpy as np import numpy as np
from functools import partial from functools import partial
os.environ['CPU_NUM'] = str(4)
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
# and Executor is different. Because, for ParallelExecutor, the dropout_op of # and Executor is different. Because, for ParallelExecutor, the dropout_op of
# the neural net will be copied N copies(N is the number of device). This will # the neural net will be copied N copies(N is the number of device). This will
...@@ -113,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): ...@@ -113,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
batch_size = 12
img_shape = [3, 224, 224] img_shape = [3, 224, 224]
...@@ -181,43 +180,84 @@ def optimizer(learning_rate=0.01): ...@@ -181,43 +180,84 @@ def optimizer(learning_rate=0.01):
return optimizer return optimizer
class TestResnet(TestParallelExecutorBase): def _batch_size():
@classmethod return 12
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
global remove_dropout def _iter(use_cuda):
global remove_bn if use_cuda:
remove_dropout = False return 10
remove_bn = False return 2
gpu_img, gpu_label = init_data(
batch_size=_batch_size(), img_shape=img_shape, label_range=999)
cpu_img, cpu_label = init_data(
batch_size=_batch_size(), img_shape=img_shape, label_range=999)
feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
model = SE_ResNeXt50Small
def _feed_dict(use_cuda):
if use_cuda:
return feed_dict_gpu
return feed_dict_cpu
def _compare_reduce_and_allreduce(self,
def _get_result_of_origin_model(use_cuda):
global remove_bn
global remove_dropout
remove_bn = True
remove_dropout = True
first_loss, last_loss = TestParallelExecutorBase.check_network_convergence(
model, model,
use_cuda, feed_dict=_feed_dict(use_cuda),
iter=20, iter=_iter(use_cuda),
delta2=1e-5): batch_size=_batch_size(),
use_cuda=use_cuda,
use_reduce=False,
optimizer=optimizer)
return first_loss, last_loss
origin_cpu_first_loss, origin_cpu_last_loss = _get_result_of_origin_model(False)
if core.is_compiled_with_cuda():
origin_gpu_first_loss, origin_gpu_last_loss = _get_result_of_origin_model(
True)
def _get_origin_result(use_cuda):
if use_cuda:
assert core.is_compiled_with_cuda(), "Doesn't compiled with CUDA."
return origin_gpu_first_loss, origin_gpu_last_loss
return origin_cpu_first_loss, origin_cpu_last_loss
class TestResnet(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
global remove_bn global remove_bn
global remove_dropout
remove_bn = True remove_bn = True
remove_dropout = True
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=999)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=_feed_dict(use_cuda),
"label": label}, iter=_iter(use_cuda),
iter=iter, batch_size=_batch_size(),
batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=False, use_reduce=False,
optimizer=optimizer) optimizer=optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence( reduce_first_loss, reduce_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=_feed_dict(use_cuda),
"label": label}, iter=_iter(use_cuda),
iter=iter, batch_size=_batch_size(),
batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=True, use_reduce=True,
optimizer=optimizer) optimizer=optimizer)
...@@ -232,10 +272,9 @@ class TestResnet(TestParallelExecutorBase): ...@@ -232,10 +272,9 @@ class TestResnet(TestParallelExecutorBase):
all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence( all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=_feed_dict(use_cuda),
"label": label}, iter=_iter(use_cuda),
iter=iter, batch_size=_batch_size(),
batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=False, use_reduce=False,
optimizer=optimizer, optimizer=optimizer,
...@@ -243,10 +282,9 @@ class TestResnet(TestParallelExecutorBase): ...@@ -243,10 +282,9 @@ class TestResnet(TestParallelExecutorBase):
reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=_feed_dict(use_cuda),
"label": label}, iter=_iter(use_cuda),
iter=iter, batch_size=_batch_size(),
batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=True, use_reduce=True,
optimizer=optimizer, optimizer=optimizer,
...@@ -267,37 +305,28 @@ class TestResnet(TestParallelExecutorBase): ...@@ -267,37 +305,28 @@ class TestResnet(TestParallelExecutorBase):
for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2) self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
def _check_resnet_convergence(self, def _compare_result_with_origin_model(self,
model, get_origin_result,
check_func_1,
check_func_2, check_func_2,
use_cuda, use_cuda,
iter=20,
delta2=1e-5, delta2=1e-5,
compare_seperately=True): compare_seperately=True,
rm_drop_out=False,
rm_bn=False):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
global remove_dropout
global remove_bn global remove_bn
remove_dropout = True global remove_dropout
remove_bn = True remove_bn = rm_bn or use_cuda
remove_dropout = rm_drop_out
img, label = init_data( func_1_first_loss, func_1_last_loss = get_origin_result(use_cuda)
batch_size=batch_size, img_shape=img_shape, label_range=999)
func_1_first_loss, func_1_last_loss = check_func_1(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda)
func_2_first_loss, func_2_last_loss = check_func_2( func_2_first_loss, func_2_last_loss = check_func_2(
model, model,
feed_dict={"image": img, feed_dict=_feed_dict(use_cuda),
"label": label}, iter=_iter(use_cuda),
iter=iter, batch_size=_batch_size(),
batch_size=batch_size,
use_cuda=use_cuda) use_cuda=use_cuda)
if compare_seperately: if compare_seperately:
...@@ -311,97 +340,55 @@ class TestResnet(TestParallelExecutorBase): ...@@ -311,97 +340,55 @@ class TestResnet(TestParallelExecutorBase):
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2) np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
def _compare_with_fused_all_reduce(self,
model,
use_cuda,
iter=20,
delta2=1e-5):
if use_cuda and not core.is_compiled_with_cuda():
return
global remove_bn
remove_bn = True
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=999)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
fuse_all_reduce_ops=False,
optimizer=optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
fuse_all_reduce_ops=True,
optimizer=optimizer)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
def test_seresnext_with_reduce(self): def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce( self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
self._compare_reduce_and_allreduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=5)
def test_seresnext_with_fused_all_reduce(self):
self._compare_with_fused_all_reduce(
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
self._compare_with_fused_all_reduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
def test_seresnext_with_learning_rate_decay(self): def test_seresnext_with_learning_rate_decay(self):
check_func_1 = partial( # NOTE(zcd): This test is compare the result of use parallel_executor and executor,
self.check_network_convergence, # and the result of drop_out op and batch_norm op in this two executor
optimizer=optimizer, # have diff, so the two ops should be removed from the model.
use_parallel_executor=True) check_func_1 = _get_origin_result
check_func_2 = partial( check_func_2 = partial(
self.check_network_convergence, self.check_network_convergence,
optimizer=optimizer, optimizer=optimizer,
use_parallel_executor=False) use_parallel_executor=False)
self._check_resnet_convergence( self._compare_result_with_origin_model(
SE_ResNeXt50Small,
check_func_1,
check_func_2,
use_cuda=True,
compare_seperately=False)
self._check_resnet_convergence(
SE_ResNeXt50Small,
check_func_1, check_func_1,
check_func_2, check_func_2,
use_cuda=False, use_cuda=False,
rm_drop_out=True,
rm_bn=True,
compare_seperately=False, compare_seperately=False,
iter=2,
delta2=1e-3) delta2=1e-3)
self._compare_result_with_origin_model(
check_func_1,
check_func_2,
use_cuda=True,
rm_drop_out=True,
rm_bn=True,
compare_seperately=False)
def test_seresnext_with_fused_optimizer_ops(self): def test_seresnext_with_fused_all_reduce(self):
check_func_1 = partial( # NOTE(zcd): In order to make the program faster,
self.check_network_convergence, fuse_all_optimizer_ops=False) # this unit test remove drop_out and batch_norm.
check_func_1 = _get_origin_result
check_func_2 = partial( check_func_2 = partial(
self.check_network_convergence, fuse_all_optimizer_ops=True) self.check_network_convergence,
# TODO(zcd): this test failed random, I will fix it in next PR. optimizer=optimizer,
# self._check_resnet_convergence( fuse_all_reduce_ops=True)
# SE_ResNeXt50Small, self._compare_result_with_origin_model(
# check_func_1,
# check_func_2,
# use_cuda=True,
# delta2=1e-3)
self._check_resnet_convergence(
SE_ResNeXt50Small,
check_func_1, check_func_1,
check_func_2, check_func_2,
use_cuda=False, use_cuda=False,
iter=2, rm_drop_out=True,
rm_bn=True)
self._compare_result_with_origin_model(
check_func_1,
check_func_2,
use_cuda=True,
rm_drop_out=True,
rm_bn=True,
delta2=1e-3) delta2=1e-3)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册