未验证 提交 f63368db 编写于 作者: W Wu Yi 提交者: GitHub

Add async dist tests (#12798)

* add async dist tests

* update delta

* fix transformer test

* refine rmsprop transpile

* update

* fix dist seresnet
上级 9f332222
...@@ -46,7 +46,8 @@ def cnn_model(data): ...@@ -46,7 +46,8 @@ def cnn_model(data):
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.3)))
conv_pool_2 = fluid.nets.simple_img_conv_pool( conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
...@@ -54,7 +55,8 @@ def cnn_model(data): ...@@ -54,7 +55,8 @@ def cnn_model(data):
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.2)))
SIZE = 10 SIZE = 10
input_shape = conv_pool_2.shape input_shape = conv_pool_2.shape
...@@ -66,8 +68,7 @@ def cnn_model(data): ...@@ -66,8 +68,7 @@ def cnn_model(data):
size=SIZE, size=SIZE,
act="softmax", act="softmax",
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer( initializer=fluid.initializer.Constant(value=0.1)))
loc=0.0, scale=scale, seed=1)))
return predict return predict
......
...@@ -129,7 +129,12 @@ class SE_ResNeXt(): ...@@ -129,7 +129,12 @@ class SE_ResNeXt():
input=conv, pool_size=7, pool_type='avg', global_pooling=True) input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') out = fluid.layers.fc(
input=drop,
size=class_dim,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2)))
return out return out
def shortcut(self, input, ch_out, stride): def shortcut(self, input, ch_out, stride):
...@@ -179,7 +184,7 @@ class SE_ResNeXt(): ...@@ -179,7 +184,7 @@ class SE_ResNeXt():
act=None, act=None,
# avoid pserver CPU init differs from GPU # avoid pserver CPU init differs from GPU
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant()), initializer=fluid.initializer.Constant(value=0.2)),
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act) return fluid.layers.batch_norm(input=conv, act=act)
...@@ -228,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase): ...@@ -228,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase):
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
# FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. learning_rate=fluid.layers.piecewise_decay(
#learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr),
# boundaries=bd, values=lr),
learning_rate=base_lr,
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4)) regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
...@@ -265,9 +265,9 @@ def main(role="pserver", ...@@ -265,9 +265,9 @@ def main(role="pserver",
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) != 7: if len(sys.argv) != 8:
print( print(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
) )
role = sys.argv[1] role = sys.argv[1]
endpoints = sys.argv[2] endpoints = sys.argv[2]
...@@ -275,6 +275,8 @@ if __name__ == "__main__": ...@@ -275,6 +275,8 @@ if __name__ == "__main__":
current_endpoint = sys.argv[4] current_endpoint = sys.argv[4]
trainers = int(sys.argv[5]) trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False is_dist = True if sys.argv[6] == "TRUE" else False
# FIXME(typhoonzero): refine this test.
is_async = True if sys.argv[7] == "TRUE" else False
main( main(
role=role, role=role,
endpoints=endpoints, endpoints=endpoints,
......
...@@ -30,7 +30,7 @@ class TestDistRunnerBase(object): ...@@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
def get_transpiler(self, trainer_id, main_program, pserver_endpoints, def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
trainers): trainers, sync_mode):
# NOTE: import fluid until runtime, or else forking processes will cause error. # NOTE: import fluid until runtime, or else forking processes will cause error.
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -39,17 +39,22 @@ class TestDistRunnerBase(object): ...@@ -39,17 +39,22 @@ class TestDistRunnerBase(object):
trainer_id=trainer_id, trainer_id=trainer_id,
program=main_program, program=main_program,
pservers=pserver_endpoints, pservers=pserver_endpoints,
trainers=trainers) trainers=trainers,
sync_mode=sync_mode)
return t return t
def run_pserver(self, pserver_endpoints, trainers, current_endpoint, def run_pserver(self,
trainer_id): pserver_endpoints,
trainers,
current_endpoint,
trainer_id,
sync_mode=True):
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
self.get_model(batch_size=2) self.get_model(batch_size=2)
t = self.get_transpiler(trainer_id, t = self.get_transpiler(trainer_id,
fluid.default_main_program(), pserver_endpoints, fluid.default_main_program(), pserver_endpoints,
trainers) trainers, sync_mode)
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog) startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -57,7 +62,13 @@ class TestDistRunnerBase(object): ...@@ -57,7 +62,13 @@ class TestDistRunnerBase(object):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): def run_trainer(self,
place,
endpoints,
trainer_id,
trainers,
is_dist=True,
sync_mode=True):
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
...@@ -65,7 +76,7 @@ class TestDistRunnerBase(object): ...@@ -65,7 +76,7 @@ class TestDistRunnerBase(object):
if is_dist: if is_dist:
t = self.get_transpiler(trainer_id, t = self.get_transpiler(trainer_id,
fluid.default_main_program(), endpoints, fluid.default_main_program(), endpoints,
trainers) trainers, sync_mode)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
else: else:
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
...@@ -106,9 +117,9 @@ def runtime_main(test_class): ...@@ -106,9 +117,9 @@ def runtime_main(test_class):
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
if len(sys.argv) != 7: if len(sys.argv) != 8:
print( print(
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
) )
role = sys.argv[1] role = sys.argv[1]
endpoints = sys.argv[2] endpoints = sys.argv[2]
...@@ -116,34 +127,43 @@ def runtime_main(test_class): ...@@ -116,34 +127,43 @@ def runtime_main(test_class):
current_endpoint = sys.argv[4] current_endpoint = sys.argv[4]
trainers = int(sys.argv[5]) trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False is_dist = True if sys.argv[6] == "TRUE" else False
sync_mode = True if sys.argv[7] == "TRUE" else False
model = test_class() model = test_class()
if role == "pserver": if role == "pserver":
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
sync_mode)
else: else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace() ) else fluid.CPUPlace()
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
sync_mode)
import paddle.compat as cpt import paddle.compat as cpt
class TestDistBase(unittest.TestCase): class TestDistBase(unittest.TestCase):
def _setup_config(self):
raise NotImplementedError("tests should have _setup_config implemented")
def setUp(self): def setUp(self):
self._trainers = 2 self._trainers = 2
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True
self._setup_config()
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log):
sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers, sync_mode_str)
ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers, sync_mode_str)
ps0_pipe = subprocess.PIPE ps0_pipe = subprocess.PIPE
ps1_pipe = subprocess.PIPE ps1_pipe = subprocess.PIPE
...@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase): ...@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase):
# Run local to get a base line # Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs) env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
(self._python_interp, model_file, (self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1) "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
if not check_error_log: if not check_error_log:
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), local_cmd.split(" "),
...@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase): ...@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase):
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \ tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers, sync_mode_str)
tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \ tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers, sync_mode_str)
env0 = {"CUDA_VISIBLE_DEVICES": "0"} env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"} env1 = {"CUDA_VISIBLE_DEVICES": "1"}
......
...@@ -17,10 +17,21 @@ import unittest ...@@ -17,10 +17,21 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistMnist2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
class TestDistMnistAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=200)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase ...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_se_resnext.py", delta=100)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase ...@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase
class TestDistTransformer2x2(TestDistBase): class TestDistTransformer2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_transformer(self): def test_transformer(self):
# TODO(paddle-dev): check if the delta is OK. # TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000 # Usually start around ~8000 and converge to ~5000
......
...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase ...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1e-7) self.check_with_place("dist_word2vec.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -1209,8 +1209,8 @@ class DistributeTranspiler(object): ...@@ -1209,8 +1209,8 @@ class DistributeTranspiler(object):
elif op_type == "momentum": elif op_type == "momentum":
if varkey == "Velocity": if varkey == "Velocity":
return param_shape return param_shape
elif op_type == "": elif op_type == "rmsprop":
if varkey == "Moment": if varkey in ["Moment", "MeanSquare"]:
return param_shape return param_shape
elif op_type == "sgd": elif op_type == "sgd":
pass pass
...@@ -1289,8 +1289,6 @@ class DistributeTranspiler(object): ...@@ -1289,8 +1289,6 @@ class DistributeTranspiler(object):
pserver_block = program.global_block() pserver_block = program.global_block()
new_inputs = collections.OrderedDict() new_inputs = collections.OrderedDict()
# update param/grad shape first, then other inputs like
# moment can use the updated shape
def _get_param_block(opt_op): def _get_param_block(opt_op):
# param is already created on global program # param is already created on global program
param_block = None param_block = None
...@@ -1303,22 +1301,6 @@ class DistributeTranspiler(object): ...@@ -1303,22 +1301,6 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
if key == "Grad": if key == "Grad":
new_inputs[key] = merged_var new_inputs[key] = merged_var
# For RMSProp optimizer
elif key == "Moment" or key == "MeanSquare":
param_block = _get_param_block(opt_op)
if not param_block:
return
moment_var = origin_program.global_block().vars[opt_op.input(
key)[0]]
tmpvar = pserver_block.create_var(
name=moment_var.name,
persistable=moment_var.persistable,
dtype=moment_var.dtype,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape=param_block.shape)
new_inputs[key] = tmpvar
elif key == "Param": elif key == "Param":
param_block = _get_param_block(opt_op) param_block = _get_param_block(opt_op)
if not param_block: if not param_block:
...@@ -1346,7 +1328,7 @@ class DistributeTranspiler(object): ...@@ -1346,7 +1328,7 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]: if key in ["Param", "Grad", "LearningRate"]:
continue continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]] var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape # update accumulator variable shape
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册