未验证 提交 2b6a5793 编写于 作者: S ShenLiang 提交者: GitHub

remove auto mode from localsgd optimizer (#27237)

* rm auto from localsgd
上级 cc3f4b81
...@@ -36,7 +36,10 @@ message AMPConfig { ...@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9; repeated string custom_black_varnames = 9;
} }
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig { message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ]; optional int32 k_steps = 1 [ default = 1 ];
......
...@@ -707,11 +707,7 @@ class DistributedStrategy(object): ...@@ -707,11 +707,7 @@ class DistributedStrategy(object):
**Notes**: **Notes**:
k_steps(int) The local steps for training before parameter synchronization. Default 1. k_steps(int) The local steps for training before parameter synchronization. Default 1.
begin_step(int) The step of begining training by localsgd. Default 1.
If strategy.auto is set True, the local steps will be calculated automatically during training.
The algorithm is referenced in this paper:
`Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
In this case, k_steps indicates the first local steps which is suggested setting to 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -719,7 +715,8 @@ class DistributedStrategy(object): ...@@ -719,7 +715,8 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.localsgd = True strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4} strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
""" """
return get_msg_dict(self.strategy.localsgd_configs) return get_msg_dict(self.strategy.localsgd_configs)
......
...@@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
def _enable_strategy(self, dist_strategy, context): def _enable_strategy(self, dist_strategy, context):
dist_strategy.localsgd = True dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1} dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
def snapshot_name(self, param_name): def snapshot_name(self, param_name):
return param_name + self.snapshot_key return param_name + self.snapshot_key
...@@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
minimized = self.inner_opt.minimize( minimized = self.inner_opt.minimize(
loss, startup_program=startup_program) loss, startup_program=startup_program)
init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps'] k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
auto_steps = self.user_defined_strategy.auto begin_step_value = self.user_defined_strategy.localsgd_configs[
'begin_step']
if startup_program is None: if startup_program is None:
startup_program = default_startup_program() startup_program = default_startup_program()
...@@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
p2s = self.create_snapshot_vars(main_block.program) p2s = self.create_snapshot_vars(main_block.program)
with program_guard(main_block.program, startup_program): with program_guard(main_block.program, startup_program):
step = layers.autoincreased_step_counter(begin=0) step = layers.autoincreased_step_counter(begin=1)
k_steps = layers.create_global_var( k_steps = layers.create_global_var(
name="k_steps", name="k_steps",
shape=[1], shape=[1],
value=init_k_steps, value=k_steps_value,
dtype='int64', dtype='int64',
persistable=True) persistable=True)
last_step = layers.create_global_var(
name="last_step", begin_step = layers.create_global_var(
name="begin_step",
shape=[1], shape=[1],
value=int(0), value=begin_step_value,
dtype='int64', dtype='int64',
persistable=True) persistable=True)
if auto_steps: last_step = layers.create_global_var(
avg_loss = layers.collective._c_allreduce( name="last_step",
loss) / self.role_maker.worker_num()
lr_0 = layers.create_global_var(
name="lr_0",
shape=[1],
value=float(0),
dtype='float32',
persistable=True)
loss_0 = layers.create_global_var(
name="loss_0",
shape=[1], shape=[1],
value=float(0), value=begin_step_value,
dtype='float32', dtype='int64',
persistable=True) persistable=True)
global_lr = self.inner_opt._global_learning_rate()
def initialize():
layers.assign(loss, loss_0)
layers.assign(global_lr, lr_0)
layers.cond(step == 0, initialize)
def communicate(): def communicate():
sub_block = default_main_program().current_block() sub_block = default_main_program().current_block()
ring_id = -1 ring_id = -1
...@@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]}, inputs={'X': [param]},
outputs={'Out': [snapshot]}, outputs={'Out': [snapshot]},
attrs={OP_ROLE_KEY: OpRole.Optimize}) attrs={OP_ROLE_KEY: OpRole.Optimize})
if auto_steps:
next_local_steps = layers.cast(
layers.ceil(
layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
float(init_k_steps))),
dtype='int64')
max_local_steps = layers.fill_constant(
shape=[1], dtype='int64', value=16)
next_local_steps = layers.elementwise_min(next_local_steps,
max_local_steps)
layers.assign(next_local_steps, k_steps)
layers.assign(step, last_step) layers.assign(step, last_step)
def begin_localsgd():
layers.cond(step - last_step == k_steps, communicate) layers.cond(step - last_step == k_steps, communicate)
layers.cond(step > begin_step, begin_localsgd, communicate)
return minimized return minimized
...@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase): ...@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
def test_localsgd_configs(self): def test_localsgd_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {"k_steps": 4} configs = {"k_steps": 4, "begin_step": 120}
strategy.localsgd_configs = configs strategy.localsgd_configs = configs
self.assertEqual(strategy.localsgd_configs["k_steps"], 4) self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
def test_dgc(self): def test_dgc(self):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
...@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase): ...@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
strategy.a_sync = True strategy.a_sync = True
strategy.localsgd = True strategy.localsgd = True
strategy.dgc = True strategy.dgc = True
localsgd_configs = {"k_steps": 5} localsgd_configs = {"k_steps": 5, "begin_step": 1}
strategy.localsgd_configs = localsgd_configs strategy.localsgd_configs = localsgd_configs
build_strategy = paddle.fluid.BuildStrategy() build_strategy = paddle.fluid.BuildStrategy()
build_strategy.enable_sequential_execution = True build_strategy.enable_sequential_execution = True
......
...@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase): ...@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
strategy.auto = True strategy.auto = True
config = strategy.localsgd_configs config = strategy.localsgd_configs
config['k_steps'] = 1 config['k_steps'] = 1
config['begin_step'] = 1
strategy.localsgd_configs = config strategy.localsgd_configs = config
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册