未验证 提交 2b6a5793 编写于 作者: S ShenLiang 提交者: GitHub

remove auto mode from localsgd optimizer (#27237)

* rm auto from localsgd
上级 cc3f4b81
......@@ -36,7 +36,10 @@ message AMPConfig {
repeated string custom_black_varnames = 9;
}
message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
message LocalSGDConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ];
......
......@@ -707,11 +707,7 @@ class DistributedStrategy(object):
**Notes**:
k_steps(int) The local steps for training before parameter synchronization. Default 1.
If strategy.auto is set True, the local steps will be calculated automatically during training.
The algorithm is referenced in this paper:
`Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
In this case, k_steps indicates the first local steps which is suggested setting to 1.
begin_step(int) The step of begining training by localsgd. Default 1.
Examples:
.. code-block:: python
......@@ -719,7 +715,8 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4}
strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
"""
return get_msg_dict(self.strategy.localsgd_configs)
......
......@@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1}
dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
def snapshot_name(self, param_name):
return param_name + self.snapshot_key
......@@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
minimized = self.inner_opt.minimize(
loss, startup_program=startup_program)
init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
auto_steps = self.user_defined_strategy.auto
k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
begin_step_value = self.user_defined_strategy.localsgd_configs[
'begin_step']
if startup_program is None:
startup_program = default_startup_program()
......@@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
p2s = self.create_snapshot_vars(main_block.program)
with program_guard(main_block.program, startup_program):
step = layers.autoincreased_step_counter(begin=0)
step = layers.autoincreased_step_counter(begin=1)
k_steps = layers.create_global_var(
name="k_steps",
shape=[1],
value=init_k_steps,
value=k_steps_value,
dtype='int64',
persistable=True)
begin_step = layers.create_global_var(
name="begin_step",
shape=[1],
value=begin_step_value,
dtype='int64',
persistable=True)
last_step = layers.create_global_var(
name="last_step",
shape=[1],
value=int(0),
value=begin_step_value,
dtype='int64',
persistable=True)
if auto_steps:
avg_loss = layers.collective._c_allreduce(
loss) / self.role_maker.worker_num()
lr_0 = layers.create_global_var(
name="lr_0",
shape=[1],
value=float(0),
dtype='float32',
persistable=True)
loss_0 = layers.create_global_var(
name="loss_0",
shape=[1],
value=float(0),
dtype='float32',
persistable=True)
global_lr = self.inner_opt._global_learning_rate()
def initialize():
layers.assign(loss, loss_0)
layers.assign(global_lr, lr_0)
layers.cond(step == 0, initialize)
def communicate():
sub_block = default_main_program().current_block()
ring_id = -1
......@@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [snapshot]},
attrs={OP_ROLE_KEY: OpRole.Optimize})
if auto_steps:
next_local_steps = layers.cast(
layers.ceil(
layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
float(init_k_steps))),
dtype='int64')
max_local_steps = layers.fill_constant(
shape=[1], dtype='int64', value=16)
next_local_steps = layers.elementwise_min(next_local_steps,
max_local_steps)
layers.assign(next_local_steps, k_steps)
layers.assign(step, last_step)
layers.cond(step - last_step == k_steps, communicate)
def begin_localsgd():
layers.cond(step - last_step == k_steps, communicate)
layers.cond(step > begin_step, begin_localsgd, communicate)
return minimized
......@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
def test_localsgd_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {"k_steps": 4}
configs = {"k_steps": 4, "begin_step": 120}
strategy.localsgd_configs = configs
self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
def test_dgc(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
......@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
strategy.a_sync = True
strategy.localsgd = True
strategy.dgc = True
localsgd_configs = {"k_steps": 5}
localsgd_configs = {"k_steps": 5, "begin_step": 1}
strategy.localsgd_configs = localsgd_configs
build_strategy = paddle.fluid.BuildStrategy()
build_strategy.enable_sequential_execution = True
......
......@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
strategy.auto = True
config = strategy.localsgd_configs
config['k_steps'] = 1
config['begin_step'] = 1
strategy.localsgd_configs = config
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册