diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100755 new mode 100644 index 8d0093388b484a5c16bd4c6a0d1aeae52bb200ab..edd1700ae7284c77883af6abd2cd7d511097685f --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -36,7 +36,10 @@ message AMPConfig { repeated string custom_black_varnames = 9; } -message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } +message LocalSGDConfig { + optional int32 k_steps = 1 [ default = 1 ]; + optional int32 begin_step = 2 [ default = 1 ]; +} message GradientMergeConfig { optional int32 k_steps = 1 [ default = 1 ]; diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index d65be0dd4b141a48946fae86ff33957130d74db9..1b86056c00443be4170757cee3cc60bbafd0f40b 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -707,11 +707,7 @@ class DistributedStrategy(object): **Notes**: k_steps(int) The local steps for training before parameter synchronization. Default 1. - - If strategy.auto is set True, the local steps will be calculated automatically during training. - The algorithm is referenced in this paper: - `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD `_. - In this case, k_steps indicates the first local steps which is suggested setting to 1. + begin_step(int) The step of begining training by localsgd. Default 1. Examples: .. code-block:: python @@ -719,7 +715,8 @@ class DistributedStrategy(object): import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.localsgd = True - strategy.localsgd_configs = {"k_steps": 4} + strategy.localsgd_configs = {"k_steps": 4, + "begin_step": 30} """ return get_msg_dict(self.strategy.localsgd_configs) diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 4d33dfe74564225de701b4eb5092ce69d0dd88e1..6fa34d8d28a907d936500907db3e4c65ab4f4da8 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): def _enable_strategy(self, dist_strategy, context): dist_strategy.localsgd = True - dist_strategy.localsgd_configs = {"k_steps": 1} + dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1} def snapshot_name(self, param_name): return param_name + self.snapshot_key @@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase): minimized = self.inner_opt.minimize( loss, startup_program=startup_program) - init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps'] - auto_steps = self.user_defined_strategy.auto + k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps'] + begin_step_value = self.user_defined_strategy.localsgd_configs[ + 'begin_step'] if startup_program is None: startup_program = default_startup_program() @@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase): p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = layers.autoincreased_step_counter(begin=0) + step = layers.autoincreased_step_counter(begin=1) k_steps = layers.create_global_var( name="k_steps", shape=[1], - value=init_k_steps, + value=k_steps_value, dtype='int64', persistable=True) + + begin_step = layers.create_global_var( + name="begin_step", + shape=[1], + value=begin_step_value, + dtype='int64', + persistable=True) + last_step = layers.create_global_var( name="last_step", shape=[1], - value=int(0), + value=begin_step_value, dtype='int64', persistable=True) - if auto_steps: - avg_loss = layers.collective._c_allreduce( - loss) / self.role_maker.worker_num() - - lr_0 = layers.create_global_var( - name="lr_0", - shape=[1], - value=float(0), - dtype='float32', - persistable=True) - loss_0 = layers.create_global_var( - name="loss_0", - shape=[1], - value=float(0), - dtype='float32', - persistable=True) - - global_lr = self.inner_opt._global_learning_rate() - - def initialize(): - layers.assign(loss, loss_0) - layers.assign(global_lr, lr_0) - - layers.cond(step == 0, initialize) - def communicate(): sub_block = default_main_program().current_block() ring_id = -1 @@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase): inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) - - if auto_steps: - next_local_steps = layers.cast( - layers.ceil( - layers.sqrt(lr_0 * loss / (global_lr * loss_0) * - float(init_k_steps))), - dtype='int64') - max_local_steps = layers.fill_constant( - shape=[1], dtype='int64', value=16) - next_local_steps = layers.elementwise_min(next_local_steps, - max_local_steps) - layers.assign(next_local_steps, k_steps) layers.assign(step, last_step) - layers.cond(step - last_step == k_steps, communicate) + def begin_localsgd(): + layers.cond(step - last_step == k_steps, communicate) + layers.cond(step > begin_step, begin_localsgd, communicate) return minimized diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index 83db1b3355136fd3e9057f51e26fff2448a46259..6f8af3017efcb9010b129131a01c5ee071b5bc36 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase): def test_localsgd_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() - configs = {"k_steps": 4} + configs = {"k_steps": 4, "begin_step": 120} strategy.localsgd_configs = configs self.assertEqual(strategy.localsgd_configs["k_steps"], 4) + self.assertEqual(strategy.localsgd_configs["begin_step"], 120) def test_dgc(self): strategy = paddle.distributed.fleet.DistributedStrategy() @@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase): strategy.a_sync = True strategy.localsgd = True strategy.dgc = True - localsgd_configs = {"k_steps": 5} + localsgd_configs = {"k_steps": 5, "begin_step": 1} strategy.localsgd_configs = localsgd_configs build_strategy = paddle.fluid.BuildStrategy() build_strategy.enable_sequential_execution = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py index 07b988bf8752057e68925bc42f564a72d466361d..945f5ae57454b2c4a509badb93574a6e03b607e8 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py @@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase): strategy.auto = True config = strategy.localsgd_configs config['k_steps'] = 1 + config['begin_step'] = 1 strategy.localsgd_configs = config optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)