remove auto mode from localsgd optimizer (#27237)

* rm auto from localsgd

remove auto mode from localsgd optimizer (#27237)
* rm auto from localsgd
2b6a5793 · ShenLiang · GitHub · cc3f4b81 · 2b6a5793 · 2b6a5793
5 changed file
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -36,7 +36,10 @@ message AMPConfig {
  repeated string custom_black_varnames = 9;
 }
-message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
+message LocalSGDConfig {
+  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
 message GradientMergeConfig {
  optional int32 k_steps = 1 [ default = 1 ];

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -707,11 +707,7 @@ class DistributedStrategy(object):
        **Notes**:
            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+            begin_step(int) The step of begining training by localsgd. Default 1.
-            If strategy.auto is set True, the local steps will be calculated automatically during training.
-            The algorithm is referenced in this paper: 
-            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
-            In this case, k_steps indicates the first local steps which is suggested setting to 1.
        Examples:
          .. code-block:: python
@@ -719,7 +715,8 @@ class DistributedStrategy(object):
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4}
+            strategy.localsgd_configs = {"k_steps": 4,
+                                         "begin_step": 30}
        """
        return get_msg_dict(self.strategy.localsgd_configs)

--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
    def _enable_strategy(self, dist_strategy, context):
        dist_strategy.localsgd = True
-        dist_strategy.localsgd_configs = {"k_steps": 1}
+        dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
    def snapshot_name(self, param_name):
        return param_name + self.snapshot_key
@@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
        minimized = self.inner_opt.minimize(
            loss, startup_program=startup_program)
-        init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
+        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
-        auto_steps = self.user_defined_strategy.auto
+        begin_step_value = self.user_defined_strategy.localsgd_configs[
+            'begin_step']
        if startup_program is None:
            startup_program = default_startup_program()
@@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
        p2s = self.create_snapshot_vars(main_block.program)
        with program_guard(main_block.program, startup_program):
-            step = layers.autoincreased_step_counter(begin=0)
+            step = layers.autoincreased_step_counter(begin=1)
            k_steps = layers.create_global_var(
                name="k_steps",
                shape=[1],
-                value=init_k_steps,
+                value=k_steps_value,
                dtype='int64',
                persistable=True)
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=begin_step_value,
+                dtype='int64',
+                persistable=True)
            last_step = layers.create_global_var(
                name="last_step",
                shape=[1],
-                value=int(0),
+                value=begin_step_value,
                dtype='int64',
                persistable=True)
-            if auto_steps:
-                avg_loss = layers.collective._c_allreduce(
-                    loss) / self.role_maker.worker_num()
-                lr_0 = layers.create_global_var(
-                    name="lr_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-                loss_0 = layers.create_global_var(
-                    name="loss_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-                global_lr = self.inner_opt._global_learning_rate()
-                def initialize():
-                    layers.assign(loss, loss_0)
-                    layers.assign(global_lr, lr_0)
-                layers.cond(step == 0, initialize)
            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
@@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                        inputs={'X': [param]},
                        outputs={'Out': [snapshot]},
                        attrs={OP_ROLE_KEY: OpRole.Optimize})
-                if auto_steps:
-                    next_local_steps = layers.cast(
-                        layers.ceil(
-                            layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
-                                        float(init_k_steps))),
-                        dtype='int64')
-                    max_local_steps = layers.fill_constant(
-                        shape=[1], dtype='int64', value=16)
-                    next_local_steps = layers.elementwise_min(next_local_steps,
-                                                              max_local_steps)
-                    layers.assign(next_local_steps, k_steps)
                layers.assign(step, last_step)
-            layers.cond(step - last_step == k_steps, communicate)
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate)
+            layers.cond(step > begin_step, begin_localsgd, communicate)
        return minimized
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
    def test_localsgd_configs(self):
        strategy = paddle.distributed.fleet.DistributedStrategy()
-        configs = {"k_steps": 4}
+        configs = {"k_steps": 4, "begin_step": 120}
        strategy.localsgd_configs = configs
        self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
+        self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
    def test_dgc(self):
        strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
        strategy.a_sync = True
        strategy.localsgd = True
        strategy.dgc = True
-        localsgd_configs = {"k_steps": 5}
+        localsgd_configs = {"k_steps": 5, "begin_step": 1}
        strategy.localsgd_configs = localsgd_configs
        build_strategy = paddle.fluid.BuildStrategy()
        build_strategy.enable_sequential_execution = True

--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
        strategy.auto = True
        config = strategy.localsgd_configs
        config['k_steps'] = 1
+        config['begin_step'] = 1
        strategy.localsgd_configs = config
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)