From c70f592002ad8d479b1c10a1a3dc8a0530383194 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Thu, 30 Jul 2020 14:43:21 +0800 Subject: [PATCH] add gradient Merge optimizer to meta (#25763) * add gradient Merge optimizer to meta, test=develop --- .../paddle/fleet/base/distributed_strategy.py | 29 ++++++++++ .../fleet/base/meta_optimizer_factory.py | 7 ++- .../paddle/fleet/meta_optimizers/__init__.py | 6 ++- .../gradient_merge_optimizer.py | 53 +++++++++++++++++++ python/paddle/fluid/optimizer.py | 6 +++ .../fluid/tests/unittests/CMakeLists.txt | 2 + ...est_fleet_gradient_merge_meta_optimizer.py | 53 +++++++++++++++++++ 7 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/fleet/base/distributed_strategy.py index bd49abd998b..21391130a6c 100644 --- a/python/paddle/fleet/base/distributed_strategy.py +++ b/python/paddle/fleet/base/distributed_strategy.py @@ -521,6 +521,23 @@ class DistributedStrategy(object): @property def gradient_merge(self): + """ + Gradient Merge, also called as Gradient Accumulation, + is a strategy for large batch training. With this strategy, + model parameter will not be updated until user-defined steps. + For each step, the forward network and the backward network + will run to calculate the gradient of model parameters. + For every k step, the optimization network will run, + applying a specific optimization method (such as SGD, Adam) + to model parameters. + + Examples: + .. code-block:: python + import paddle.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} + """ return self.strategy.gradient_merge @gradient_merge.setter @@ -532,6 +549,18 @@ class DistributedStrategy(object): @property def gradient_merge_configs(self): + """ + the key-value configs of distribute_strategy + Keys: + k_steps (int): the update period of the parameters + avg (bool): whether to average the gradients of each mini-batch, + the default value is `True` + Example: + import paddle.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} + """ return get_msg_dict(self.strategy.gradient_merge_configs) @gradient_merge_configs.setter diff --git a/python/paddle/fleet/base/meta_optimizer_factory.py b/python/paddle/fleet/base/meta_optimizer_factory.py index 8d42c2a0c89..9b94ac51339 100644 --- a/python/paddle/fleet/base/meta_optimizer_factory.py +++ b/python/paddle/fleet/base/meta_optimizer_factory.py @@ -13,11 +13,16 @@ # limitations under the License. from ..meta_optimizers import RecomputeOptimizer +from ..meta_optimizers import GradientMergeOptimizer from ..meta_optimizers import GraphExecutionOptimizer __all__ = ["MetaOptimizerFactory"] -meta_optimizer_names = ["RecomputeOptimizer", "GraphExecutionOptimizer"] +meta_optimizer_names = [ + "RecomputeOptimizer", + "GradientMergeOptimizer", + "GraphExecutionOptimizer", +] class MetaOptimizerFactory(object): diff --git a/python/paddle/fleet/meta_optimizers/__init__.py b/python/paddle/fleet/meta_optimizers/__init__.py index 8a87a31e903..2133eba0810 100644 --- a/python/paddle/fleet/meta_optimizers/__init__.py +++ b/python/paddle/fleet/meta_optimizers/__init__.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and from .recompute_optimizer import RecomputeOptimizer +from .gradient_merge_optimizer import GradientMergeOptimizer from .graph_execution_optimizer import GraphExecutionOptimizer -__all__ = ['RecomputeOptimizer'] +__all__ = [ + 'RecomputeOptimizer', + 'GradientMergeOptimizer', +] diff --git a/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py new file mode 100644 index 00000000000..668cf605def --- /dev/null +++ b/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -0,0 +1,53 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from paddle.fluid.optimizer import GradientMergeOptimizer as GM +from .meta_optimizer_base import MetaOptimizerBase + +__all__ = ["GradientMergeOptimizer"] + + +class GradientMergeOptimizer(MetaOptimizerBase): + def __init__(self, optimizer): + super(GradientMergeOptimizer, self).__init__(optimizer) + self.inner_opt = optimizer + self.wrapped_opt = GM(optimizer) + self.meta_optimizers_white_list = [] + + def _set_basic_info(self, loss, role_maker, user_defined_optimizer, + user_defined_strategy): + super(GradientMergeOptimizer, self)._set_basic_info( + loss, role_maker, user_defined_optimizer, user_defined_strategy) + self.wrapped_opt._set_k_steps( + self.user_defined_strategy.gradient_merge_configs["k_steps"]) + self.wrapped_opt._set_avg( + self.user_defined_strategy.gradient_merge_configs["avg"]) + + def _can_apply(self): + can_apply = (self.user_defined_strategy.gradient_merge == True) and \ + self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1 + return can_apply + + def _disable_strategy(self, dist_strategy): + dist_strategy.gradient_merge = False + dist_strategy.gradient_merge_configs = {"k_steps": 1, "avg": True} + + def minimize_impl(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + optimize_ops, params_grads = \ + self.wrapped_opt.minimize(loss, startup_program, + parameter_list, no_grad_set) + return optimize_ops, params_grads diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 2ce95131f05..b6a95943182 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -5017,6 +5017,12 @@ class GradientMergeOptimizer(object): self.type = "gradient_merge" self.avg = avg + def _set_k_steps(self, k_steps): + self.k_steps = k_steps + + def _set_avg(self, avg): + self.avg = avg + def minimize(self, loss, startup_program=None, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4ba3bf4389b..710376de56b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -32,6 +32,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_base) list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) @@ -364,6 +365,7 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS}) py_test_modules(test_fleet_meta_optimizer MODULES test_fleet_meta_optimizer ENVS ${dist_ENVS}) + py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) endif(NOT APPLE) if(WITH_DGC) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py new file mode 100644 index 00000000000..36d5912cb7e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -0,0 +1,53 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os +import paddle.fleet as fleet +import paddle.fluid.incubate.fleet.base.role_maker as role_maker + + +class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ + "127.0.0.1:36001,127.0.0.2:36001" + + def test_gradient_merge_optimizer(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.fleet.DistributedStrategy() + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 2, "avg": True} + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + +if __name__ == "__main__": + unittest.main() -- GitLab