From 54003b873e47c4f6e6289a4471e48f20b921c1b1 Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Wed, 12 Aug 2020 10:08:56 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91add=20lamb=20to?= =?UTF-8?q?=20fleet=20meta=20optimizer=20(#26025)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add lamb to fleet meta optimizer --- .../framework/distributed_strategy.proto | 5 +- .../paddle/fleet/base/distributed_strategy.py | 9 ++ .../paddle/fleet/meta_optimizers/__init__.py | 2 + .../fleet/meta_optimizers/lamb_optimizer.py | 99 ++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 2 + .../test_fleet_dgc_meta_optimizer.py | 58 ++++++---- .../test_fleet_lamb_meta_optimizer.py | 108 ++++++++++++++++++ .../test_fleet_lars_meta_optimizer.py | 54 +++++---- 8 files changed, 292 insertions(+), 45 deletions(-) mode change 100644 => 100755 python/paddle/fleet/base/distributed_strategy.py create mode 100755 python/paddle/fleet/meta_optimizers/lamb_optimizer.py mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py create mode 100755 python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 0c0f9d82287..4b984210ed1 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -55,9 +55,8 @@ message LarsConfig { } message LambConfig { - optional float beta1 = 1 [ default = 0.001 ]; - optional float beta2 = 2 [ default = 0.999 ]; - optional float epsilon = 3 [ default = 0.000001 ]; + optional float lamb_weight_decay = 1 [ default = 0.01 ]; + repeated string exclude_from_weight_decay = 2; } message BuildStrategy { diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/fleet/base/distributed_strategy.py old mode 100644 new mode 100755 index 6ada175b89e..26b4c8f572a --- a/python/paddle/fleet/base/distributed_strategy.py +++ b/python/paddle/fleet/base/distributed_strategy.py @@ -627,6 +627,15 @@ class DistributedStrategy(object): else: print("WARNING: lamb should have value of bool type") + @property + def lamb_configs(self): + return get_msg_dict(self.strategy.lamb_configs) + + @lamb_configs.setter + def lamb_configs(self, configs): + check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs") + assign_configs_value(self.strategy.lamb_configs, configs) + @property def elastic(self): return self.strategy.elastic diff --git a/python/paddle/fleet/meta_optimizers/__init__.py b/python/paddle/fleet/meta_optimizers/__init__.py index 81ea958f321..075e8b6c430 100644 --- a/python/paddle/fleet/meta_optimizers/__init__.py +++ b/python/paddle/fleet/meta_optimizers/__init__.py @@ -21,6 +21,7 @@ from .localsgd_optimizer import LocalSGDOptimizer from .lars_optimizer import LarsOptimizer from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer from .dgc_optimizer import DGCOptimizer +from .lamb_optimizer import LambOptimizer __all__ = [ 'AMPOptimizer', @@ -33,4 +34,5 @@ __all__ = [ 'LarsOptimizer', 'AsyncGraphExecutionOptimizer', 'DGCOptimizer', + 'LambOptimizer', ] diff --git a/python/paddle/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/fleet/meta_optimizers/lamb_optimizer.py new file mode 100755 index 00000000000..cf4b479b523 --- /dev/null +++ b/python/paddle/fleet/meta_optimizers/lamb_optimizer.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from paddle.fluid.optimizer import AdamOptimizer +from paddle.fluid.optimizer import LambOptimizer as LAMB +from .meta_optimizer_base import MetaOptimizerBase +import logging + +__all__ = ["LambOptimizer"] + + +class LambOptimizer(MetaOptimizerBase): + def __init__(self, optimizer): + super(LambOptimizer, self).__init__(optimizer) + self.inner_opt = optimizer + self.lamb_opt = None + # we do not allow meta optimizer to be inner optimizer currently + self.meta_optimizers_white_list = [] + + def _set_basic_info(self, loss, role_maker, user_defined_optimizer, + user_defined_strategy): + super(LambOptimizer, self)._set_basic_info( + loss, role_maker, user_defined_optimizer, user_defined_strategy) + + opt = self.inner_opt + if not isinstance(opt, AdamOptimizer): + return + + configs = self.user_defined_strategy.lamb_configs + if len(configs['exclude_from_weight_decay']) == 0: + _exclude_from_weight_decay_fn = None + else: + + def exclude_fn(param): + exclude_list = configs['exclude_from_weight_decay'] + for name in exclude_list: + if param.name.endswith(name): + return True + return False + + _exclude_from_weight_decay_fn = exclude_fn + + self.lamb_opt = LAMB( + learning_rate=opt._learning_rate, + lamb_weight_decay=configs['lamb_weight_decay'], + beta1=opt._beta1, + beta2=opt._beta2, + epsilon=opt._epsilon, + parameter_list=opt._parameter_list, + regularization=opt.regularization, + grad_clip=opt._grad_clip, + exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn, + name=opt._name) + + def _can_apply(self): + if self.user_defined_strategy.lamb: + if not isinstance(self.inner_opt, AdamOptimizer): + logging.warn( + "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.". + format(self.inner_opt.type)) + return False + return True + return False + + def _disable_strategy(self, dist_strategy): + dist_strategy.lamb = False + dist_strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + return self.lamb_opt.backward(loss, startup_program, parameter_list, + no_grad_set, callbacks) + + def minimize_impl(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + optimize_ops, params_grads = \ + self.lamb_opt.minimize(loss, startup_program, + parameter_list, no_grad_set) + return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 489ea8d0c25..43bedb99415 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -40,6 +40,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor) @@ -386,6 +387,7 @@ if(WITH_DISTRIBUTE) if(NOT WIN32) py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS}) + py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS}) endif(NOT WIN32) endif(NOT APPLE) if(WITH_DGC) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py old mode 100644 new mode 100755 index 0590650bd02..b0a841ecba7 --- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py @@ -14,6 +14,7 @@ import unittest import paddle +from paddle import fluid import os import paddle.fleet as fleet import paddle.fluid.incubate.fleet.base.role_maker as role_maker @@ -25,31 +26,40 @@ class TestFleetDGCOptimizer(unittest.TestCase): os.environ[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" - def net(self): - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + def net(self, main_prog, startup_prog): + with fluid.program_guard(main_prog, startup_prog): + with fluid.unique_name.guard(): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') - fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + fc_1 = paddle.fluid.layers.fc(input=input_x, + size=64, + act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) - strategy = paddle.fleet.DistributedStrategy() - strategy.dgc = True - strategy.dgc_configs = { - "rampup_begin_step": 128, - "rampup_step": 100, - "sparsity": [0.996, 0.999] - } + strategy = paddle.fleet.DistributedStrategy() + strategy.dgc = True + strategy.dgc_configs = { + "rampup_begin_step": 128, + "rampup_step": 100, + "sparsity": [0.996, 0.999] + } return avg_cost, strategy def test_dgc_optimizer(self): - avg_cost, strategy = self.net() + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -59,7 +69,9 @@ class TestFleetDGCOptimizer(unittest.TestCase): self.assertIn('dgc_momentum', ops) def test_dgc_not_apply_with_adam(self): - avg_cost, strategy = self.net() + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -72,7 +84,9 @@ class TestFleetDGCOptimizer(unittest.TestCase): os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - avg_cost, strategy = self.net() + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py new file mode 100755 index 00000000000..7384e3b4dfa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle import fluid +import os +import paddle.fleet as fleet +import paddle.fluid.incubate.fleet.base.role_maker as role_maker + + +class TestFleetLambMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ + "127.0.0.1:36001,127.0.0.2:36001" + + def net(self, main_prog, startup_prog): + with fluid.program_guard(main_prog, startup_prog): + with fluid.unique_name.guard(): + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, + size=64, + act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.fleet.DistributedStrategy() + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + + return avg_cost, strategy + + def test_lamb_optimizer(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('lamb', ops) + + def test_lamb_not_apply_with_momentum(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = [op.type for op in avg_cost.block.ops] + self.assertNotIn('lamb', ops) + + def test_lamb_exclude_fn(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': ['.b_0'], + } + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops_with_bias = [ + op for op in avg_cost.block.ops + if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0') + ] + for op in ops_with_bias: + self.assertEqual(op.attr('weight_decay'), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index 960ffbd4035..d8a56016ff3 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -14,6 +14,7 @@ import unittest import paddle +from paddle import fluid import os import paddle.fleet as fleet import paddle.fluid.incubate.fleet.base.role_maker as role_maker @@ -27,31 +28,40 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ "127.0.0.1:36001,127.0.0.2:36001" - def net(self): - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + def net(self, main_prog, startup_prog): + with fluid.program_guard(main_prog, startup_prog): + with fluid.unique_name.guard(): + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') - fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + fc_1 = paddle.fluid.layers.fc(input=input_x, + size=64, + act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) - strategy = paddle.fleet.DistributedStrategy() - strategy.lars = True - strategy.lars_configs = { - "lars_coeff": 0.001, - "lars_weight_decay": 0.0005, - } + strategy = paddle.fleet.DistributedStrategy() + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.001, + "lars_weight_decay": 0.0005, + } return avg_cost, strategy def test_lars_optimizer(self): - avg_cost, strategy = self.net() + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -60,7 +70,11 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): self.assertIn('lars_momentum', ops) def test_lars_not_apply_with_adam(self): - avg_cost, strategy = self.net() + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + startup_prog = fluid.Program() + train_prog = fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) -- GitLab