diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 04955fb80c9ba5ecdcad8ab53b1cfef570efd9a5..e4081f481d0b9f1b33fd1a7e67a3b26dac22a575 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -46,7 +46,7 @@ class AMPOptimizer(MetaOptimizerBase): custom_white_list = set(config['custom_white_list']) custom_black_list = set(config['custom_black_list']) custom_black_varnames = set(config['custom_black_varnames']) - self.amp_lists = mixed_precision.AutoMixedPrecisionLists( + amp_lists = mixed_precision.AutoMixedPrecisionLists( custom_white_list, custom_black_list, custom_black_varnames) self.wrapped_opt = mixed_precision.decorate( diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index df9887759e16fddb0579abdcdf3ef5f9024825e7..64d54ae3bab03b4511340c3ae222001aa7942f9c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -98,6 +98,10 @@ class LambOptimizer(MetaOptimizerBase): def apply_gradients(self, params_grads): return self.lamb_opt.apply_gradients(params_grads=params_grads) + def apply_optimize(self, loss, startup_program, params_grads): + return self.lamb_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index 609d8b85e714c1c7247898f8d506f9dadab9f499..32c6be505a5467b2fe6cc3f155cc8df7e21bfeca 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -85,6 +85,10 @@ class LarsOptimizer(MetaOptimizerBase): def apply_gradients(self, params_grads): return self.lars_opt.apply_gradients(params_grads=params_grads) + def apply_optimize(self, loss, startup_program, params_grads): + return self.lars_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, diff --git a/python/paddle/fluid/tests/unittests/test_fleet_combine_meta_optimizer.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py similarity index 59% rename from python/paddle/fluid/tests/unittests/test_fleet_combine_meta_optimizer.py rename to python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py index 2ed0d7bd685f3a6212f428ce69114b6a4a704748..979eabd7ec1b484a0c2be3452355eca13222626e 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_combine_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py @@ -19,10 +19,8 @@ import os import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker -paddle.enable_static() - -class TestFleetCombineOptimizer(unittest.TestCase): +class TestFleetMetaOptimizer(unittest.TestCase): def setUp(self): os.environ["PADDLE_TRAINER_ID"] = "1" os.environ[ @@ -50,19 +48,21 @@ class TestFleetCombineOptimizer(unittest.TestCase): avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.dgc = True - strategy.dgc_configs = { - "rampup_begin_step": 128, - "rampup_step": 100, - "sparsity": [0.996, 0.999] - } return avg_cost, strategy - def optimizer(self, loss, strategy, train_prog, startup_prog): + def optimizer(self, + loss, + strategy, + train_prog, + startup_prog, + name='momentum'): with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9) + if name == 'momentum': + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) + elif name == 'adam': + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(loss) @@ -70,53 +70,41 @@ class TestFleetCombineOptimizer(unittest.TestCase): def set_strategy(self, strategy, name): if name == 'amp': strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "decr_every_n_nan_or_inf": 2, + "incr_every_n_steps": 1000, + "incr_ratio": 2.0, + "use_dynamic_loss_scaling": True, + "decr_ratio": 0.5, + "custom_white_list": ['softmax'], + "custom_black_list": ['tanh'], + } elif name == 'dgc': strategy.dgc = True + strategy.dgc_configs = { + "rampup_begin_step": 128, + "rampup_step": 100, + "sparsity": [0.996, 0.999] + } elif name == 'recompute': strategy.recompute = True strategy.recompute_configs = { "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"] } - - def test_dgc_recompute_optimizer(self): - train_prog = fluid.Program() - startup_prog = fluid.Program() - avg_cost, strategy = self.net(train_prog, startup_prog) - - self.set_strategy(strategy, 'dgc') - self.set_strategy(strategy, 'recompute') - - self.optimizer(avg_cost, strategy, train_prog, startup_prog) - - ops = [op.type for op in avg_cost.block.ops] - outs = [ - op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' - ] - self.assertIn('dgc', ops) - self.assertIn('dgc_momentum', ops) - - self.assertIn('subprog', ''.join(outs)) - - def test_amp_recompute_optimizer(self): - train_prog = fluid.Program() - startup_prog = fluid.Program() - avg_cost, strategy = self.net(train_prog, startup_prog) - - self.set_strategy(strategy, 'amp') - self.set_strategy(strategy, 'recompute') - - self.optimizer(avg_cost, strategy, train_prog, startup_prog) - - ops = [op.type for op in avg_cost.block.ops] - outs = [ - op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' - ] - print(train_prog) - self.assertIn('cast', ops) - self.assertIn('check_finite_and_unscale', ops) - - self.assertIn('subprog', ''.join(outs)) - - -if __name__ == "__main__": - unittest.main() + elif name == 'lars': + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.001, + "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ["batch_norm", ".b"], + } + elif name == 'lamb': + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + else: + raise NotImplementedError() diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py index 49b93e0dfaaacddc9916f91a9ccd6c7e8bbd1714..d615f7cb7044e588557b1b14dbb54a881bdb8730 100644 --- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py @@ -16,12 +16,14 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer import paddle.fluid.regularizer as regularizer import paddle.fluid.clip as clip import paddle.compat as cpt from paddle.fluid.backward import append_backward +paddle.enable_static() class TestDGCMomentumOptimizer(unittest.TestCase): @@ -86,13 +88,17 @@ class TestDGCMomentumOptimizer(unittest.TestCase): block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) # params_grads = append_backward(mean_out) - params_grads = dgc_momentum_optimizer.backward(mean_out) + params_grads = dgc_momentum_optimizer.backward( + mean_out, startup_program=init_program) + + with framework.program_guard(program, init_program): + opts = dgc_momentum_optimizer.apply_gradients(params_grads) + accumulator_count = 1 if name == "momentum" else 2 self.assertEqual(len(params_grads), 1) self.assertEqual( len(dgc_momentum_optimizer.get_accumulators()), accumulator_count) - with framework.program_guard(program, init_program): - opts = dgc_momentum_optimizer.apply_gradients(params_grads) + self.assertEqual(len(opts), 2) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], ["scale", name]) @@ -108,8 +114,11 @@ class TestDGCMomentumOptimizer(unittest.TestCase): self.assertTrue(mul_x.name in velocity_acc) # Check init_program + # dgc not apply include: lr, dgc(count, nranks, begin step), (u,) + # dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather) + init_ops_count = 5 if name == "momentum" else 9 init_ops = init_program.global_block().ops - self.assertEqual(len(init_ops), 1) + self.assertEqual(len(init_ops), init_ops_count) self.assertEqual(init_ops[0].type, "fill_constant") self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index 362428631e68cc7ac88be93d7ba1ff449a035822..8178dbcd859c27aab3671b9e74184897aeca86d8 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -16,53 +16,42 @@ import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker import unittest import paddle +import paddle.fluid as fluid import os +from fleet_meta_optimizer_base import TestFleetMetaOptimizer paddle.enable_static() -class TestFleetAMPOptimizer(unittest.TestCase): - def setUp(self): - os.environ["PADDLE_TRAINER_ID"] = "0" - os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - +class TestFleetAMPOptimizer(TestFleetMetaOptimizer): def test_amp_optimizer(self): - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') - - fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'amp') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.amp = True - strategy.amp_configs = { - "init_loss_scaling": 32768, - "decr_every_n_nan_or_inf": 2, - "incr_every_n_steps": 1000, - "incr_ratio": 2.0, - "use_dynamic_loss_scaling": True, - "decr_ratio": 0.5, - "custom_white_list": ['softmax'], - "custom_black_list": ['tanh'], - } + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('cast', ops) + self.assertIn('check_finite_and_unscale', ops) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + def test_amp_recompute_optimizer(self): + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'amp') + self.set_strategy(strategy, 'recompute') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) strategy = fleet._final_strategy() ops = [op.type for op in avg_cost.block.ops] + outs = [ + op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' + ] self.assertIn('cast', ops) self.assertIn('check_finite_and_unscale', ops) + self.assertIn('subprog', ''.join(outs)) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py index d3615c9605a00a63fd79310660713cf1f85ed066..e77f6883124d5fdb4fa7c461b3ea0d60e27d1cce 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py @@ -18,66 +18,27 @@ from paddle import fluid import os import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker +from fleet_meta_optimizer_base import TestFleetMetaOptimizer paddle.enable_static() -class TestFleetDGCOptimizer(unittest.TestCase): - def setUp(self): - os.environ["PADDLE_TRAINER_ID"] = "1" - os.environ[ - "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" - - def net(self, main_prog, startup_prog): - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data( - name="y", shape=[1], dtype='int64') - - fc_1 = paddle.fluid.layers.fc(input=input_x, - size=64, - act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], - size=2, - act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) - - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.dgc = True - strategy.dgc_configs = { - "rampup_begin_step": 128, - "rampup_step": 100, - "sparsity": [0.996, 0.999] - } - return avg_cost, strategy - +class TestFleetDGCOptimizer(TestFleetMetaOptimizer): def test_dgc_optimizer(self): - startup_prog = fluid.Program() - train_prog = fluid.Program() + train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + self.set_strategy(strategy, 'dgc') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) ops = [op.type for op in avg_cost.block.ops] self.assertIn('dgc', ops) self.assertIn('dgc_momentum', ops) def test_dgc_not_apply_with_adam(self): - startup_prog = fluid.Program() - train_prog = fluid.Program() + train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + self.set_strategy(strategy, 'dgc') + self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam') ops = [op.type for op in avg_cost.block.ops] self.assertNotIn('dgc', ops) @@ -87,18 +48,31 @@ class TestFleetDGCOptimizer(unittest.TestCase): os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - startup_prog = fluid.Program() - train_prog = fluid.Program() + train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + self.set_strategy(strategy, 'dgc') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) ops = [op.type for op in avg_cost.block.ops] self.assertNotIn('dgc', ops) self.assertNotIn('dgc_momentum', ops) + def test_dgc_recompute_optimizer(self): + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'dgc') + self.set_strategy(strategy, 'recompute') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + + ops = [op.type for op in avg_cost.block.ops] + outs = [ + op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' + ] + self.assertIn('dgc', ops) + self.assertIn('dgc_momentum', ops) + + self.assertIn('subprog', ''.join(outs)) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py index a42010a4eaa5066821adb817e7a5df2b81bedf7c..0b869cb93bac61f10464e074b3bba3a4e51d7aba 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py @@ -14,40 +14,55 @@ import unittest import paddle +import paddle.fluid as fluid import os +from fleet_meta_optimizer_base import TestFleetMetaOptimizer +paddle.enable_static() -class TestFleetRecomputeMetaOptimizer(unittest.TestCase): - def setUp(self): - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - os.environ["PADDLE_TRAINERS_NUM"] = "2" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ - "127.0.0.1:36001,127.0.0.2:36001" +class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer): def test_recompute_optimizer(self): - import paddle.distributed.fleet as fleet - import paddle.distributed.fleet.base.role_maker as role_maker - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - input_x = paddle.fluid.layers.data( - name="x", shape=[32], dtype='float32') - input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') - - fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') - cost = paddle.fluid.layers.cross_entropy( - input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) - - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.recompute = True - strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]} - - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'recompute') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + + outs = [ + op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' + ] + + self.assertIn('subprog', ''.join(outs)) + + def test_recompute_lars_optimizer(self): + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'recompute') + self.set_strategy(strategy, 'lars') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + + ops = [op.type for op in avg_cost.block.ops] + outs = [ + op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' + ] + + self.assertIn('lars_momentum', ops) + self.assertIn('subprog', ''.join(outs)) + + def test_recompute_lamb_optimizer(self): + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'recompute') + self.set_strategy(strategy, 'lamb') + self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam') + + ops = [op.type for op in avg_cost.block.ops] + outs = [ + op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' + ] + + self.assertIn('lamb', ops) + self.assertIn('subprog', ''.join(outs)) if __name__ == "__main__":