diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 7cf8d55aeeb1d99acd2f501461f0563f87a25e78..55ba9b0a0f4bd5aa34bd8c3b503279a3add21723 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -44,6 +44,8 @@ message AMPConfig { repeated string custom_white_list = 7; repeated string custom_black_list = 8; repeated string custom_black_varnames = 9; + optional bool use_pure_fp16 = 10 [ default = false ]; + optional bool use_fp16_guard = 11 [ default = true ]; } message LocalSGDConfig { diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index f7a28f15e9b70be3280ce29eb97487a238e78ce6..186d9263dc57df822130e333be082d283e6bb845 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -49,6 +49,9 @@ def assign_configs_value(msg, config): for key in config: for f in fields: if key == f.name: + # LABEL_OPTIONAL = 1 + # LABEL_REPEATED = 3 + # LABEL_REQUIRED = 2 if f.label == 3: getattr(msg, f.name).extend(config[f.name]) elif f.label == 1 or f.label == 2: @@ -366,7 +369,14 @@ class DistributedStrategy(object): custom_black_list(list[str]): Users' custom black list which forbidden execution fp16. - Examples: + custom_black_varnames(list[str]): Users' custom black varibles' names. + + use_pure_fp16(bool): Whether to use the pure fp16 training. Default False. + + use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program. + Default True. Only takes effect when `use_pure_fp16` is turned on. + + Examples 1: .. code-block:: python @@ -376,6 +386,19 @@ class DistributedStrategy(object): strategy.amp_configs = { "init_loss_scaling": 32768, "custom_white_list": ['conv2d']} + + Examples 2: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.amp = True + # pure fp16 + strategy.amp_configs = { + "init_loss_scaling": 32768, + "use_pure_fp16": True + } """ return get_msg_dict(self.strategy.amp_configs) diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index f4703a47cb78791436df82c2ebad31ffe573886f..f4075e92c4c4488ae2ffb8a2c14b34eeee35123b 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -196,6 +196,7 @@ class Fleet(object): else: if isinstance(role_maker, RoleMakerBase): self._role_maker = role_maker + self._is_collective = role_maker._is_collective else: raise ValueError( "`role_maker` should be subclass of `RoleMakerBase`, but got {}". @@ -1022,9 +1023,22 @@ class Fleet(object): if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: run_example_code() """ + # imitate target optimizer retrieval - return self.user_defined_optimizer.amp_init(place, scope, test_program, - use_fp16_test) + amp_optimizer = None + for optimizer in self.strategy_compiler._get_applied_meta_optimizer(): + if hasattr(optimizer, 'amp_init'): + amp_optimizer = optimizer + break + + if amp_optimizer is None: + if hasattr(self.user_defined_optimizer, 'amp_init'): + amp_optimizer = self.user_defined_optimizer + + assert amp_optimizer is not None, \ + "amp_init can only be used when the amp(auto mixed precision) strategy is turned on." + + return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test) def _final_strategy(self): if "valid_strategy" not in self._context: diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py index 1d6fcee5442947b0139383e5e1275548ba16792f..7b146318abe62a4a3de84860193567fe5b008604 100644 --- a/python/paddle/distributed/fleet/base/strategy_compiler.py +++ b/python/paddle/distributed/fleet/base/strategy_compiler.py @@ -129,6 +129,9 @@ class StrategyCompiler(StrategyCompilerBase): self._meta_optimizer_candidates = [] self._graph_optimizer_candidates = [] + def _get_applied_meta_optimizer(self): + return self._meta_optimizers + def _get_applied_meta_list(self): return [type(opt).__name__ for opt in self._meta_optimizers] diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index c751e229cbbe2b900ead900297ff9956946b9e75..dba3c944f70ab8e434d75269e5a3876e1fa49461 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -50,7 +50,8 @@ class AMPOptimizer(MetaOptimizerBase): self.inner_opt, amp_lists, config['init_loss_scaling'], config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'], config['incr_ratio'], config['decr_ratio'], - config['use_dynamic_loss_scaling']) + config['use_dynamic_loss_scaling'], config['use_pure_fp16'], + config['use_fp16_guard']) # if worker_num > 1, all cards will communication with each other, # add is_distributed to optimize amp, overlap communication and @@ -112,3 +113,11 @@ class AMPOptimizer(MetaOptimizerBase): self.wrapped_opt.minimize(loss, startup_program, parameter_list, no_grad_set) return optimize_ops, params_grads + + def amp_init(self, + place, + scope=None, + test_program=None, + use_fp16_test=False): + return self.wrapped_opt.amp_init(place, scope, test_program, + use_fp16_test) diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index 7ee184cfc5eb774a6508e645d069ca3df811162c..dd73577ae2e85389e2a931d4f01ca8707ff068e1 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -165,7 +165,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase): main_program._hierarchical_allreduce_inter_nranks = local_build_strategy.hierarchical_allreduce_inter_nranks # TODO(guru4elephant): should be an independent optimizer - self._setup_nccl_op(startup_program, main_program, local_build_strategy) + if worker_num > 1: + self._setup_nccl_op(startup_program, main_program, + local_build_strategy) local_build_strategy.num_trainers = self.role_maker._worker_num() local_build_strategy.trainer_id = self.role_maker._worker_index() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 67fc04b3ca8c46421b833a6c14b28964262d73b7..e367d051f087e80ee4a27d5f6c540cdd1e200029 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -48,6 +48,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3) list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_sharding_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) @@ -506,6 +507,7 @@ if(WITH_DISTRIBUTE) py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS}) + py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS ${dist_ENVS}) py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS}) diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py index b5eacecd003be519772adf77213def21d3528c95..1c74a11cc4d2e63570ecdc1f420fbd403008cfa3 100755 --- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py +++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py @@ -88,6 +88,21 @@ class TestFleetMetaOptimizer(unittest.TestCase): "custom_white_list": ['softmax'], "custom_black_list": ['tanh'], } + elif name == 'pure_fp16': + strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "decr_every_n_nan_or_inf": 2, + "incr_every_n_steps": 1000, + "incr_ratio": 2.0, + "use_dynamic_loss_scaling": True, + "decr_ratio": 0.5, + "custom_white_list": ['softmax'], + "custom_black_list": ['tanh'], + "use_pure_fp16": True, + "use_fp16_guard": False, + } + elif name == 'dgc': strategy.dgc = True strategy.dgc_configs = { diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py index 2fa6bf54769e0fa0bd8b0c97a40bd523c623bee6..869ca41a1923daa099112df95f9b8e3b520883d7 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py @@ -46,34 +46,88 @@ class TestFleetAMPInit(unittest.TestCase): def test_fleet_amp_init(self): if not fluid.core.is_compiled_with_cuda(): return - input_x = paddle.static.data( - name="x", shape=[None, 32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64') - cost = mlp(input_x, input_y) - optimizer = paddle.optimizer.Momentum( - learning_rate=0.001, - momentum=0.9, - weight_decay=fluid.regularizer.L2Decay(1e-4), - multi_precision=True) + main_program = paddle.static.Program() + startup_program = paddle.static.Program() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) - optimizer = paddle.static.amp.decorate(optimizer) - optimizer = fleet.distributed_optimizer(optimizer) - optimizer.minimize(cost) + with paddle.static.program_guard(main_program, startup_program): + input_x = paddle.static.data( + name="x", shape=[None, 32], dtype='float32') + input_y = paddle.static.data( + name="y", shape=[None, 1], dtype='int64') + + cost = mlp(input_x, input_y) + optimizer = paddle.optimizer.Momentum( + learning_rate=0.001, + momentum=0.9, + weight_decay=fluid.regularizer.L2Decay(1e-4), + multi_precision=True) + + optimizer = paddle.static.amp.decorate(optimizer) + optimizer = fleet.distributed_optimizer(optimizer) + optimizer.minimize(cost) + place = paddle.CUDAPlace(0) exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) + exe.run(startup_program) optimizer.amp_init(place) step = 1 for i in range(step): - cost_val = exe.run(program=paddle.static.default_main_program(), + cost_val = exe.run(program=main_program, + feed=gen_data(), + fetch_list=[cost.name]) + + def test_fleet_amp_meta_optimizer_init(self): + if not fluid.core.is_compiled_with_cuda(): + return + + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + + with paddle.static.program_guard(main_program, startup_program): + input_x = paddle.static.data( + name="x", shape=[None, 32], dtype='float32') + input_y = paddle.static.data( + name="y", shape=[None, 1], dtype='int64') + + cost = mlp(input_x, input_y) + optimizer = paddle.optimizer.Momentum( + learning_rate=0.001, + momentum=0.9, + weight_decay=fluid.regularizer.L2Decay(1e-4), + multi_precision=True) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.amp = True + strategy.amp_configs = {'use_pure_fp16': True} + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 2} + + optimizer = fleet.distributed_optimizer(optimizer, strategy) + optimizer.minimize(cost) + + print(fleet._get_applied_meta_list()) + + place = paddle.CUDAPlace(0) + + exe = paddle.static.Executor(place) + exe.run(startup_program) + optimizer.amp_init(place) + + step = 3 + for i in range(step): + cost_val = exe.run(program=main_program, feed=gen_data(), fetch_list=[cost.name]) + print(cost_val) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index 30f6607df9d8ad99e1c2bb87b03e157390f848bb..982ec4eb5c7a041e14ffa869da86f393df5a0aee 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -93,6 +93,21 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer): self.assertIn('cast', ops) self.assertIn('check_finite_and_unscale', ops) + def test_pure_fp16_optimizer(self): + """ test pure fp16 """ + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'pure_fp16') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + + params = train_prog.all_parameters() + for param in train_prog.all_parameters(): + self.assertEqual(param.dtype, fluid.core.VarDesc.VarType.FP16) + + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('cast', ops) + self.assertIn('check_finite_and_unscale', ops) + def test_amp_distributed_optimizer(self): """ test amp when distributed """ train_prog, startup_prog = fluid.Program(), fluid.Program() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_single.py b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py index 03e2939948273f7886cca00d80067b48465c7370..42b30e45b686b0e89b2e328ae34eff0822a65fd1 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_single.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py @@ -78,6 +78,7 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase): } def test_single_run_collective_minimize(self): + paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') @@ -114,6 +115,7 @@ class TestFleetBaseSingleRunPS(unittest.TestCase): } def test_single_run_ps_minimize(self): + paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py index 2d03b267fe9e3c969a82976fd3309374c53efa46..efe62a32fc3f7870393a5c4739fba17a25f4e054 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -53,8 +53,25 @@ class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer): self.set_strategy(strategy, 'gradient_merge') self.set_strategy(strategy, 'amp') self.optimizer(avg_cost, strategy, train_prog, startup_prog) + + vars = [x.name for x in train_prog.list_vars()] + self.assertIn('@GradientMerge', ''.join(vars)) + self.assertIn('cast', ''.join(vars)) + + def test_gm_pure_fp16_optimizer(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'gradient_merge') + self.set_strategy(strategy, 'pure_fp16') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) print(train_prog) + params = train_prog.all_parameters() + for param in train_prog.all_parameters(): + self.assertEqual(param.dtype, + paddle.fluid.core.VarDesc.VarType.FP16) + vars = [x.name for x in train_prog.list_vars()] self.assertIn('@GradientMerge', ''.join(vars)) self.assertIn('cast', ''.join(vars)) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index f07f64257c5dbeae6f9dc9427398407fc2377031..aad7ec63fc7f2d93690f89c14a7d5afed9cb12a5 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -244,7 +244,7 @@ class Adam(Optimizer): if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision: warnings.warn( "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Momentum optimizer." + "Consider using multi_precision=True option of the Adam optimizer." ) self._add_moments_pows(p)