diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index a86e1234e4a351454ccd1a4daa9287e68b4bdc75..6282b9021b411f493654248ebfb3f65b28010ff2 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -34,6 +34,8 @@ __all__ = [ fleet = Fleet() _final_strategy = fleet._final_strategy +_get_applied_meta_list = fleet._get_applied_meta_list +_get_applied_graph_list = fleet._get_applied_graph_list init = fleet.init is_first_worker = fleet.is_first_worker worker_index = fleet.worker_index diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index c46911da0f2293df403168fcf4710abc065ce8e6..4bd94c13ae0f218fd0d3459b3b3741655350a510 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -925,6 +925,24 @@ class Fleet(object): else: return self._context["valid_strategy"] + def _get_applied_meta_list(self): + if "applied_meta_list" not in self._context: + print( + "WARNING: You may need to call minimize function before _get_applied_meta_list called" + ) + return [] + else: + return self._context["applied_meta_list"] + + def _get_applied_graph_list(self): + if "applied_graph_list" not in self._context: + print( + "WARNING: You may need to call minimize function before _get_applied_graph_list called" + ) + return [] + else: + return self._context["applied_graph_list"] + def minimize(self, loss, startup_program=None, @@ -1043,6 +1061,12 @@ class Fleet(object): context["valid_strategy"] = copy.deepcopy(valid_strategy) + applied_meta_list = self.strategy_compiler._get_applied_meta_list() + applied_graph_list = self.strategy_compiler._get_applied_graph_list() + + context['applied_meta_list'] = applied_meta_list + context['applied_graph_list'] = applied_graph_list + self._context = context self.valid_strategy = valid_strategy diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py index d598dd8ed4bbdd2d79268aae595a70d5f7209e1f..1d6fcee5442947b0139383e5e1275548ba16792f 100644 --- a/python/paddle/distributed/fleet/base/strategy_compiler.py +++ b/python/paddle/distributed/fleet/base/strategy_compiler.py @@ -122,13 +122,19 @@ class StrategyCompiler(StrategyCompilerBase): def __init__(self): super(StrategyCompiler, self).__init__() - self._meta_optimizer = None - self._graph_optimizer = None + self._meta_optimizers = [] + self._graph_optimizers = [] self._valid_optimizer_list = None self._user_defined_strategy = None self._meta_optimizer_candidates = [] self._graph_optimizer_candidates = [] + def _get_applied_meta_list(self): + return [type(opt).__name__ for opt in self._meta_optimizers] + + def _get_applied_graph_list(self): + return [type(opt).__name__ for opt in self._graph_optimizers] + def _get_valid_strategy(self, dist_strategy, can_not_apply_optimizer_list): import copy valid_strategy = copy.deepcopy(dist_strategy) @@ -178,8 +184,8 @@ class StrategyCompiler(StrategyCompilerBase): # and graph_optimizer, the corresponding distributed strategy # should be updated. - self._meta_optimizers = meta_optimizers - self._graph_optimizers = graph_optimizers + self._meta_optimizers = [] if meta_optimizers is None else meta_optimizers + self._graph_optimizers = [] if graph_optimizers is None else graph_optimizers return_meta = None if meta_optimizers == None else meta_optimizers[ 0] diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 283589c5f332089ecb1e4e97e326c7314ee437c3..d861aa7579f461c61032e77e38e59f9376df0210 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -72,7 +72,7 @@ class AMPOptimizer(MetaOptimizerBase): "incr_every_n_steps": 1000, "decr_every_n_nan_or_inf": 2, "incr_ratio": 2.0, - "decr_ratio": 8.0, + "decr_ratio": 0.8, "use_dynamic_loss_scaling": True } diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index b4112e88860cd21ebd96400079c51e1676af8f63..eb4ac1356eaaff4359854176ad18edb0cef178e6 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -133,8 +133,14 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer): self.set_strategy(strategy, 'amp') self.set_strategy(strategy, 'recompute') self.set_strategy(strategy, 'lamb') + self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam') + applied_meta_list = fleet._get_applied_meta_list() + applied_graph_list = fleet._get_applied_graph_list() + print(applied_meta_list, applied_graph_list) + self.assertEqual(len(applied_meta_list), 3) + ops = [op.type for op in avg_cost.block.ops] outs = [ op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py index 0a4e2f631d60cf1dad790d720c88074090a08ca8..3e5b479fab559b36a0b2fc7ea14b65e538ed0ad4 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py @@ -48,6 +48,9 @@ class TestDistributedStrategyAuto(unittest.TestCase): optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) + applied_meta_list = fleet._get_applied_meta_list() + print("applied_meta_list: {}".format(applied_meta_list)) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py index 25801793f1f2e70c404727ed4f64c7d3c830aec9..6be05f436328e0b255d67408ee25e5259949c128 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py @@ -18,6 +18,7 @@ import paddle import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker import paddle.fluid as fluid +paddle.enable_static() class TestFleetBase(unittest.TestCase): @@ -48,5 +49,44 @@ class TestFleetBase(unittest.TestCase): optimizer.minimize(avg_cost) +class TestFleetBase(unittest.TestCase): + def setUp(self): + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ + "127.0.0.1:36001,127.0.0.2:36001" + + def test_fleet_get_applied_optimizer(self): + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + fleet.init(is_collective=True) + + meta_list = fleet._get_applied_meta_list() + graph_list = fleet._get_applied_graph_list() + # not called minimize function + self.assertEqual(len(meta_list), 0) + self.assertEqual(len(graph_list), 0) + + strategy = fleet.DistributedStrategy() + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + meta_list = fleet._get_applied_meta_list() + graph_list = fleet._get_applied_graph_list() + self.assertEqual(len(meta_list), 0) + self.assertEqual(len(graph_list), 1) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py index 1b3fbb86a4af55d6838df3a628bf2cf194c5235d..dba409ec9200e5675f2c8ca3fd74ce46691f41aa 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py @@ -16,6 +16,9 @@ import unittest import paddle import os import paddle.fluid as fluid +import paddle.distributed.fleet as fleet + +paddle.enable_static() class TestFleetBase(unittest.TestCase): @@ -27,7 +30,6 @@ class TestFleetBase(unittest.TestCase): "127.0.0.1:36001,127.0.0.2:36001" def test_fleet_init(self): - import paddle.distributed.fleet as fleet os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["POD_IP"] = "127.0.0.1"