diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 96ddc82b1c95ab3480b8badaa274118e40d2beda..d17e68276cd1ce576029cf306a18469aef2ffdb0 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -31,6 +31,9 @@ message AMPConfig { optional float incr_ratio = 4 [ default = 2.0 ]; optional float decr_ratio = 5 [ default = 0.8 ]; optional bool use_dynamic_loss_scaling = 6 [ default = true ]; + repeated string custom_white_list = 7; + repeated string custom_black_list = 8; + repeated string custom_black_varnames = 9; } message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; } diff --git a/python/paddle/fleet/base/meta_optimizer_factory.py b/python/paddle/fleet/base/meta_optimizer_factory.py index bbbd5fcacd66757f1a2932d33b86e6ea42fe8793..89ebb0ec601e249c58fd43995df1530f44940af4 100755 --- a/python/paddle/fleet/base/meta_optimizer_factory.py +++ b/python/paddle/fleet/base/meta_optimizer_factory.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ..meta_optimizers import AMPOptimizer from ..meta_optimizers import RecomputeOptimizer from ..meta_optimizers import GradientMergeOptimizer from ..meta_optimizers import GraphExecutionOptimizer @@ -22,6 +23,7 @@ from ..meta_optimizers import LarsOptimizer __all__ = ["MetaOptimizerFactory"] meta_optimizer_names = [ + "AMPOptimizer", "RecomputeOptimizer", "GradientMergeOptimizer", "GraphExecutionOptimizer", diff --git a/python/paddle/fleet/meta_optimizers/__init__.py b/python/paddle/fleet/meta_optimizers/__init__.py index 0beb06eacf8cf066c244b24392c697e2dab57948..aa6708e758a78cf2cb10f8ebda81d50ac796b548 100755 --- a/python/paddle/fleet/meta_optimizers/__init__.py +++ b/python/paddle/fleet/meta_optimizers/__init__.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +from .amp_optimizer import AMPOptimizer from .recompute_optimizer import RecomputeOptimizer from .gradient_merge_optimizer import GradientMergeOptimizer from .graph_execution_optimizer import GraphExecutionOptimizer @@ -19,6 +20,7 @@ from .localsgd_optimizer import LocalSGDOptimizer from .lars_optimizer import LarsOptimizer __all__ = [ + 'AMPOptimizer', 'RecomputeOptimizer', 'GradientMergeOptimizer', 'PipelineOptimizer', diff --git a/python/paddle/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/fleet/meta_optimizers/amp_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8316d807fa87062a8e3fba0bcb3bd057d2231032 --- /dev/null +++ b/python/paddle/fleet/meta_optimizers/amp_optimizer.py @@ -0,0 +1,63 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import paddle.fluid.contrib.mixed_precision as mixed_precision +from .meta_optimizer_base import MetaOptimizerBase + +__all__ = ["AMPOptimizer"] + + +class AMPOptimizer(MetaOptimizerBase): + def __init__(self, optimizer): + super(AMPOptimizer, self).__init__(optimizer) + self.inner_opt = optimizer + self.amp_opt = None + # we do not allow meta optimizer to be inner optimizer currently + self.meta_optimizers_white_list = [] + + def _set_basic_info(self, loss, role_maker, user_defined_optimizer, + user_defined_strategy): + super(AMPOptimizer, self)._set_basic_info( + loss, role_maker, user_defined_optimizer, user_defined_strategy) + + def _can_apply(self): + if self.user_defined_strategy.amp: + return True + return False + + def _disable_strategy(self, dist_strategy): + dist_strategy.amp = False + + def minimize_impl(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + if self.amp_opt is None: + config = self.user_defined_strategy.amp_configs + custom_white_list = set(config['custom_white_list']) + custom_black_list = set(config['custom_black_list']) + custom_black_varnames = set(config['custom_black_varnames']) + amp_lists = mixed_precision.AutoMixedPrecisionLists( + custom_white_list, custom_black_list, custom_black_varnames) + + self.amp_opt = mixed_precision.decorate( + self.inner_opt, amp_lists, config['init_loss_scaling'], + config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'], + config['incr_ratio'], config['decr_ratio'], + config['use_dynamic_loss_scaling']) + + optimize_ops, params_grads = \ + self.amp_opt.minimize(loss, startup_program, + parameter_list, no_grad_set) + return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index db3dc6b859495a4c060a5d4fe6e6eebd90c19b86..971a94f549fcff467bbe57256946016e40cff6bf 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -34,6 +34,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base) list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer) @@ -372,6 +373,7 @@ if(WITH_DISTRIBUTE) py_test_modules(test_fleet_graph_execution_meta_optimizer MODULES test_fleet_graph_execution_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS}) py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS}) + py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) if(NOT WIN32) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..ae4b5d7ecd7c5131e38904a0d8fde0b9bb4fbb89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os + + +class TestFleetAMPOptimizer(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" + + def test_amp_optimizer(self): + import paddle.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.fleet.DistributedStrategy() + strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "decr_every_n_nan_or_inf": 2, + "incr_every_n_steps": 1000, + "incr_ratio": 2.0, + "use_dynamic_loss_scaling": True, + "decr_ratio": 0.5, + "custom_white_list": ['softmax'], + "custom_black_list": ['tanh'], + } + + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = [op.type for op in avg_cost.block.ops] + self.assertIn('cast', ops) + self.assertIn('isfinite', ops) + + +if __name__ == "__main__": + unittest.main()