gradient_merge_optimizer.py 2.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

from paddle.fluid.optimizer import GradientMergeOptimizer as GM
from .meta_optimizer_base import MetaOptimizerBase

__all__ = ["GradientMergeOptimizer"]

19 20
# amp + gradient merge + lamb

21 22 23 24 25 26

class GradientMergeOptimizer(MetaOptimizerBase):
    def __init__(self, optimizer):
        super(GradientMergeOptimizer, self).__init__(optimizer)
        self.inner_opt = optimizer
        self.wrapped_opt = GM(optimizer)
27 28 29 30 31 32
        self.meta_optimizers_white_list = [
            "LarsOptimizer",
            "LambOptimizer",
            "GraphExecutionOptimizer",
        ]
        self.meta_optimizers_black_list = []
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                        user_defined_strategy):
        super(GradientMergeOptimizer, self)._set_basic_info(
            loss, role_maker, user_defined_optimizer, user_defined_strategy)
        self.wrapped_opt._set_k_steps(
            self.user_defined_strategy.gradient_merge_configs["k_steps"])
        self.wrapped_opt._set_avg(
            self.user_defined_strategy.gradient_merge_configs["avg"])

    def _can_apply(self):
        can_apply = (self.user_defined_strategy.gradient_merge == True) and \
                  self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
        return can_apply

    def _disable_strategy(self, dist_strategy):
        dist_strategy.gradient_merge = False
50
        dist_strategy.gradient_merge_configs = {}
51 52 53 54 55 56 57 58 59 60

    def minimize_impl(self,
                      loss,
                      startup_program=None,
                      parameter_list=None,
                      no_grad_set=None):
        optimize_ops, params_grads = \
            self.wrapped_opt.minimize(loss, startup_program,
                                      parameter_list, no_grad_set)
        return optimize_ops, params_grads