optimizer.py 3.6 KB
Newer Older
W
wuhuachaocoding 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
16

W
wuhuachaocoding 已提交
17
from paddle.distributed import fleet
18
from paddle.framework import in_dynamic_mode
19 20

from .meta_optimizers import HeterParallelOptimizer, HybridParallelOptimizer
R
Roc 已提交
21
from .utils.log_util import logger
W
wuhuachaocoding 已提交
22 23 24 25


def _dygraph_distributed_optimizer(optimizer, strategy=None):
    """
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
    Optimizer for distributed training.
    For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
    Which has basic Optimizer function and special features for distributed training.
    Args:
        optimizer(Optimizer): The executor to run for init server.
        strategy(DistributedStrategy): Extra properties for distributed optimizer.
            It is recommended to use DistributedStrategy in fleet.init(). The strategy
            here is for compatibility. If the strategy in fleet.distributed_optimizer()
            is not None, then it will overwrite the DistributedStrategy in fleet.init(),
            which will take effect in distributed training.
    Returns:
        Fleet: instance of fleet.
    Examples:
        .. code-block:: python
            import paddle
            import paddle.distributed.fleet as fleet
            fleet.init(is_collective=True)
            strategy = fleet.DistributedStrategy()
            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
    """
W
wuhuachaocoding 已提交
47 48 49 50 51
    fleet_env = fleet.fleet
    fleet_env.user_defined_optimizer = optimizer

    if strategy is not None:
        if fleet_env._is_collective:
R
Roc 已提交
52
            logger.warning(
W
wuhuachaocoding 已提交
53 54 55 56
                "It is recommended to use DistributedStrategy "
                "in fleet_env.init(). The strategy here is only for compatibility. "
                "If the strategy in fleet_env.distributed_optimizer() is "
                "not None, then it will overwrite the DistributedStrategy in fleet_env.init(), "
57 58
                "which will take effect in distributed training."
            )
W
wuhuachaocoding 已提交
59 60 61 62 63
        fleet_env._user_defined_strategy = copy.deepcopy(strategy)

    fleet_env._context = {}

    if fleet_env.worker_num() > 1:
64
        if not fleet_env._user_defined_strategy.heter_ccl_mode:
65
            hp_optim = HybridParallelOptimizer(
66 67
                optimizer, fleet_env._hcg, fleet_env._user_defined_strategy
            )
68 69 70 71 72 73

            if fleet_env._user_defined_strategy.hybrid_configs[
                "pp_configs"
            ].dp_comm_overlap:
                hp_optim._dp_enable = False

74 75 76 77 78
            if fleet_env._user_defined_strategy.hybrid_configs[
                "pp_configs"
            ].sharding_comm_overlap:
                hp_optim._sharding_enable = False

79
            return hp_optim
W
wuhuachaocoding 已提交
80
        else:
81 82 83
            return HeterParallelOptimizer(
                optimizer, fleet_env._user_defined_strategy
            )
W
wuhuachaocoding 已提交
84 85 86 87 88
    else:
        return optimizer


def distributed_optimizer(*args, **kwargs):
89
    if in_dynamic_mode():
W
wuhuachaocoding 已提交
90 91 92
        return _dygraph_distributed_optimizer(*args, **kwargs)
    else:
        return fleet.fleet.distributed_optimizer(*args, **kwargs)