#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import

import logging
import re

import numpy as np
import paddle as P
import paddle.distributed.fleet as fleet
from propeller.paddle.train.hooks import RunHook

log = logging.getLogger(__name__)

from demo.utils import create_if_not_exists, get_warmup_and_linear_decay


def optimization(
        loss,
        warmup_steps,
        num_train_steps,
        learning_rate,
        train_program,
        startup_prog,
        weight_decay,
        scheduler='linear_warmup_decay',
        use_fp16=False, ):
    """do backword for static"""

    def exclude_from_weight_decay(param):
        name = param.rstrip('.master')
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    g_clip = P.nn.ClipGradByGlobalNorm(1.0)
    lr_scheduler = P.optimizer.lr.LambdaDecay(
        learning_rate,
        get_warmup_and_linear_decay(num_train_steps, warmup_steps))

    optimizer = P.optimizer.AdamW(
        learning_rate=lr_scheduler,
        weight_decay=weight_decay,
        grad_clip=g_clip,
        apply_decay_param_fun=exclude_from_weight_decay)

    if use_fp16:
        log.info('AMP activated')
        if weight_decay > 0.:
            raise ValueError(
                'paddle amp will ignore `weight_decay`, see https://github.com/PaddlePaddle/Paddle/issues/29794'
            )
        #amp_list = P.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
        #    custom_white_list=['softmax', 'layer_norm', 'gelu'])
        optimizer = P.fluid.contrib.mixed_precision.decorate(
            optimizer, init_loss_scaling=2**15, use_dynamic_loss_scaling=True)
        _, param_grads = optimizer.minimize(loss)
        loss_scaling = P.static.default_main_program().global_block().var(
            'loss_scaling_0')
    else:
        _, param_grads = optimizer.minimize(loss)
        loss_scaling = None

    class LRStepHook(RunHook):
        def after_run(self, _, __):
            lr_scheduler.step()
            log.debug('lr step: %.5f' % lr_scheduler.get_lr())

    return LRStepHook(), loss_scaling