#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import paddle
import paddle.fluid as fluid
import contextlib
import unittest
import numpy as np
from paddle.io import Dataset
from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_model_to_fp16

paddle.enable_static()


class RandomDataset(Dataset):
    def __init__(self, num_samples, seed=123):
        super(RandomDataset, self).__init__()
        np.random.seed(seed)
        self.num_samples = num_samples

    def __getitem__(self, idx):
        image = np.random.random([3, 32, 32]).astype('float32')
        label = np.random.randint(0, 9, (1, )).astype('int64')
        return image, label

    def __len__(self):
        return self.num_samples


def reader_decorator(reader):
    def __reader__():
        for i in range(len(reader)):
            yield reader[i]

    return __reader__


def resnet_cifar10(input, depth=32):
    def conv_bn_layer(input,
                      ch_out,
                      filter_size,
                      stride,
                      padding,
                      act='relu',
                      bias_attr=False):
        tmp = fluid.layers.conv2d(
            input=input,
            filter_size=filter_size,
            num_filters=ch_out,
            stride=stride,
            padding=padding,
            act=None,
            bias_attr=bias_attr)
        return fluid.layers.batch_norm(input=tmp, act=act)

    def shortcut(input, ch_in, ch_out, stride):
        if ch_in != ch_out:
            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
        else:
            return input

    def basicblock(input, ch_in, ch_out, stride):
        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
        short = shortcut(input, ch_in, ch_out, stride)
        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')

    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
        tmp = block_func(input, ch_in, ch_out, stride)
        for i in range(1, count):
            tmp = block_func(tmp, ch_out, ch_out, 1)
        return tmp

    assert (depth - 2) % 6 == 0
    n = (depth - 2) // 6
    conv1 = conv_bn_layer(
        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
    with paddle.static.amp.fp16_guard():
        res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
        res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
        res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
    pool = fluid.layers.pool2d(
        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
    return pool


def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
    classdim = 10
    data_shape = [3, 32, 32]
    PASS_NUM = 1

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    train_program.random_seed = 123
    startup_prog.random_seed = 456
    with fluid.program_guard(train_program, startup_prog):
        images = fluid.layers.data(
            name='pixel', shape=data_shape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
        net = resnet_cifar10(images)
        logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
        cost = fluid.layers.softmax_with_cross_entropy(
            logits, label, return_softmax=False)
        sum_cost = fluid.layers.reduce_sum(cost)

        # Test program
        test_program = train_program.clone(for_test=True)

        if optimizer == "Adam":
            optimizer = paddle.optimizer.AdamW(
                learning_rate=0.001,
                epsilon=1e-8,
                weight_decay=0.0,
                multi_precision=True)
        elif optimizer == "Lars":
            optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
                learning_rate=0.001,
                momentum=0.9,
                multi_precision=use_pure_fp16)
        else:
            optimizer = paddle.optimizer.Momentum(
                learning_rate=0.001,
                momentum=0.9,
                use_nesterov=use_nesterov,
                weight_decay=fluid.regularizer.L2Decay(1e-4),
                multi_precision=use_pure_fp16)

        if use_pure_fp16:
            optimizer = paddle.static.amp.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True,
                use_pure_fp16=True)

        optimizer.minimize(sum_cost)

    train_reader = paddle.batch(
        reader_decorator(RandomDataset(
            16 * 5, seed=123)),
        batch_size=16,
        drop_last=True)

    test_reader = paddle.batch(
        reader_decorator(RandomDataset(
            4 * 5, seed=456)),
        batch_size=4,
        drop_last=True)

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])

    def train_loop():
        exe.run(startup_prog)
        if use_pure_fp16:
            optimizer.amp_init(
                place, test_program=test_program, use_fp16_test=True)

        train_loss_list = []
        test_loss_list = []
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                loss, = exe.run(train_program,
                                feed=feeder.feed(data),
                                fetch_list=[sum_cost])
                loss_v = loss[0] if isinstance(loss, np.ndarray) else loss
                print('PassID {0:1}, Train Batch ID {1:04}, train loss {2:2.4}'.
                      format(pass_id, batch_id + 1, float(loss_v)))
                train_loss_list.append(float(loss_v))

            for tid, test_data in enumerate(test_reader()):
                loss_t, = exe.run(program=test_program,
                                  feed=feeder.feed(test_data),
                                  fetch_list=[sum_cost])
                test_loss_list.append(float(loss_t))
                print('PassID {0:1}, Test Batch ID {1:04}, test loss {2:2.4}'.
                      format(pass_id, tid + 1, float(loss_t)))

        return train_loss_list, test_loss_list

    return train_loop()


class TestImageMultiPrecision(unittest.TestCase):
    def test_resnet_pure_fp16(self):
        if not fluid.core.is_compiled_with_cuda():
            return

        def do_test(use_nesterov=False, optimizer=""):
            if optimizer == "Adam":
                suffix = "use Adam"
            elif optimizer == "Lars":
                suffix = "use Lars"
            else:
                suffix = "with Nesterov" if use_nesterov else "without Nesterov"
            with self.scope_prog_guard():
                print("-----------------FP16 Train {}-----------------".format(
                    suffix))
                train_loss_fp16, test_loss_fp16 = train(
                    use_pure_fp16=True,
                    use_nesterov=use_nesterov,
                    optimizer=optimizer)
            with self.scope_prog_guard():
                print("-----------------FP32 Train {}-----------------".format(
                    suffix))
                train_loss_fp32, test_loss_fp32 = train(
                    use_pure_fp16=False,
                    use_nesterov=use_nesterov,
                    optimizer=optimizer)

            self.assertTrue(
                np.allclose(
                    np.array(train_loss_fp16),
                    np.array(train_loss_fp32),
                    rtol=1e-02,
                    atol=1e-05,
                    equal_nan=True),
                msg='Failed to train in pure FP16.')
            self.assertTrue(
                np.allclose(
                    np.array(test_loss_fp16),
                    np.array(test_loss_fp32),
                    rtol=1e-02,
                    atol=1e-05,
                    equal_nan=True),
                msg='Failed to test in pure FP16.')

        do_test(use_nesterov=False)
        do_test(use_nesterov=True)
        do_test(optimizer="Adam")
        do_test(optimizer="Lars")

    @contextlib.contextmanager
    def scope_prog_guard(self):
        prog = fluid.Program()
        startup_prog = fluid.Program()
        scope = fluid.core.Scope()
        with fluid.scope_guard(scope):
            with fluid.program_guard(prog, startup_prog):
                yield


class TestAmpWithNonIterableDataLoader(unittest.TestCase):
    def decorate_with_data_loader(self):
        main_prog = paddle.static.Program()
        start_prog = paddle.static.Program()
        with paddle.static.program_guard(main_prog, start_prog):
            with paddle.fluid.unique_name.guard():
                image = fluid.layers.data(
                    name='image', shape=[3, 224, 224], dtype='float32')
                label = fluid.layers.data(
                    name='label', shape=[1], dtype='int64')
                py_reader = fluid.io.DataLoader.from_generator(
                    feed_list=[image, label],
                    capacity=4,
                    iterable=False,
                    use_double_buffer=False)
                zero_var = fluid.layers.fill_constant(
                    shape=[1], dtype='int64', value=0)
                one_var = fluid.layers.fill_constant(
                    shape=[1], dtype='int64', value=1)
                with fluid.layers.control_flow.Switch() as switch:
                    with switch.case(label != zero_var):
                        fluid.layers.assign(input=zero_var, output=label)
                    with switch.default():
                        fluid.layers.assign(input=one_var, output=label)

                net = resnet_cifar10(image)
                logits = fluid.layers.fc(input=net, size=10, act="softmax")

        block = main_prog.global_block()
        for op in block.ops:
            if op.type == "mul":
                op._set_attr('in_dtype', fluid.core.VarDesc.VarType.FP32)
                op._set_attr('out_dtype', fluid.core.VarDesc.VarType.FP32)
                op._set_attr('dtype', fluid.core.VarDesc.VarType.FP32)

        cast_model_to_fp16(main_prog, use_fp16_guard=False)

    def test_non_iterable_dataloader(self):
        if fluid.core.is_compiled_with_cuda():
            self.decorate_with_data_loader()


if __name__ == '__main__':
    unittest.main()