# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import os
import sys
import time
import unittest

import numpy as np

import paddle
from paddle import fluid
from paddle.fluid import compiler, core

# open eager delete mode
os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
os.environ['FLAGS_fast_eager_deletion_mode'] = 'true'
os.environ['CPU_NUM'] = '2'


class BuildIrMemOptBase(unittest.TestCase):
    def setup_reader(self):
        self.batch_size = 32
        self.word_dict = paddle.dataset.imdb.word_dict()
        self.train_reader = paddle.batch(
            paddle.dataset.imdb.train(self.word_dict),
            batch_size=self.batch_size,
        )

    def check_network_convergence(
        self,
        network,
        use_cuda=True,
        use_ir_memory_optimize=True,
        enable_inplace=True,
        iter=5,
    ):
        if use_cuda and not core.is_compiled_with_cuda():
            print('Skip use_cuda=True because Paddle is not compiled with cuda')
            return

        if os.name == 'nt':
            print(
                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
            )
            return
        fluid.default_startup_program().random_seed = 100
        fluid.default_main_program().random_seed = 100

        data = paddle.static.data(
            name="words", shape=[-1, 1], dtype="int64", lod_level=1
        )

        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")

        cost = network(data, label, len(self.word_dict))
        optimizer = paddle.optimizer.Adam(learning_rate=0.001)
        optimizer.minimize(cost)
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = enable_inplace
        build_strategy.memory_optimize = use_ir_memory_optimize

        # execution
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
        reader = feeder.feed(self.train_reader())
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())

        train_cp = compiler.CompiledProgram(
            fluid.default_main_program(), build_strategy=build_strategy
        )
        fetch_list = [cost.name]

        begin = time.time()
        first_loss, last_loss = None, None
        step_id = 0
        custom_iter = getattr(self, "iter", None)
        if custom_iter is not None:
            iter = custom_iter
        for data in reader():
            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
            print(ret)
            step_id += 1
            if step_id == 1:
                first_loss = ret[0]
            if step_id == iter:
                last_loss = ret[0]
                break
        end = time.time()

        print(
            "%.4f Instance per second"
            % ((self.batch_size * iter) / (end - begin))
        )

        print(first_loss, last_loss)
        avg_last_loss_val = np.array(last_loss).mean()
        avg_first_loss_val = np.array(first_loss).mean()
        if math.isnan(float(avg_last_loss_val)) or math.isnan(
            float(avg_first_loss_val)
        ):
            sys.exit("got NaN loss, training failed.")

        return first_loss, last_loss


class TestIrMemOptBase(BuildIrMemOptBase):
    def setUp(self):
        self.network = None

    def test_network(self):
        if self.network is None or not core.is_compiled_with_cuda():
            return

        self.setup_reader()

        with fluid.program_guard(fluid.Program(), fluid.Program()):
            with fluid.scope_guard(core.Scope()):
                (
                    baseline_first_loss,
                    baseline_last_loss,
                ) = self.check_network_convergence(self.network)

                cur_first_loss, cur_last_loss = self.check_network_convergence(
                    self.network
                )

                self.assertAlmostEqual(
                    np.mean(baseline_last_loss),
                    np.mean(cur_last_loss),
                    delta=1e-6,
                )
                self.assertAlmostEqual(
                    np.mean(baseline_first_loss),
                    np.mean(cur_first_loss),
                    delta=1e-6,
                )