# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import unittest

from get_gpt_model import FakeDataset

import paddle
from paddle.distributed.fleet import auto

sys.path.append("../legacy_test")
import auto_parallel_gpt_model as modeling
from auto_parallel_gpt_model import (
    GPTForPretraining,
    GPTModel,
    GPTPretrainingCriterion,
)


def generate_model():
    modeling.init_global()
    modeling._global_parallel_strategy = "serial"

    gpt = GPTModel(
        vocab_size=50304,
        hidden_size=1024,
        num_hidden_layers=13,
        num_attention_heads=16,
        intermediate_size=1024 * 4,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=1024,
        type_vocab_size=1,
        initializer_range=0.02,
        pad_token_id=0,
        eos_token_id=7,
        bos_token_id=0,
        eol_token_id=3,
        use_new_recompute=True,
        recompute_granularity="full",
    )
    model = GPTForPretraining(
        gpt, vocab_size=50304, hidden_size=1024, initializer_range=0.02
    )
    criterion = GPTPretrainingCriterion()
    return model, criterion


def apply_pass():
    strategy = auto.Strategy()
    strategy.auto_mode = "semi"

    recompute = strategy.recompute
    recompute.enable = True
    recompute.enable_tuning = True

    tuning = strategy.tuning
    tuning.enable = True
    tuning.profile_start_step = 1
    tuning.profile_end_step = 2
    tuning.run_after_tuning = True
    tuning.verbose = True
    return strategy


class TestRecomputePassTuning(unittest.TestCase):
    def setUp(self):
        self.batch_size = 8
        self.batch_num = 200
        self.dataset = FakeDataset(
            self.batch_size * self.batch_num,
            vocab_size=50304,
            sequence_len=1024,
        )

    def test_recompute_pass(self):
        strategy = apply_pass()
        clip = paddle.nn.ClipGradByGlobalNorm(0.2)
        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
        model, loss = generate_model()

        engine = auto.Engine(model, loss, opt, strategy=strategy)
        engine._tune(self.dataset, 3, batch_size=self.batch_size)

        gpu_memory_size = round(
            paddle.device.cuda.get_device_properties(0).total_memory
            / 1024
            / 1024
            / 1024
        )
        dist_strategy = engine._dist_contexts['train'].strategy
        if gpu_memory_size in [16, 32]:
            self.assertGreater(
                len(dist_strategy.recompute.no_recompute_segments),
                0,
                "When GPU memory size is 16G or 32G, the length of no_recompute_segments should be greater than 0.",
            )
        elif gpu_memory_size >= 40:
            self.assertEqual(
                dist_strategy.recompute.enable,
                False,
                "When GPU memory size is greater than 40GB, the recompute strategy should be disable.",
            )


if __name__ == "__main__":
    unittest.main()