test_rule_based_tuner_o2.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import unittest

import numpy as np

import paddle
from paddle import static

sys.path.append("..")
import auto_parallel_gpt_model as modeling
from auto_parallel_gpt_model import (
    GPTForPretraining,
    GPTModel,
    GPTPretrainingCriterion,
)


def get_gpt_model(
    train_program, start_program, place, batch_size, sequence_len, vocab_size
):
    with static.program_guard(train_program, start_program):
        tokens = paddle.static.data(
            name="tokens", shape=[batch_size, sequence_len], dtype='int64'
        )
        position_ids = paddle.static.data(
            name="position_ids", shape=[batch_size, sequence_len], dtype='int64'
        )
        attention_mask = paddle.static.data(
            name="attention_mask",
            shape=[batch_size, 1, sequence_len, sequence_len],
            dtype='float32',
        )
        labels = paddle.static.data(
            name="labels", shape=[batch_size, sequence_len], dtype='int64'
        )
        loss_mask = paddle.static.data(
            name="loss_mask", shape=[batch_size, sequence_len], dtype='float32'
        )

        gpt = GPTModel(
            vocab_size=1000,
            hidden_size=64,
            num_hidden_layers=2,
            num_attention_heads=8,
            intermediate_size=256,
            hidden_act="gelu",
            hidden_dropout_prob=0.0,
            attention_probs_dropout_prob=0.0,
            max_position_embeddings=1024,
            type_vocab_size=1,
            initializer_range=0.02,
            pad_token_id=0,
            eos_token_id=7,
            bos_token_id=0,
            eol_token_id=3,
        )

        model = GPTForPretraining(
            gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02
        )
        preds = model(tokens, position_ids, attention_mask)
        criterion = GPTPretrainingCriterion()
        loss = criterion(preds, labels, loss_mask)

    def gen_data():
        np.random.seed(2021)
        tokens = []
        position_ids = []
        attention_mask = []
        labels = []
        loss_mask = []
        for _ in range(batch_size):
            tokens.append(np.random.randint(vocab_size, size=sequence_len))
            position_ids.append(np.arange(sequence_len))
            attention_mask.append([np.tril(np.ones(sequence_len))])
            labels.append(np.random.randint(vocab_size, size=sequence_len))
            loss_mask.append(np.ones(sequence_len))

        return tokens, position_ids, attention_mask, labels, loss_mask

    return train_program, start_program, loss, gen_data


class TestRuleBasedTuner(unittest.TestCase):
    def test_gpt_o2(self):
        modeling.init_global()
        train_program = static.Program()
        start_program = static.Program()
        batch_size = 8
        sequence_len = 512
        vocab_size = 1000
        place = None
        train_program, start_program, loss, gen_data = get_gpt_model(
            train_program,
            start_program,
            place,
            batch_size,
            sequence_len,
            vocab_size,
        )
        from paddle.distributed.auto_parallel.static.cluster import Cluster
        from paddle.distributed.auto_parallel.static.dist_context import (
            DistributedContext,
        )
        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
            RuleBasedTuner,
        )

        clip = paddle.nn.ClipGradByGlobalNorm(0.2)
        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)

        cluster = Cluster()
        cluster.gen_default_config_cluster(node_count=1, device_count=8)
        dist_context = DistributedContext(
            serial_main_prog=train_program,
            serial_startup_prog=start_program,
            serial_optimizer=opt,
            serial_loss=loss,
            cluster=cluster,
        )
        dist_context.initialize()
        tuner = RuleBasedTuner(dist_context, level="o2")
        tuner.tune()


if __name__ == "__main__":
    unittest.main()