# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import unittest import numpy as np import paddle from paddle import static sys.path.append("..") import auto_parallel_gpt_model as modeling from auto_parallel_gpt_model import ( GPTForPretraining, GPTModel, GPTPretrainingCriterion, ) def get_gpt_model( train_program, start_program, place, batch_size, sequence_len, vocab_size ): with static.program_guard(train_program, start_program): tokens = paddle.static.data( name="tokens", shape=[batch_size, sequence_len], dtype='int64' ) position_ids = paddle.static.data( name="position_ids", shape=[batch_size, sequence_len], dtype='int64' ) attention_mask = paddle.static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float32', ) labels = paddle.static.data( name="labels", shape=[batch_size, sequence_len], dtype='int64' ) loss_mask = paddle.static.data( name="loss_mask", shape=[batch_size, sequence_len], dtype='float32' ) gpt = GPTModel( vocab_size=1000, hidden_size=64, num_hidden_layers=2, num_attention_heads=8, intermediate_size=256, hidden_act="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=1024, type_vocab_size=1, initializer_range=0.02, pad_token_id=0, eos_token_id=7, bos_token_id=0, eol_token_id=3, ) model = GPTForPretraining( gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02 ) preds = model(tokens, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) def gen_data(): np.random.seed(2021) tokens = [] position_ids = [] attention_mask = [] labels = [] loss_mask = [] for _ in range(batch_size): tokens.append(np.random.randint(vocab_size, size=sequence_len)) position_ids.append(np.arange(sequence_len)) attention_mask.append([np.tril(np.ones(sequence_len))]) labels.append(np.random.randint(vocab_size, size=sequence_len)) loss_mask.append(np.ones(sequence_len)) return tokens, position_ids, attention_mask, labels, loss_mask return train_program, start_program, loss, gen_data class TestRuleBasedTuner(unittest.TestCase): def test_gpt_o2(self): modeling.init_global() train_program = static.Program() start_program = static.Program() batch_size = 8 sequence_len = 512 vocab_size = 1000 place = None train_program, start_program, loss, gen_data = get_gpt_model( train_program, start_program, place, batch_size, sequence_len, vocab_size, ) from paddle.distributed.auto_parallel.cluster import Cluster from paddle.distributed.auto_parallel.dist_context import ( DistributedContext, ) from paddle.distributed.auto_parallel.tuner.rule_based_tuner import ( RuleBasedTuner, ) clip = paddle.nn.ClipGradByGlobalNorm(0.2) opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip) cluster = Cluster() cluster.gen_default_config_cluster(node_count=1, device_count=8) dist_context = DistributedContext( serial_main_prog=train_program, serial_startup_prog=start_program, serial_optimizer=opt, serial_loss=loss, cluster=cluster, ) dist_context.initialize() tuner = RuleBasedTuner(dist_context, level="o2") tuner.tune() if __name__ == "__main__": unittest.main()