diff --git a/test/collective/fleet/hybrid_parallel_mp_amp.py b/test/collective/fleet/hybrid_parallel_mp_amp.py index 2ae5fcf70291c85e579036985cc0991066f0117d..7b139c096647f48b278345099b5e2d8b518cca88 100644 --- a/test/collective/fleet/hybrid_parallel_mp_amp.py +++ b/test/collective/fleet/hybrid_parallel_mp_amp.py @@ -14,13 +14,13 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet -class TestMPClipGrad(TestDistMPTraning): +class TestMPClipGrad(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py index ae977f9891768b4567021452be629d32a8bb8bb6..2ddf1868dd0e51cdd4a4b140165fbb754ffa1144 100644 --- a/test/collective/fleet/hybrid_parallel_mp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py @@ -14,14 +14,14 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16 -class TestMPFP16(TestDistMPTraning): +class TestMPFP16(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py index b376afa1a9913a9617ec13162a7895c112470d80..9872bbf20e9275971d4f4b069aeeedd1bfae6d70 100644 --- a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py +++ b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py @@ -19,7 +19,7 @@ import numpy as np from hybrid_parallel_mp_model import ( SimpleDPNet, SimpleMPNet, - TestDistMPTraning, + TestDistMPTraining, parallel_matmul, set_random_seed, ) @@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet): return x -class TestMPBroadcastObj(TestDistMPTraning): +class TestMPBroadcastObj(TestDistMPTraining): def build_model_optimizer(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() diff --git a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py index 1f674675ecb8c58f757160410302c89b5b94726b..dab3998f9814de5742d8857b43d560c68e615214 100644 --- a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py +++ b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py @@ -14,7 +14,7 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle @@ -22,7 +22,7 @@ import paddle # log.setLevel(logging.WARNING) -class TestMPClipGrad(TestDistMPTraning): +class TestMPClipGrad(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_fp16.py b/test/collective/fleet/hybrid_parallel_mp_fp16.py index 5d10ac0a7aebcf96afbee7478cdf277cb0d42088..ca5fdc3a4062e096d2fba95cf60c012dd2b3e702 100644 --- a/test/collective/fleet/hybrid_parallel_mp_fp16.py +++ b/test/collective/fleet/hybrid_parallel_mp_fp16.py @@ -14,13 +14,13 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet -class TestMPFP16(TestDistMPTraning): +class TestMPFP16(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py index 751bc9255c100a5bbdb14a991fe59776edff7a92..4a7b223c6879a4f9d3cb10ab95be16d88457415d 100644 --- a/test/collective/fleet/hybrid_parallel_mp_layers.py +++ b/test/collective/fleet/hybrid_parallel_mp_layers.py @@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer): return output -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_model.py b/test/collective/fleet/hybrid_parallel_mp_model.py index 44ceb368b9b67920a47a31bcc2ad54ac6fa2cc48..08ae8f51e47f139db456d08d041528c4345cc690 100644 --- a/test/collective/fleet/hybrid_parallel_mp_model.py +++ b/test/collective/fleet/hybrid_parallel_mp_model.py @@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistMPSyncTraning(unittest.TestCase): +class TestDistMPSyncTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase): ) -class TestDistMPSyncModelTraning(TestDistMPSyncTraning): +class TestDistMPSyncModelTraining(TestDistMPSyncTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py index fa78482601f4035a3ca8e7d32fa62b03077b4a31..a4f11294f3815be33275f0d87c33c48bb9d84266 100644 --- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py +++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py @@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistSPSyncTraning(unittest.TestCase): +class TestDistSPSyncTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase): ) -class TestDistSPSyncModelTraning(TestDistSPSyncTraning): +class TestDistSPSyncModelTraining(TestDistSPSyncTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistSPTraning(unittest.TestCase): +class TestDistSPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_random.py b/test/collective/fleet/hybrid_parallel_mp_random.py index 717e983e262596790aa98cf4f438fe7e1fc191c9..00877818b1e7ecb5c2169fea075c1dd13b933934 100644 --- a/test/collective/fleet/hybrid_parallel_mp_random.py +++ b/test/collective/fleet/hybrid_parallel_mp_random.py @@ -20,7 +20,7 @@ import paddle from paddle.distributed import fleet -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_pp_amp.py b/test/collective/fleet/hybrid_parallel_pp_amp.py index 7c9f973ee1302c616996ef6b400fbc36e2f3395c..f3fe88a9161cfc5f2a11912ed12acfa134558af8 100644 --- a/test/collective/fleet/hybrid_parallel_pp_amp.py +++ b/test/collective/fleet/hybrid_parallel_pp_amp.py @@ -34,7 +34,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py index 70b3aec1515a84867602c98c12374a1a7fc95583..f260cd88f2f20b10d510cd7c064f2b8e3aaaa4a4 100644 --- a/test/collective/fleet/hybrid_parallel_pp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py @@ -35,7 +35,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py index 695e3ee90ef6590bdc6cb33f79cf8b495aadf349..76b0fa90c05aab2a06311def6956ef5f8ab46583 100644 --- a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py +++ b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py @@ -17,12 +17,12 @@ import unittest sys.path.append("../../legacy_test") -from hybrid_parallel_pp_alexnet import TestDistPPTraning +from hybrid_parallel_pp_alexnet import TestDistPPTraining import paddle -class TestPPClipGrad(TestDistPPTraning): +class TestPPClipGrad(TestDistPPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) scheduler = paddle.optimizer.lr.PiecewiseDecay( @@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning): return scheduler, optimizer -class TestPPClipGradParamGroup(TestDistPPTraning): +class TestPPClipGradParamGroup(TestDistPPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) scheduler = paddle.optimizer.lr.PiecewiseDecay( diff --git a/test/collective/fleet/hybrid_parallel_pp_embedding.py b/test/collective/fleet/hybrid_parallel_pp_embedding.py index 2dd335eb0efec6e979c470976d97c1f2ee311934..d485e77a79972857d5012a77c6c5b2c4aa0d384b 100644 --- a/test/collective/fleet/hybrid_parallel_pp_embedding.py +++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py @@ -120,7 +120,7 @@ class SimpleNetPipe(Layer): return feat -class TestDistEmbeddingTraning(unittest.TestCase): +class TestDistEmbeddingTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_fp16.py b/test/collective/fleet/hybrid_parallel_pp_fp16.py index 84430f4be3730360e79177405d1950705a3f68e0..c6c107a852a222340f99b49860431a8a1589cefc 100644 --- a/test/collective/fleet/hybrid_parallel_pp_fp16.py +++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py @@ -38,7 +38,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py index 793f2effd09bc9f4c70b62d24cb8f3f585d31ebb..fd03b562f25a36acb375baeb5f0969000cccd804 100644 --- a/test/collective/fleet/hybrid_parallel_pp_recompute.py +++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py @@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load.py b/test/collective/fleet/hybrid_parallel_pp_save_load.py index a2ab6a0654d5a44fd3587100213ec8d0a5998f75..cfa7c6961aef15e28e07d288fef7df84729b7454 100644 --- a/test/collective/fleet/hybrid_parallel_pp_save_load.py +++ b/test/collective/fleet/hybrid_parallel_pp_save_load.py @@ -30,7 +30,7 @@ micro_batch_size = 2 vocab_size = 128 -class TestDistPPSaveLoadTraning(unittest.TestCase): +class TestDistPPSaveLoadTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py index 5d6152f4e9f05c964f9fb32ce811e59dbb0ab8a3..8e922792cdb8a4c04e73b868b42674b737c297d7 100644 --- a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py @@ -33,7 +33,7 @@ micro_batch_size = 2 vocab_size = 128 -class TestDistPPSaveLoadTraning(unittest.TestCase): +class TestDistPPSaveLoadTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer.py b/test/collective/fleet/hybrid_parallel_pp_transformer.py index 216f37796daf08dd904117b9b720ff86b5432a94..18986e3df34fbe976f0fca8c821bdf9152fbb9cc 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py @@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py index a8cf970f73d229cd68af1831df477022ad9359ac..1bd865e107ec4ed3a5b5e95de47efb7aaf3812d1 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py @@ -29,7 +29,7 @@ vocab_size = 128 transformer_layer_num = 8 -class TestDistPPSaveTraning(unittest.TestCase): +class TestDistPPSaveTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py index 372cbe7f48d93a4295d921561b9302b32c9f71be..fea23b62661b9118e928c68ad504c8f01acc0c0f 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py @@ -33,7 +33,7 @@ vocab_size = 128 transformer_layer_num = 8 -class TestDistPPSaveTraning(unittest.TestCase): +class TestDistPPSaveTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py index 60b1e1052bda3d613a87e3be286078381afe5fdd..bc43f514ea9a62cff8672e12e0492c65cdc82122 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py @@ -17,7 +17,7 @@ import unittest import numpy as np from hybrid_parallel_pp_transformer import ( ModelPipe, - TestDistPPTraning, + TestDistPPTraining, batch_size, length, micro_batch_size, @@ -30,7 +30,7 @@ import paddle.distributed as dist from paddle.distributed import fleet -class TestDistPPTraningUnbalancedData(TestDistPPTraning): +class TestDistPPTrainingUnbalancedData(TestDistPPTraining): def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py index 3d7991727ee83599e94d5fef25f09a104953b10a..35a17f17acea249cdfc32f4f6564043247f5a85b 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py @@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_qat.py b/test/collective/fleet/hybrid_parallel_qat.py index 11662d849eb4d1d0bfecbfe2497e227f53807d89..00bc0f746e761b34f30c6a35b245c55a35da5fc7 100644 --- a/test/collective/fleet/hybrid_parallel_qat.py +++ b/test/collective/fleet/hybrid_parallel_qat.py @@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer): return x -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_sharding_model.py b/test/collective/fleet/hybrid_parallel_sharding_model.py index bb1ace6a6a47653f272174dc456cdea3e06bb768..41343d2dbda9edabb868ed889dee2f225baa3f81 100644 --- a/test/collective/fleet/hybrid_parallel_sharding_model.py +++ b/test/collective/fleet/hybrid_parallel_sharding_model.py @@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): random.seed(2021) np.random.seed(2021) diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py index c110d5d00a4a91f5694ff5f9b12588dfcf33f6ff..f54b994768740ad040923d4d4354a97aa9ae9939 100644 --- a/test/collective/fleet/hybrid_parallel_shared_weight.py +++ b/test/collective/fleet/hybrid_parallel_shared_weight.py @@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer): super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs) -class TestDistEmbeddingTraning(unittest.TestCase): +class TestDistEmbeddingTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py index a3f4d5f0b1610dd2910bb324fc269173b9823c27..85f24f61d794dcae7c06fef10f54b66a2dbc2409 100644 --- a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py +++ b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/collective/fleet/test_fleet_static_mp_layers.py b/test/collective/fleet/test_fleet_static_mp_layers.py index 6f20943a12837194f23a062fcaec271d03f80a5e..3cd07943d114d306ad66a609766ee2b599ee33a8 100644 --- a/test/collective/fleet/test_fleet_static_mp_layers.py +++ b/test/collective/fleet/test_fleet_static_mp_layers.py @@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer): return output -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): os.environ["PADDLE_TRAINER_ID"] = "2" os.environ[ diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py index 1be27f9bc803ab43a2296f58cefb392d156ecede..4d5e1955e23f118b2bd5131ec3a5805ec67a95f3 100644 --- a/test/legacy_test/auto_parallel_gpt_model.py +++ b/test/legacy_test/auto_parallel_gpt_model.py @@ -35,7 +35,7 @@ def init_global(): class MultiHeadAttention(nn.Layer): """ - Attention mapps queries and a set of key-value pairs to outputs, and + Attention maps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. """ @@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer): def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): """ - Prapares linear projected queries, keys and values for usage of subsequnt + Prepares linear projected queries, keys and values for usage of subsequent multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ @@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer): def gen_cache(self, key, value=None, type=Cache): """ - Generates cache for `forward` usage in inference accroding to arguments. + Generates cache for `forward` usage in inference according to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ @@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer): ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones - input_embedings = self.word_embeddings(input_ids) + input_embeddings = self.word_embeddings(input_ids) if _global_parallel_strategy == "mp": auto.shard_tensor( self.word_embeddings.weight, _global_process_mesh, ["x", None] @@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer): ) position_embeddings = self.position_embeddings(position_ids) - embeddings = input_embedings + position_embeddings + embeddings = input_embeddings + position_embeddings embeddings = self.dropout(embeddings) return embeddings diff --git a/test/legacy_test/benchmark_sum_op.py b/test/legacy_test/benchmark_sum_op.py index 6854fd7f208c5edeffb78f4941027c93f8fa150d..7bc15957e1cdabdf109c0ac4de076f6a2072125e 100644 --- a/test/legacy_test/benchmark_sum_op.py +++ b/test/legacy_test/benchmark_sum_op.py @@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite): def test_timeit_output(self): """ - perf the op, time cost will be averged in iters. + perf the op, time cost will be averaged in iters. output example >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 @@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite): def test_timeit_grad(self): """ - perf the op gradient, time cost will be averged in iters. + perf the op gradient, time cost will be averaged in iters. output example >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py index a5634a0cfba28fd23ccc1a23ead9858a536861a8..419344edfae5de7df40a4b1b8b775090ce93d21f 100644 --- a/test/legacy_test/dist_fleet_ctr.py +++ b/test/legacy_test/dist_fleet_ctr.py @@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): dnn_out = fc # build lr model - lr_embbding = paddle.static.nn.embedding( + lr_embedding = paddle.static.nn.embedding( is_distributed=False, input=lr_data, size=[lr_input_dim, 1], @@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): padding_idx=0, ) lr_pool = paddle.static.nn.sequence_lod.sequence_pool( - input=lr_embbding.squeeze(-2), pool_type="sum" + input=lr_embedding.squeeze(-2), pool_type="sum" ) merge_layer = paddle.concat([dnn_out, lr_pool], axis=1) diff --git a/test/legacy_test/dist_hapi_mnist_dynamic.py b/test/legacy_test/dist_hapi_mnist_dynamic.py index 7fa896cf3ddc270e8144e87885cab9c93eb838d4..66b5f66119b4f8e5eea03477aaba55134a66b13f 100644 --- a/test/legacy_test/dist_hapi_mnist_dynamic.py +++ b/test/legacy_test/dist_hapi_mnist_dynamic.py @@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_dynamic_multiple_gpus(self): device = set_device('gpu') diff --git a/test/legacy_test/dist_hapi_mnist_static.py b/test/legacy_test/dist_hapi_mnist_static.py index 9d9b488f907e6f238afdba8b120c57cf50b667e6..c465ef7fe85a368e4226bdb920ade3b950fb4f3a 100644 --- a/test/legacy_test/dist_hapi_mnist_static.py +++ b/test/legacy_test/dist_hapi_mnist_static.py @@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_static_multiple_gpus(self): paddle.enable_static() device = set_device('gpu') diff --git a/test/legacy_test/dist_hapi_pure_fp16_static.py b/test/legacy_test/dist_hapi_pure_fp16_static.py index f63866991fe267640893b85efa4d7690ab184842..3506b22d31f3003765c015444940206f5a488428 100644 --- a/test/legacy_test/dist_hapi_pure_fp16_static.py +++ b/test/legacy_test/dist_hapi_pure_fp16_static.py @@ -26,10 +26,10 @@ from paddle.vision.models import LeNet @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraningWithPureFP16(unittest.TestCase): +class TestDistTrainingWithPureFP16(unittest.TestCase): def test_amp_training_purefp16(self): if not fluid.is_compiled_with_cuda(): - self.skipTest('module not tested when ONLY_CPU compling') + self.skipTest('module not tested when ONLY_CPU compiling') data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32) label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64) diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py index 23e69c6b8ae62d5313b6edc71c4913d246cb0542..c24691eda1503057a48f16e9fdb789b35dd76f85 100644 --- a/test/legacy_test/gradient_checker.py +++ b/test/legacy_test/gradient_checker.py @@ -269,7 +269,7 @@ def grad_check( if program is None: program = fluid.default_main_program() - # init variable in strtup program + # init variable in startup program scope = fluid.executor.global_scope() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -595,7 +595,7 @@ def get_static_double_grad( if program is None: program = fluid.default_main_program() - # init variable in strtup program + # init variable in startup program scope = fluid.executor.global_scope() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -657,7 +657,7 @@ def get_eager_double_grad( the second order derivative and the inputs of second order derivative's calculation will be returned for higher order derivative's calculation. If 'return_mid_result' set False. - A list of numpy array that stores second derivative result calulated by dygraph. + A list of numpy array that stores second derivative result calculated by dygraph. """ if isinstance(place, fluid.CPUPlace): paddle.set_device("cpu") @@ -684,7 +684,7 @@ def get_eager_double_grad( ) d_inputs = [d_input for d_input in d_inputs if d_input is not None] - # calcluate second derivative + # calculate second derivative inputs = inputs + dys ddys = [] if return_mid_result: @@ -808,7 +808,7 @@ def get_static_triple_grad( program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). Returns: - A list of numpy array that stores third derivative result calulated by static graph. + A list of numpy array that stores third derivative result calculated by static graph. """ if program is None: program = fluid.default_main_program() @@ -858,13 +858,13 @@ def get_eager_triple_grad( place (fluid.CPUPlace or fluid.CUDAPlace): the device. return_mid_result (list[Tensor], list[Tensor]): If set True, the Returns: - A list of numpy array that stores second derivative result calulated by dygraph + A list of numpy array that stores second derivative result calculated by dygraph """ dd_y, dd_x = get_eager_double_grad( func, x_init, dy_init, place, return_mid_result=True ) - # calcluate third derivative + # calculate third derivative dddys = [] for dd_yi in dd_y: dd_yi.stop_gradient = False diff --git a/test/legacy_test/hybrid_parallel_pp_alexnet.py b/test/legacy_test/hybrid_parallel_pp_alexnet.py index d8e433456d1250ab7280ca9a67448c8a07d323d4..dc461176da1679ac2798012721f73545e4a8b441 100644 --- a/test/legacy_test/hybrid_parallel_pp_alexnet.py +++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py @@ -41,7 +41,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 @@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase): ) -class TestDistPPDelayScaleLoss(TestDistPPTraning): +class TestDistPPDelayScaleLoss(TestDistPPTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 @@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistPPMainGrad(TestDistPPTraning): +class TestDistPPMainGrad(TestDistPPTraining): def wrapper_mix_precision(self, model, optimizer): model = MixPrecisionLayer(model, dtype="float16") optimizer = MixPrecisionOptimizer(optimizer) diff --git a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py index febf4b5f87d4ac52adc863ad071869723aff22d2..87167fd7318ab5fffa3ef70f111f456bf70b0570 100644 --- a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py +++ b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py @@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): return self.linear(inputs) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/legacy_test/parallel_dygraph_gradient_check.py b/test/legacy_test/parallel_dygraph_gradient_check.py index a6c47b65d8f5a2f22ad6aed2843627a87bbbc7ed..5566b74f9aa65454636d833a6d2019c6a0a87e52 100644 --- a/test/legacy_test/parallel_dygraph_gradient_check.py +++ b/test/legacy_test/parallel_dygraph_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): dist.init_parallel_env() self.trainer_id = dist.get_rank() diff --git a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py index df66ff7616a08d372fb04d49ae08a4fcf2c6db05..7cb8c326a3edef36dc17d1c55e813a08e24d7beb 100644 --- a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py +++ b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() self.pg = dist.init_parallel_env() diff --git a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py index 6db6e9e62a5f94650b4f8cb85d99dbc0829e4784..805bf251acce6f51c9a150bbfc5b2dae5fcfcfcd 100644 --- a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py +++ b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py @@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): return self.linear(inputs) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py index 9de687f524aae6a03b0e398478d952870731184f..4f9ac5e3b98c78977e74d434993627efb2e9d3da 100644 --- a/test/xpu/parallel_dygraph_gradient_check.py +++ b/test/xpu/parallel_dygraph_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): dist.init_parallel_env() self.trainer_id = dist.get_rank() diff --git a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py index f0e46b2db27c21e27b735e48e9acb9fb876b35eb..497b4b025a2bb839655f289060653bd848075e66 100644 --- a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py +++ b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): self.trainer_id = dist.get_rank() self.pg = dist.init_parallel_env()