From 4d094b0c20c54ff14068e9ada9c34a781493ac66 Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 7 Aug 2023 15:12:44 +0800 Subject: [PATCH] Fix typos (#56008) --- test/collective/fleet/hybrid_parallel_mp_amp.py | 4 ++-- test/collective/fleet/hybrid_parallel_mp_bf16.py | 4 ++-- .../fleet/hybrid_parallel_mp_broadcast_obj.py | 4 ++-- .../fleet/hybrid_parallel_mp_clip_grad.py | 4 ++-- test/collective/fleet/hybrid_parallel_mp_fp16.py | 4 ++-- test/collective/fleet/hybrid_parallel_mp_layers.py | 2 +- test/collective/fleet/hybrid_parallel_mp_model.py | 6 +++--- ...rid_parallel_mp_model_with_sequence_parallel.py | 6 +++--- test/collective/fleet/hybrid_parallel_mp_random.py | 2 +- test/collective/fleet/hybrid_parallel_pp_amp.py | 2 +- test/collective/fleet/hybrid_parallel_pp_bf16.py | 2 +- .../fleet/hybrid_parallel_pp_clip_grad.py | 6 +++--- .../fleet/hybrid_parallel_pp_embedding.py | 2 +- test/collective/fleet/hybrid_parallel_pp_fp16.py | 2 +- .../fleet/hybrid_parallel_pp_recompute.py | 2 +- .../fleet/hybrid_parallel_pp_save_load.py | 2 +- ...rid_parallel_pp_save_load_with_virtual_stage.py | 2 +- .../fleet/hybrid_parallel_pp_transformer.py | 2 +- .../fleet/hybrid_parallel_pp_transformer_save.py | 2 +- ...allel_pp_transformer_save_with_virtual_stage.py | 2 +- ...brid_parallel_pp_transformer_unbalanced_data.py | 4 ++-- ...d_parallel_pp_transformer_with_virtual_stage.py | 2 +- test/collective/fleet/hybrid_parallel_qat.py | 2 +- .../fleet/hybrid_parallel_sharding_model.py | 2 +- .../fleet/hybrid_parallel_shared_weight.py | 2 +- .../parallel_dygraph_no_sync_gradient_check.py | 2 +- .../fleet/test_fleet_static_mp_layers.py | 2 +- test/legacy_test/auto_parallel_gpt_model.py | 10 +++++----- test/legacy_test/benchmark_sum_op.py | 4 ++-- test/legacy_test/dist_fleet_ctr.py | 4 ++-- test/legacy_test/dist_hapi_mnist_dynamic.py | 2 +- test/legacy_test/dist_hapi_mnist_static.py | 2 +- test/legacy_test/dist_hapi_pure_fp16_static.py | 4 ++-- test/legacy_test/gradient_checker.py | 14 +++++++------- test/legacy_test/hybrid_parallel_pp_alexnet.py | 6 +++--- .../parallel_dygraph_dataparallel_with_pylayer.py | 2 +- .../legacy_test/parallel_dygraph_gradient_check.py | 2 +- ...arallel_dygraph_gradient_check_in_eager_mode.py | 2 +- .../parallel_dygraph_dataparallel_with_pylayer.py | 2 +- test/xpu/parallel_dygraph_gradient_check.py | 2 +- ...arallel_dygraph_gradient_check_in_eager_mode.py | 2 +- 41 files changed, 68 insertions(+), 68 deletions(-) diff --git a/test/collective/fleet/hybrid_parallel_mp_amp.py b/test/collective/fleet/hybrid_parallel_mp_amp.py index 2ae5fcf7029..7b139c09664 100644 --- a/test/collective/fleet/hybrid_parallel_mp_amp.py +++ b/test/collective/fleet/hybrid_parallel_mp_amp.py @@ -14,13 +14,13 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet -class TestMPClipGrad(TestDistMPTraning): +class TestMPClipGrad(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py index ae977f98917..2ddf1868dd0 100644 --- a/test/collective/fleet/hybrid_parallel_mp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py @@ -14,14 +14,14 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16 -class TestMPFP16(TestDistMPTraning): +class TestMPFP16(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py index b376afa1a99..9872bbf20e9 100644 --- a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py +++ b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py @@ -19,7 +19,7 @@ import numpy as np from hybrid_parallel_mp_model import ( SimpleDPNet, SimpleMPNet, - TestDistMPTraning, + TestDistMPTraining, parallel_matmul, set_random_seed, ) @@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet): return x -class TestMPBroadcastObj(TestDistMPTraning): +class TestMPBroadcastObj(TestDistMPTraining): def build_model_optimizer(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() diff --git a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py index 1f674675ecb..dab3998f981 100644 --- a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py +++ b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py @@ -14,7 +14,7 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle @@ -22,7 +22,7 @@ import paddle # log.setLevel(logging.WARNING) -class TestMPClipGrad(TestDistMPTraning): +class TestMPClipGrad(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_fp16.py b/test/collective/fleet/hybrid_parallel_mp_fp16.py index 5d10ac0a7ae..ca5fdc3a406 100644 --- a/test/collective/fleet/hybrid_parallel_mp_fp16.py +++ b/test/collective/fleet/hybrid_parallel_mp_fp16.py @@ -14,13 +14,13 @@ import unittest -from hybrid_parallel_mp_model import TestDistMPTraning +from hybrid_parallel_mp_model import TestDistMPTraining import paddle from paddle.distributed import fleet -class TestMPFP16(TestDistMPTraning): +class TestMPFP16(TestDistMPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py index 751bc9255c1..4a7b223c687 100644 --- a/test/collective/fleet/hybrid_parallel_mp_layers.py +++ b/test/collective/fleet/hybrid_parallel_mp_layers.py @@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer): return output -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_model.py b/test/collective/fleet/hybrid_parallel_mp_model.py index 44ceb368b9b..08ae8f51e47 100644 --- a/test/collective/fleet/hybrid_parallel_mp_model.py +++ b/test/collective/fleet/hybrid_parallel_mp_model.py @@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistMPSyncTraning(unittest.TestCase): +class TestDistMPSyncTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase): ) -class TestDistMPSyncModelTraning(TestDistMPSyncTraning): +class TestDistMPSyncModelTraining(TestDistMPSyncTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py index fa78482601f..a4f11294f38 100644 --- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py +++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py @@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistSPSyncTraning(unittest.TestCase): +class TestDistSPSyncTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase): ) -class TestDistSPSyncModelTraning(TestDistSPSyncTraning): +class TestDistSPSyncModelTraining(TestDistSPSyncTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 @@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistSPTraning(unittest.TestCase): +class TestDistSPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_mp_random.py b/test/collective/fleet/hybrid_parallel_mp_random.py index 717e983e262..00877818b1e 100644 --- a/test/collective/fleet/hybrid_parallel_mp_random.py +++ b/test/collective/fleet/hybrid_parallel_mp_random.py @@ -20,7 +20,7 @@ import paddle from paddle.distributed import fleet -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_pp_amp.py b/test/collective/fleet/hybrid_parallel_pp_amp.py index 7c9f973ee13..f3fe88a9161 100644 --- a/test/collective/fleet/hybrid_parallel_pp_amp.py +++ b/test/collective/fleet/hybrid_parallel_pp_amp.py @@ -34,7 +34,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py index 70b3aec1515..f260cd88f2f 100644 --- a/test/collective/fleet/hybrid_parallel_pp_bf16.py +++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py @@ -35,7 +35,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py index 695e3ee90ef..76b0fa90c05 100644 --- a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py +++ b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py @@ -17,12 +17,12 @@ import unittest sys.path.append("../../legacy_test") -from hybrid_parallel_pp_alexnet import TestDistPPTraning +from hybrid_parallel_pp_alexnet import TestDistPPTraining import paddle -class TestPPClipGrad(TestDistPPTraning): +class TestPPClipGrad(TestDistPPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) scheduler = paddle.optimizer.lr.PiecewiseDecay( @@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning): return scheduler, optimizer -class TestPPClipGradParamGroup(TestDistPPTraning): +class TestPPClipGradParamGroup(TestDistPPTraining): def build_optimizer(self, model): grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) scheduler = paddle.optimizer.lr.PiecewiseDecay( diff --git a/test/collective/fleet/hybrid_parallel_pp_embedding.py b/test/collective/fleet/hybrid_parallel_pp_embedding.py index 2dd335eb0ef..d485e77a799 100644 --- a/test/collective/fleet/hybrid_parallel_pp_embedding.py +++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py @@ -120,7 +120,7 @@ class SimpleNetPipe(Layer): return feat -class TestDistEmbeddingTraning(unittest.TestCase): +class TestDistEmbeddingTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_fp16.py b/test/collective/fleet/hybrid_parallel_pp_fp16.py index 84430f4be37..c6c107a852a 100644 --- a/test/collective/fleet/hybrid_parallel_pp_fp16.py +++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py @@ -38,7 +38,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py index 793f2effd09..fd03b562f25 100644 --- a/test/collective/fleet/hybrid_parallel_pp_recompute.py +++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py @@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load.py b/test/collective/fleet/hybrid_parallel_pp_save_load.py index a2ab6a0654d..cfa7c6961ae 100644 --- a/test/collective/fleet/hybrid_parallel_pp_save_load.py +++ b/test/collective/fleet/hybrid_parallel_pp_save_load.py @@ -30,7 +30,7 @@ micro_batch_size = 2 vocab_size = 128 -class TestDistPPSaveLoadTraning(unittest.TestCase): +class TestDistPPSaveLoadTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py index 5d6152f4e9f..8e922792cdb 100644 --- a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py @@ -33,7 +33,7 @@ micro_batch_size = 2 vocab_size = 128 -class TestDistPPSaveLoadTraning(unittest.TestCase): +class TestDistPPSaveLoadTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer.py b/test/collective/fleet/hybrid_parallel_pp_transformer.py index 216f37796da..18986e3df34 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py @@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py index a8cf970f73d..1bd865e107e 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py @@ -29,7 +29,7 @@ vocab_size = 128 transformer_layer_num = 8 -class TestDistPPSaveTraning(unittest.TestCase): +class TestDistPPSaveTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py index 372cbe7f48d..fea23b62661 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py @@ -33,7 +33,7 @@ vocab_size = 128 transformer_layer_num = 8 -class TestDistPPSaveTraning(unittest.TestCase): +class TestDistPPSaveTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py index 60b1e1052bd..bc43f514ea9 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py @@ -17,7 +17,7 @@ import unittest import numpy as np from hybrid_parallel_pp_transformer import ( ModelPipe, - TestDistPPTraning, + TestDistPPTraining, batch_size, length, micro_batch_size, @@ -30,7 +30,7 @@ import paddle.distributed as dist from paddle.distributed import fleet -class TestDistPPTraningUnbalancedData(TestDistPPTraning): +class TestDistPPTrainingUnbalancedData(TestDistPPTraining): def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py index 3d7991727ee..35a17f17ace 100644 --- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py +++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py @@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer): ) -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/hybrid_parallel_qat.py b/test/collective/fleet/hybrid_parallel_qat.py index 11662d849eb..00bc0f746e7 100644 --- a/test/collective/fleet/hybrid_parallel_qat.py +++ b/test/collective/fleet/hybrid_parallel_qat.py @@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer): return x -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 2 diff --git a/test/collective/fleet/hybrid_parallel_sharding_model.py b/test/collective/fleet/hybrid_parallel_sharding_model.py index bb1ace6a6a4..41343d2dbda 100644 --- a/test/collective/fleet/hybrid_parallel_sharding_model.py +++ b/test/collective/fleet/hybrid_parallel_sharding_model.py @@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): return x -class TestDistMPTraning(unittest.TestCase): +class TestDistMPTraining(unittest.TestCase): def setUp(self): random.seed(2021) np.random.seed(2021) diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py index c110d5d00a4..f54b9947687 100644 --- a/test/collective/fleet/hybrid_parallel_shared_weight.py +++ b/test/collective/fleet/hybrid_parallel_shared_weight.py @@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer): super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs) -class TestDistEmbeddingTraning(unittest.TestCase): +class TestDistEmbeddingTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 diff --git a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py index a3f4d5f0b16..85f24f61d79 100644 --- a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py +++ b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/collective/fleet/test_fleet_static_mp_layers.py b/test/collective/fleet/test_fleet_static_mp_layers.py index 6f20943a128..3cd07943d11 100644 --- a/test/collective/fleet/test_fleet_static_mp_layers.py +++ b/test/collective/fleet/test_fleet_static_mp_layers.py @@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer): return output -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def setUp(self): os.environ["PADDLE_TRAINER_ID"] = "2" os.environ[ diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py index 1be27f9bc80..4d5e1955e23 100644 --- a/test/legacy_test/auto_parallel_gpt_model.py +++ b/test/legacy_test/auto_parallel_gpt_model.py @@ -35,7 +35,7 @@ def init_global(): class MultiHeadAttention(nn.Layer): """ - Attention mapps queries and a set of key-value pairs to outputs, and + Attention maps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. """ @@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer): def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): """ - Prapares linear projected queries, keys and values for usage of subsequnt + Prepares linear projected queries, keys and values for usage of subsequent multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ @@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer): def gen_cache(self, key, value=None, type=Cache): """ - Generates cache for `forward` usage in inference accroding to arguments. + Generates cache for `forward` usage in inference according to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ @@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer): ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones - input_embedings = self.word_embeddings(input_ids) + input_embeddings = self.word_embeddings(input_ids) if _global_parallel_strategy == "mp": auto.shard_tensor( self.word_embeddings.weight, _global_process_mesh, ["x", None] @@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer): ) position_embeddings = self.position_embeddings(position_ids) - embeddings = input_embedings + position_embeddings + embeddings = input_embeddings + position_embeddings embeddings = self.dropout(embeddings) return embeddings diff --git a/test/legacy_test/benchmark_sum_op.py b/test/legacy_test/benchmark_sum_op.py index 6854fd7f208..7bc15957e1c 100644 --- a/test/legacy_test/benchmark_sum_op.py +++ b/test/legacy_test/benchmark_sum_op.py @@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite): def test_timeit_output(self): """ - perf the op, time cost will be averged in iters. + perf the op, time cost will be averaged in iters. output example >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 @@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite): def test_timeit_grad(self): """ - perf the op gradient, time cost will be averged in iters. + perf the op gradient, time cost will be averaged in iters. output example >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py index a5634a0cfba..419344edfae 100644 --- a/test/legacy_test/dist_fleet_ctr.py +++ b/test/legacy_test/dist_fleet_ctr.py @@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): dnn_out = fc # build lr model - lr_embbding = paddle.static.nn.embedding( + lr_embedding = paddle.static.nn.embedding( is_distributed=False, input=lr_data, size=[lr_input_dim, 1], @@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): padding_idx=0, ) lr_pool = paddle.static.nn.sequence_lod.sequence_pool( - input=lr_embbding.squeeze(-2), pool_type="sum" + input=lr_embedding.squeeze(-2), pool_type="sum" ) merge_layer = paddle.concat([dnn_out, lr_pool], axis=1) diff --git a/test/legacy_test/dist_hapi_mnist_dynamic.py b/test/legacy_test/dist_hapi_mnist_dynamic.py index 7fa896cf3dd..66b5f66119b 100644 --- a/test/legacy_test/dist_hapi_mnist_dynamic.py +++ b/test/legacy_test/dist_hapi_mnist_dynamic.py @@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_dynamic_multiple_gpus(self): device = set_device('gpu') diff --git a/test/legacy_test/dist_hapi_mnist_static.py b/test/legacy_test/dist_hapi_mnist_static.py index 9d9b488f907..c465ef7fe85 100644 --- a/test/legacy_test/dist_hapi_mnist_static.py +++ b/test/legacy_test/dist_hapi_mnist_static.py @@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_static_multiple_gpus(self): paddle.enable_static() device = set_device('gpu') diff --git a/test/legacy_test/dist_hapi_pure_fp16_static.py b/test/legacy_test/dist_hapi_pure_fp16_static.py index f63866991fe..3506b22d31f 100644 --- a/test/legacy_test/dist_hapi_pure_fp16_static.py +++ b/test/legacy_test/dist_hapi_pure_fp16_static.py @@ -26,10 +26,10 @@ from paddle.vision.models import LeNet @unittest.skipIf( not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' ) -class TestDistTraningWithPureFP16(unittest.TestCase): +class TestDistTrainingWithPureFP16(unittest.TestCase): def test_amp_training_purefp16(self): if not fluid.is_compiled_with_cuda(): - self.skipTest('module not tested when ONLY_CPU compling') + self.skipTest('module not tested when ONLY_CPU compiling') data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32) label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64) diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py index 23e69c6b8ae..c24691eda15 100644 --- a/test/legacy_test/gradient_checker.py +++ b/test/legacy_test/gradient_checker.py @@ -269,7 +269,7 @@ def grad_check( if program is None: program = fluid.default_main_program() - # init variable in strtup program + # init variable in startup program scope = fluid.executor.global_scope() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -595,7 +595,7 @@ def get_static_double_grad( if program is None: program = fluid.default_main_program() - # init variable in strtup program + # init variable in startup program scope = fluid.executor.global_scope() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -657,7 +657,7 @@ def get_eager_double_grad( the second order derivative and the inputs of second order derivative's calculation will be returned for higher order derivative's calculation. If 'return_mid_result' set False. - A list of numpy array that stores second derivative result calulated by dygraph. + A list of numpy array that stores second derivative result calculated by dygraph. """ if isinstance(place, fluid.CPUPlace): paddle.set_device("cpu") @@ -684,7 +684,7 @@ def get_eager_double_grad( ) d_inputs = [d_input for d_input in d_inputs if d_input is not None] - # calcluate second derivative + # calculate second derivative inputs = inputs + dys ddys = [] if return_mid_result: @@ -808,7 +808,7 @@ def get_static_triple_grad( program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). Returns: - A list of numpy array that stores third derivative result calulated by static graph. + A list of numpy array that stores third derivative result calculated by static graph. """ if program is None: program = fluid.default_main_program() @@ -858,13 +858,13 @@ def get_eager_triple_grad( place (fluid.CPUPlace or fluid.CUDAPlace): the device. return_mid_result (list[Tensor], list[Tensor]): If set True, the Returns: - A list of numpy array that stores second derivative result calulated by dygraph + A list of numpy array that stores second derivative result calculated by dygraph """ dd_y, dd_x = get_eager_double_grad( func, x_init, dy_init, place, return_mid_result=True ) - # calcluate third derivative + # calculate third derivative dddys = [] for dd_yi in dd_y: dd_yi.stop_gradient = False diff --git a/test/legacy_test/hybrid_parallel_pp_alexnet.py b/test/legacy_test/hybrid_parallel_pp_alexnet.py index d8e433456d1..dc461176da1 100644 --- a/test/legacy_test/hybrid_parallel_pp_alexnet.py +++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py @@ -41,7 +41,7 @@ batch_size = 4 micro_batch_size = 2 -class TestDistPPTraning(unittest.TestCase): +class TestDistPPTraining(unittest.TestCase): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 @@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase): ) -class TestDistPPDelayScaleLoss(TestDistPPTraning): +class TestDistPPDelayScaleLoss(TestDistPPTraining): def setUp(self): strategy = fleet.DistributedStrategy() self.model_parallel_size = 1 @@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning): fleet.init(is_collective=True, strategy=strategy) -class TestDistPPMainGrad(TestDistPPTraning): +class TestDistPPMainGrad(TestDistPPTraining): def wrapper_mix_precision(self, model, optimizer): model = MixPrecisionLayer(model, dtype="float16") optimizer = MixPrecisionOptimizer(optimizer) diff --git a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py index febf4b5f87d..87167fd7318 100644 --- a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py +++ b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py @@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): return self.linear(inputs) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/legacy_test/parallel_dygraph_gradient_check.py b/test/legacy_test/parallel_dygraph_gradient_check.py index a6c47b65d8f..5566b74f9aa 100644 --- a/test/legacy_test/parallel_dygraph_gradient_check.py +++ b/test/legacy_test/parallel_dygraph_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): dist.init_parallel_env() self.trainer_id = dist.get_rank() diff --git a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py index df66ff7616a..7cb8c326a3e 100644 --- a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py +++ b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_gpus(self): self.trainer_id = dist.get_rank() self.pg = dist.init_parallel_env() diff --git a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py index 6db6e9e62a5..805bf251acc 100644 --- a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py +++ b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py @@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): return self.linear(inputs) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): self.trainer_id = dist.get_rank() dist.init_parallel_env() diff --git a/test/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py index 9de687f524a..4f9ac5e3b98 100644 --- a/test/xpu/parallel_dygraph_gradient_check.py +++ b/test/xpu/parallel_dygraph_gradient_check.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): dist.init_parallel_env() self.trainer_id = dist.get_rank() diff --git a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py index f0e46b2db27..497b4b025a2 100644 --- a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py +++ b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py @@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): return self.share_net(tmp) -class TestDistTraning(unittest.TestCase): +class TestDistTraining(unittest.TestCase): def test_multiple_xpus(self): self.trainer_id = dist.get_rank() self.pg = dist.init_parallel_env() -- GitLab