未验证 提交 4d094b0c 编写于 作者: C co63oc 提交者: GitHub

Fix typos (#56008)

上级 c1913a5f
......@@ -14,13 +14,13 @@
import unittest
from hybrid_parallel_mp_model import TestDistMPTraning
from hybrid_parallel_mp_model import TestDistMPTraining
import paddle
from paddle.distributed import fleet
class TestMPClipGrad(TestDistMPTraning):
class TestMPClipGrad(TestDistMPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
scheduler = paddle.optimizer.lr.ExponentialDecay(
......
......@@ -14,14 +14,14 @@
import unittest
from hybrid_parallel_mp_model import TestDistMPTraning
from hybrid_parallel_mp_model import TestDistMPTraining
import paddle
from paddle.distributed import fleet
from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16
class TestMPFP16(TestDistMPTraning):
class TestMPFP16(TestDistMPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
scheduler = paddle.optimizer.lr.ExponentialDecay(
......
......@@ -19,7 +19,7 @@ import numpy as np
from hybrid_parallel_mp_model import (
SimpleDPNet,
SimpleMPNet,
TestDistMPTraning,
TestDistMPTraining,
parallel_matmul,
set_random_seed,
)
......@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet):
return x
class TestMPBroadcastObj(TestDistMPTraning):
class TestMPBroadcastObj(TestDistMPTraining):
def build_model_optimizer(self):
hcg = fleet.get_hybrid_communicate_group()
word_size = hcg.get_model_parallel_world_size()
......
......@@ -14,7 +14,7 @@
import unittest
from hybrid_parallel_mp_model import TestDistMPTraning
from hybrid_parallel_mp_model import TestDistMPTraining
import paddle
......@@ -22,7 +22,7 @@ import paddle
# log.setLevel(logging.WARNING)
class TestMPClipGrad(TestDistMPTraning):
class TestMPClipGrad(TestDistMPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
scheduler = paddle.optimizer.lr.ExponentialDecay(
......
......@@ -14,13 +14,13 @@
import unittest
from hybrid_parallel_mp_model import TestDistMPTraning
from hybrid_parallel_mp_model import TestDistMPTraining
import paddle
from paddle.distributed import fleet
class TestMPFP16(TestDistMPTraning):
class TestMPFP16(TestDistMPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
scheduler = paddle.optimizer.lr.ExponentialDecay(
......
......@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer):
return output
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......
......@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x
class TestDistMPSyncTraning(unittest.TestCase):
class TestDistMPSyncTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase):
)
class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
class TestDistMPSyncModelTraining(TestDistMPSyncTraining):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
fleet.init(is_collective=True, strategy=strategy)
class TestDistMPTraning(unittest.TestCase):
class TestDistMPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......
......@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x
class TestDistSPSyncTraning(unittest.TestCase):
class TestDistSPSyncTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase):
)
class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
class TestDistSPSyncModelTraining(TestDistSPSyncTraining):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
fleet.init(is_collective=True, strategy=strategy)
class TestDistSPTraning(unittest.TestCase):
class TestDistSPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......
......@@ -20,7 +20,7 @@ import paddle
from paddle.distributed import fleet
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......
......@@ -34,7 +34,7 @@ batch_size = 4
micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -35,7 +35,7 @@ batch_size = 4
micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -17,12 +17,12 @@ import unittest
sys.path.append("../../legacy_test")
from hybrid_parallel_pp_alexnet import TestDistPPTraning
from hybrid_parallel_pp_alexnet import TestDistPPTraining
import paddle
class TestPPClipGrad(TestDistPPTraning):
class TestPPClipGrad(TestDistPPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
scheduler = paddle.optimizer.lr.PiecewiseDecay(
......@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning):
return scheduler, optimizer
class TestPPClipGradParamGroup(TestDistPPTraning):
class TestPPClipGradParamGroup(TestDistPPTraining):
def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
scheduler = paddle.optimizer.lr.PiecewiseDecay(
......
......@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer):
return feat
class TestDistEmbeddingTraning(unittest.TestCase):
class TestDistEmbeddingTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -38,7 +38,7 @@ batch_size = 4
micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
)
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -30,7 +30,7 @@ micro_batch_size = 2
vocab_size = 128
class TestDistPPSaveLoadTraning(unittest.TestCase):
class TestDistPPSaveLoadTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -33,7 +33,7 @@ micro_batch_size = 2
vocab_size = 128
class TestDistPPSaveLoadTraning(unittest.TestCase):
class TestDistPPSaveLoadTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
)
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -29,7 +29,7 @@ vocab_size = 128
transformer_layer_num = 8
class TestDistPPSaveTraning(unittest.TestCase):
class TestDistPPSaveTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -33,7 +33,7 @@ vocab_size = 128
transformer_layer_num = 8
class TestDistPPSaveTraning(unittest.TestCase):
class TestDistPPSaveTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -17,7 +17,7 @@ import unittest
import numpy as np
from hybrid_parallel_pp_transformer import (
ModelPipe,
TestDistPPTraning,
TestDistPPTraining,
batch_size,
length,
micro_batch_size,
......@@ -30,7 +30,7 @@ import paddle.distributed as dist
from paddle.distributed import fleet
class TestDistPPTraningUnbalancedData(TestDistPPTraning):
class TestDistPPTrainingUnbalancedData(TestDistPPTraining):
def test_pp_model(self):
hcg = fleet.get_hybrid_communicate_group()
word_size = hcg.get_model_parallel_world_size()
......
......@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer):
)
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer):
return x
class TestDistMPTraning(unittest.TestCase):
class TestDistMPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2
......
......@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x
class TestDistMPTraning(unittest.TestCase):
class TestDistMPTraining(unittest.TestCase):
def setUp(self):
random.seed(2021)
np.random.seed(2021)
......
......@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer):
super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
class TestDistEmbeddingTraning(unittest.TestCase):
class TestDistEmbeddingTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......
......@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self):
self.trainer_id = dist.get_rank()
dist.init_parallel_env()
......
......@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer):
return output
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "2"
os.environ[
......
......@@ -35,7 +35,7 @@ def init_global():
class MultiHeadAttention(nn.Layer):
"""
Attention mapps queries and a set of key-value pairs to outputs, and
Attention maps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.
"""
......@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer):
def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
"""
Prapares linear projected queries, keys and values for usage of subsequnt
Prepares linear projected queries, keys and values for usage of subsequent
multiple parallel attention. If `cache` is not None, using cached results
to reduce redundant calculations.
"""
......@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer):
def gen_cache(self, key, value=None, type=Cache):
"""
Generates cache for `forward` usage in inference accroding to arguments.
Generates cache for `forward` usage in inference according to arguments.
The generated cache is an instance of `MultiHeadAttention.Cache` or an
instance of `MultiHeadAttention.StaticCache`.
"""
......@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer):
ones = paddle.ones_like(input_ids, dtype="int64")
seq_length = paddle.cumsum(ones, axis=-1)
position_ids = seq_length - ones
input_embedings = self.word_embeddings(input_ids)
input_embeddings = self.word_embeddings(input_ids)
if _global_parallel_strategy == "mp":
auto.shard_tensor(
self.word_embeddings.weight, _global_process_mesh, ["x", None]
......@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer):
)
position_embeddings = self.position_embeddings(position_ids)
embeddings = input_embedings + position_embeddings
embeddings = input_embeddings + position_embeddings
embeddings = self.dropout(embeddings)
return embeddings
......
......@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite):
def test_timeit_output(self):
"""
perf the op, time cost will be averged in iters.
perf the op, time cost will be averaged in iters.
output example
>>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
>>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
......@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite):
def test_timeit_grad(self):
"""
perf the op gradient, time cost will be averged in iters.
perf the op gradient, time cost will be averaged in iters.
output example
>>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
>>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
......
......@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
dnn_out = fc
# build lr model
lr_embbding = paddle.static.nn.embedding(
lr_embedding = paddle.static.nn.embedding(
is_distributed=False,
input=lr_data,
size=[lr_input_dim, 1],
......@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
padding_idx=0,
)
lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
input=lr_embbding.squeeze(-2), pool_type="sum"
input=lr_embedding.squeeze(-2), pool_type="sum"
)
merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)
......
......@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
@unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_dynamic_multiple_gpus(self):
device = set_device('gpu')
......
......@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
@unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_static_multiple_gpus(self):
paddle.enable_static()
device = set_device('gpu')
......
......@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet
@unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
)
class TestDistTraningWithPureFP16(unittest.TestCase):
class TestDistTrainingWithPureFP16(unittest.TestCase):
def test_amp_training_purefp16(self):
if not fluid.is_compiled_with_cuda():
self.skipTest('module not tested when ONLY_CPU compling')
self.skipTest('module not tested when ONLY_CPU compiling')
data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
......
......@@ -269,7 +269,7 @@ def grad_check(
if program is None:
program = fluid.default_main_program()
# init variable in strtup program
# init variable in startup program
scope = fluid.executor.global_scope()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -595,7 +595,7 @@ def get_static_double_grad(
if program is None:
program = fluid.default_main_program()
# init variable in strtup program
# init variable in startup program
scope = fluid.executor.global_scope()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -657,7 +657,7 @@ def get_eager_double_grad(
the second order derivative and the inputs of second order derivative's calculation
will be returned for higher order derivative's calculation.
If 'return_mid_result' set False.
A list of numpy array that stores second derivative result calulated by dygraph.
A list of numpy array that stores second derivative result calculated by dygraph.
"""
if isinstance(place, fluid.CPUPlace):
paddle.set_device("cpu")
......@@ -684,7 +684,7 @@ def get_eager_double_grad(
)
d_inputs = [d_input for d_input in d_inputs if d_input is not None]
# calcluate second derivative
# calculate second derivative
inputs = inputs + dys
ddys = []
if return_mid_result:
......@@ -808,7 +808,7 @@ def get_static_triple_grad(
program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program().
Returns:
A list of numpy array that stores third derivative result calulated by static graph.
A list of numpy array that stores third derivative result calculated by static graph.
"""
if program is None:
program = fluid.default_main_program()
......@@ -858,13 +858,13 @@ def get_eager_triple_grad(
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
return_mid_result (list[Tensor], list[Tensor]): If set True, the
Returns:
A list of numpy array that stores second derivative result calulated by dygraph
A list of numpy array that stores second derivative result calculated by dygraph
"""
dd_y, dd_x = get_eager_double_grad(
func, x_init, dy_init, place, return_mid_result=True
)
# calcluate third derivative
# calculate third derivative
dddys = []
for dd_yi in dd_y:
dd_yi.stop_gradient = False
......
......@@ -41,7 +41,7 @@ batch_size = 4
micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase):
class TestDistPPTraining(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase):
)
class TestDistPPDelayScaleLoss(TestDistPPTraning):
class TestDistPPDelayScaleLoss(TestDistPPTraining):
def setUp(self):
strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1
......@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning):
fleet.init(is_collective=True, strategy=strategy)
class TestDistPPMainGrad(TestDistPPTraning):
class TestDistPPMainGrad(TestDistPPTraining):
def wrapper_mix_precision(self, model, optimizer):
model = MixPrecisionLayer(model, dtype="float16")
optimizer = MixPrecisionOptimizer(optimizer)
......
......@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
return self.linear(inputs)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self):
self.trainer_id = dist.get_rank()
dist.init_parallel_env()
......
......@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self):
dist.init_parallel_env()
self.trainer_id = dist.get_rank()
......
......@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self):
self.trainer_id = dist.get_rank()
self.pg = dist.init_parallel_env()
......
......@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
return self.linear(inputs)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self):
self.trainer_id = dist.get_rank()
dist.init_parallel_env()
......
......@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self):
dist.init_parallel_env()
self.trainer_id = dist.get_rank()
......
......@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp)
class TestDistTraning(unittest.TestCase):
class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self):
self.trainer_id = dist.get_rank()
self.pg = dist.init_parallel_env()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册