未验证 提交 4d094b0c 编写于 作者: C co63oc 提交者: GitHub

Fix typos (#56008)

上级 c1913a5f
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
import unittest import unittest
from hybrid_parallel_mp_model import TestDistMPTraning from hybrid_parallel_mp_model import TestDistMPTraining
import paddle import paddle
from paddle.distributed import fleet from paddle.distributed import fleet
class TestMPClipGrad(TestDistMPTraning): class TestMPClipGrad(TestDistMPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
scheduler = paddle.optimizer.lr.ExponentialDecay( scheduler = paddle.optimizer.lr.ExponentialDecay(
......
...@@ -14,14 +14,14 @@ ...@@ -14,14 +14,14 @@
import unittest import unittest
from hybrid_parallel_mp_model import TestDistMPTraning from hybrid_parallel_mp_model import TestDistMPTraining
import paddle import paddle
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16 from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16
class TestMPFP16(TestDistMPTraning): class TestMPFP16(TestDistMPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
scheduler = paddle.optimizer.lr.ExponentialDecay( scheduler = paddle.optimizer.lr.ExponentialDecay(
......
...@@ -19,7 +19,7 @@ import numpy as np ...@@ -19,7 +19,7 @@ import numpy as np
from hybrid_parallel_mp_model import ( from hybrid_parallel_mp_model import (
SimpleDPNet, SimpleDPNet,
SimpleMPNet, SimpleMPNet,
TestDistMPTraning, TestDistMPTraining,
parallel_matmul, parallel_matmul,
set_random_seed, set_random_seed,
) )
...@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet): ...@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet):
return x return x
class TestMPBroadcastObj(TestDistMPTraning): class TestMPBroadcastObj(TestDistMPTraining):
def build_model_optimizer(self): def build_model_optimizer(self):
hcg = fleet.get_hybrid_communicate_group() hcg = fleet.get_hybrid_communicate_group()
word_size = hcg.get_model_parallel_world_size() word_size = hcg.get_model_parallel_world_size()
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import unittest import unittest
from hybrid_parallel_mp_model import TestDistMPTraning from hybrid_parallel_mp_model import TestDistMPTraining
import paddle import paddle
...@@ -22,7 +22,7 @@ import paddle ...@@ -22,7 +22,7 @@ import paddle
# log.setLevel(logging.WARNING) # log.setLevel(logging.WARNING)
class TestMPClipGrad(TestDistMPTraning): class TestMPClipGrad(TestDistMPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
scheduler = paddle.optimizer.lr.ExponentialDecay( scheduler = paddle.optimizer.lr.ExponentialDecay(
......
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
import unittest import unittest
from hybrid_parallel_mp_model import TestDistMPTraning from hybrid_parallel_mp_model import TestDistMPTraining
import paddle import paddle
from paddle.distributed import fleet from paddle.distributed import fleet
class TestMPFP16(TestDistMPTraning): class TestMPFP16(TestDistMPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
scheduler = paddle.optimizer.lr.ExponentialDecay( scheduler = paddle.optimizer.lr.ExponentialDecay(
......
...@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer): ...@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer):
return output return output
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
......
...@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): ...@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x return x
class TestDistMPSyncTraning(unittest.TestCase): class TestDistMPSyncTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
...@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase): ...@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase):
) )
class TestDistMPSyncModelTraning(TestDistMPSyncTraning): class TestDistMPSyncModelTraining(TestDistMPSyncTraining):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
...@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning): ...@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
fleet.init(is_collective=True, strategy=strategy) fleet.init(is_collective=True, strategy=strategy)
class TestDistMPTraning(unittest.TestCase): class TestDistMPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
......
...@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer): ...@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x return x
class TestDistSPSyncTraning(unittest.TestCase): class TestDistSPSyncTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
...@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase): ...@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase):
) )
class TestDistSPSyncModelTraning(TestDistSPSyncTraning): class TestDistSPSyncModelTraining(TestDistSPSyncTraining):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
...@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning): ...@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
fleet.init(is_collective=True, strategy=strategy) fleet.init(is_collective=True, strategy=strategy)
class TestDistSPTraning(unittest.TestCase): class TestDistSPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
......
...@@ -20,7 +20,7 @@ import paddle ...@@ -20,7 +20,7 @@ import paddle
from paddle.distributed import fleet from paddle.distributed import fleet
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
......
...@@ -34,7 +34,7 @@ batch_size = 4 ...@@ -34,7 +34,7 @@ batch_size = 4
micro_batch_size = 2 micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -35,7 +35,7 @@ batch_size = 4 ...@@ -35,7 +35,7 @@ batch_size = 4
micro_batch_size = 2 micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -17,12 +17,12 @@ import unittest ...@@ -17,12 +17,12 @@ import unittest
sys.path.append("../../legacy_test") sys.path.append("../../legacy_test")
from hybrid_parallel_pp_alexnet import TestDistPPTraning from hybrid_parallel_pp_alexnet import TestDistPPTraining
import paddle import paddle
class TestPPClipGrad(TestDistPPTraning): class TestPPClipGrad(TestDistPPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
scheduler = paddle.optimizer.lr.PiecewiseDecay( scheduler = paddle.optimizer.lr.PiecewiseDecay(
...@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning): ...@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning):
return scheduler, optimizer return scheduler, optimizer
class TestPPClipGradParamGroup(TestDistPPTraning): class TestPPClipGradParamGroup(TestDistPPTraining):
def build_optimizer(self, model): def build_optimizer(self, model):
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
scheduler = paddle.optimizer.lr.PiecewiseDecay( scheduler = paddle.optimizer.lr.PiecewiseDecay(
......
...@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer): ...@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer):
return feat return feat
class TestDistEmbeddingTraning(unittest.TestCase): class TestDistEmbeddingTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -38,7 +38,7 @@ batch_size = 4 ...@@ -38,7 +38,7 @@ batch_size = 4
micro_batch_size = 2 micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ...@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
) )
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -30,7 +30,7 @@ micro_batch_size = 2 ...@@ -30,7 +30,7 @@ micro_batch_size = 2
vocab_size = 128 vocab_size = 128
class TestDistPPSaveLoadTraning(unittest.TestCase): class TestDistPPSaveLoadTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -33,7 +33,7 @@ micro_batch_size = 2 ...@@ -33,7 +33,7 @@ micro_batch_size = 2
vocab_size = 128 vocab_size = 128
class TestDistPPSaveLoadTraning(unittest.TestCase): class TestDistPPSaveLoadTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer): ...@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
) )
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -29,7 +29,7 @@ vocab_size = 128 ...@@ -29,7 +29,7 @@ vocab_size = 128
transformer_layer_num = 8 transformer_layer_num = 8
class TestDistPPSaveTraning(unittest.TestCase): class TestDistPPSaveTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -33,7 +33,7 @@ vocab_size = 128 ...@@ -33,7 +33,7 @@ vocab_size = 128
transformer_layer_num = 8 transformer_layer_num = 8
class TestDistPPSaveTraning(unittest.TestCase): class TestDistPPSaveTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -17,7 +17,7 @@ import unittest ...@@ -17,7 +17,7 @@ import unittest
import numpy as np import numpy as np
from hybrid_parallel_pp_transformer import ( from hybrid_parallel_pp_transformer import (
ModelPipe, ModelPipe,
TestDistPPTraning, TestDistPPTraining,
batch_size, batch_size,
length, length,
micro_batch_size, micro_batch_size,
...@@ -30,7 +30,7 @@ import paddle.distributed as dist ...@@ -30,7 +30,7 @@ import paddle.distributed as dist
from paddle.distributed import fleet from paddle.distributed import fleet
class TestDistPPTraningUnbalancedData(TestDistPPTraning): class TestDistPPTrainingUnbalancedData(TestDistPPTraining):
def test_pp_model(self): def test_pp_model(self):
hcg = fleet.get_hybrid_communicate_group() hcg = fleet.get_hybrid_communicate_group()
word_size = hcg.get_model_parallel_world_size() word_size = hcg.get_model_parallel_world_size()
......
...@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer): ...@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer):
) )
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer): ...@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer):
return x return x
class TestDistMPTraning(unittest.TestCase): class TestDistMPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 2 self.model_parallel_size = 2
......
...@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer): ...@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
return x return x
class TestDistMPTraning(unittest.TestCase): class TestDistMPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
random.seed(2021) random.seed(2021)
np.random.seed(2021) np.random.seed(2021)
......
...@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer): ...@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer):
super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs) super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
class TestDistEmbeddingTraning(unittest.TestCase): class TestDistEmbeddingTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
......
...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp) return self.share_net(tmp)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self): def test_multiple_gpus(self):
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
dist.init_parallel_env() dist.init_parallel_env()
......
...@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer): ...@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer):
return output return output
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "2" os.environ["PADDLE_TRAINER_ID"] = "2"
os.environ[ os.environ[
......
...@@ -35,7 +35,7 @@ def init_global(): ...@@ -35,7 +35,7 @@ def init_global():
class MultiHeadAttention(nn.Layer): class MultiHeadAttention(nn.Layer):
""" """
Attention mapps queries and a set of key-value pairs to outputs, and Attention maps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces. to information from different representation subspaces.
""" """
...@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer): ...@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer):
def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
""" """
Prapares linear projected queries, keys and values for usage of subsequnt Prepares linear projected queries, keys and values for usage of subsequent
multiple parallel attention. If `cache` is not None, using cached results multiple parallel attention. If `cache` is not None, using cached results
to reduce redundant calculations. to reduce redundant calculations.
""" """
...@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer): ...@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer):
def gen_cache(self, key, value=None, type=Cache): def gen_cache(self, key, value=None, type=Cache):
""" """
Generates cache for `forward` usage in inference accroding to arguments. Generates cache for `forward` usage in inference according to arguments.
The generated cache is an instance of `MultiHeadAttention.Cache` or an The generated cache is an instance of `MultiHeadAttention.Cache` or an
instance of `MultiHeadAttention.StaticCache`. instance of `MultiHeadAttention.StaticCache`.
""" """
...@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer): ...@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer):
ones = paddle.ones_like(input_ids, dtype="int64") ones = paddle.ones_like(input_ids, dtype="int64")
seq_length = paddle.cumsum(ones, axis=-1) seq_length = paddle.cumsum(ones, axis=-1)
position_ids = seq_length - ones position_ids = seq_length - ones
input_embedings = self.word_embeddings(input_ids) input_embeddings = self.word_embeddings(input_ids)
if _global_parallel_strategy == "mp": if _global_parallel_strategy == "mp":
auto.shard_tensor( auto.shard_tensor(
self.word_embeddings.weight, _global_process_mesh, ["x", None] self.word_embeddings.weight, _global_process_mesh, ["x", None]
...@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer): ...@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer):
) )
position_embeddings = self.position_embeddings(position_ids) position_embeddings = self.position_embeddings(position_ids)
embeddings = input_embedings + position_embeddings embeddings = input_embeddings + position_embeddings
embeddings = self.dropout(embeddings) embeddings = self.dropout(embeddings)
return embeddings return embeddings
......
...@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite): ...@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite):
def test_timeit_output(self): def test_timeit_output(self):
""" """
perf the op, time cost will be averged in iters. perf the op, time cost will be averaged in iters.
output example output example
>>> One pass of (sum_op) at CPUPlace cost 0.000461330413818 >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
>>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596 >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
...@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite): ...@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite):
def test_timeit_grad(self): def test_timeit_grad(self):
""" """
perf the op gradient, time cost will be averged in iters. perf the op gradient, time cost will be averaged in iters.
output example output example
>>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536 >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
>>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653 >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
......
...@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
dnn_out = fc dnn_out = fc
# build lr model # build lr model
lr_embbding = paddle.static.nn.embedding( lr_embedding = paddle.static.nn.embedding(
is_distributed=False, is_distributed=False,
input=lr_data, input=lr_data,
size=[lr_input_dim, 1], size=[lr_input_dim, 1],
...@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
padding_idx=0, padding_idx=0,
) )
lr_pool = paddle.static.nn.sequence_lod.sequence_pool( lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
input=lr_embbding.squeeze(-2), pool_type="sum" input=lr_embedding.squeeze(-2), pool_type="sum"
) )
merge_layer = paddle.concat([dnn_out, lr_pool], axis=1) merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)
......
...@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): ...@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
@unittest.skipIf( @unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
) )
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_dynamic_multiple_gpus(self): def test_dynamic_multiple_gpus(self):
device = set_device('gpu') device = set_device('gpu')
......
...@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt): ...@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
@unittest.skipIf( @unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
) )
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_static_multiple_gpus(self): def test_static_multiple_gpus(self):
paddle.enable_static() paddle.enable_static()
device = set_device('gpu') device = set_device('gpu')
......
...@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet ...@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet
@unittest.skipIf( @unittest.skipIf(
not fluid.is_compiled_with_cuda(), 'CPU testing is not supported' not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
) )
class TestDistTraningWithPureFP16(unittest.TestCase): class TestDistTrainingWithPureFP16(unittest.TestCase):
def test_amp_training_purefp16(self): def test_amp_training_purefp16(self):
if not fluid.is_compiled_with_cuda(): if not fluid.is_compiled_with_cuda():
self.skipTest('module not tested when ONLY_CPU compling') self.skipTest('module not tested when ONLY_CPU compiling')
data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32) data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64) label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
......
...@@ -269,7 +269,7 @@ def grad_check( ...@@ -269,7 +269,7 @@ def grad_check(
if program is None: if program is None:
program = fluid.default_main_program() program = fluid.default_main_program()
# init variable in strtup program # init variable in startup program
scope = fluid.executor.global_scope() scope = fluid.executor.global_scope()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -595,7 +595,7 @@ def get_static_double_grad( ...@@ -595,7 +595,7 @@ def get_static_double_grad(
if program is None: if program is None:
program = fluid.default_main_program() program = fluid.default_main_program()
# init variable in strtup program # init variable in startup program
scope = fluid.executor.global_scope() scope = fluid.executor.global_scope()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -657,7 +657,7 @@ def get_eager_double_grad( ...@@ -657,7 +657,7 @@ def get_eager_double_grad(
the second order derivative and the inputs of second order derivative's calculation the second order derivative and the inputs of second order derivative's calculation
will be returned for higher order derivative's calculation. will be returned for higher order derivative's calculation.
If 'return_mid_result' set False. If 'return_mid_result' set False.
A list of numpy array that stores second derivative result calulated by dygraph. A list of numpy array that stores second derivative result calculated by dygraph.
""" """
if isinstance(place, fluid.CPUPlace): if isinstance(place, fluid.CPUPlace):
paddle.set_device("cpu") paddle.set_device("cpu")
...@@ -684,7 +684,7 @@ def get_eager_double_grad( ...@@ -684,7 +684,7 @@ def get_eager_double_grad(
) )
d_inputs = [d_input for d_input in d_inputs if d_input is not None] d_inputs = [d_input for d_input in d_inputs if d_input is not None]
# calcluate second derivative # calculate second derivative
inputs = inputs + dys inputs = inputs + dys
ddys = [] ddys = []
if return_mid_result: if return_mid_result:
...@@ -808,7 +808,7 @@ def get_static_triple_grad( ...@@ -808,7 +808,7 @@ def get_static_triple_grad(
program (Program|None): a Program with forward pass. program (Program|None): a Program with forward pass.
If None, use fluid.default_main_program(). If None, use fluid.default_main_program().
Returns: Returns:
A list of numpy array that stores third derivative result calulated by static graph. A list of numpy array that stores third derivative result calculated by static graph.
""" """
if program is None: if program is None:
program = fluid.default_main_program() program = fluid.default_main_program()
...@@ -858,13 +858,13 @@ def get_eager_triple_grad( ...@@ -858,13 +858,13 @@ def get_eager_triple_grad(
place (fluid.CPUPlace or fluid.CUDAPlace): the device. place (fluid.CPUPlace or fluid.CUDAPlace): the device.
return_mid_result (list[Tensor], list[Tensor]): If set True, the return_mid_result (list[Tensor], list[Tensor]): If set True, the
Returns: Returns:
A list of numpy array that stores second derivative result calulated by dygraph A list of numpy array that stores second derivative result calculated by dygraph
""" """
dd_y, dd_x = get_eager_double_grad( dd_y, dd_x = get_eager_double_grad(
func, x_init, dy_init, place, return_mid_result=True func, x_init, dy_init, place, return_mid_result=True
) )
# calcluate third derivative # calculate third derivative
dddys = [] dddys = []
for dd_yi in dd_y: for dd_yi in dd_y:
dd_yi.stop_gradient = False dd_yi.stop_gradient = False
......
...@@ -41,7 +41,7 @@ batch_size = 4 ...@@ -41,7 +41,7 @@ batch_size = 4
micro_batch_size = 2 micro_batch_size = 2
class TestDistPPTraning(unittest.TestCase): class TestDistPPTraining(unittest.TestCase):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
...@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase): ...@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase):
) )
class TestDistPPDelayScaleLoss(TestDistPPTraning): class TestDistPPDelayScaleLoss(TestDistPPTraining):
def setUp(self): def setUp(self):
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
self.model_parallel_size = 1 self.model_parallel_size = 1
...@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning): ...@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning):
fleet.init(is_collective=True, strategy=strategy) fleet.init(is_collective=True, strategy=strategy)
class TestDistPPMainGrad(TestDistPPTraning): class TestDistPPMainGrad(TestDistPPTraining):
def wrapper_mix_precision(self, model, optimizer): def wrapper_mix_precision(self, model, optimizer):
model = MixPrecisionLayer(model, dtype="float16") model = MixPrecisionLayer(model, dtype="float16")
optimizer = MixPrecisionOptimizer(optimizer) optimizer = MixPrecisionOptimizer(optimizer)
......
...@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
return self.linear(inputs) return self.linear(inputs)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self): def test_multiple_gpus(self):
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
dist.init_parallel_env() dist.init_parallel_env()
......
...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp) return self.share_net(tmp)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self): def test_multiple_gpus(self):
dist.init_parallel_env() dist.init_parallel_env()
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
......
...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp) return self.share_net(tmp)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_gpus(self): def test_multiple_gpus(self):
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
self.pg = dist.init_parallel_env() self.pg = dist.init_parallel_env()
......
...@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
return self.linear(inputs) return self.linear(inputs)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self): def test_multiple_xpus(self):
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
dist.init_parallel_env() dist.init_parallel_env()
......
...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp) return self.share_net(tmp)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self): def test_multiple_xpus(self):
dist.init_parallel_env() dist.init_parallel_env()
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
......
...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
return self.share_net(tmp) return self.share_net(tmp)
class TestDistTraning(unittest.TestCase): class TestDistTraining(unittest.TestCase):
def test_multiple_xpus(self): def test_multiple_xpus(self):
self.trainer_id = dist.get_rank() self.trainer_id = dist.get_rank()
self.pg = dist.init_parallel_env() self.pg = dist.init_parallel_env()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册