From 1f829f6e48b91db59a0561fe420aa10eb4778b42 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Tue, 5 Apr 2022 14:57:26 +0800 Subject: [PATCH] [Dygraph] Support process group in dp with fleet api (#41119) * support process group in dp with fleet api * update * fix uts * update --- python/paddle/distributed/parallel.py | 1 + python/paddle/fluid/dygraph/parallel.py | 32 +-- .../fluid/tests/unittests/CMakeLists.txt | 17 +- .../tests/unittests/dygraph_fleet_api.py | 2 + ...llel_dygraph_dataparallel_in_eager_mode.py | 137 ---------- ...el_dygraph_gradient_check_in_eager_mode.py | 28 +-- .../unittests/parallel_dygraph_no_sync.py | 81 +++--- .../tests/unittests/spawn_runner_base.py | 15 +- .../fluid/tests/unittests/test_dist_base.py | 234 +++--------------- .../tests/unittests/test_imperative_group.py | 26 +- ...llel_dygraph_control_flow_in_eager_mode.py | 84 ------- .../test_parallel_dygraph_dataparallel.py | 5 - ...t_parallel_dygraph_dataparallel_cpuonly.py | 2 +- ..._parallel_dygraph_no_sync_in_eager_mode.py | 111 --------- .../test_parallel_dygraph_sparse_embedding.py | 42 ---- ...el_dygraph_sparse_embedding_over_height.py | 27 -- .../test_parallel_dygraph_sync_batch_norm.py | 16 -- .../test_parallel_dygraph_transformer.py | 23 -- .../test_parallel_dygraph_unused_variables.py | 66 ----- 19 files changed, 107 insertions(+), 842 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow_in_eager_mode.py delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_in_eager_mode.py diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 71ac15bd4b..b90f24d377 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -217,6 +217,7 @@ def init_parallel_env(): "required to create a process group.") master_addr = os.getenv("MASTER_ADDR", None) master_port = os.getenv("MASTER_PORT", None) + endpoints = None if not master_addr or not master_port: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index cac67a02dd..ac15034ffb 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -398,16 +398,6 @@ def sync_params_buffers(model, 'axis': 0}) -@imperative_base.no_grad -@framework.dygraph_only -def sync_eager_params(model, comm_group=None, src_rank=0): - for _, param in model._obtain_parameters_buffers().items(): - if not isinstance(param, core.eager.Tensor): - raise TypeError("The data type of '%s' must be '%s'" % - (param.name, core.eager.Tensor)) - comm_group.broadcast(param, src_rank).synchronize() - - class DataParallel(layers.Layer): """ Run the dygraph module with data parallelism. @@ -575,7 +565,7 @@ class DataParallel(layers.Layer): comm_buffer_size=25, last_comm_buffer_size=1, find_unused_parameters=False, - process_group=None): + group=None): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") @@ -585,7 +575,7 @@ class DataParallel(layers.Layer): self._layers = layers self.find_unused_parameters = find_unused_parameters self.grad_need_sync = True - self.process_group = process_group + self.group = group self.var_dtype = core.eager.Tensor if in_dygraph_mode( ) else core.VarBase @@ -604,20 +594,18 @@ class DataParallel(layers.Layer): "ParallelContext must be initialized before. You should use init_parallel_env() before" \ "constructing the DataParallel." - if self.process_group is None and in_dygraph_mode(): - raise RuntimeError( - "Process group should be built for DataParallel in eager mode." - ) + if in_dygraph_mode(): + self.group = paddle.distributed.collective._get_default_group( + ) if self.group is None else self.group + + assert isinstance(self.group, paddle.distributed.collective.Group), \ + "ProcessGroup must be an instance of Group in DataParallel." # sync buffer and params # TODO(liuyuhui) Currently not support xpu. xpu is # still broadcasting parameters when calling layer if not paddle.is_compiled_with_xpu(): - if in_dygraph_mode(): - sync_eager_params( - self._layers, comm_group=self.process_group) - elif _in_legacy_dygraph(): - sync_params_buffers(self._layers) + sync_params_buffers(self._layers) self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024) # NOTE(shenliang03): We can set environment variables to control @@ -678,7 +666,7 @@ class DataParallel(layers.Layer): self._reducer = core.EagerReducer( trainable_parameters, list(reversed(self.group_indices)), is_sparse_gradient, - self.process_group, + self.group.process_group, [self.last_comm_buffer_size, self.comm_buffer_size], self.find_unused_parameters) elif _in_legacy_dygraph(): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index b4d6f9b941..51bedda407 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -39,9 +39,7 @@ if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL) endif() list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables) list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow) -list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow_in_eager_mode) list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync) -list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync_in_eager_mode) list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync_gradient_check) list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) @@ -279,9 +277,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow) - list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow_in_eager_mode) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync) - list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_in_eager_mode) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_gradient_check) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel) @@ -1128,12 +1124,11 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 300) - set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200) - set_tests_properties(test_parallel_dygraph_control_flow_in_eager_mode PROPERTIES TIMEOUT 150) - set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) - set_tests_properties(test_parallel_dygraph_no_sync_in_eager_mode PROPERTIES TIMEOUT 150) + set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT 200) + set_tests_properties(test_parallel_dygraph_se_resnext PROPERTIES TIMEOUT 200) + set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 350) + set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 350) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) @@ -1155,8 +1150,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 200) + set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height PROPERTIES TIMEOUT 150) - set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 150) endif() endif() diff --git a/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py b/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py index 2a9d74e4af..de4457a58f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py @@ -57,4 +57,6 @@ class TestDygraphFleetAPI(unittest.TestCase): if __name__ == "__main__": + with _test_eager_guard(): + pass unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py deleted file mode 100644 index d48a7f09ce..0000000000 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import print_function - -import unittest -import os -import copy -import numpy as np -import random -import socket - -import paddle -import paddle.nn as nn -from paddle.fluid.dygraph.nn import Linear -import paddle.fluid.core as core -from paddle.fluid.framework import _test_eager_guard -import paddle.distributed as dist -from paddle.fluid.dygraph.parallel import ParallelEnv -from paddle.optimizer import SGD -from paddle.fluid.initializer import NumpyArrayInitializer -from test_parallel_dygraph_dataparallel import get_dist_port_from_flags - - -def init_process_group(strategy=None): - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - envs = copy.copy(os.environ.copy()) - port = get_dist_port_from_flags() - store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master, nranks) - if 'PADDLE_DISTRI_BACKEND' in envs.keys() and envs[ - 'PADDLE_DISTRI_BACKEND'] == 'gloo': - group = core.ProcessGroupGloo(store, rank, nranks) - else: - group = core.ProcessGroupNCCL(store, rank, nranks) - return group - - -class LinearModel(nn.Layer): - def __init__(self, attr_list): - super(LinearModel, self).__init__() - self._linear1 = paddle.nn.Linear( - 50, 30, weight_attr=attr_list[0], bias_attr=False) - self._linear2 = paddle.nn.Linear( - 30, 10, weight_attr=attr_list[1], bias_attr=False) - self._linear3 = paddle.nn.Linear( - 10, 10, weight_attr=attr_list[2], bias_attr=False) - - def forward(self, x): - output = self._linear1(x) - output = self._linear2(output) - output = self._linear3(output) - return output - - -class TestDistTraning(unittest.TestCase): - def test_multiple_gpus(self): - process_group = init_process_group() - self.generate_reducer("float32", process_group) - if paddle.get_device() != "cpu": - self.generate_reducer("float16", process_group) - - def generate_reducer(self, dtype, process_group): - local_rank = ParallelEnv().local_rank - np.random.seed(2022 + local_rank) - paddle.set_default_dtype(dtype) - - w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer( - np.random.rand(50, 30).astype(dtype))) - w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer( - np.random.rand(30, 10).astype(dtype))) - w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer( - np.random.rand(10, 10).astype(dtype))) - - attr_list = [w_1, w_2, w_3] - inp = np.random.rand(10, 50).astype(dtype) - - # original reducer - params_a = self.model_train(attr_list, inp) - - # refactored reducer in eager mode - with _test_eager_guard(): - params_b = self.model_train( - attr_list, inp, process_group=process_group) - - for i in range(len(params_a)): - np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy()) - - def model_train(self, attr_list, inp, process_group=None): - model = LinearModel(attr_list) - model = paddle.DataParallel(model, process_group=process_group) - optimizer = SGD(learning_rate=0.0003, parameters=model.parameters()) - - x = paddle.to_tensor(inp) - x.stop_gradient = False - - for step in range(10): - y = model(x) - loss = y.mean() - - loss.backward() - optimizer.step() - optimizer.clear_grad() - - return model.parameters() - - -class TestCatchErrors1(unittest.TestCase): - def test_multiple_gpus(self): - linear = paddle.nn.Linear(2, 4) - with _test_eager_guard(): - self.assertRaises(RuntimeError, paddle.DataParallel, linear) - - -class TestCatchErrors2(unittest.TestCase): - def test_multiple_gpus(self): - with _test_eager_guard(): - linear = paddle.nn.Linear(2, 4) - self.assertRaises(RuntimeError, paddle.DataParallel, linear) - - -if __name__ == '__main__': - dist.init_parallel_env() - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py index bf337d4864..db41236dd5 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py @@ -36,19 +36,6 @@ in_dim = 10 out_dim = 20 -def init_process_group(strategy=None): - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - current_env = copy.copy(os.environ.copy()) - port = 6175 - if 'PADDLE_DIST_UT_PORT' in current_env.keys(): - port = int(current_env['PADDLE_DIST_UT_PORT']) - store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master, nranks) - group = core.ProcessGroupNCCL(store, rank, nranks) - return group - - class SimpleNet(fluid.Layer): def __init__(self, train_id): super(SimpleNet, self).__init__() @@ -83,12 +70,9 @@ class SimpleNet(fluid.Layer): class TestDistTraning(unittest.TestCase): def test_multiple_gpus(self): - dist.init_parallel_env() self.trainer_id = dist.get_rank() - - process_group = init_process_group() - self.pg = process_group with _test_eager_guard(): + self.pg = dist.init_parallel_env() model_a = SimpleNet(self.trainer_id) model_b = SimpleNet(self.trainer_id) @@ -97,13 +81,9 @@ class TestDistTraning(unittest.TestCase): model_b.set_state_dict(state_dict) model_a = paddle.DataParallel( - model_a, - find_unused_parameters=True, - process_group=process_group) + model_a, find_unused_parameters=True, group=self.pg) model_b = paddle.DataParallel( - model_b, - find_unused_parameters=True, - process_group=process_group) + model_b, find_unused_parameters=True, group=self.pg) ones_input = paddle.ones(shape=(batch, in_dim)) ones_input.stop_gradient = True @@ -150,7 +130,7 @@ class TestDistTraning(unittest.TestCase): print(*args) def broadcast_param(self, param, root): - self.pg.broadcast(param, root) + self.pg.process_group.broadcast(param, root) return param def check_gradient(self, params): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py index f5af896f73..9a3b5ee2f0 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -69,18 +69,6 @@ class TestNoSync(TestParallelDyGraphRunnerBase): loss = out.sum() / len(batch) return loss - def run_trainer(self, args): - if args.eager_mode: - self.run_trainer_in_eager_mode(args) - else: - self.run_trainer_func(args) - - def run_trainer_with_spawn(self, args): - if args.eager_mode: - return self.run_trainer_with_spawn_in_eager_mode(args) - else: - return self.run_trainer_with_spawn_func(args) - def run_trainer_func(self, args): if fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) @@ -103,41 +91,36 @@ class TestNoSync(TestParallelDyGraphRunnerBase): model = paddle.DataParallel( model, find_unused_parameters=args.find_unused_parameters) print_to_err(type(self).__name__, "model built in dygraph") - return self.model_train(args, model, opt, train_reader) - - def run_trainer_in_eager_mode(self, args): - if fluid.core.is_compiled_with_cuda(): - device_id = int(os.getenv("FLAGS_selected_gpus", "0")) - place = fluid.CUDAPlace(device_id) - else: - assert ("Only support CUDAPlace for now.") - - with fluid.dygraph.guard(place): - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - np.random.seed(seed) - random.seed(seed) - - with _test_eager_guard(): - model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": - dist.init_parallel_env() - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore( - "127.0.0.1", args.dist_port, is_master, nranks) - group = core.ProcessGroupNCCL(store, rank, nranks) - model = paddle.DataParallel( - model, - process_group=group, - find_unused_parameters=args.find_unused_parameters) - print_to_err(type(self).__name__, "model built in dygraph") - return self.model_train(args, model, opt, train_reader) + out_losses = self.model_train(args, model, opt, train_reader) + print_to_out(out_losses) + return out_losses + + def run_trainer_with_spawn_func(self, args): + # 1. enable dygraph + paddle.disable_static() + + # 2. init seed + seed = 90 + paddle.static.default_startup_program().random_seed = seed + paddle.static.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + # get trainer id + args.trainer_id = paddle.distributed.get_rank() + + # 3. init parallel env + if args.update_method in ["nccl2", "gloo"]: + paddle.distributed.init_parallel_env() + + # 4. train model + model, train_reader, opt = self.get_model() + if args.update_method in ["nccl2", "gloo"]: + model = paddle.DataParallel( + model, find_unused_parameters=args.find_unused_parameters) + + out_losses = self.model_train(args, model, opt, train_reader) + print_to_out(out_losses) + return out_losses def model_train(self, args, model, opt, train_reader): out_losses = [] @@ -157,12 +140,8 @@ class TestNoSync(TestParallelDyGraphRunnerBase): loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) - print_to_err( - type(self).__name__, - "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) model.clear_gradients() - print_to_out(out_losses) return out_losses diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index e7057f95d2..11f8cd559d 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -21,7 +21,7 @@ import paddle # used by model.run_trainer in test_dist_base from test_dist_base import RUN_STEP -from test_parallel_dygraph_dataparallel import get_dist_port_from_flags +from paddle.fluid.framework import _test_eager_guard # NOTE: compatible TestParallelDyGraphRunnerBase args @@ -29,8 +29,6 @@ class SpawnAssistTestArgs(object): update_method = "local" trainer_id = 0 find_unused_parameters = False - eager_mode = False - dist_port = get_dist_port_from_flags() class TestDistSpawnRunner(unittest.TestCase): @@ -55,14 +53,17 @@ class TestDistSpawnRunner(unittest.TestCase): result_list.append(res_queue.get()) return result_list - def _args_config(self, args): - return - def check_dist_result_with_spawn(self, test_class, delta=1e-3): + with _test_eager_guard(): + self.check_dist_result_with_spawn_func( + test_class=test_class, delta=delta) + self.check_dist_result_with_spawn_func( + test_class=test_class, delta=delta) + + def check_dist_result_with_spawn_func(self, test_class, delta=1e-3): # 0. prepare model and args model = test_class() args = SpawnAssistTestArgs() - self._args_config(args) # 1. calc signal card loss losses = self._run(model, args) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index a2faf1e395..11972059c8 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -36,7 +36,6 @@ import paddle.fluid.dygraph as dygraph from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.parallel import DataParallel, ParallelEnv from paddle.fluid.framework import _test_eager_guard - from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy import paddle.fluid.incubate.fleet.base.role_maker as role_maker @@ -543,12 +542,6 @@ class TestParallelDyGraphRunnerBase(object): return batch def run_trainer(self, args): - if args.eager_mode: - self.run_trainer_in_eager_mode(args) - else: - self.run_trainer_func(args) - - def run_trainer_func(self, args): seed = 90 if args.update_method == 'gloo': place = fluid.CPUPlace() @@ -580,6 +573,7 @@ class TestParallelDyGraphRunnerBase(object): strategy.local_rank = args.trainer_id strategy.trainer_endpoints = args.endpoints.split(",") strategy.current_endpoint = args.current_endpoint + paddle.distributed.init_parallel_env() print_to_err( type(self).__name__, "begin to prepare context in dygraph with nccl2") @@ -621,82 +615,7 @@ class TestParallelDyGraphRunnerBase(object): model.clear_gradients() print_to_out(out_losses) - def run_trainer_in_eager_mode(self, args): - seed = 90 - if args.update_method == 'gloo': - place = fluid.CPUPlace() - elif fluid.core.is_compiled_with_cuda(): - device_id = int(os.getenv("FLAGS_selected_gpus", "0")) - place = fluid.CUDAPlace(device_id) - elif fluid.core.is_compiled_with_xpu(): - device_id = int(os.getenv("FLAGS_selected_xpus", "0")) - place = fluid.XPUPlace(device_id) - elif fluid.core.is_compiled_with_npu(): - device_id = int(os.getenv("FLAGS_selected_npus", "0")) - place = fluid.NPUPlace(device_id) - else: - assert ("Only support CUDAPlace or XPUPlace or CPU(Gloo) for now.") - - with _test_eager_guard(): - with fluid.dygraph.guard(place): - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - np.random.seed(seed) - import random - random.seed(seed) - - model, train_reader, opt = self.get_model() - - #if args.update_method == "nccl2": - if args.update_method in ["nccl2", "gloo"]: - paddle.distributed.init_parallel_env() - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore( - "127.0.0.1", args.dist_port, is_master, nranks) - if args.update_method == "nccl2": - group = core.ProcessGroupNCCL(store, rank, nranks) - elif args.update_method == "gloo": - group = core.ProcessGroupGloo(store, rank, nranks) - - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - model = dygraph.parallel.DataParallel( - model, - process_group=group, - find_unused_parameters=args.find_unused_parameters) - print_to_err(type(self).__name__, "model built in dygraph") - - out_losses = [] - print_to_err( - type(self).__name__, "begin to run dygraph training") - for step_id, data in enumerate(train_reader()): - data = self._get_data(data, args) - if step_id == RUN_STEP: - break - loss = self.run_one_loop(model, opt, data) - if step_id % 10 == 0: - print_to_err( - type(self).__name__, - "loss at step %d: %f" % (step_id, loss.numpy())) - out_losses.append(loss.numpy()) - - loss.backward() - - opt.minimize(loss) - if not args.accumulate_gradient: - model.clear_gradients() - print_to_out(out_losses) - def run_trainer_with_spawn(self, args): - if args.eager_mode: - return self.run_trainer_with_spawn_in_eager_mode(args) - else: - return self.run_trainer_with_spawn_func(args) - - def run_trainer_with_spawn_func(self, args): # 1. enable dygraph paddle.disable_static() @@ -733,64 +652,7 @@ class TestParallelDyGraphRunnerBase(object): model.clear_gradients() return out_losses - def run_trainer_with_spawn_in_eager_mode(self, args): - # 1. enable dygraph - paddle.disable_static() - - # 2. init seed - seed = 90 - paddle.static.default_startup_program().random_seed = seed - paddle.static.default_main_program().random_seed = seed - np.random.seed(seed) - random.seed(seed) - # get trainer id - args.trainer_id = paddle.distributed.get_rank() - - # 3. init parallel env - if args.update_method in ["nccl2", "gloo"]: - paddle.distributed.init_parallel_env() - - # 4. build process group - nranks = ParallelEnv().nranks - rank = ParallelEnv().local_rank - is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore("127.0.0.1", args.dist_port, - is_master, nranks) - if args.update_method == "nccl2": - group = core.ProcessGroupNCCL(store, rank, nranks) - elif args.update_method == "gloo": - group = core.ProcessGroupGloo(store, rank, nranks) - - # 5. train model - with _test_eager_guard(): - model, train_reader, opt = self.get_model() - if args.update_method in ["nccl2", "gloo"]: - model = paddle.DataParallel( - model, - process_group=group, - find_unused_parameters=args.find_unused_parameters) - - out_losses = [] - for step_id, data in enumerate(train_reader()): - data = self._get_data(data, args) - if step_id == RUN_STEP: - break - loss = self.run_one_loop(model, opt, data) - out_losses.append(loss.numpy()) - - loss.backward() - - opt.minimize(loss) - model.clear_gradients() - return out_losses - def run_use_fleet_api_trainer(self, args): - if args.eager_mode: - self.run_use_fleet_api_trainer_in_eager_mode(args) - else: - self.run_use_fleet_api_trainer_func(args) - - def run_use_fleet_api_trainer_func(self, args): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker # 1. enable dygraph @@ -835,52 +697,6 @@ class TestParallelDyGraphRunnerBase(object): opt.clear_grad() print_to_out(out_losses) - def run_use_fleet_api_trainer_in_eager_mode(self, args): - import paddle.distributed.fleet as fleet - import paddle.distributed.fleet.base.role_maker as role_maker - # 1. enable dygraph - paddle.disable_static() - - # 2. init seed - seed = 90 - paddle.static.default_startup_program().random_seed = seed - paddle.static.default_main_program().random_seed = seed - np.random.seed(seed) - random.seed(seed) - # get trainer id - args.trainer_id = paddle.distributed.get_rank() - - # set strategy - strategy = fleet.DistributedStrategy() - if args.find_unused_parameters: - strategy.find_unused_parameters = True - - # 3. init parallel env - if args.update_method == "nccl2" or "bkcl" or "hccl": - fleet.init(is_collective=True, strategy=strategy) - - # 4. train model - with _test_eager_guard(): - model, train_reader, opt = self.get_model() - if args.update_method == "nccl2" or "bkcl" or "hccl": - opt = fleet.distributed_optimizer(opt) - model = fleet.distributed_model(model) - - out_losses = [] - for step_id, data in enumerate(train_reader()): - data = self._get_data(data, args) - if step_id == RUN_STEP: - break - loss = self.run_one_loop(model, opt, data) - out_losses.append(loss.numpy()) - - loss.backward() - - opt.step() - if not args.accumulate_gradient: - opt.clear_grad() - print_to_out(out_losses) - def runtime_main(test_class): parser = argparse.ArgumentParser(description='Run dist test.') @@ -911,8 +727,6 @@ def runtime_main(test_class): parser.add_argument( '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') - parser.add_argument('--eager_mode', action='store_true') - parser.add_argument('--dist_port', type=int, required=False, default=6175) parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_cpu', action='store_true') parser.add_argument('--use_xpu', action='store_true') @@ -1005,8 +819,6 @@ class TestDistBase(unittest.TestCase): self._port_set = set() self._python_interp = sys.executable self._sync_mode = True - self._dist_port = 6175 - self._eager_mode = False self._hogwild_mode = False self._enforce_place = None self._use_reduce = False @@ -1168,10 +980,6 @@ class TestDistBase(unittest.TestCase): if len(devices) > 1 and self._use_dgc: cmd += " --use_dgc" - if self._eager_mode: - cmd += " --eager_mode" - cmd += " --dist_port {}".format(self._dist_port) - if self._accumulate_gradient: cmd += " --accumulate_gradient" @@ -1245,11 +1053,6 @@ class TestDistBase(unittest.TestCase): if self._sync_mode: tr0_cmd += " --sync_mode" tr1_cmd += " --sync_mode" - if self._eager_mode: - tr0_cmd += " --eager_mode" - tr1_cmd += " --eager_mode" - tr0_cmd += " --dist_port {}".format(self._dist_port) - tr1_cmd += " --dist_port {}".format(self._dist_port) if self._hogwild_mode: tr0_cmd += " --hogwild" tr1_cmd += " --hogwild" @@ -1356,10 +1159,6 @@ class TestDistBase(unittest.TestCase): assert self._use_dgc == False, "gloo not support use dgc" - if self._eager_mode: - tr_cmd += " --eager_mode" - tr_cmd += " --dist_port {}".format(self._dist_port) - if self._accumulate_gradient: tr_cmd += " --accumulate_gradient" @@ -1437,10 +1236,6 @@ class TestDistBase(unittest.TestCase): if self._use_dgc: tr_cmd += " --use_dgc" - if self._eager_mode: - tr_cmd += " --eager_mode" - tr_cmd += " --dist_port {}".format(self._dist_port) - if self._accumulate_gradient: tr_cmd += " --accumulate_gradient" @@ -1665,7 +1460,34 @@ class TestDistBase(unittest.TestCase): check_error_log=False, need_envs={}, log_name=""): + if self._dygraph and (self._gloo_mode or self._nccl2_mode): + with _test_eager_guard(): + self.check_with_place_func( + model_file=model_file, + delta=delta, + check_error_log=check_error_log, + need_envs=need_envs, + log_name=log_name) + self.check_with_place_func( + model_file=model_file, + delta=delta, + check_error_log=check_error_log, + need_envs=need_envs, + log_name=log_name) + else: + self.check_with_place_func( + model_file=model_file, + delta=delta, + check_error_log=check_error_log, + need_envs=need_envs, + log_name=log_name) + def check_with_place_func(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}, + log_name=""): required_envs = self._get_required_envs(check_error_log, need_envs) if self._gloo_mode: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py index 89535797ed..994ae27a29 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_group.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py @@ -26,7 +26,7 @@ import paddle.fluid.dygraph as dygraph from paddle.fluid.dygraph.nn import Linear import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, in_dygraph_mode class TestDataParallelGroup(unittest.TestCase): @@ -34,7 +34,10 @@ class TestDataParallelGroup(unittest.TestCase): return paddle.rand(shape=shape, dtype=dtype) def assign_group_by_size(self, *args): - return core.assign_group_by_size(*args) + if in_dygraph_mode(): + return core.eager_assign_group_by_size(*args) + elif _in_legacy_dygraph(): + return core.assign_group_by_size(*args) def test_construct_group0(self): # one dtype & one limit capability @@ -160,14 +163,19 @@ class TestDataParallelGroup(unittest.TestCase): [300], [1, 0, 2, 3]) self.assertEqual([[1, 0], [3], [2]], res) - -class TestDataParallelGroupEager(TestDataParallelGroup): - def create_varbase(self, dtype, shape): + def test_construct_group_in_legacy_mode(self): with _test_eager_guard(): - return paddle.rand(shape=shape, dtype=dtype) - - def assign_group_by_size(self, *args): - return core.eager_assign_group_by_size(*args) + pass + self.test_construct_group0() + self.test_construct_group1() + self.test_construct_group2() + self.test_construct_group3() + self.test_construct_group4() + self.test_construct_group5() + self.test_construct_group6() + self.test_construct_group7() + self.test_construct_group8() + self.test_construct_group9() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow_in_eager_mode.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow_in_eager_mode.py deleted file mode 100644 index dde0c4b260..0000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow_in_eager_mode.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -import sys -import unittest - -import paddle.fluid as fluid -from test_dist_base import TestDistBase -from spawn_runner_base import TestDistSpawnRunner - -flag_name = os.path.splitext(__file__)[0] - - -class TestDygraphControlFlowSameEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._eager_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_net(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_control_flow_same.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestDygraphControlFlowSameAccGradEager(TestDygraphControlFlowSameEager): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._eager_mode = True - self._dygraph = True - self._accumulate_gradient = True - self._find_unused_parameters = True - - -class TestDygraphControlFlowDiffEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._eager_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_net(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_control_flow_different.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestFleetDygraphControlFlowDiffAccGradEager( - TestDygraphControlFlowDiffEager): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._eager_mode = True - self._dygraph = True - self._accumulate_gradient = True - self._find_unused_parameters = True - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index cbf08856e7..d2e7949981 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -208,11 +208,6 @@ class TestDataParallelWithPyLayer(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py') -class TestDataParallelInEagerMode(TestMultipleGpus): - def test_multiple_gpus_dynamic(self): - self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') - - class TestGradientCheckInEagerMode(TestMultipleGpus): def test_multiple_gpus_dynamic(self): self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py') diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py index 6c5a2375f6..ce67a2ce4d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py @@ -136,7 +136,7 @@ class TestDataParallelGradientCheck(TestMultipleGpus): class TestDataParallelGradientCheckInEagerMode(TestMultipleGpus): def test_multiple_gpus_dynamic(self): - self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') + self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_in_eager_mode.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_in_eager_mode.py deleted file mode 100644 index d0e7d41395..0000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_in_eager_mode.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -import sys -import unittest - -import paddle.fluid as fluid -from test_dist_base import TestDistBase -from spawn_runner_base import TestDistSpawnRunner -from parallel_dygraph_no_sync import TestNoSync -from parallel_dygraph_no_sync_unused_params import TestNoSyncUnusedParam -from parallel_dygraph_no_sync_control_flow import TestNoSyncControlFlow - -flag_name = os.path.splitext(__file__)[0] - - -class TestParallelDygraphNoSync(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = False - - def test_no_sync(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncUnusedParam(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_no_sync_ununsed_param(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync_unused_params.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncControlFlow(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_no_sync_control_flow(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync_control_flow.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncSpawn(TestDistSpawnRunner): - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn(test_class=TestNoSync, delta=1e-5) - - -class TestParallelDygraphNoSyncUnusedParamSpawn(TestDistSpawnRunner): - def _args_config(self, args): - args.find_unused_parameters = True - args.eager_mode = True - - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestNoSyncUnusedParam, delta=1e-5) - - -class TestParallelDygraphNoSyncControlFlowSpawn(TestDistSpawnRunner): - def _args_config(self, args): - args.find_unused_parameters = True - args.eager_mode = True - - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestNoSyncControlFlow, delta=1e-5) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py index 30349270b9..43907da609 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py @@ -64,47 +64,5 @@ class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner): test_class=TestSparseEmbedding, delta=1e-5) -class TestParallelDygraphSparseEmdeddingEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._eager_mode = True - self._dygraph = True - - def test_sparse_embedding(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_sparse_embedding.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphSparseEmdeddingFP64Eager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_sparse_embedding_fp64(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_sparse_embedding_fp64.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphSparseEmdeddingSpawnEager(TestDistSpawnRunner): - def _args_config(self, args): - args.eager_mode = True - - def test_sparse_embedding_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestSparseEmbedding, delta=1e-5) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py index fb4c992d35..9aca448f16 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py @@ -48,32 +48,5 @@ class TestParallelDygraphSparseEmdeddingOverHeightSpawn(TestDistSpawnRunner): test_class=TestSparseEmbeddingOverHeight, delta=1e-5) -class TestParallelDygraphSparseEmdeddingOverHeightEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_sparse_embedding(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_sparse_embedding_over_height.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphSparseEmdeddingOverHeightSpawnEager( - TestDistSpawnRunner): - def _args_config(self, args): - args.eager_mode = True - - def test_sparse_embedding_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestSparseEmbeddingOverHeight, delta=1e-5) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py index 3a7a32c2ec..7cf1e9711b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py @@ -36,21 +36,5 @@ class TestParallelDygraphMnist(TestDistBase): log_name=flag_name) -class TestParallelDygraphMnistEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_mnist(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_sync_batch_norm.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py index 2141cceb79..71a8c7347e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py @@ -41,13 +41,6 @@ class TestParallelDygraphTransformer(TestDistBase): log_name=flag_name) -class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner): - def test_transformer_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestTransformer, delta=1e-5) - - class TestParallelDygraphTransformerAccGrad(TestDistBase): def _setup_config(self): self._sync_mode = False @@ -65,21 +58,5 @@ class TestParallelDygraphTransformerAccGrad(TestDistBase): log_name=flag_name) -class TestParallelDygraphTransformerEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_transformer(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_transformer.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py index f2225111d1..75fa6f7c71 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py @@ -86,71 +86,5 @@ class TestParallelDygraphSharedUnusedVariables(TestDistBase): log_name=flag_name) -class TestParallelDygraphUnusedVarEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_net(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_unused_variables.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestDygraphUnusedVarEager(TestParallelDygraphUnusedVar): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - -class TestSparseEmbeddingUnusedVarsSpawnEager(TestDistSpawnRunner): - def _args_config(self, args): - args.eager_mode = True - - def test_mnist_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) - - -class TestParallelDygraphNoVarEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_net(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_none_var.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphSharedUnusedVariablesEager(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._eager_mode = True - self._nccl2_mode = True - self._dygraph = True - - def test_mnist(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_shared_unused_var.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - if __name__ == "__main__": unittest.main() -- GitLab