From e50d883efe64441841b136738f088889ec90eea6 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 17 Jan 2022 11:30:40 +0800 Subject: [PATCH] Add NoReduce mode for ParallelExecutor (#38969) * add no reduce mode for pe * add NoReduce ut --- .../fluid/framework/details/build_strategy.cc | 4 + .../fluid/framework/details/build_strategy.h | 2 +- .../framework/distributed_strategy.proto | 1 + .../multi_devices_graph_pass.cc | 2 + .../multi_devices_graph_pass.h | 9 ++ paddle/fluid/pybind/pybind.cc | 3 +- .../fleet/base/distributed_strategy.py | 16 ++- python/paddle/fluid/compiler.py | 15 ++- .../unittests/test_parallel_executor_mnist.py | 99 +++++++++++++++++-- 9 files changed, 134 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index cee97820d6a..c99200ec98a 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -239,6 +239,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); break; + case BuildStrategy::ReduceStrategy::kNoReduce: + multi_devices_pass = AppendPass("no_reduce_multi_devices_pass").get(); + break; default: PADDLE_THROW( platform::errors::Unimplemented("Unknown reduce strategy.")); @@ -475,6 +478,7 @@ USE_PASS(fuse_bn_act_pass); USE_PASS(fuse_bn_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); +USE_PASS(no_reduce_multi_devices_pass); USE_PASS(reduce_mode_multi_devices_pass); USE_PASS(all_reduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index f9c28cbee50..70a083dd70b 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -72,7 +72,7 @@ struct BuildStrategy { // For CPU, if you want to fix the order of summing to make the result // of kAllReduce and kReduce no diff, you can add // `FLAGS_cpu_deterministic=true` to env. - enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 }; + enum class ReduceStrategy { kAllReduce = 0, kReduce = 1, kNoReduce = 2 }; enum class GradientScaleStrategy { kCoeffNumDevice = 0, diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 28108e78d9d..da2147f4036 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -117,6 +117,7 @@ message BuildStrategy { optional bool enable_addto = 12 [ default = false ]; optional bool fix_op_run_order = 13 [ default = false ]; optional bool allow_cuda_graph_capture = 14 [ default = false ]; + optional int32 reduce_strategy = 15 [ default = 0 ]; } message ExecutionStrategy { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index c50e00f9995..5dbc3e38ea1 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -1283,3 +1283,5 @@ REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, paddle::framework::ir::DistSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass, paddle::framework::ir::AsyncSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(no_reduce_multi_devices_pass, + paddle::framework::ir::NoReduceSSAGraphBuilder); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index 27eda22828e..c76f3001676 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -144,6 +144,15 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { bool IsEncoded(const std::string &p_name) const; }; +class NoReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + void InsertCollectiveOp(ir::Graph *result, ir::Node *node, + const std::string &p_name, + const std::string &g_name) const override {} + + void InsertPostprocessOps(ir::Graph *result) const override {} +}; + class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { protected: void InsertCollectiveOp(ir::Graph *result, ir::Node *node, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5f4e9a88613..3eabf255ccb 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2984,7 +2984,8 @@ All parameter, weight, gradient are variables in Paddle. py::enum_(build_strategy, "ReduceStrategy") .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) - .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce); + .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) + .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); py::enum_(build_strategy, "GradientScaleStrategy") .value("CoeffNumDevice", diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 3b8b36a61e2..8c9499628e7 100644 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -102,6 +102,10 @@ class DistributedJobInfo(object): self.job_info.strategy = dist_strategy +ReduceStrategyFluid = paddle.fluid.BuildStrategy.ReduceStrategy +ReduceStrategyFleet = int + + class DistributedStrategy(object): __lock_attr = False @@ -239,8 +243,10 @@ class DistributedStrategy(object): build_strategy = paddle.fluid.BuildStrategy() fields = self.strategy.build_strategy.DESCRIPTOR.fields for f in fields: - setattr(build_strategy, f.name, - getattr(self.strategy.build_strategy, f.name)) + value = getattr(self.strategy.build_strategy, f.name) + if f.name == 'reduce_strategy': + value = ReduceStrategyFluid(value) + setattr(build_strategy, f.name, value) return build_strategy @build_strategy.setter @@ -249,8 +255,10 @@ class DistributedStrategy(object): fields = self.strategy.build_strategy.DESCRIPTOR.fields for f in fields: if f.label == 1 or f.label == 2: # optional and required field - setattr(self.strategy.build_strategy, f.name, - getattr(strategy, f.name)) + value = getattr(strategy, f.name) + if f.name == 'reduce_strategy': + value = ReduceStrategyFleet(value) + setattr(self.strategy.build_strategy, f.name, value) elif f.label == 3: # repeated field getattr(self.strategy.build_strategy, f.name).extend(getattr(strategy, f.name)) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 2698f1a00dc..1fa86d0aeea 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -85,6 +85,16 @@ def _has_optimizer_in_control_flow(program): return False +def _should_broadcast_or_not_exists(program, var_name): + block = program.global_block() + var = block.vars.get(var_name, None) + if var is None: + return True + is_distributed = getattr(var, '_is_distributed', False) or getattr( + var, 'is_distributed', False) + return not is_distributed + + class CompiledProgram(object): """ :api_attr: Static Graph @@ -398,7 +408,10 @@ class CompiledProgram(object): for node in self._graph.nodes(): if node.is_var() and node.var() is not None and node.var().persistable() and \ node.var().type() != core.VarDesc.VarType.RAW: - self._persistable_vars.append(cpt.to_text(node.name())) + name = cpt.to_text(node.name()) + if self._program is not None and _should_broadcast_or_not_exists( + self._program, name): + self._persistable_vars.append(cpt.to_text(node.name())) places = list(map(_place_obj, places)) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 2c79670f1a2..61d643f24c1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -65,17 +65,18 @@ def fc_with_batchnorm(use_feed): return loss +def init_data(): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + class TestMNIST(TestParallelExecutorBase): @classmethod def setUpClass(cls): os.environ['CPU_NUM'] = str(4) - def _init_data(self): - np.random.seed(5) - img = np.random.random(size=[32, 784]).astype(np.float32) - label = np.ones(shape=[32, 1], dtype='int64') - return img, label - def _compare_reduce_and_allreduce(self, model, use_device, @@ -87,7 +88,7 @@ class TestMNIST(TestParallelExecutorBase): if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return - img, label = self._init_data() + img, label = init_data() all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, @@ -116,7 +117,7 @@ class TestMNIST(TestParallelExecutorBase): if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return - img, label = self._init_data() + img, label = init_data() self.check_network_convergence( simple_fc_net, @@ -144,7 +145,7 @@ class TestMNIST(TestParallelExecutorBase): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return - img, label = self._init_data() + img, label = init_data() single_first_loss, single_last_loss = self.check_network_convergence( method=simple_fc_net, @@ -175,7 +176,7 @@ class TestMNIST(TestParallelExecutorBase): return if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return - img, label = self._init_data() + img, label = init_data() self.check_network_convergence( fc_with_batchnorm, @@ -199,6 +200,84 @@ class TestMNIST(TestParallelExecutorBase): 1e-5, 1e-2) +class TestMNISTNoReduce(unittest.TestCase): + def run_program(self, device_type): + if device_type == DeviceType.CUDA: + if not paddle.is_compiled_with_cuda(): + return + places = paddle.static.cuda_places() + else: + self.assertEqual(device_type, DeviceType.CPU) + places = paddle.static.cpu_places(4) + + paddle.seed(10) + with paddle.fluid.unique_name.guard(): + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + loss = simple_fc_net(use_feed=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.0) + optimizer.minimize(loss) + + grads = [p.name + '@GRAD' for p in main.all_parameters()] + no_reduce = paddle.static.BuildStrategy.ReduceStrategy._NoReduce + + build_strategy = paddle.static.BuildStrategy() + build_strategy.reduce_strategy = no_reduce + main_multi_place = paddle.static.CompiledProgram( + main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + places=places) + + build_strategy = paddle.static.BuildStrategy() + build_strategy.reduce_strategy = no_reduce + main_single_place = paddle.static.CompiledProgram(main.clone( + )).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + places=places[0]) + + image, label = init_data() + feed = {'image': image, 'label': label} + exe = paddle.static.Executor(places[0]) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + exe.run(startup) + grads_multi_place = exe.run(main_multi_place, + feed=feed, + fetch_list=[grads]) + + feeds = self.split_feed(feed, len(places)) + grads_single_place = [list() for _ in range(len(grads))] + for f in feeds: + gs = exe.run(main_single_place, feed=f, fetch_list=[grads]) + for i, g in enumerate(gs): + grads_single_place[i].append(g) + + for i in range(len(grads)): + grads_single_place[i] = np.concatenate( + grads_single_place[i], axis=0) / len(places) + + self.assertEqual(len(grads_multi_place), len(grads_single_place)) + for g1, g2 in zip(grads_multi_place, grads_single_place): + self.assertTrue( + np.allclose(g1, g2), 'g1 = {}\ng2 = {}\n'.format(g1, g2)) + + def split_feed(self, feed, n): + image = feed['image'] + label = feed['label'] + self.assertEqual(image.shape[0] % n, 0) + self.assertEqual(label.shape[0] % n, 0) + images = np.split(image, n) + labels = np.split(label, n) + return [{'image': images[i], 'label': labels[i]} for i in range(n)] + + def test_main(self): + self.run_program(DeviceType.CUDA) + self.run_program(DeviceType.CPU) + + if __name__ == '__main__': paddle.enable_static() unittest.main() -- GitLab