From b75a69bad6ecc73a057faae3316d29a3a5c9386f Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 2 Apr 2019 02:09:00 -0500 Subject: [PATCH] Add Stream for fetch op handle (#16600) * expose fuse broadcast ops --- paddle/fluid/framework/details/build_strategy.h | 6 +++++- paddle/fluid/framework/details/fetch_op_handle.cc | 3 ++- .../framework/details/multi_devices_graph_pass.cc | 4 ++-- paddle/fluid/pybind/pybind.cc | 15 ++++++++++++++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 5811693b7..cc48c51e9 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,11 @@ struct BuildStrategy { bool enable_sequential_execution_{false}; - bool fuse_broadcast_op_{false}; + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 81e200c0d..6c8b8937e 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -63,7 +63,8 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopySync(t, cpu, &tensors_[i]); + TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]); + dev_ctxes_.at(t.place())->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e22bd3917..f213e07b5 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -658,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { if (UseGPU()) { - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1021,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { return; } - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b79b759d..044677fb7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1299,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle. to fuse relu and depthwise_conv2d, it will save GPU memory and may make the execution faster. This options is only available in GPU devices. - Default False)DOC") + Default False.)DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + self.fuse_broadcast_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False.)DOC") .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { return self.fuse_all_optimizer_ops_; -- GitLab