diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index d5ca061944f33939cea59a5275e691b1966194fa..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() { int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; + int type = platform::ToNCCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + for (auto out_var_handle : out_var_handles) { Variable *out_var = var_scopes.at(out_var_handle->scope_idx_) ->FindVar(out_var_handle->name_); @@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() { send_recv_buffer = const_cast(in_tensor.data()); out_handle = out_var_handle; } else { - send_recv_buffer = - VariableVisitor::GetMutableTensor(out_var).mutable_data( - out_var_handle->place_); + send_recv_buffer = VariableVisitor::GetMutableTensor(out_var) + .Resize(in_tensor.dims()) + .mutable_data(out_var_handle->place_); } - int type = platform::ToNCCLDataType(in_tensor.type()); - size_t numel = static_cast(in_tensor.numel()); broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { PADDLE_ENFORCE(platform::dynload::ncclBcast( diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 99dcaf27134e879fa85f57e5a675382442e9edf2..a6fe64fa80d6bf036893d49de56d7274d49a3b30 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -351,7 +351,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); - auto var = new VarHandle(vars.size() - 1, i, og, p); + auto var = new VarHandle(vars.size(), i, og, p); vars.emplace_back(var); op_handle->AddOutput(var); } @@ -447,8 +447,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, op_handle->AddInput(prev_grad.get()); } auto &vars = result->vars_[dst_dev_id][og]; - auto var = - new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]); + auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]); vars.emplace_back(var); op_handle->AddOutput(var); return var; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index f79565fe71c4aef140475c922cbbf5a1e0b7fe03..1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/op_handle_base.h" +#include namespace paddle { namespace framework { @@ -122,11 +122,16 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event std::function method = callback; - + // NOTE(zcd): device context must be ordered here because RecordEvent + // will use a mutex to ensure the safe of multi-threads. + std::map ordered_ctxes; for (auto &p : dev_ctxes_) { + ordered_ctxes.emplace(p.second, p.first); + } + for (auto &p : ordered_ctxes) { method = [method, p, this]() { - static_cast(p.second)->RecordEvent( - events_.at(boost::get(p.first).device), + static_cast(p.first)->RecordEvent( + events_.at(boost::get(p.second).device), method); }; }