broadcast_op_handle.cc 5.7 KB
Newer Older
C
chengduoZH 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

C
chengduoZH 已提交
15
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
Y
Yu Yang 已提交
16 17
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
C
chengduoZH 已提交
18 19 20 21 22

namespace paddle {
namespace framework {
namespace details {

C
chengduoZH 已提交
23
void BroadcastOpHandle::RunImpl() {
C
chengduoZH 已提交
24
  if (places_.size() == 1) return;
Y
Yu Yang 已提交
25

C
chengduoZH 已提交
26 27
  // The input and output may have dummy vars.
  VarHandle *in_var_handle;
Y
Yu Yang 已提交
28 29 30 31 32 33 34 35
  {
    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
                      "The number of input should be one.");
    in_var_handle = in_var_handles[0];
  }

  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
C
chengduoZH 已提交
36

C
chengduoZH 已提交
37
  PADDLE_ENFORCE_EQ(
38
      out_var_handles.size(), places_.size(),
C
chengduoZH 已提交
39
      "The number of output should equal to the number of places.");
C
chengduoZH 已提交
40

C
chengduoZH 已提交
41
  WaitInputVarGenerated();
C
chengduoZH 已提交
42

C
chengduoZH 已提交
43 44 45 46 47 48 49
  std::vector<const Scope *> var_scopes;
  for (auto *s : local_scopes_) {
    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
  }

  auto *in_var =
      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
Y
Yu Yang 已提交
50 51
  PADDLE_ENFORCE_NOT_NULL(in_var);
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
C
chengduoZH 已提交
52

53
  InitOutputValue(*in_var_handle, out_var_handles);
C
chengduoZH 已提交
54

C
chengduoZH 已提交
55
  if (platform::is_cpu_place(in_tensor.place())) {
C
chengduoZH 已提交
56
    for (auto *out_var_handle : out_var_handles) {
C
chengduoZH 已提交
57
      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
C
chengduoZH 已提交
58 59
        continue;
      }
C
chengduoZH 已提交
60 61 62 63
      auto &out_p = out_var_handle->place_;
      auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
                          ->FindVar(out_var_handle->name_);

C
chengduoZH 已提交
64
      RunAndRecordEvent(out_p, [in_tensor, out_var] {
C
chengduoZH 已提交
65
        paddle::framework::TensorCopy(
C
chengduoZH 已提交
66
            in_tensor, platform::CPUPlace(),
C
chengduoZH 已提交
67 68 69 70 71
            &VariableVisitor::GetMutableTensor(out_var));
      });
    }
  } else {
#ifdef PADDLE_WITH_CUDA
C
chengduoZH 已提交
72 73
    VarHandle *out_handle = nullptr;
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
C
chengduoZH 已提交
74 75
    std::vector<std::function<void()>> broadcast_calls;

76 77 78
    int type = platform::ToNCCLDataType(in_tensor.type());
    size_t numel = static_cast<size_t>(in_tensor.numel());

C
chengduoZH 已提交
79
    for (auto out_var_handle : out_var_handles) {
C
chengduoZH 已提交
80 81 82
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                              ->FindVar(out_var_handle->name_);

C
chengduoZH 已提交
83 84
      int dst_id =
          boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
C
chengduoZH 已提交
85

C
chengduoZH 已提交
86
      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
C
chengduoZH 已提交
87 88

      void *send_recv_buffer = nullptr;
C
chengduoZH 已提交
89
      if (root_id == dst_id) {
C
chengduoZH 已提交
90 91 92
        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
        out_handle = out_var_handle;
      } else {
93 94 95
        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
                               .Resize(in_tensor.dims())
                               .mutable_data(out_var_handle->place_);
C
chengduoZH 已提交
96 97
      }

C
chengduoZH 已提交
98 99 100 101 102 103
      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
            PADDLE_ENFORCE(platform::dynload::ncclBcast(
                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
          });
Y
Yu Yang 已提交
104 105
    }

C
chengduoZH 已提交
106 107 108 109 110 111 112
    this->RunAndRecordEvent([&] {
      {
        platform::NCCLGroupGuard guard;
        for (auto &call : broadcast_calls) {
          call();
        }
      }
C
chengduoZH 已提交
113 114

      if (!out_handle->IsTheSameVar(*in_var_handle)) {
C
chengduoZH 已提交
115 116 117 118 119 120 121
        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
                           ->FindVar(out_var_handles[0]->name_);
        paddle::framework::TensorCopy(
            in_tensor, in_var_handle->place_,
            *(dev_ctxes_.at(in_var_handle->place_)),
            &VariableVisitor::GetMutableTensor(out_var));
      }
C
chengduoZH 已提交
122
    });
C
chengduoZH 已提交
123
#else
C
chengduoZH 已提交
124
    PADDLE_THROW("CUDA is not enabled.");
C
chengduoZH 已提交
125
#endif
C
chengduoZH 已提交
126 127 128
  }
}

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
void BroadcastOpHandle::InitOutputValue(
    const VarHandle &in_var_handle,
    const std::vector<VarHandle *> &out_var_handles) const {
  std::vector<const Scope *> var_scopes;
  for (auto *s : local_scopes_) {
    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
  }
  auto *in_var =
      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);

  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);

  // NOTE: The tensors' Place of input and output must be all on GPU or all on
  // CPU.
  for (auto *out_var_handle : out_var_handles) {
    if (out_var_handle->IsTheSameVar(in_var_handle)) {
      continue;
    }
    auto t_out_p = out_var_handle->place_;
    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
                        ->FindVar(out_var_handle->name_);
    PADDLE_ENFORCE_NOT_NULL(out_var);
    if (is_gpu_place(in_tensor.place())) {
      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
                     "Places of input and output must be all on GPU.");
    } else {
      t_out_p = platform::CPUPlace();
    }
    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
                                                            in_tensor.type());
  }
}

C
chengduoZH 已提交
163
std::string BroadcastOpHandle::Name() const { return "broadcast"; }
C
chengduoZH 已提交
164 165 166
}  // namespace details
}  // namespace framework
}  // namespace paddle