broadcast_op_handle.cc 6.2 KB
Newer Older
C
chengduoZH 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

C
chengduoZH 已提交
15
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
Y
Yu Yang 已提交
16 17
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
Y
Yancey1989 已提交
18
#include "paddle/fluid/platform/profiler.h"
C
chengduoZH 已提交
19 20 21 22 23

namespace paddle {
namespace framework {
namespace details {

C
chengduoZH 已提交
24
void BroadcastOpHandle::RunImpl() {
Y
Yancey1989 已提交
25
  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
Y
Yancey1989 已提交
26

C
chengduoZH 已提交
27
  if (places_.size() == 1) return;
Y
Yu Yang 已提交
28

C
chengduoZH 已提交
29 30
  // The input and output may have dummy vars.
  VarHandle *in_var_handle;
Y
Yu Yang 已提交
31 32 33 34 35 36 37 38
  {
    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
                      "The number of input should be one.");
    in_var_handle = in_var_handles[0];
  }

  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
C
chengduoZH 已提交
39

C
chengduoZH 已提交
40
  PADDLE_ENFORCE_EQ(
41
      out_var_handles.size(), places_.size(),
C
chengduoZH 已提交
42
      "The number of output should equal to the number of places.");
C
chengduoZH 已提交
43

C
chengduoZH 已提交
44
  WaitInputVarGenerated();
C
chengduoZH 已提交
45

C
chengduoZH 已提交
46 47 48 49 50
  std::vector<const Scope *> var_scopes;
  for (auto *s : local_scopes_) {
    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
  }

51 52 53 54 55 56 57
  BroadcastOneVar(*in_var_handle, out_var_handles, var_scopes);
}

void BroadcastOpHandle::BroadcastOneVar(
    const VarHandle &in_var_handle,
    const std::vector<VarHandle *> &out_var_handles,
    const std::vector<const Scope *> &var_scopes) {
C
chengduoZH 已提交
58
  auto *in_var =
59
      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
Y
Yu Yang 已提交
60 61
  PADDLE_ENFORCE_NOT_NULL(in_var);
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
62 63 64 65
  if (!in_tensor.IsInitialized()) {
    VLOG(3) << "in var " << in_var_handle->name_ << "not inited, return!";
    return;
  }
C
chengduoZH 已提交
66

67
  InitOutputValue(in_var_handle, out_var_handles);
C
chengduoZH 已提交
68

C
chengduoZH 已提交
69
  if (platform::is_cpu_place(in_tensor.place())) {
C
chengduoZH 已提交
70
    for (auto *out_var_handle : out_var_handles) {
71
      if (out_var_handle->IsTheSameVar(in_var_handle)) {
C
chengduoZH 已提交
72 73
        continue;
      }
C
chengduoZH 已提交
74 75 76 77
      auto &out_p = out_var_handle->place_;
      auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
                          ->FindVar(out_var_handle->name_);

C
chengduoZH 已提交
78
      RunAndRecordEvent(out_p, [in_tensor, out_var] {
C
chengduoZH 已提交
79
        paddle::framework::TensorCopy(
C
chengduoZH 已提交
80
            in_tensor, platform::CPUPlace(),
C
chengduoZH 已提交
81 82 83 84 85
            &VariableVisitor::GetMutableTensor(out_var));
      });
    }
  } else {
#ifdef PADDLE_WITH_CUDA
C
chengduoZH 已提交
86 87
    VarHandle *out_handle = nullptr;
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
C
chengduoZH 已提交
88 89
    std::vector<std::function<void()>> broadcast_calls;

C
chengduoZH 已提交
90 91 92
    int type = platform::ToNCCLDataType(in_tensor.type());
    size_t numel = static_cast<size_t>(in_tensor.numel());

C
chengduoZH 已提交
93
    for (auto out_var_handle : out_var_handles) {
C
chengduoZH 已提交
94 95 96
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                              ->FindVar(out_var_handle->name_);

C
chengduoZH 已提交
97 98
      int dst_id =
          boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
C
chengduoZH 已提交
99

C
chengduoZH 已提交
100
      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
C
chengduoZH 已提交
101 102

      void *send_recv_buffer = nullptr;
C
chengduoZH 已提交
103
      if (root_id == dst_id) {
C
chengduoZH 已提交
104 105 106
        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
        out_handle = out_var_handle;
      } else {
C
chengduoZH 已提交
107 108 109
        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
                               .Resize(in_tensor.dims())
                               .mutable_data(out_var_handle->place_);
C
chengduoZH 已提交
110 111
      }

C
chengduoZH 已提交
112 113 114 115 116 117
      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
            PADDLE_ENFORCE(platform::dynload::ncclBcast(
                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
          });
Y
Yu Yang 已提交
118 119
    }

120 121 122 123 124
    this->RunAndRecordEvent([&] {
      {
        platform::NCCLGroupGuard guard;
        for (auto &call : broadcast_calls) {
          call();
C
chengduoZH 已提交
125
        }
126
      }
C
chengduoZH 已提交
127

128 129
      if (!out_handle->IsTheSameVar(in_var_handle)) {
        auto out_var = var_scopes.at(in_var_handle.scope_idx_)
130 131
                           ->FindVar(out_var_handles[0]->name_);
        paddle::framework::TensorCopy(
132 133
            in_tensor, in_var_handle.place_,
            *(dev_ctxes_.at(in_var_handle.place_)),
134 135 136
            &VariableVisitor::GetMutableTensor(out_var));
      }
    });
C
chengduoZH 已提交
137
#else
C
chengduoZH 已提交
138
    PADDLE_THROW("CUDA is not enabled.");
C
chengduoZH 已提交
139
#endif
C
chengduoZH 已提交
140 141 142
  }
}

143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
void BroadcastOpHandle::InitOutputValue(
    const VarHandle &in_var_handle,
    const std::vector<VarHandle *> &out_var_handles) const {
  std::vector<const Scope *> var_scopes;
  for (auto *s : local_scopes_) {
    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
  }
  auto *in_var =
      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);

  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);

  // NOTE: The tensors' Place of input and output must be all on GPU or all on
  // CPU.
  for (auto *out_var_handle : out_var_handles) {
    if (out_var_handle->IsTheSameVar(in_var_handle)) {
      continue;
    }
    auto t_out_p = out_var_handle->place_;
    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
                        ->FindVar(out_var_handle->name_);
    PADDLE_ENFORCE_NOT_NULL(out_var);
    if (is_gpu_place(in_tensor.place())) {
      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
                     "Places of input and output must be all on GPU.");
    } else {
      t_out_p = platform::CPUPlace();
    }
    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
                                                            in_tensor.type());
  }
}

C
chengduoZH 已提交
177
std::string BroadcastOpHandle::Name() const { return "broadcast"; }
C
chengduoZH 已提交
178 179 180
}  // namespace details
}  // namespace framework
}  // namespace paddle