all_reduce_op_handle.cc 6.5 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
C
chengduoZH 已提交
14
#include <algorithm>
Y
Yu Yang 已提交
15

16
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
C
chengduoZH 已提交
17
#include "paddle/fluid/framework/details/container_cast.h"
C
chengduoZH 已提交
18
#include "paddle/fluid/framework/details/reduce_and_gather.h"
C
chengduoZH 已提交
19
#include "paddle/fluid/framework/details/variable_visitor.h"
20
#include "paddle/fluid/platform/profiler.h"
Y
Stash  
Yu Yang 已提交
21

Y
Yu Yang 已提交
22 23 24
namespace paddle {
namespace framework {
namespace details {
C
chengduoZH 已提交
25

P
peizhilin 已提交
26
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
X
Xin Pan 已提交
27 28
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
29 30
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLContextMap *ctxs)
X
Xin Pan 已提交
31 32 33 34
    : OpHandleBase(node),
      local_scopes_(local_scopes),
      places_(places),
      nccl_ctxs_(ctxs) {
35
  if (nccl_ctxs_) {
C
chengduoZH 已提交
36
    for (auto &p : places_) {
C
chengduo 已提交
37
      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
C
chengduoZH 已提交
38
    }
Y
Yu Yang 已提交
39 40
  }
}
C
chengduoZH 已提交
41
#else
X
Xin Pan 已提交
42 43
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
44
                                     const std::vector<platform::Place> &places)
X
Xin Pan 已提交
45
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
C
chengduoZH 已提交
46
#endif
Y
Yu Yang 已提交
47

48
void AllReduceOpHandle::RunImpl() {
Y
Yancey1989 已提交
49 50 51
  int64_t start_ts = GetTS();
  int64_t func_ts = GetTS();
  VLOG(5) << "all_reduce_op_handle::RunImpl start";
C
chengduo 已提交
52
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
Y
Yancey1989 已提交
53

54 55 56 57
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA
  if (NoDummyInputSize() == 1 &&
Y
Yancey1989 已提交
58
      local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) {
59
#else
C
chengduoZH 已提交
60
  if (NoDummyInputSize() == 1) {
61
#endif
Y
Yu Yang 已提交
62 63 64
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done
Y
Yancey1989 已提交
65
    start_ts = GetTS();
C
chengduoZH 已提交
66
    WaitInputVarGenerated();
Y
Yancey1989 已提交
67 68 69
    VLOG(5) << "all_reduce_op_handle wait input var spent: "
            << GetTS() - start_ts << " (ns).";
    start_ts = GetTS();
C
chengduoZH 已提交
70 71 72 73 74 75 76 77
    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
    PADDLE_ENFORCE_EQ(
        in_var_handles.size(), places_.size(),
        "The NoDummyInputSize should be equal to the number of places.");
    PADDLE_ENFORCE_EQ(
        in_var_handles.size(), out_var_handles.size(),
        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
Y
Yu Yang 已提交
78

C
chengduoZH 已提交
79
    std::vector<const LoDTensor *> lod_tensors;
Y
Yu Yang 已提交
80 81
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
Y
Yu Yang 已提交
82
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
C
chengduoZH 已提交
83 84
      auto &lod_tensor =
          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
C
chengduoZH 已提交
85
      lod_tensors.emplace_back(&lod_tensor);
C
chengduoZH 已提交
86 87
      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
                        "The name of input and output should be equal.");
Y
Stash  
Yu Yang 已提交
88 89
    }

C
chengduoZH 已提交
90
    if (platform::is_gpu_place(lod_tensors[0]->place())) {
P
peizhilin 已提交
91
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
C
chengduoZH 已提交
92
      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
C
chengduoZH 已提交
93 94
      int dtype = -1;
      size_t numel = 0;
Y
Stash  
Yu Yang 已提交
95 96 97
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
C
chengduoZH 已提交
98
        auto &lod_tensor = *lod_tensors[i];
Y
Stash  
Yu Yang 已提交
99
        void *buffer = const_cast<void *>(lod_tensor.data<void>());
Y
Yu Yang 已提交
100

Y
Stash  
Yu Yang 已提交
101 102 103 104 105 106 107 108 109
        if (dtype == -1) {
          dtype = platform::ToNCCLDataType(lod_tensor.type());
        }

        if (numel == 0) {
          numel = static_cast<size_t>(lod_tensor.numel());
        }

        int dev_id = boost::get<platform::CUDAPlace>(p).device;
Y
Yancey1989 已提交
110 111
        VLOG(5) << "call allreduce: " << in_var_handles[i]->name_
                << " on dev: " << dev_id;
C
chengduoZH 已提交
112
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
Y
Stash  
Yu Yang 已提交
113 114 115 116 117 118 119
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
              ncclSum, comm, stream));
        });
Y
Yu Yang 已提交
120
      }
121
      this->RunAndRecordEvent([&] {
Y
Yancey1989 已提交
122 123 124 125 126 127 128 129 130 131 132
        // TODO(Yancey1989): need allreduce operator to avoid this flag
        if (nccl_ctxs_->need_group_call_) {
          platform::NCCLGroupGuard guard;
          for (auto &call : all_reduce_calls) {
            call();
          }
        } else {
          // only used in executor_type == ParallalGraph, one thread one GPU
          // TODO(Yancey1989): use allreduce operator to avoid this tricky.
          PADDLE_ENFORCE(all_reduce_calls.size() == 1UL);
          all_reduce_calls[0]();
133 134
        }
      });
Y
Yancey1989 已提交
135

C
chengduoZH 已提交
136 137 138
#else
      PADDLE_THROW("Not compiled with CUDA");
#endif
Y
Stash  
Yu Yang 已提交
139
    } else {  // Special handle CPU only Operator's gradient. Like CRF
Y
Yu Yang 已提交
140 141 142
      auto &trg = *this->local_scopes_[0]
                       ->FindVar(kLocalExecScopeName)
                       ->Get<Scope *>()
143
                       ->FindVar(out_var_handles[0]->name_)
Y
Yu Yang 已提交
144
                       ->GetMutable<framework::LoDTensor>();
Y
Yu Yang 已提交
145

Y
Stash  
Yu Yang 已提交
146 147
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
C
chengduoZH 已提交
148
      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
149

150
      for (size_t i = 1; i < local_scopes_.size(); ++i) {
Y
Yu Yang 已提交
151 152
        auto &scope =
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
153
        auto &p = places_[i];
154
        auto *var = scope.FindVar(out_var_handles[i]->name_);
C
chengduo 已提交
155
        auto *dev_ctx = dev_ctxes_.at(p);
156 157 158 159 160 161 162

        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
          auto &tensor_cpu = trg;
          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
        });
      }
Y
Yu Yang 已提交
163 164
    }
  }
Y
Yancey1989 已提交
165 166
  VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts
          << " (ns).";
Y
Yu Yang 已提交
167
}
Y
Yu Yang 已提交
168

C
chengduoZH 已提交
169
std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
Y
Yu Yang 已提交
170 171 172
}  // namespace details
}  // namespace framework
}  // namespace paddle