all_reduce_op_handle.cc 6.0 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
15
#include <algorithm>
C
chengduoZH 已提交
16
#include "paddle/fluid/framework/details/container_cast.h"
C
chengduoZH 已提交
17
#include "paddle/fluid/framework/details/reduce_and_gather.h"
C
chengduoZH 已提交
18
#include "paddle/fluid/framework/details/variable_visitor.h"
19 20
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/gpu_info.h"
21
#include "paddle/fluid/platform/profiler.h"
Y
Stash  
Yu Yang 已提交
22

23 24 25
#ifdef PADDLE_WITH_CUDA
DECLARE_bool(sync_nccl_allreduce);
#endif
Y
Yancey1989 已提交
26

Y
Yu Yang 已提交
27 28 29
namespace paddle {
namespace framework {
namespace details {
C
chengduoZH 已提交
30

P
peizhilin 已提交
31
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
X
Xin Pan 已提交
32 33
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
34
                                     const std::vector<platform::Place> &places,
35
                                     const platform::NCCLCommunicator *ctxs)
36 37
    : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
Y
Yu Yang 已提交
38
}
C
chengduoZH 已提交
39
#else
X
Xin Pan 已提交
40 41
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
42
                                     const std::vector<platform::Place> &places)
X
Xin Pan 已提交
43
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
C
chengduoZH 已提交
44
#endif
Y
Yu Yang 已提交
45

46
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
G
gongweibao 已提交
47 48
void AllReduceOpHandle::RunAllReduceFuncs(
    const std::vector<std::function<void()>> &all_reduce_calls) {
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
  this->RunAndRecordEvent([&] {
    if (all_reduce_calls.size() == 1UL) {
      // Do not use NCCLGroup when manage NCCL by per thread per device
      all_reduce_calls[0]();
    } else {
      platform::NCCLGroupGuard guard;
      for (auto &call : all_reduce_calls) {
        call();
      }
    }
  });

  if (FLAGS_sync_nccl_allreduce) {
    for (auto &p : places_) {
      int dev_id = boost::get<platform::CUDAPlace>(p).device;
64 65 66
      auto *nccl_ctxs =
          nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
      auto &nccl_ctx = nccl_ctxs->at(dev_id);
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
      auto stream = nccl_ctx.stream();
      cudaError_t e_sync = cudaStreamSynchronize(stream);
      if (e_sync != 0) {
        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
      }

      cudaError_t e_get = cudaGetLastError();
      if (e_get != 0) {
        LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
                   << " errno:" << e_get;
      }
    }
  }
}
#endif

83
void AllReduceOpHandle::RunImpl() {
84
  platform::RecordEvent record_event(Name());
Y
Yancey1989 已提交
85

Y
Yancey1989 已提交
86
  WaitInputVarGenerated();
87

Y
Yancey1989 已提交
88 89 90 91 92 93 94 95 96 97 98
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
      "The NoDummyInputSize should be equal to the number of places.");
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
      "The NoDummyInputSize and NoDummyOutputSize should be equal.");

  std::vector<const LoDTensor *> lod_tensors;
  for (size_t i = 0; i < local_scopes_.size(); ++i) {
99
    auto &local_scope = local_exec_scopes_[i];
Y
Yancey1989 已提交
100
    auto &lod_tensor =
101
        local_scope->FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
Y
Yancey1989 已提交
102
    lod_tensors.emplace_back(&lod_tensor);
103 104
    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
             << ", out_name:" << out_var_handles[i]->name();
G
gongweibao 已提交
105
    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
Y
Yancey1989 已提交
106 107
                      "The name of input and output should be equal.");
  }
Y
Stash  
Yu Yang 已提交
108

Y
Yancey1989 已提交
109
  if (platform::is_gpu_place(lod_tensors[0]->place())) {
P
peizhilin 已提交
110
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
Y
Yancey1989 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
    int dtype = -1;
    size_t numel = 0;
    std::vector<std::function<void()>> all_reduce_calls;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto &p = places_[i];
      auto &lod_tensor = *lod_tensors[i];
      void *buffer = const_cast<void *>(lod_tensor.data<void>());

      if (dtype == -1) {
        dtype = platform::ToNCCLDataType(lod_tensor.type());
      }

      if (numel == 0) {
        numel = static_cast<size_t>(lod_tensor.numel());
      }

      all_reduce_calls.emplace_back([=] {
129 130
        NCCLAllReduce(p, buffer, buffer, numel,
                      static_cast<ncclDataType_t>(dtype), ncclSum);
Y
Yancey1989 已提交
131 132
      });
    }
133
    VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type());
G
gongweibao 已提交
134
    RunAllReduceFuncs(all_reduce_calls);
C
chengduoZH 已提交
135
#else
Y
Yancey1989 已提交
136
    PADDLE_THROW("Not compiled with CUDA");
C
chengduoZH 已提交
137
#endif
Y
Yancey1989 已提交
138
  } else {  // Special handle CPU only Operator's gradient. Like CRF
139
    auto &trg = *this->local_exec_scopes_[0]
G
gongweibao 已提交
140
                     ->FindVar(out_var_handles[0]->name())
Y
Yancey1989 已提交
141 142 143 144 145 146 147
                     ->GetMutable<framework::LoDTensor>();

    // Reduce All Tensor to trg in CPU
    ReduceLoDTensor func(lod_tensors, &trg);
    VisitDataType(lod_tensors[0]->type(), func);

    for (size_t i = 1; i < local_scopes_.size(); ++i) {
148
      auto &scope = local_exec_scopes_[i];
Y
Yancey1989 已提交
149
      auto &p = places_[i];
150
      auto *var = scope->FindVar(out_var_handles[i]->name());
Y
Yancey1989 已提交
151 152 153 154 155 156 157
      auto *dev_ctx = dev_ctxes_.at(p);

      RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
        auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
        auto &tensor_cpu = trg;
        TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
      });
Y
Yu Yang 已提交
158 159 160
    }
  }
}
Y
Yu Yang 已提交
161

C
chengduoZH 已提交
162
std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
Y
Yu Yang 已提交
163 164 165
}  // namespace details
}  // namespace framework
}  // namespace paddle