carrier.cc 10.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/carrier.h"
16
#include "paddle/fluid/distributed/fleet_executor/global_map.h"
17 18
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
19
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
20
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
21
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
22
#include "paddle/fluid/framework/garbage_collector.h"
23
#include "paddle/fluid/framework/scope.h"
24 25 26 27

namespace paddle {
namespace distributed {

28
USE_INTERCEPTOR(Compute);
29
USE_INTERCEPTOR(Amplifier);
30

31 32 33 34
void Carrier::Init(
    int64_t rank,
    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
    const std::unordered_set<int64_t>& interceptor_ids) {
35
  rank_ = rank;
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
  interceptor_id_to_rank_ = interceptor_id_to_rank;
  interceptor_ids_ = interceptor_ids;

  // TODO(fleet_exe dev): thread pool
  thread_num_ = 1;
  thread_pool_.SetThreadNum(thread_num_);
  thread_pool_.Start();
}

void Carrier::Init(
    int64_t rank,
    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
    const std::unordered_set<int64_t>& interceptor_ids,
    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
    framework::Scope* root_scope, framework::Scope* minibatch_scope,
    const std::vector<framework::Scope*>& microbatch_scopes,
    const platform::Place& place) {
  rank_ = rank;
  interceptor_id_to_rank_ = interceptor_id_to_rank;
  interceptor_ids_ = interceptor_ids;
  interceptor_id_to_node_ = interceptor_id_to_node;
57 58 59
  minibatch_scope_ = minibatch_scope;
  microbatch_scopes_ = microbatch_scopes;
  place_ = place;
60 61
  root_scope_ = root_scope;
  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
62 63 64 65 66 67

  // TODO(fleet_exe dev): thread pool
  thread_num_ = 1;
  thread_pool_.SetThreadNum(thread_num_);
  thread_pool_.Start();

68
  CreateInterceptors();
69
  is_init_ = true;
70 71
}

72
void Carrier::Release() {}
73

74 75
Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }

76 77
bool Carrier::EnqueueInterceptorMessage(
    const InterceptorMessage& interceptor_message) {
78
  if (interceptor_message.ctrl_message()) {
79 80 81
    VLOG(3) << "Receiving control message from rank "
            << interceptor_message.src_id() << " to rank "
            << interceptor_message.dst_id();
82 83
    // for barrier
    msg_bus_->IncreaseBarrierCount();
84 85 86
  } else {
    int64_t dst_id = interceptor_message.dst_id();
    Interceptor* dst_interceptor = GetInterceptor(dst_id);
87
    dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
88
  }
89
  return true;
90 91 92 93 94 95 96 97 98 99
}

Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
  PADDLE_ENFORCE_NE(iter, interceptor_idx_to_interceptor_.end(),
                    platform::errors::InvalidArgument(
                        "Cannot find interceptor instance for interceptor "
                        "id %lld. Wrong dst? Call before init?",
                        interceptor_id));
  return iter->second.get();
100 101
}

102 103 104 105 106
void Carrier::Wait() {
  std::unique_lock<std::mutex> lock(running_mutex_);
  cond_var_.wait(lock);
}

107 108 109 110 111
void Carrier::WakeUp() {
  // probably double notify, but ok for ut
  cond_var_.notify_all();
}

112
void Carrier::Start() {
113
  PADDLE_ENFORCE_EQ(msg_bus_->IsInit(), true,
114
                    platform::errors::PreconditionNotMet(
115 116 117
                        "Using message bus since it has not been initialized. "
                        "Please invoke MessageBus::Init() before using it or "
                        "neccessary components are not ready."));
118 119
  PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet(
                                        "Using carrier before initialized."));
120 121 122 123 124 125 126 127
  for (int64_t id : source_interceptor_ids_) {
    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
            << ".";
    InterceptorMessage start_msg;
    // source node data_is_ready is send by carrier, so set src_id=-1
    start_msg.set_src_id(-1);
    start_msg.set_dst_id(id);
    start_msg.set_message_type(DATA_IS_READY);
128
    Send(start_msg);
129
  }
130
  // TODO(wangxi): async step
131
  Wait();
132
  dev_ctx_->Wait();
133 134 135 136
}

bool Carrier::IsInit() const { return is_init_; }

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
int64_t Carrier::GetRank(int64_t interceptor_id) const {
  PADDLE_ENFORCE_NE(
      interceptor_id_to_rank_.find(interceptor_id),
      interceptor_id_to_rank_.end(),
      platform::errors::NotFound("Cannot find rank for interceptor id %lld.",
                                 interceptor_id));
  return interceptor_id_to_rank_.at(interceptor_id);
}

bool Carrier::Send(const InterceptorMessage& msg) {
  int64_t src_id = (msg.src_id() == -1) ? msg.dst_id() : msg.src_id();
  int64_t dst_id = msg.dst_id();
  int64_t src_rank = GetRank(src_id);
  int64_t dst_rank = GetRank(dst_id);
  PADDLE_ENFORCE_EQ(
      src_rank, rank_,
      platform::errors::Fatal("The source rank id %lld, which is not equal to "
                              "the carrier rank id %lld.",
                              src_rank, rank_));
  if (src_rank == dst_rank) {
    VLOG(3) << "Send a message from interceptor " << src_id
            << " to interceptor " << dst_id << ", which are in the same ranks.";
159 160 161
    int64_t carrier_id = *GlobalMap<int64_t, int64_t>::Get(dst_id);
    return GlobalMap<int64_t, Carrier>::Get(carrier_id)
        ->EnqueueInterceptorMessage(msg);
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
  } else {
    PADDLE_ENFORCE_NOT_NULL(
        msg_bus_.get(),
        platform::errors::Unavailable("Message bus is released accidently"));
    PADDLE_ENFORCE_EQ(
        msg_bus_->IsInit(), true,
        platform::errors::PreconditionNotMet(
            "Using message bus since it has not been initialized. "
            "Please invoke MessageBus::Init() before using it or "
            "neccessary components are not ready."));
    VLOG(3) << "Send a message from interceptor " << src_id
            << " to interceptor " << dst_id
            << ", which are in different ranks.";
    return msg_bus_->Send(dst_rank, msg);
  }
177 178
}

179 180 181 182 183 184 185 186
Interceptor* Carrier::SetInterceptor(int64_t interceptor_id,
                                     std::unique_ptr<Interceptor> interceptor) {
  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
  PADDLE_ENFORCE_EQ(iter, interceptor_idx_to_interceptor_.end(),
                    platform::errors::AlreadyExists(
                        "The interceptor id %lld has already been created! "
                        "The interceptor id should be unique.",
                        interceptor_id));
187
  interceptor->RegisterCarrier(this);
188 189 190 191 192 193 194

  // TODO(fleet_exe dev): get loop
  auto* loop = thread_pool_.GetLoop(interceptor_id % thread_num_);
  PADDLE_ENFORCE_NOT_NULL(
      loop, platform::errors::Fatal("thread task loop must not null"));
  interceptor->RegisterTaskLoop(loop);

195 196 197
  // TODO(liyurui): Using struct InterceptorID replace int64_t
  GlobalMap<int64_t, int64_t>::Create(interceptor_id, carrier_id_);

198 199 200 201 202 203
  auto* ptr = interceptor.get();
  interceptor_idx_to_interceptor_.insert(
      std::make_pair(interceptor_id, std::move(interceptor)));
  return ptr;
}

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
static std::shared_ptr<framework::GarbageCollector> GetGC(
    const platform::Place& place) {
  int64_t max_memory_size = framework::GetEagerDeletionThreshold();
  std::shared_ptr<framework::GarbageCollector> gc;
  if (max_memory_size >= 0) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(place)) {
      if (framework::IsFastEagerDeletionModeEnabled()) {
        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
      }
    }
#endif
  }  // max_memory_size >= 0

  return gc;
}

222
void Carrier::CreateInterceptors() {
223
  if (interceptor_ids_.empty()) return;
224 225 226

  auto gc = GetGC(place_);

227
  // create each Interceptor
228
  // no auto init since there is no config
229 230 231 232 233 234 235
  for (int64_t interceptor_id : interceptor_ids_) {
    const auto& task_node_iter = interceptor_id_to_node_.find(interceptor_id);
    PADDLE_ENFORCE_NE(
        task_node_iter, interceptor_id_to_node_.end(),
        platform::errors::NotFound("Can not find task node for interceptor %ld",
                                   interceptor_id));
    TaskNode* task_node = task_node_iter->second;
236

237 238 239 240 241 242
    PADDLE_ENFORCE_LT(
        task_node->run_at_offset(), task_node->run_per_steps(),
        platform::errors::InvalidArgument(
            "Interceptor's run_at_offset must < run_per_steps, must now "
            "run_at_offset=%ld run_per_steps=%ld",
            task_node->run_at_offset(), task_node->run_per_steps()));
243

244
    std::unique_ptr<Interceptor> interceptor;
245 246 247 248 249 250
    PADDLE_ENFORCE_NE(task_node->type().empty(), true,
                      platform::errors::NotFound(
                          "Cannot found type for task node with id %lld",
                          task_node->task_id()));
    interceptor = InterceptorFactory::Create(task_node->type(), interceptor_id,
                                             task_node);
251 252 253 254 255 256 257 258 259 260 261 262
    interceptor->SetPlace(place_);
    interceptor->SetMiniBatchScope(minibatch_scope_);
    interceptor->SetMicroBatchScope(microbatch_scopes_);
    interceptor->SetRootScope(root_scope_);
    interceptor->SetGC(gc);

    SetInterceptor(interceptor_id, std::move(interceptor));
    VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
            << " with type: " << task_node->type() << ".";

    if (task_node->upstream().empty()) {
      source_interceptor_ids_.emplace_back(interceptor_id);
263
    }
264
  }
265 266 267 268
}

}  // namespace distributed
}  // namespace paddle