carrier.cc 4.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"

namespace paddle {
namespace distributed {

23 24 25
void Carrier::Init(
    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node) {
  interceptor_id_to_node_ = interceptor_id_to_node;
26
  CreateInterceptors();
27 28 29 30 31
}

bool Carrier::EnqueueInterceptorMessage(
    const InterceptorMessage& interceptor_message) {
  // enqueue message to interceptor
32 33 34 35
  if (interceptor_message.ctrl_message()) {
    // handle control message
    return true;
  } else {
36 37 38 39 40 41 42
    if (creating_interceptors_) {
      // Cannot handle the message to interceptor since interceptors
      // are still under creating. Will enqueue into a tmp stack.
      VLOG(3) << "Receiving message while creating interceptors.";
      message_tmp_.emplace_back(interceptor_message);
      return true;
    }
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
    int64_t dst_id = interceptor_message.dst_id();
    Interceptor* dst_interceptor = GetInterceptor(dst_id);
    bool rst =
        dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
    if (rst) {
      std::condition_variable& interceptor_cond_var =
          dst_interceptor->GetCondVar();
      interceptor_cond_var.notify_all();
    }
    return rst;
  }
}

Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
  PADDLE_ENFORCE_NE(iter, interceptor_idx_to_interceptor_.end(),
                    platform::errors::InvalidArgument(
                        "Cannot find interceptor instance for interceptor "
                        "id %lld. Wrong dst? Call before init?",
                        interceptor_id));
  return iter->second.get();
64 65
}

66 67 68 69 70 71 72 73 74 75 76 77 78 79
Interceptor* Carrier::SetInterceptor(int64_t interceptor_id,
                                     std::unique_ptr<Interceptor> interceptor) {
  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
  PADDLE_ENFORCE_EQ(iter, interceptor_idx_to_interceptor_.end(),
                    platform::errors::AlreadyExists(
                        "The interceptor id %lld has already been created! "
                        "The interceptor id should be unique.",
                        interceptor_id));
  auto* ptr = interceptor.get();
  interceptor_idx_to_interceptor_.insert(
      std::make_pair(interceptor_id, std::move(interceptor)));
  return ptr;
}

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
void Carrier::SetCreatingFlag(bool flag) {
  // set the creating flag
  VLOG(3) << "Carrier is set the creating flag from " << creating_interceptors_
          << " to " << flag << ".";
  creating_interceptors_ = flag;
  if (!flag) {
    // finish create interceptors outside, handle tmp messsages
    HandleTmpMessages();
  }
}

void Carrier::HandleTmpMessages() {
  VLOG(3) << "Carrier has received " << message_tmp_.size()
          << " messages during creating interceptors.";
  for (const auto& msg : message_tmp_) {
    EnqueueInterceptorMessage(msg);
  }
  message_tmp_.clear();
}

100 101
void Carrier::CreateInterceptors() {
  // create each Interceptor
102 103 104 105 106
  if (!interceptor_id_to_node_.empty()) {
    // no auto init since there is no config
    for (const auto& item : interceptor_id_to_node_) {
      int64_t interceptor_id = item.first;
      TaskNode* task_node = item.second;
107

108 109 110 111 112 113 114 115 116 117 118
      // TODO(wangxi): use node_type to select different Interceptor
      auto interceptor =
          std::make_unique<Interceptor>(interceptor_id, task_node);
      SetInterceptor(interceptor_id, std::move(interceptor));
      VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
              << ".";
    }
    // The carrier will be always waiting for outside initializer
    // since there is no interceptor has been created during auto init
    creating_interceptors_ = false;
    HandleTmpMessages();
119
  }
120 121 122 123
}

}  // namespace distributed
}  // namespace paddle