rpc_server.cc 6.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

W
Wu Yi 已提交
15 16
#include "paddle/fluid/operators/distributed/rpc_server.h"

17 18 19 20
#include <fstream>
#include <iostream>
#include <limits>
#include <string>
21 22
#include "paddle/fluid/platform/profiler.h"

23 24
namespace paddle {
namespace operators {
25
namespace distributed {
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41

void RPCServer::ShutDown() {
  LOG(INFO) << "RPCServer ShutDown ";
  ShutDownImpl();

  exit_flag_ = true;
  barrier_cond_.notify_all();
  rpc_cond_.notify_all();
}

void RPCServer::SavePort() const {
  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
  std::ofstream port_file;
  port_file.open(file_path);
  port_file << selected_port_;
  port_file.close();
Q
Qiao Longfei 已提交
42
  VLOG(3) << "selected port written to " << file_path;
43 44 45
}

void RPCServer::WaitBarrier(const std::string& rpc_name) {
Q
Qiao Longfei 已提交
46
  VLOG(3) << "WaitBarrier in: " << rpc_name;
47
  std::unique_lock<std::mutex> lock(this->mutex_);
W
Wu Yi 已提交
48
  barrier_cond_.wait(lock, [this, &rpc_name] {
Y
Yancey1989 已提交
49 50
    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
            exit_flag_.load());
51 52
  });

Q
Qiao Longfei 已提交
53 54
  VLOG(3) << "WaitBarrier out: " << rpc_name
          << " counter: " << barrier_counter_[rpc_name];
55 56 57
}

void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
Q
Qiao Longfei 已提交
58
  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
59 60
  // barrier msg should make sure that it's in the right cond(send|recv)
  WaitCond(rpc_name);
61
  int b = 0;
W
Wu Yi 已提交
62 63
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
Q
Qiao Longfei 已提交
64
  VLOG(3) << rpc_name << " barrier_counter: " << b;
65
  if (b >= client_num_) {
W
Wu Yi 已提交
66
    lock.unlock();
Q
Qiao Longfei 已提交
67 68
    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
            << rpc_name;
69
    barrier_cond_.notify_all();
W
Wu Yi 已提交
70
    lock.lock();
71 72 73
  }
}

Y
Yancey1989 已提交
74
void RPCServer::Complete() {
Y
Yancey1989 已提交
75
  {
Y
Yancey1989 已提交
76
    std::unique_lock<std::mutex> lock(mutex_);
W
Wu Yi 已提交
77
    client_num_--;
Y
Yancey1989 已提交
78 79
    need_reset_all_vars_ = true;

Q
Qiao Longfei 已提交
80
    VLOG(3) << "decrease client_num to: " << client_num_;
Y
Yancey1989 已提交
81 82 83
    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
      barrier_counter_[kRequestGet]--;
    }
W
Wu Yi 已提交
84 85 86 87
  }
  barrier_cond_.notify_all();
}

Y
Yancey1989 已提交
88 89 90 91 92
bool RPCServer::NeedResetAllVars() {
  std::unique_lock<std::mutex> lock(mutex_);
  return need_reset_all_vars_;
}

Y
Yancey1989 已提交
93 94 95 96 97
int RPCServer::GetClientNum() {
  std::unique_lock<std::mutex> lock(mutex_);
  return client_num_;
}

98
void RPCServer::ResetBarrierCounter() {
M
minqiyang 已提交
99
  VLOG(3) << "RPCServer ResetBarrierCounter ";
100 101 102 103
  std::unique_lock<std::mutex> lock(mutex_);
  for (auto& t : barrier_counter_) {
    t.second = 0;
  }
Y
Yancey1989 已提交
104
  need_reset_all_vars_ = false;
105 106 107 108 109 110 111 112 113
}

void RPCServer::RegisterRPC(const std::string& rpc_name,
                            RequestHandler* handler, int thread_num) {
  rpc_call_map_[rpc_name] = handler;
  rpc_thread_num_[rpc_name] = thread_num;

  static int cond = -1;
  rpc_cond_map_[rpc_name] = ++cond;
Q
Qiao Longfei 已提交
114
  VLOG(3) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
M
minqiyang 已提交
115
          << ", cond:" << rpc_cond_map_[rpc_name];
116 117 118
}

void RPCServer::SetCond(const std::string& rpc_name) {
M
minqiyang 已提交
119
  VLOG(3) << "RPCServer SetCond " << rpc_name;
120 121 122 123 124 125 126 127 128
  {
    std::unique_lock<std::mutex> lock(mutex_);
    cur_cond_ = rpc_cond_map_[rpc_name];
  }

  rpc_cond_.notify_all();
}

void RPCServer::WaitCond(const std::string& rpc_name) {
129
  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
130 131 132 133 134 135 136 137 138
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    cond = rpc_cond_map_[rpc_name];
  }

  std::unique_lock<std::mutex> lock(mutex_);
  rpc_cond_.wait(
      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
139
  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
140 141
}

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
void RPCServer::RegisterVar(const std::string& var_name,
                            const std::string& rpc_name,
                            framework::Scope* scope,
                            platform::DeviceContext* dev_ctx) {
  MonomerHandle h;
  h.var_name_ = var_name;
  h.rpc_name_ = rpc_name;
  h.scope_ = scope;
  h.dev_ctx_ = dev_ctx;

  {
    std::unique_lock<std::mutex> lock(mutex_);
    if (var_map_.find(var_name) != var_map_.end()) {
      PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
    }
    var_map_[var_name] = h;
  }

  rpc_cond_.notify_all();
Q
Qiao Longfei 已提交
161
  VLOG(3) << "RegisterVar context:" << h.String();
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
}

void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
  int b = 0;
  MonomerHandle h;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    b = ++var_map_[var_name].barrier_;
    h = var_map_[var_name];
  }

  if (b >= client_num_) {
    barrier_cond_.notify_all();
  }

Q
Qiao Longfei 已提交
177
  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
178 179 180
}

void RPCServer::WaitVarBarrier(const std::string& var_name) {
Q
Qiao Longfei 已提交
181
  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
182 183 184 185 186 187 188

  std::unique_lock<std::mutex> lock(mutex_);
  barrier_cond_.wait(lock, [&]() {
    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
            exit_flag_.load());
  });

Q
Qiao Longfei 已提交
189
  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
190 191 192
}

void RPCServer::SetVarCond(const std::string& var_name) {
Q
Qiao Longfei 已提交
193
  VLOG(3) << "SetVarCond var_name:" << var_name;
194 195 196 197 198 199 200 201 202
  {
    std::unique_lock<std::mutex> lock(mutex_);
    if (var_map_.find(var_name) != var_map_.end()) {
      rpc_cond_.notify_all();
    }
  }
}

void RPCServer::WaitVarCond(const std::string& var_name) {
Q
Qiao Longfei 已提交
203
  VLOG(3) << "WaitVarCond var_name:" << var_name;
204 205 206 207 208 209

  std::unique_lock<std::mutex> lock(mutex_);
  rpc_cond_.wait(lock, [=] {
    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
  });

Q
Qiao Longfei 已提交
210
  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
}

MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
  MonomerHandle h;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    h = var_map_[var_name];
  }

  return h;
}

void RPCServer::ClearRegisteredVars() {
  std::unique_lock<std::mutex> lock(mutex_);
  var_map_.clear();
}

void RPCServer::ClearVar(const std::string& var_name) {
  std::unique_lock<std::mutex> lock(mutex_);
  var_map_.erase(var_name);
}
232
}  // namespace distributed
233 234
}  // namespace operators
}  // namespace paddle