grpc_server.cc 12.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/detail/grpc_server.h"
16 17 18

#include <limits>
#include <string>
G
gongweibao 已提交
19

20
using ::grpc::ServerAsyncResponseWriter;
G
gongweibao 已提交
21 22 23 24 25 26 27 28 29 30 31

namespace paddle {
namespace operators {
namespace detail {

enum CallStatus { PROCESS = 0, FINISH };

// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
 public:
32
  explicit RequestBase(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
33
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
34
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
35 36 37 38 39
      : service_(service),
        cq_(cq),
        sync_mode_(sync_mode),
        status_(PROCESS),
        dev_ctx_(dev_ctx) {
G
gongweibao 已提交
40 41
    PADDLE_ENFORCE(cq_);
  }
G
gongweibao 已提交
42 43 44 45 46
  virtual ~RequestBase() {}
  virtual void Process() { assert(false); }

  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
T
typhoonzero 已提交
47 48 49 50
  virtual std::string GetReqName() {
    assert(false);
    return "";
  }
G
gongweibao 已提交
51 52

 protected:
53 54 55
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
Q
qiaolongfei 已提交
56
  const bool sync_mode_;
G
gongweibao 已提交
57
  CallStatus status_;
58
  const platform::DeviceContext* dev_ctx_;
G
gongweibao 已提交
59 60 61 62
};

class RequestSend final : public RequestBase {
 public:
63
  explicit RequestSend(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
64
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
65 66
                       framework::Scope* scope, ReceivedQueue* queue,
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
67 68 69 70
      : RequestBase(service, cq, sync_mode, dev_ctx),
        queue_(queue),
        responder_(&ctx_) {
    if (sync_mode_) {
71
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
72
    } else {
73
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
74
    }
75 76 77
    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
G
gongweibao 已提交
78 79 80 81
  }

  virtual ~RequestSend() {}

82
  virtual std::string GetReqName() { return request_->Varname(); }
G
gongweibao 已提交
83

G
gongweibao 已提交
84
  virtual void Process() {
85 86 87 88
    queue_->Push(std::make_pair(request_->Varname(), request_));

    sendrecv::VoidMessage reply;
    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
89
    status_ = FINISH;
G
gongweibao 已提交
90 91 92
  }

 protected:
93 94
  std::shared_ptr<VariableResponse> request_;
  ReceivedQueue* queue_;
G
gongweibao 已提交
95 96 97 98 99
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};

class RequestGet final : public RequestBase {
 public:
100
  explicit RequestGet(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
101
                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
102
                      framework::Scope* scope,
T
typhoonzero 已提交
103
                      const platform::DeviceContext* dev_ctx,
T
typhoonzero 已提交
104
                      framework::BlockingQueue<MessageWithName>* queue)
Q
qiaolongfei 已提交
105
      : RequestBase(service, cq, sync_mode, dev_ctx),
Y
Yancey1989 已提交
106 107
        responder_(&ctx_),
        scope_(scope),
T
typhoonzero 已提交
108
        queue_(queue) {
109 110 111
    int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
                                cq_, this);
G
gongweibao 已提交
112 113 114 115
  }

  virtual ~RequestGet() {}

G
gongweibao 已提交
116 117
  virtual std::string GetReqName() { return request_.varname(); }

G
gongweibao 已提交
118 119 120 121
  virtual void Process() {
    // proc request.
    std::string var_name = request_.varname();
    auto* var = scope_->FindVar(var_name);
122 123

    ::grpc::ByteBuffer reply;
124
    if (var_name != FETCH_BARRIER_MESSAGE) {
125
      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
126
    }
127 128

    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
129
    status_ = FINISH;
130 131 132 133 134 135

    if (var_name == FETCH_BARRIER_MESSAGE) {
      sendrecv::VariableMessage msg;
      MessageWithName msg_with_name = std::make_pair(var_name, msg);
      queue_->Push(msg_with_name);
    }
G
gongweibao 已提交
136 137 138 139
  }

 protected:
  sendrecv::VariableMessage request_;
140
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
G
gongweibao 已提交
141
  framework::Scope* scope_;
T
typhoonzero 已提交
142
  framework::BlockingQueue<MessageWithName>* queue_;
G
gongweibao 已提交
143 144
};

145 146 147
class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
148
                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
149 150 151
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
Y
Yancey1989 已提交
152 153
                           framework::ProgramDesc* program,
                           framework::ExecutorPrepareContext* prefetch_ctx)
Q
qiaolongfei 已提交
154
      : RequestBase(service, cq, sync_mode, dev_ctx),
155 156 157 158
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
Y
Yancey1989 已提交
159
        prefetch_ctx_(prefetch_ctx) {
Q
qiaolongfei 已提交
160
    if (sync_mode_) {
161
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
162
    } else {
163
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
164
    }
165
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
Y
Yancey1989 已提交
166 167
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
168 169 170 171
  }

  virtual ~RequestPrefetch() {}

Y
Yancey1989 已提交
172
  virtual std::string GetReqName() { return request_->Varname(); }
173 174 175

  virtual void Process() {
    // prefetch process...
Y
Yancey1989 已提交
176
    ::grpc::ByteBuffer reply;
177

Y
Yancey1989 已提交
178
    std::string var_name = request_->OutVarname();
179
    VLOG(3) << "prefetch var " << var_name;
Y
Yancey1989 已提交
180 181 182 183
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);
    InitializeVariable(var, var_desc->GetType());
Y
Yancey1989 已提交
184
    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
Y
Yancey1989 已提交
185 186

    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
Q
qiaolongfei 已提交
187

Y
Yancey1989 已提交
188
    responder_.Finish(reply, ::grpc::Status::OK, this);
189 190 191 192
    status_ = FINISH;
  }

 protected:
Y
Yancey1989 已提交
193
  std::shared_ptr<VariableResponse> request_;
194 195 196 197
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
Y
Yancey1989 已提交
198
  framework::ExecutorPrepareContext* prefetch_ctx_;
199 200
};

T
typhoonzero 已提交
201
void AsyncGRPCServer::WaitClientGet(int count) {
202 203 204 205 206 207
  int fetch_barriers = 0;
  while (fetch_barriers < count) {
    auto msg = var_get_queue_.Pop();
    if (msg.first == FETCH_BARRIER_MESSAGE) {
      fetch_barriers++;
    }
T
typhoonzero 已提交
208 209 210
  }
}

T
update  
typhoonzero 已提交
211 212 213 214 215
bool AsyncGRPCServer::WaitServerReady() {
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [&] { return this->ready_ == 1; });
}

G
gongweibao 已提交
216
void AsyncGRPCServer::RunSyncUpdate() {
217
  ::grpc::ServerBuilder builder;
T
typhoonzero 已提交
218 219
  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
                           &selected_port_);
G
gongweibao 已提交
220 221
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
G
gongweibao 已提交
222 223 224 225
  builder.RegisterService(&service_);

  cq_send_ = builder.AddCompletionQueue();
  cq_get_ = builder.AddCompletionQueue();
226
  cq_prefetch_ = builder.AddCompletionQueue();
Y
Yancey 已提交
227

G
gongweibao 已提交
228
  server_ = builder.BuildAndStart();
T
typhoonzero 已提交
229 230
  LOG(INFO) << "Server listening on " << address_
            << " selected port: " << selected_port_;
G
gongweibao 已提交
231 232 233 234 235

  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
  std::function<void()> get_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
236 237
  std::function<void()> prefetch_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
G
gongweibao 已提交
238

T
typhoonzero 已提交
239
  // TODO(wuyi): Run these "HandleRequest" in thread pool
G
gongweibao 已提交
240
  t_send_.reset(
Y
Yancey 已提交
241
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
242 243
                                cq_send_.get(), "cq_send", send_register)));
  t_get_.reset(
Y
Yancey 已提交
244
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
245
                                cq_get_.get(), "cq_get", get_register)));
246 247 248
  t_prefetch_.reset(new std::thread(
      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                "cq_prefetch", prefetch_register)));
T
wip  
typhoonzero 已提交
249 250 251 252 253 254

  {
    std::lock_guard<std::mutex> lock(this->mutex_ready_);
    ready_ = 1;
  }
  condition_ready_.notify_all();
G
gongweibao 已提交
255 256 257 258
  // wait server
  server_->Wait();
  t_send_->join();
  t_get_->join();
259
  t_prefetch_->join();
G
gongweibao 已提交
260 261 262 263 264 265
}

void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
266
  cq_prefetch_->Shutdown();
G
gongweibao 已提交
267 268 269 270
}

// This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() {
T
typhoonzero 已提交
271
  is_shut_down_ = true;
G
gongweibao 已提交
272
  ShutdownQueue();
T
typhoonzero 已提交
273
  server_->Shutdown();
G
gongweibao 已提交
274 275 276 277 278
}

void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
279
    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
G
gongweibao 已提交
280 281
    return;
  }
Q
qiaolongfei 已提交
282 283
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
                                      scope_, &var_recv_queue_, dev_ctx_);
Y
Yancey 已提交
284
  VLOG(4) << "Create RequestSend status:" << send->Status();
G
gongweibao 已提交
285 286 287 288 289
}

void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
290
    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
G
gongweibao 已提交
291 292
    return;
  }
Q
qiaolongfei 已提交
293 294
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
                                   dev_ctx_, &var_get_queue_);
Y
Yancey 已提交
295
  VLOG(4) << "Create RequestGet status:" << get->Status();
G
gongweibao 已提交
296 297
}

298 299 300
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
301
    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
302 303 304
    return;
  }
  RequestPrefetch* prefetch =
Q
qiaolongfei 已提交
305 306
      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
                          dev_ctx_, executor_, program_, prefetch_ctx_);
307 308 309 310

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}

Y
Yancey 已提交
311
// FIXME(typhoonzero): change cq_name to enum.
312
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
Y
Yi Wang 已提交
313
                                    const std::string& cq_name,
G
gongweibao 已提交
314 315 316 317 318
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

  void* tag = NULL;
  bool ok = false;
319

G
gongweibao 已提交
320
  while (true) {
321
    VLOG(3) << "HandleRequest for " << cq_name << " while in";
G
gongweibao 已提交
322
    if (!cq->Next(&tag, &ok)) {
T
typhoonzero 已提交
323
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
G
gongweibao 已提交
324 325
      break;
    }
326
    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
G
gongweibao 已提交
327

G
gongweibao 已提交
328
    PADDLE_ENFORCE(tag);
Q
qiaolongfei 已提交
329 330 331 332 333
    if (sync_mode_) {
      // FIXME(typhoonzero): de-couple the barriers with recv_op
      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
    }
G
gongweibao 已提交
334

335
    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
G
gongweibao 已提交
336 337 338 339
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
G
gongweibao 已提交
340
    if (!ok) {
Q
qiaolongfei 已提交
341 342
      LOG(WARNING) << cq_name << " recv no regular event:argument name["
                   << base->GetReqName() << "]";
G
gongweibao 已提交
343 344 345 346 347 348 349
      TryToRegisterNewOne();
      delete base;
      continue;
    }

    switch (base->Status()) {
      case PROCESS: {
Q
qiaolongfei 已提交
350
        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
G
gongweibao 已提交
351 352 353 354 355
        TryToRegisterNewOne();
        base->Process();
        break;
      }
      case FINISH: {
Q
qiaolongfei 已提交
356
        VLOG(4) << cq_name << " FINISH status:" << base->Status();
G
gongweibao 已提交
357 358 359 360 361 362 363 364
        delete base;
        break;
      }
      default: { assert(false); }
    }
  }
}

T
typhoonzero 已提交
365 366 367 368
void AsyncGRPCServer::WaitCond(int cond) {
  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
  barrier_condition_.wait(lock,
                          [=] { return this->barrier_cond_step_ == cond; });
G
gongweibao 已提交
369 370
}

T
typhoonzero 已提交
371
void AsyncGRPCServer::SetCond(int cond) {
G
gongweibao 已提交
372
  {
T
typhoonzero 已提交
373 374
    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
    barrier_cond_step_ = cond;
G
gongweibao 已提交
375
  }
T
typhoonzero 已提交
376
  barrier_condition_.notify_all();
G
gongweibao 已提交
377 378 379 380 381
}

}  // namespace detail
}  // namespace operators
}  // namespace paddle