grpc_server.cc 12.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/detail/grpc_server.h"
16 17 18

#include <limits>
#include <string>
G
gongweibao 已提交
19

20
using ::grpc::ServerAsyncResponseWriter;
G
gongweibao 已提交
21 22 23 24 25 26 27 28 29 30 31

namespace paddle {
namespace operators {
namespace detail {

enum CallStatus { PROCESS = 0, FINISH };

// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
 public:
32
  explicit RequestBase(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
33
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
34
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
35 36 37 38 39
      : service_(service),
        cq_(cq),
        sync_mode_(sync_mode),
        status_(PROCESS),
        dev_ctx_(dev_ctx) {
G
gongweibao 已提交
40 41
    PADDLE_ENFORCE(cq_);
  }
G
gongweibao 已提交
42 43 44 45 46
  virtual ~RequestBase() {}
  virtual void Process() { assert(false); }

  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
T
typhoonzero 已提交
47 48 49 50
  virtual std::string GetReqName() {
    assert(false);
    return "";
  }
G
gongweibao 已提交
51 52

 protected:
53 54 55
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
Q
qiaolongfei 已提交
56
  const bool sync_mode_;
G
gongweibao 已提交
57
  CallStatus status_;
58
  const platform::DeviceContext* dev_ctx_;
G
gongweibao 已提交
59 60 61 62
};

class RequestSend final : public RequestBase {
 public:
63
  explicit RequestSend(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
64
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
65 66
                       framework::Scope* scope, ReceivedQueue* queue,
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
67 68 69 70
      : RequestBase(service, cq, sync_mode, dev_ctx),
        queue_(queue),
        responder_(&ctx_) {
    if (sync_mode_) {
71
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
72
    } else {
73
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
74
    }
75 76 77
    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
G
gongweibao 已提交
78 79 80 81
  }

  virtual ~RequestSend() {}

82
  virtual std::string GetReqName() { return request_->Varname(); }
G
gongweibao 已提交
83

G
gongweibao 已提交
84
  virtual void Process() {
Q
qiaolongfei 已提交
85 86 87
    std::string var_name = GetReqName();
    VLOG(3) << "RequestSend " << var_name;
    queue_->Push(std::make_pair(var_name, request_));
88 89 90

    sendrecv::VoidMessage reply;
    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
91
    status_ = FINISH;
G
gongweibao 已提交
92 93 94
  }

 protected:
95 96
  std::shared_ptr<VariableResponse> request_;
  ReceivedQueue* queue_;
G
gongweibao 已提交
97 98 99 100 101
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};

class RequestGet final : public RequestBase {
 public:
102
  explicit RequestGet(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
103
                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
104
                      framework::Scope* scope,
T
typhoonzero 已提交
105
                      const platform::DeviceContext* dev_ctx,
T
typhoonzero 已提交
106
                      framework::BlockingQueue<MessageWithName>* queue)
Q
qiaolongfei 已提交
107
      : RequestBase(service, cq, sync_mode, dev_ctx),
Y
Yancey1989 已提交
108 109
        responder_(&ctx_),
        scope_(scope),
T
typhoonzero 已提交
110
        queue_(queue) {
Q
qiaolongfei 已提交
111
    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
112 113
    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
                                cq_, this);
G
gongweibao 已提交
114 115 116 117
  }

  virtual ~RequestGet() {}

G
gongweibao 已提交
118 119
  virtual std::string GetReqName() { return request_.varname(); }

G
gongweibao 已提交
120 121 122
  virtual void Process() {
    // proc request.
    std::string var_name = request_.varname();
Q
qiaolongfei 已提交
123
    VLOG(3) << "RequestGet " << var_name;
G
gongweibao 已提交
124
    auto* var = scope_->FindVar(var_name);
125 126

    ::grpc::ByteBuffer reply;
127
    if (var_name != FETCH_BARRIER_MESSAGE) {
128
      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
129
    }
130 131

    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
132
    status_ = FINISH;
133 134 135 136 137 138

    if (var_name == FETCH_BARRIER_MESSAGE) {
      sendrecv::VariableMessage msg;
      MessageWithName msg_with_name = std::make_pair(var_name, msg);
      queue_->Push(msg_with_name);
    }
G
gongweibao 已提交
139 140 141 142
  }

 protected:
  sendrecv::VariableMessage request_;
143
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
G
gongweibao 已提交
144
  framework::Scope* scope_;
T
typhoonzero 已提交
145
  framework::BlockingQueue<MessageWithName>* queue_;
G
gongweibao 已提交
146 147
};

148 149 150
class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
151
                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
152 153 154
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
Y
Yancey1989 已提交
155 156
                           framework::ProgramDesc* program,
                           framework::ExecutorPrepareContext* prefetch_ctx)
Q
qiaolongfei 已提交
157
      : RequestBase(service, cq, sync_mode, dev_ctx),
158 159 160 161
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
Y
Yancey1989 已提交
162
        prefetch_ctx_(prefetch_ctx) {
Q
qiaolongfei 已提交
163
    if (sync_mode_) {
164
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
165
    } else {
166
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
167
    }
168
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
Y
Yancey1989 已提交
169 170
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
171 172 173 174
  }

  virtual ~RequestPrefetch() {}

Y
Yancey1989 已提交
175
  virtual std::string GetReqName() { return request_->Varname(); }
176 177 178

  virtual void Process() {
    // prefetch process...
Y
Yancey1989 已提交
179
    ::grpc::ByteBuffer reply;
180

Y
Yancey1989 已提交
181
    std::string var_name = request_->OutVarname();
Q
qiaolongfei 已提交
182
    VLOG(3) << "RequestPrefetch " << var_name;
Y
Yancey1989 已提交
183 184 185 186
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);
    InitializeVariable(var, var_desc->GetType());
Y
Yancey1989 已提交
187
    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
Y
Yancey1989 已提交
188 189

    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
Q
qiaolongfei 已提交
190

Y
Yancey1989 已提交
191
    responder_.Finish(reply, ::grpc::Status::OK, this);
192 193 194 195
    status_ = FINISH;
  }

 protected:
Y
Yancey1989 已提交
196
  std::shared_ptr<VariableResponse> request_;
197 198 199 200
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
Y
Yancey1989 已提交
201
  framework::ExecutorPrepareContext* prefetch_ctx_;
202 203
};

T
typhoonzero 已提交
204
void AsyncGRPCServer::WaitClientGet(int count) {
205 206 207 208 209 210
  int fetch_barriers = 0;
  while (fetch_barriers < count) {
    auto msg = var_get_queue_.Pop();
    if (msg.first == FETCH_BARRIER_MESSAGE) {
      fetch_barriers++;
    }
T
typhoonzero 已提交
211 212 213
  }
}

G
gongweibao 已提交
214
void AsyncGRPCServer::RunSyncUpdate() {
215
  ::grpc::ServerBuilder builder;
T
typhoonzero 已提交
216 217
  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
                           &selected_port_);
G
gongweibao 已提交
218 219
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
G
gongweibao 已提交
220 221 222 223
  builder.RegisterService(&service_);

  cq_send_ = builder.AddCompletionQueue();
  cq_get_ = builder.AddCompletionQueue();
224
  cq_prefetch_ = builder.AddCompletionQueue();
Y
Yancey 已提交
225

G
gongweibao 已提交
226
  server_ = builder.BuildAndStart();
T
typhoonzero 已提交
227 228
  LOG(INFO) << "Server listening on " << address_
            << " selected port: " << selected_port_;
G
gongweibao 已提交
229 230 231 232 233

  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
  std::function<void()> get_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
234 235
  std::function<void()> prefetch_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
G
gongweibao 已提交
236

T
typhoonzero 已提交
237
  // TODO(wuyi): Run these "HandleRequest" in thread pool
G
gongweibao 已提交
238
  t_send_.reset(
Y
Yancey 已提交
239
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
240 241
                                cq_send_.get(), "cq_send", send_register)));
  t_get_.reset(
Y
Yancey 已提交
242
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
243
                                cq_get_.get(), "cq_get", get_register)));
244 245 246
  t_prefetch_.reset(new std::thread(
      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                "cq_prefetch", prefetch_register)));
G
gongweibao 已提交
247 248 249 250
  // wait server
  server_->Wait();
  t_send_->join();
  t_get_->join();
251
  t_prefetch_->join();
G
gongweibao 已提交
252 253 254 255 256 257
}

void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
258
  cq_prefetch_->Shutdown();
G
gongweibao 已提交
259 260 261 262
}

// This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() {
T
typhoonzero 已提交
263
  is_shut_down_ = true;
G
gongweibao 已提交
264
  ShutdownQueue();
T
typhoonzero 已提交
265
  server_->Shutdown();
G
gongweibao 已提交
266 267 268 269 270
}

void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
271
    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
G
gongweibao 已提交
272 273
    return;
  }
Q
qiaolongfei 已提交
274 275
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
                                      scope_, &var_recv_queue_, dev_ctx_);
Y
Yancey 已提交
276
  VLOG(4) << "Create RequestSend status:" << send->Status();
G
gongweibao 已提交
277 278 279 280 281
}

void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
282
    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
G
gongweibao 已提交
283 284
    return;
  }
Q
qiaolongfei 已提交
285 286
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
                                   dev_ctx_, &var_get_queue_);
Y
Yancey 已提交
287
  VLOG(4) << "Create RequestGet status:" << get->Status();
G
gongweibao 已提交
288 289
}

290 291 292
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
293
    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
294 295 296
    return;
  }
  RequestPrefetch* prefetch =
Q
qiaolongfei 已提交
297 298
      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
                          dev_ctx_, executor_, program_, prefetch_ctx_);
299 300 301 302

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}

Y
Yancey 已提交
303
// FIXME(typhoonzero): change cq_name to enum.
304
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
Y
Yi Wang 已提交
305
                                    const std::string& cq_name,
G
gongweibao 已提交
306 307 308 309 310
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

  void* tag = NULL;
  bool ok = false;
311

G
gongweibao 已提交
312
  while (true) {
Q
qiaolongfei 已提交
313
    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
G
gongweibao 已提交
314
    if (!cq->Next(&tag, &ok)) {
T
typhoonzero 已提交
315
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
G
gongweibao 已提交
316 317
      break;
    }
Q
qiaolongfei 已提交
318
    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
G
gongweibao 已提交
319

G
gongweibao 已提交
320
    PADDLE_ENFORCE(tag);
Q
qiaolongfei 已提交
321

Q
qiaolongfei 已提交
322 323
    if (sync_mode_) {
      // FIXME(typhoonzero): de-couple the barriers with recv_op
Q
qiaolongfei 已提交
324
      VLOG(3) << "HandleRequest for " << cq_name << " before WaitCond";
Q
qiaolongfei 已提交
325 326
      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
Q
qiaolongfei 已提交
327
      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
Q
qiaolongfei 已提交
328
    }
G
gongweibao 已提交
329

330
    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
G
gongweibao 已提交
331 332 333 334
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
G
gongweibao 已提交
335
    if (!ok) {
Q
qiaolongfei 已提交
336 337
      LOG(WARNING) << cq_name << " recv no regular event:argument name["
                   << base->GetReqName() << "]";
G
gongweibao 已提交
338 339 340 341 342 343 344 345 346
      TryToRegisterNewOne();
      delete base;
      continue;
    }

    switch (base->Status()) {
      case PROCESS: {
        TryToRegisterNewOne();
        base->Process();
Q
qiaolongfei 已提交
347
        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
G
gongweibao 已提交
348 349 350
        break;
      }
      case FINISH: {
Q
qiaolongfei 已提交
351
        VLOG(4) << cq_name << " FINISH status:" << base->Status();
G
gongweibao 已提交
352 353 354 355 356 357 358 359
        delete base;
        break;
      }
      default: { assert(false); }
    }
  }
}

T
typhoonzero 已提交
360 361
void AsyncGRPCServer::WaitCond(int cond) {
  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
Q
qiaolongfei 已提交
362
  VLOG(3) << "WaitCond " << cond << " in";
T
typhoonzero 已提交
363 364
  barrier_condition_.wait(lock,
                          [=] { return this->barrier_cond_step_ == cond; });
Q
qiaolongfei 已提交
365
  VLOG(3) << "WaitCond " << cond << " out";
G
gongweibao 已提交
366 367
}

T
typhoonzero 已提交
368
void AsyncGRPCServer::SetCond(int cond) {
G
gongweibao 已提交
369
  {
T
typhoonzero 已提交
370 371
    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
    barrier_cond_step_ = cond;
G
gongweibao 已提交
372
  }
T
typhoonzero 已提交
373
  barrier_condition_.notify_all();
G
gongweibao 已提交
374 375 376 377 378
}

}  // namespace detail
}  // namespace operators
}  // namespace paddle