grpc_server.cc 12.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/detail/grpc_server.h"
16 17 18

#include <limits>
#include <string>
G
gongweibao 已提交
19

20
using ::grpc::ServerAsyncResponseWriter;
G
gongweibao 已提交
21 22 23 24 25 26 27 28 29 30 31

namespace paddle {
namespace operators {
namespace detail {

enum CallStatus { PROCESS = 0, FINISH };

// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
 public:
32
  explicit RequestBase(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
33
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
34
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
35 36 37 38 39
      : service_(service),
        cq_(cq),
        sync_mode_(sync_mode),
        status_(PROCESS),
        dev_ctx_(dev_ctx) {
G
gongweibao 已提交
40 41
    PADDLE_ENFORCE(cq_);
  }
G
gongweibao 已提交
42 43 44 45 46
  virtual ~RequestBase() {}
  virtual void Process() { assert(false); }

  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
T
typhoonzero 已提交
47 48 49 50
  virtual std::string GetReqName() {
    assert(false);
    return "";
  }
G
gongweibao 已提交
51 52

 protected:
53 54 55
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
Q
qiaolongfei 已提交
56
  const bool sync_mode_;
G
gongweibao 已提交
57
  CallStatus status_;
58
  const platform::DeviceContext* dev_ctx_;
G
gongweibao 已提交
59 60 61 62
};

class RequestSend final : public RequestBase {
 public:
63
  explicit RequestSend(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
64
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
65 66
                       framework::Scope* scope, ReceivedQueue* queue,
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
67 68 69 70
      : RequestBase(service, cq, sync_mode, dev_ctx),
        queue_(queue),
        responder_(&ctx_) {
    if (sync_mode_) {
71
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
72
    } else {
73
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
74
    }
75 76 77
    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
G
gongweibao 已提交
78 79 80 81
  }

  virtual ~RequestSend() {}

82
  virtual std::string GetReqName() { return request_->Varname(); }
G
gongweibao 已提交
83

G
gongweibao 已提交
84
  virtual void Process() {
85 86 87 88
    queue_->Push(std::make_pair(request_->Varname(), request_));

    sendrecv::VoidMessage reply;
    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
89
    status_ = FINISH;
G
gongweibao 已提交
90 91 92
  }

 protected:
93 94
  std::shared_ptr<VariableResponse> request_;
  ReceivedQueue* queue_;
G
gongweibao 已提交
95 96 97 98 99
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};

class RequestGet final : public RequestBase {
 public:
100
  explicit RequestGet(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
101
                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
102
                      framework::Scope* scope,
T
typhoonzero 已提交
103
                      const platform::DeviceContext* dev_ctx,
T
typhoonzero 已提交
104
                      framework::BlockingQueue<MessageWithName>* queue)
Q
qiaolongfei 已提交
105
      : RequestBase(service, cq, sync_mode, dev_ctx),
Y
Yancey1989 已提交
106 107
        responder_(&ctx_),
        scope_(scope),
T
typhoonzero 已提交
108
        queue_(queue) {
109 110 111
    int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
                                cq_, this);
G
gongweibao 已提交
112 113 114 115
  }

  virtual ~RequestGet() {}

G
gongweibao 已提交
116 117
  virtual std::string GetReqName() { return request_.varname(); }

G
gongweibao 已提交
118 119 120 121
  virtual void Process() {
    // proc request.
    std::string var_name = request_.varname();
    auto* var = scope_->FindVar(var_name);
122 123

    ::grpc::ByteBuffer reply;
124
    if (var_name != FETCH_BARRIER_MESSAGE) {
125
      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
126
    }
127 128

    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
129
    status_ = FINISH;
130 131 132 133 134 135

    if (var_name == FETCH_BARRIER_MESSAGE) {
      sendrecv::VariableMessage msg;
      MessageWithName msg_with_name = std::make_pair(var_name, msg);
      queue_->Push(msg_with_name);
    }
G
gongweibao 已提交
136 137 138 139
  }

 protected:
  sendrecv::VariableMessage request_;
140
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
G
gongweibao 已提交
141
  framework::Scope* scope_;
T
typhoonzero 已提交
142
  framework::BlockingQueue<MessageWithName>* queue_;
G
gongweibao 已提交
143 144
};

145 146 147
class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
148
                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
149 150 151
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
Y
Yancey1989 已提交
152 153
                           framework::ProgramDesc* program,
                           framework::ExecutorPrepareContext* prefetch_ctx)
Q
qiaolongfei 已提交
154
      : RequestBase(service, cq, sync_mode, dev_ctx),
155 156 157 158
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
Y
Yancey1989 已提交
159
        prefetch_ctx_(prefetch_ctx) {
Q
qiaolongfei 已提交
160
    if (sync_mode_) {
161
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
162
    } else {
163
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
164
    }
165
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
Y
Yancey1989 已提交
166 167
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
168 169 170 171
  }

  virtual ~RequestPrefetch() {}

Y
Yancey1989 已提交
172
  virtual std::string GetReqName() { return request_->Varname(); }
173 174 175

  virtual void Process() {
    // prefetch process...
Y
Yancey1989 已提交
176
    ::grpc::ByteBuffer reply;
177

Y
Yancey1989 已提交
178
    std::string var_name = request_->OutVarname();
179
    VLOG(3) << "prefetch var " << var_name;
Y
Yancey1989 已提交
180 181 182 183
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);
    InitializeVariable(var, var_desc->GetType());
Y
Yancey1989 已提交
184
    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
Y
Yancey1989 已提交
185 186

    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
Q
qiaolongfei 已提交
187

Y
Yancey1989 已提交
188
    responder_.Finish(reply, ::grpc::Status::OK, this);
189 190 191 192
    status_ = FINISH;
  }

 protected:
Y
Yancey1989 已提交
193
  std::shared_ptr<VariableResponse> request_;
194 195 196 197
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
Y
Yancey1989 已提交
198
  framework::ExecutorPrepareContext* prefetch_ctx_;
199 200
};

T
typhoonzero 已提交
201
void AsyncGRPCServer::WaitClientGet(int count) {
202 203 204 205 206 207
  int fetch_barriers = 0;
  while (fetch_barriers < count) {
    auto msg = var_get_queue_.Pop();
    if (msg.first == FETCH_BARRIER_MESSAGE) {
      fetch_barriers++;
    }
T
typhoonzero 已提交
208 209 210
  }
}

G
gongweibao 已提交
211
void AsyncGRPCServer::RunSyncUpdate() {
212
  ::grpc::ServerBuilder builder;
T
typhoonzero 已提交
213 214
  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
                           &selected_port_);
G
gongweibao 已提交
215 216
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
G
gongweibao 已提交
217 218 219 220
  builder.RegisterService(&service_);

  cq_send_ = builder.AddCompletionQueue();
  cq_get_ = builder.AddCompletionQueue();
221
  cq_prefetch_ = builder.AddCompletionQueue();
Y
Yancey 已提交
222

G
gongweibao 已提交
223
  server_ = builder.BuildAndStart();
T
typhoonzero 已提交
224 225
  LOG(INFO) << "Server listening on " << address_
            << " selected port: " << selected_port_;
G
gongweibao 已提交
226 227 228 229 230

  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
  std::function<void()> get_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
231 232
  std::function<void()> prefetch_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
G
gongweibao 已提交
233

T
typhoonzero 已提交
234
  // TODO(wuyi): Run these "HandleRequest" in thread pool
G
gongweibao 已提交
235
  t_send_.reset(
Y
Yancey 已提交
236
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
237 238
                                cq_send_.get(), "cq_send", send_register)));
  t_get_.reset(
Y
Yancey 已提交
239
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
240
                                cq_get_.get(), "cq_get", get_register)));
241 242 243
  t_prefetch_.reset(new std::thread(
      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                "cq_prefetch", prefetch_register)));
G
gongweibao 已提交
244 245 246 247
  // wait server
  server_->Wait();
  t_send_->join();
  t_get_->join();
248
  t_prefetch_->join();
G
gongweibao 已提交
249 250 251 252 253 254
}

void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
255
  cq_prefetch_->Shutdown();
G
gongweibao 已提交
256 257 258 259
}

// This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() {
T
typhoonzero 已提交
260
  is_shut_down_ = true;
G
gongweibao 已提交
261
  ShutdownQueue();
T
typhoonzero 已提交
262
  server_->Shutdown();
G
gongweibao 已提交
263 264 265 266 267
}

void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
268
    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
G
gongweibao 已提交
269 270
    return;
  }
Q
qiaolongfei 已提交
271 272
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
                                      scope_, &var_recv_queue_, dev_ctx_);
Y
Yancey 已提交
273
  VLOG(4) << "Create RequestSend status:" << send->Status();
G
gongweibao 已提交
274 275 276 277 278
}

void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
279
    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
G
gongweibao 已提交
280 281
    return;
  }
Q
qiaolongfei 已提交
282 283
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
                                   dev_ctx_, &var_get_queue_);
Y
Yancey 已提交
284
  VLOG(4) << "Create RequestGet status:" << get->Status();
G
gongweibao 已提交
285 286
}

287 288 289
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
290
    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
291 292 293
    return;
  }
  RequestPrefetch* prefetch =
Q
qiaolongfei 已提交
294 295
      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
                          dev_ctx_, executor_, program_, prefetch_ctx_);
296 297 298 299

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}

Y
Yancey 已提交
300
// FIXME(typhoonzero): change cq_name to enum.
301
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
Y
Yi Wang 已提交
302
                                    const std::string& cq_name,
G
gongweibao 已提交
303 304 305 306 307
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

  void* tag = NULL;
  bool ok = false;
308

G
gongweibao 已提交
309
  while (true) {
310
    VLOG(3) << "HandleRequest for " << cq_name << " while in";
G
gongweibao 已提交
311
    if (!cq->Next(&tag, &ok)) {
T
typhoonzero 已提交
312
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
G
gongweibao 已提交
313 314
      break;
    }
315
    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
G
gongweibao 已提交
316

G
gongweibao 已提交
317
    PADDLE_ENFORCE(tag);
Q
qiaolongfei 已提交
318 319 320 321 322
    if (sync_mode_) {
      // FIXME(typhoonzero): de-couple the barriers with recv_op
      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
    }
G
gongweibao 已提交
323

324
    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
G
gongweibao 已提交
325 326 327 328
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
G
gongweibao 已提交
329
    if (!ok) {
Q
qiaolongfei 已提交
330 331
      LOG(WARNING) << cq_name << " recv no regular event:argument name["
                   << base->GetReqName() << "]";
G
gongweibao 已提交
332 333 334 335 336 337 338 339 340
      TryToRegisterNewOne();
      delete base;
      continue;
    }

    switch (base->Status()) {
      case PROCESS: {
        TryToRegisterNewOne();
        base->Process();
Q
qiaolongfei 已提交
341
        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
G
gongweibao 已提交
342 343 344
        break;
      }
      case FINISH: {
Q
qiaolongfei 已提交
345
        VLOG(4) << cq_name << " FINISH status:" << base->Status();
G
gongweibao 已提交
346 347 348 349 350 351 352 353
        delete base;
        break;
      }
      default: { assert(false); }
    }
  }
}

T
typhoonzero 已提交
354 355 356 357
void AsyncGRPCServer::WaitCond(int cond) {
  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
  barrier_condition_.wait(lock,
                          [=] { return this->barrier_cond_step_ == cond; });
G
gongweibao 已提交
358 359
}

T
typhoonzero 已提交
360
void AsyncGRPCServer::SetCond(int cond) {
G
gongweibao 已提交
361
  {
T
typhoonzero 已提交
362 363
    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
    barrier_cond_step_ = cond;
G
gongweibao 已提交
364
  }
T
typhoonzero 已提交
365
  barrier_condition_.notify_all();
G
gongweibao 已提交
366 367 368 369 370
}

}  // namespace detail
}  // namespace operators
}  // namespace paddle