grpc_server.cc 12.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/detail/grpc_server.h"
16 17 18

#include <limits>
#include <string>
G
gongweibao 已提交
19

20
using ::grpc::ServerAsyncResponseWriter;
G
gongweibao 已提交
21 22 23 24 25 26 27 28 29 30 31

namespace paddle {
namespace operators {
namespace detail {

enum CallStatus { PROCESS = 0, FINISH };

// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
 public:
32
  explicit RequestBase(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
33
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
34
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
35 36 37 38 39
      : service_(service),
        cq_(cq),
        sync_mode_(sync_mode),
        status_(PROCESS),
        dev_ctx_(dev_ctx) {
G
gongweibao 已提交
40 41
    PADDLE_ENFORCE(cq_);
  }
G
gongweibao 已提交
42 43 44 45 46
  virtual ~RequestBase() {}
  virtual void Process() { assert(false); }

  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
T
typhoonzero 已提交
47 48 49 50
  virtual std::string GetReqName() {
    assert(false);
    return "";
  }
G
gongweibao 已提交
51 52

 protected:
53 54 55
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
Q
qiaolongfei 已提交
56
  const bool sync_mode_;
G
gongweibao 已提交
57
  CallStatus status_;
58
  const platform::DeviceContext* dev_ctx_;
G
gongweibao 已提交
59 60 61 62
};

class RequestSend final : public RequestBase {
 public:
63
  explicit RequestSend(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
64
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
65 66
                       framework::Scope* scope, ReceivedQueue* queue,
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
67 68 69 70
      : RequestBase(service, cq, sync_mode, dev_ctx),
        queue_(queue),
        responder_(&ctx_) {
    if (sync_mode_) {
71
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
72
    } else {
73
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
74
    }
75 76 77
    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
G
gongweibao 已提交
78 79 80 81
  }

  virtual ~RequestSend() {}

82
  virtual std::string GetReqName() { return request_->Varname(); }
G
gongweibao 已提交
83

G
gongweibao 已提交
84
  virtual void Process() {
85 86 87 88
    queue_->Push(std::make_pair(request_->Varname(), request_));

    sendrecv::VoidMessage reply;
    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
89
    status_ = FINISH;
G
gongweibao 已提交
90 91 92
  }

 protected:
93 94
  std::shared_ptr<VariableResponse> request_;
  ReceivedQueue* queue_;
G
gongweibao 已提交
95 96 97 98 99
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};

class RequestGet final : public RequestBase {
 public:
100
  explicit RequestGet(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
101
                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
102
                      framework::Scope* scope,
T
typhoonzero 已提交
103
                      const platform::DeviceContext* dev_ctx,
T
typhoonzero 已提交
104
                      framework::BlockingQueue<MessageWithName>* queue)
Q
qiaolongfei 已提交
105
      : RequestBase(service, cq, sync_mode, dev_ctx),
Y
Yancey1989 已提交
106 107
        responder_(&ctx_),
        scope_(scope),
T
typhoonzero 已提交
108
        queue_(queue) {
109 110 111
    int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
                                cq_, this);
G
gongweibao 已提交
112 113 114 115
  }

  virtual ~RequestGet() {}

G
gongweibao 已提交
116 117
  virtual std::string GetReqName() { return request_.varname(); }

G
gongweibao 已提交
118 119 120 121
  virtual void Process() {
    // proc request.
    std::string var_name = request_.varname();
    auto* var = scope_->FindVar(var_name);
122 123

    ::grpc::ByteBuffer reply;
124
    if (var_name != FETCH_BARRIER_MESSAGE) {
125
      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
126
    }
127 128

    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
129
    status_ = FINISH;
130 131 132 133 134 135

    if (var_name == FETCH_BARRIER_MESSAGE) {
      sendrecv::VariableMessage msg;
      MessageWithName msg_with_name = std::make_pair(var_name, msg);
      queue_->Push(msg_with_name);
    }
G
gongweibao 已提交
136 137 138 139
  }

 protected:
  sendrecv::VariableMessage request_;
140
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
G
gongweibao 已提交
141
  framework::Scope* scope_;
T
typhoonzero 已提交
142
  framework::BlockingQueue<MessageWithName>* queue_;
G
gongweibao 已提交
143 144
};

145 146 147
class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
148
                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
149 150 151
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
Y
Yancey1989 已提交
152 153
                           framework::ProgramDesc* program,
                           framework::ExecutorPrepareContext* prefetch_ctx)
Q
qiaolongfei 已提交
154
      : RequestBase(service, cq, sync_mode, dev_ctx),
155 156 157 158
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
Y
Yancey1989 已提交
159
        prefetch_ctx_(prefetch_ctx) {
Q
qiaolongfei 已提交
160
    if (sync_mode_) {
161
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
162
    } else {
163
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
164
    }
165
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
Y
Yancey1989 已提交
166 167
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
168 169 170 171
  }

  virtual ~RequestPrefetch() {}

Y
Yancey1989 已提交
172
  virtual std::string GetReqName() { return request_->Varname(); }
173 174 175

  virtual void Process() {
    // prefetch process...
Y
Yancey1989 已提交
176
    ::grpc::ByteBuffer reply;
177

Y
Yancey1989 已提交
178
    std::string var_name = request_->OutVarname();
179
    VLOG(3) << "prefetch var " << var_name;
Y
Yancey1989 已提交
180 181 182 183
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);
    InitializeVariable(var, var_desc->GetType());
Y
Yancey1989 已提交
184
    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
Y
Yancey1989 已提交
185 186

    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
Q
qiaolongfei 已提交
187

Y
Yancey1989 已提交
188
    responder_.Finish(reply, ::grpc::Status::OK, this);
189 190 191 192
    status_ = FINISH;
  }

 protected:
Y
Yancey1989 已提交
193
  std::shared_ptr<VariableResponse> request_;
194 195 196 197
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
Y
Yancey1989 已提交
198
  framework::ExecutorPrepareContext* prefetch_ctx_;
199 200
};

T
typhoonzero 已提交
201
void AsyncGRPCServer::WaitClientGet(int count) {
202 203 204 205 206 207
  int fetch_barriers = 0;
  while (fetch_barriers < count) {
    auto msg = var_get_queue_.Pop();
    if (msg.first == FETCH_BARRIER_MESSAGE) {
      fetch_barriers++;
    }
T
typhoonzero 已提交
208 209 210
  }
}

G
gongweibao 已提交
211
void AsyncGRPCServer::RunSyncUpdate() {
212
  ::grpc::ServerBuilder builder;
T
typhoonzero 已提交
213 214
  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
                           &selected_port_);
G
gongweibao 已提交
215 216
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
G
gongweibao 已提交
217 218 219 220
  builder.RegisterService(&service_);

  cq_send_ = builder.AddCompletionQueue();
  cq_get_ = builder.AddCompletionQueue();
221
  cq_prefetch_ = builder.AddCompletionQueue();
Y
Yancey 已提交
222

G
gongweibao 已提交
223
  server_ = builder.BuildAndStart();
T
typhoonzero 已提交
224 225
  LOG(INFO) << "Server listening on " << address_
            << " selected port: " << selected_port_;
G
gongweibao 已提交
226 227 228 229 230

  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
  std::function<void()> get_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
231 232
  std::function<void()> prefetch_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
G
gongweibao 已提交
233

T
typhoonzero 已提交
234
  // TODO(wuyi): Run these "HandleRequest" in thread pool
G
gongweibao 已提交
235
  t_send_.reset(
Y
Yancey 已提交
236
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
237 238
                                cq_send_.get(), "cq_send", send_register)));
  t_get_.reset(
Y
Yancey 已提交
239
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
240
                                cq_get_.get(), "cq_get", get_register)));
241 242 243
  t_prefetch_.reset(new std::thread(
      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                "cq_prefetch", prefetch_register)));
T
wip  
typhoonzero 已提交
244 245 246 247 248 249

  {
    std::lock_guard<std::mutex> lock(this->mutex_ready_);
    ready_ = 1;
  }
  condition_ready_.notify_all();
G
gongweibao 已提交
250 251 252 253
  // wait server
  server_->Wait();
  t_send_->join();
  t_get_->join();
254
  t_prefetch_->join();
G
gongweibao 已提交
255 256 257 258 259 260
}

void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
261
  cq_prefetch_->Shutdown();
G
gongweibao 已提交
262 263 264 265
}

// This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() {
T
typhoonzero 已提交
266
  is_shut_down_ = true;
G
gongweibao 已提交
267
  ShutdownQueue();
T
typhoonzero 已提交
268
  server_->Shutdown();
G
gongweibao 已提交
269 270 271 272 273
}

void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
274
    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
G
gongweibao 已提交
275 276
    return;
  }
Q
qiaolongfei 已提交
277 278
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
                                      scope_, &var_recv_queue_, dev_ctx_);
Y
Yancey 已提交
279
  VLOG(4) << "Create RequestSend status:" << send->Status();
G
gongweibao 已提交
280 281 282 283 284
}

void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
285
    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
G
gongweibao 已提交
286 287
    return;
  }
Q
qiaolongfei 已提交
288 289
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
                                   dev_ctx_, &var_get_queue_);
Y
Yancey 已提交
290
  VLOG(4) << "Create RequestGet status:" << get->Status();
G
gongweibao 已提交
291 292
}

293 294 295
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
296
    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
297 298 299
    return;
  }
  RequestPrefetch* prefetch =
Q
qiaolongfei 已提交
300 301
      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
                          dev_ctx_, executor_, program_, prefetch_ctx_);
302 303 304 305

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}

Y
Yancey 已提交
306
// FIXME(typhoonzero): change cq_name to enum.
307
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
Y
Yi Wang 已提交
308
                                    const std::string& cq_name,
G
gongweibao 已提交
309 310 311 312 313
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

  void* tag = NULL;
  bool ok = false;
314

G
gongweibao 已提交
315
  while (true) {
316
    VLOG(3) << "HandleRequest for " << cq_name << " while in";
G
gongweibao 已提交
317
    if (!cq->Next(&tag, &ok)) {
T
typhoonzero 已提交
318
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
G
gongweibao 已提交
319 320
      break;
    }
321
    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
G
gongweibao 已提交
322

G
gongweibao 已提交
323
    PADDLE_ENFORCE(tag);
Q
qiaolongfei 已提交
324 325 326 327 328
    if (sync_mode_) {
      // FIXME(typhoonzero): de-couple the barriers with recv_op
      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
    }
G
gongweibao 已提交
329

330
    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
G
gongweibao 已提交
331 332 333 334
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
G
gongweibao 已提交
335
    if (!ok) {
Q
qiaolongfei 已提交
336 337
      LOG(WARNING) << cq_name << " recv no regular event:argument name["
                   << base->GetReqName() << "]";
G
gongweibao 已提交
338 339 340 341 342 343 344
      TryToRegisterNewOne();
      delete base;
      continue;
    }

    switch (base->Status()) {
      case PROCESS: {
Q
qiaolongfei 已提交
345
        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
G
gongweibao 已提交
346 347 348 349 350
        TryToRegisterNewOne();
        base->Process();
        break;
      }
      case FINISH: {
Q
qiaolongfei 已提交
351
        VLOG(4) << cq_name << " FINISH status:" << base->Status();
G
gongweibao 已提交
352 353 354 355 356 357 358 359
        delete base;
        break;
      }
      default: { assert(false); }
    }
  }
}

T
typhoonzero 已提交
360 361 362 363
void AsyncGRPCServer::WaitCond(int cond) {
  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
  barrier_condition_.wait(lock,
                          [=] { return this->barrier_cond_step_ == cond; });
G
gongweibao 已提交
364 365
}

T
typhoonzero 已提交
366
void AsyncGRPCServer::SetCond(int cond) {
G
gongweibao 已提交
367
  {
T
typhoonzero 已提交
368 369
    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
    barrier_cond_step_ = cond;
G
gongweibao 已提交
370
  }
T
typhoonzero 已提交
371
  barrier_condition_.notify_all();
G
gongweibao 已提交
372 373 374 375 376
}

}  // namespace detail
}  // namespace operators
}  // namespace paddle