grpc_server.cc 12.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
G
gongweibao 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/detail/grpc_server.h"
16 17 18

#include <limits>
#include <string>
G
gongweibao 已提交
19

20
using ::grpc::ServerAsyncResponseWriter;
G
gongweibao 已提交
21 22 23 24 25 26 27 28 29 30 31

namespace paddle {
namespace operators {
namespace detail {

enum CallStatus { PROCESS = 0, FINISH };

// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
 public:
32
  explicit RequestBase(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
33
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
34
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
35 36 37 38 39
      : service_(service),
        cq_(cq),
        sync_mode_(sync_mode),
        status_(PROCESS),
        dev_ctx_(dev_ctx) {
G
gongweibao 已提交
40 41
    PADDLE_ENFORCE(cq_);
  }
G
gongweibao 已提交
42 43 44 45 46
  virtual ~RequestBase() {}
  virtual void Process() { assert(false); }

  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
T
typhoonzero 已提交
47 48 49 50
  virtual std::string GetReqName() {
    assert(false);
    return "";
  }
G
gongweibao 已提交
51 52

 protected:
53 54 55
  ::grpc::ServerContext ctx_;
  GrpcService::AsyncService* service_;
  ::grpc::ServerCompletionQueue* cq_;
Q
qiaolongfei 已提交
56
  const bool sync_mode_;
G
gongweibao 已提交
57
  CallStatus status_;
58
  const platform::DeviceContext* dev_ctx_;
G
gongweibao 已提交
59 60 61 62
};

class RequestSend final : public RequestBase {
 public:
63
  explicit RequestSend(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
64
                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
65 66
                       framework::Scope* scope, ReceivedQueue* queue,
                       const platform::DeviceContext* dev_ctx)
Q
qiaolongfei 已提交
67 68 69 70
      : RequestBase(service, cq, sync_mode, dev_ctx),
        queue_(queue),
        responder_(&ctx_) {
    if (sync_mode_) {
71
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
72
    } else {
73
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
74
    }
75 76 77
    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
G
gongweibao 已提交
78 79 80 81
  }

  virtual ~RequestSend() {}

82
  virtual std::string GetReqName() { return request_->Varname(); }
G
gongweibao 已提交
83

G
gongweibao 已提交
84
  virtual void Process() {
Q
qiaolongfei 已提交
85 86 87
    std::string var_name = GetReqName();
    VLOG(3) << "RequestSend " << var_name;
    queue_->Push(std::make_pair(var_name, request_));
88 89 90

    sendrecv::VoidMessage reply;
    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
91
    status_ = FINISH;
G
gongweibao 已提交
92 93 94
  }

 protected:
95 96
  std::shared_ptr<VariableResponse> request_;
  ReceivedQueue* queue_;
G
gongweibao 已提交
97 98 99 100 101
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};

class RequestGet final : public RequestBase {
 public:
102
  explicit RequestGet(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
103
                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
104
                      framework::Scope* scope,
T
typhoonzero 已提交
105
                      const platform::DeviceContext* dev_ctx,
T
typhoonzero 已提交
106
                      framework::BlockingQueue<MessageWithName>* queue)
Q
qiaolongfei 已提交
107
      : RequestBase(service, cq, sync_mode, dev_ctx),
Y
Yancey1989 已提交
108 109
        responder_(&ctx_),
        scope_(scope),
T
typhoonzero 已提交
110
        queue_(queue) {
Q
qiaolongfei 已提交
111
    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
112 113
    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
                                cq_, this);
G
gongweibao 已提交
114 115 116 117
  }

  virtual ~RequestGet() {}

G
gongweibao 已提交
118 119
  virtual std::string GetReqName() { return request_.varname(); }

G
gongweibao 已提交
120 121 122
  virtual void Process() {
    // proc request.
    std::string var_name = request_.varname();
Q
qiaolongfei 已提交
123
    VLOG(3) << "RequestGet " << var_name;
G
gongweibao 已提交
124
    auto* var = scope_->FindVar(var_name);
125 126

    ::grpc::ByteBuffer reply;
127
    if (var_name != FETCH_BARRIER_MESSAGE) {
128
      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
129
    }
130 131

    responder_.Finish(reply, ::grpc::Status::OK, this);
G
gongweibao 已提交
132
    status_ = FINISH;
133 134 135 136 137 138

    if (var_name == FETCH_BARRIER_MESSAGE) {
      sendrecv::VariableMessage msg;
      MessageWithName msg_with_name = std::make_pair(var_name, msg);
      queue_->Push(msg_with_name);
    }
G
gongweibao 已提交
139 140 141 142
  }

 protected:
  sendrecv::VariableMessage request_;
143
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
G
gongweibao 已提交
144
  framework::Scope* scope_;
T
typhoonzero 已提交
145
  framework::BlockingQueue<MessageWithName>* queue_;
G
gongweibao 已提交
146 147
};

148 149 150
class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
Q
qiaolongfei 已提交
151
                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
152 153 154
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
Y
Yancey1989 已提交
155 156
                           framework::ProgramDesc* program,
                           framework::ExecutorPrepareContext* prefetch_ctx)
Q
qiaolongfei 已提交
157
      : RequestBase(service, cq, sync_mode, dev_ctx),
158 159 160 161
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
Y
Yancey1989 已提交
162
        prefetch_ctx_(prefetch_ctx) {
Q
qiaolongfei 已提交
163
    if (sync_mode_) {
164
      request_.reset(new VariableResponse(scope, dev_ctx_, false));
Q
qiaolongfei 已提交
165
    } else {
166
      request_.reset(new VariableResponse(scope, dev_ctx_, true));
Q
qiaolongfei 已提交
167
    }
168
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
Y
Yancey1989 已提交
169 170
    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
                                cq_, cq_, this);
171 172 173 174
  }

  virtual ~RequestPrefetch() {}

Y
Yancey1989 已提交
175
  virtual std::string GetReqName() { return request_->Varname(); }
176 177 178

  virtual void Process() {
    // prefetch process...
Y
Yancey1989 已提交
179
    ::grpc::ByteBuffer reply;
180

Y
Yancey1989 已提交
181
    std::string var_name = request_->OutVarname();
Q
qiaolongfei 已提交
182
    VLOG(3) << "RequestPrefetch " << var_name;
Y
Yancey1989 已提交
183 184 185 186
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);
    InitializeVariable(var, var_desc->GetType());
Y
Yancey1989 已提交
187
    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
Y
Yancey1989 已提交
188 189

    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
Q
qiaolongfei 已提交
190

Y
Yancey1989 已提交
191
    responder_.Finish(reply, ::grpc::Status::OK, this);
192 193 194 195
    status_ = FINISH;
  }

 protected:
Y
Yancey1989 已提交
196
  std::shared_ptr<VariableResponse> request_;
197 198 199 200
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
Y
Yancey1989 已提交
201
  framework::ExecutorPrepareContext* prefetch_ctx_;
202 203
};

T
typhoonzero 已提交
204
void AsyncGRPCServer::WaitClientGet(int count) {
205 206 207 208 209 210
  int fetch_barriers = 0;
  while (fetch_barriers < count) {
    auto msg = var_get_queue_.Pop();
    if (msg.first == FETCH_BARRIER_MESSAGE) {
      fetch_barriers++;
    }
T
typhoonzero 已提交
211 212 213
  }
}

T
done  
typhoonzero 已提交
214
void AsyncGRPCServer::WaitServerReady() {
T
update  
typhoonzero 已提交
215
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
T
done  
typhoonzero 已提交
216
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
T
update  
typhoonzero 已提交
217 218
}

G
gongweibao 已提交
219
void AsyncGRPCServer::RunSyncUpdate() {
220
  ::grpc::ServerBuilder builder;
T
typhoonzero 已提交
221 222
  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
                           &selected_port_);
G
gongweibao 已提交
223 224
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
G
gongweibao 已提交
225 226 227 228
  builder.RegisterService(&service_);

  cq_send_ = builder.AddCompletionQueue();
  cq_get_ = builder.AddCompletionQueue();
229
  cq_prefetch_ = builder.AddCompletionQueue();
Y
Yancey 已提交
230

G
gongweibao 已提交
231
  server_ = builder.BuildAndStart();
T
typhoonzero 已提交
232 233
  LOG(INFO) << "Server listening on " << address_
            << " selected port: " << selected_port_;
G
gongweibao 已提交
234 235 236 237 238

  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
  std::function<void()> get_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
239 240
  std::function<void()> prefetch_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
G
gongweibao 已提交
241

T
typhoonzero 已提交
242
  // TODO(wuyi): Run these "HandleRequest" in thread pool
G
gongweibao 已提交
243
  t_send_.reset(
Y
Yancey 已提交
244
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
245 246
                                cq_send_.get(), "cq_send", send_register)));
  t_get_.reset(
Y
Yancey 已提交
247
      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
G
gongweibao 已提交
248
                                cq_get_.get(), "cq_get", get_register)));
249 250 251
  t_prefetch_.reset(new std::thread(
      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                "cq_prefetch", prefetch_register)));
T
wip  
typhoonzero 已提交
252 253 254 255 256 257

  {
    std::lock_guard<std::mutex> lock(this->mutex_ready_);
    ready_ = 1;
  }
  condition_ready_.notify_all();
G
gongweibao 已提交
258 259 260 261
  // wait server
  server_->Wait();
  t_send_->join();
  t_get_->join();
262
  t_prefetch_->join();
G
gongweibao 已提交
263 264 265 266 267 268
}

void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
269
  cq_prefetch_->Shutdown();
G
gongweibao 已提交
270 271 272 273
}

// This URL explains why shutdown is complicate:
void AsyncGRPCServer::ShutDown() {
T
typhoonzero 已提交
274
  is_shut_down_ = true;
G
gongweibao 已提交
275
  ShutdownQueue();
T
typhoonzero 已提交
276
  server_->Shutdown();
G
gongweibao 已提交
277 278 279 280 281
}

void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
282
    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
G
gongweibao 已提交
283 284
    return;
  }
Q
qiaolongfei 已提交
285 286
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
                                      scope_, &var_recv_queue_, dev_ctx_);
Y
Yancey 已提交
287
  VLOG(4) << "Create RequestSend status:" << send->Status();
G
gongweibao 已提交
288 289 290 291 292
}

void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
293
    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
G
gongweibao 已提交
294 295
    return;
  }
Q
qiaolongfei 已提交
296 297
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
                                   dev_ctx_, &var_get_queue_);
Y
Yancey 已提交
298
  VLOG(4) << "Create RequestGet status:" << get->Status();
G
gongweibao 已提交
299 300
}

301 302 303
void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
304
    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
305 306 307
    return;
  }
  RequestPrefetch* prefetch =
Q
qiaolongfei 已提交
308 309
      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
                          dev_ctx_, executor_, program_, prefetch_ctx_);
310 311 312 313

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
}

Y
Yancey 已提交
314
// FIXME(typhoonzero): change cq_name to enum.
315
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
Y
Yi Wang 已提交
316
                                    const std::string& cq_name,
G
gongweibao 已提交
317 318 319 320 321
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

  void* tag = NULL;
  bool ok = false;
322

G
gongweibao 已提交
323
  while (true) {
Q
qiaolongfei 已提交
324
    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
G
gongweibao 已提交
325
    if (!cq->Next(&tag, &ok)) {
T
typhoonzero 已提交
326
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
G
gongweibao 已提交
327 328
      break;
    }
Q
qiaolongfei 已提交
329
    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
G
gongweibao 已提交
330

G
gongweibao 已提交
331
    PADDLE_ENFORCE(tag);
Q
qiaolongfei 已提交
332

Q
qiaolongfei 已提交
333 334 335 336
    if (sync_mode_) {
      // FIXME(typhoonzero): de-couple the barriers with recv_op
      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
Q
qiaolongfei 已提交
337
      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
Q
qiaolongfei 已提交
338
    }
G
gongweibao 已提交
339

340
    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
G
gongweibao 已提交
341 342 343 344
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
G
gongweibao 已提交
345
    if (!ok) {
Q
qiaolongfei 已提交
346 347
      LOG(WARNING) << cq_name << " recv no regular event:argument name["
                   << base->GetReqName() << "]";
G
gongweibao 已提交
348 349 350 351 352 353 354 355 356
      TryToRegisterNewOne();
      delete base;
      continue;
    }

    switch (base->Status()) {
      case PROCESS: {
        TryToRegisterNewOne();
        base->Process();
Q
qiaolongfei 已提交
357
        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
G
gongweibao 已提交
358 359 360
        break;
      }
      case FINISH: {
Q
qiaolongfei 已提交
361
        VLOG(4) << cq_name << " FINISH status:" << base->Status();
G
gongweibao 已提交
362 363 364 365 366 367 368 369
        delete base;
        break;
      }
      default: { assert(false); }
    }
  }
}

T
typhoonzero 已提交
370 371 372 373
void AsyncGRPCServer::WaitCond(int cond) {
  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
  barrier_condition_.wait(lock,
                          [=] { return this->barrier_cond_step_ == cond; });
G
gongweibao 已提交
374 375
}

T
typhoonzero 已提交
376
void AsyncGRPCServer::SetCond(int cond) {
G
gongweibao 已提交
377
  {
T
typhoonzero 已提交
378 379
    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
    barrier_cond_step_ = cond;
G
gongweibao 已提交
380
  }
T
typhoonzero 已提交
381
  barrier_condition_.notify_all();
G
gongweibao 已提交
382 383 384 385 386
}

}  // namespace detail
}  // namespace operators
}  // namespace paddle