fix prefetch hang problem, add some more logs

45363069 · qiaolongfei · fdecae5f · 45363069 · 45363069 · 45363069
4 changed file
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "grpc_client.h"
-#include <sys/time.h>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+#include <limits>
+
 #include "paddle/fluid/framework/threadpool.h"

 namespace paddle {
@@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@@ -109,7 +111,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@@ -153,7 +155,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
        &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@@ -169,7 +171,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  req_count_++;
 }

@@ -181,7 +183,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  req_count_++;
 }


--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include <paddle/fluid/operators/detail/send_recv.pb.h>
+
+#include <limits>
+#include <string>

 using ::grpc::ServerAsyncResponseWriter;

@@ -224,6 +226,7 @@ void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
+  cq_prefetch_->Shutdown();
 }

 // This URL explains why shutdown is complicate:
@@ -236,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -246,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
    return;
  }
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -257,6 +262,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  VLOG(4) << "TryToRegisterNewPrefetchOne in";
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
    return;
  }
  RequestPrefetch* prefetch =
@@ -274,18 +280,21 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,

  void* tag = NULL;
  bool ok = false;
+
  while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
      break;
    }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";

    PADDLE_ENFORCE(tag);
    // FIXME(typhoonzero): de-couple the barriers with recv_op
    if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
    if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);

-    RequestBase* base = (RequestBase*)tag;
+    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM

--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -89,7 +89,7 @@ inline const char* GrpcMethodName(GrpcMethod id) {
    case GrpcMethod::kGetVariable:
      return "/sendrecv.SendRecvService/GetVariable";
    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendREcvService/PrefetchVariable";
+      return "/sendrecv.SendRecvService/PrefetchVariable";
  }

  // Shouldn't be reached.
@@ -117,5 +117,5 @@ class GrpcService final {
 };

 }  // namespace detail
-}  // namespace operator
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-
-#include <unistd.h>

 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace operators {
@@ -177,7 +168,8 @@ class ListenAndServOp : public framework::OperatorBase {
      }
      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);

-      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+              << "(ms)";

      // Reset the received sparse variables, the sum operator would not
      // sum the input sparse variables which rows is empty at the next