diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index ba25c8d079b5a5305b20fe12214101d14a2517d2..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -50,7 +50,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 - PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/include/grpcpp/impl/codegen/completion_queue.h + PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h # NOTE(yuyang18): # Disable -Werror, otherwise the compile will fail in MacOS. # It seems that we cannot configure that by make command. diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 265f964ddc682868c64669744b130aebbbf86692..b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -49,6 +49,7 @@ void GRPCClient::SendComplete() { } GRPCClient::~GRPCClient() { + stopped_ = true; Wait(); cq_.Shutdown(); { @@ -275,7 +276,7 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; - while (cq_.Next(&tag, &ok)) { + while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 8351d825f817437e1b3691e916952dd9a86af491..29b95fad768dea860185d64eb33c0b485478a871 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { class GRPCClient : public RPCClient { public: - GRPCClient() : ok_(true), completed_(false) {} + GRPCClient() : ok_(true), completed_(false), stopped_(false) {} virtual ~GRPCClient(); bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, @@ -237,6 +237,8 @@ class GRPCClient : public RPCClient { // mutex for sending complete message only once std::mutex completed_mutex_; bool completed_; + + bool stopped_; }; } // namespace distributed diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4 --- /dev/null +++ b/patches/grpc/completion_queue.h @@ -0,0 +1,386 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// A completion queue implements a concurrent producer-consumer queue, with +/// two main API-exposed methods: \a Next and \a AsyncNext. These +/// methods are the essential component of the gRPC C++ asynchronous API. +/// There is also a \a Shutdown method to indicate that a given completion queue +/// will no longer have regular events. This must be called before the +/// completion queue is destroyed. +/// All completion queue APIs are thread-safe and may be used concurrently with +/// any other completion queue API invocation; it is acceptable to have +/// multiple threads calling \a Next or \a AsyncNext on the same or different +/// completion queues, or to call these methods concurrently with a \a Shutdown +/// elsewhere. +/// \remark{All other API calls on completion queue should be completed before +/// a completion queue destructor is called.} +#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H +#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H + +#include + +#include +#include +#include +#include +#include +#include + +struct grpc_completion_queue; + +namespace grpc { + +template +class ClientReader; +template +class ClientWriter; +template +class ClientReaderWriter; +template +class ServerReader; +template +class ServerWriter; +namespace internal { +template +class ServerReaderWriterBody; +} // namespace internal + +class Channel; +class ChannelInterface; +class ClientContext; +class CompletionQueue; +class Server; +class ServerBuilder; +class ServerContext; +class ServerInterface; + +namespace internal { +class CompletionQueueTag; +class RpcMethod; +template +class RpcMethodHandler; +template +class ClientStreamingHandler; +template +class ServerStreamingHandler; +template +class BidiStreamingHandler; +class UnknownMethodHandler; +template +class TemplatedBidiStreamingHandler; +template +class BlockingUnaryCallImpl; +} // namespace internal + +extern CoreCodegenInterface* g_core_codegen_interface; + +/// A thin wrapper around \ref grpc_completion_queue (see \ref +/// src/core/lib/surface/completion_queue.h). +/// See \ref doc/cpp/perf_notes.md for notes on best practices for high +/// performance servers. +class CompletionQueue : private GrpcLibraryCodegen { + public: + /// Default constructor. Implicitly creates a \a grpc_completion_queue + /// instance. + CompletionQueue() + : CompletionQueue(grpc_completion_queue_attributes{ + GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {} + + /// Wrap \a take, taking ownership of the instance. + /// + /// \param take The completion queue instance to wrap. Ownership is taken. + explicit CompletionQueue(grpc_completion_queue* take); + + /// Destructor. Destroys the owned wrapped completion queue / instance. + ~CompletionQueue() { + if (typeid(*g_core_codegen_interface).hash_code() != + typeid(CoreCodegenInterface).hash_code()) { + g_core_codegen_interface->grpc_completion_queue_destroy(cq_); + } + } + + /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT. + enum NextStatus { + SHUTDOWN, ///< The completion queue has been shutdown and fully-drained + GOT_EVENT, ///< Got a new event; \a tag will be filled in with its + ///< associated value; \a ok indicating its success. + TIMEOUT ///< deadline was reached. + }; + + /// Read from the queue, blocking until an event is available or the queue is + /// shutting down. + /// + /// \param tag[out] Updated to point to the read event's tag. + /// \param ok[out] true if read a successful event, false otherwise. + /// + /// Note that each tag sent to the completion queue (through RPC operations + /// or alarms) will be delivered out of the completion queue by a call to + /// Next (or a related method), regardless of whether the operation succeeded + /// or not. Success here means that this operation completed in the normal + /// valid manner. + /// + /// Server-side RPC request: \a ok indicates that the RPC has indeed + /// been started. If it is false, the server has been Shutdown + /// before this particular call got matched to an incoming RPC. + /// + /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is + /// going to go to the wire. If it is false, it not going to the wire. This + /// would happen if the channel is either permanently broken or + /// transiently broken but with the fail-fast option. (Note that async unary + /// RPCs don't post a CQ tag at this point, nor do client-streaming + /// or bidi-streaming RPCs that have the initial metadata corked option set.) + /// + /// Client-side Write, Client-side WritesDone, Server-side Write, + /// Server-side Finish, Server-side SendInitialMetadata (which is + /// typically included in Write or Finish when not done explicitly): + /// \a ok means that the data/metadata/status/etc is going to go to the + /// wire. If it is false, it not going to the wire because the call + /// is already dead (i.e., canceled, deadline expired, other side + /// dropped the channel, etc). + /// + /// Client-side Read, Server-side Read, Client-side + /// RecvInitialMetadata (which is typically included in Read if not + /// done explicitly): \a ok indicates whether there is a valid message + /// that got read. If not, you know that there are certainly no more + /// messages that can ever be read from this stream. For the client-side + /// operations, this only happens because the call is dead. For the + /// server-sider operation, though, this could happen because the client + /// has done a WritesDone already. + /// + /// Client-side Finish: \a ok should always be true + /// + /// Server-side AsyncNotifyWhenDone: \a ok should always be true + /// + /// Alarm: \a ok is true if it expired, false if it was canceled + /// + /// \return true if got an event, false if the queue is fully drained and + /// shut down. + bool Next(void** tag, bool* ok) { + return (AsyncNextInternal(tag, + ok, + g_core_codegen_interface->gpr_inf_future( + GPR_CLOCK_REALTIME)) != SHUTDOWN); + } + + /// Read from the queue, blocking up to \a deadline (or the queue's shutdown). + /// Both \a tag and \a ok are updated upon success (if an event is available + /// within the \a deadline). A \a tag points to an arbitrary location usually + /// employed to uniquely identify an event. + /// + /// \param tag[out] Upon sucess, updated to point to the event's tag. + /// \param ok[out] Upon sucess, true if a successful event, false otherwise + /// See documentation for CompletionQueue::Next for explanation of ok + /// \param deadline[in] How long to block in wait for an event. + /// + /// \return The type of event read. + template + NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) { + TimePoint deadline_tp(deadline); + return AsyncNextInternal(tag, ok, deadline_tp.raw_time()); + } + + /// EXPERIMENTAL + /// First executes \a F, then reads from the queue, blocking up to + /// \a deadline (or the queue's shutdown). + /// Both \a tag and \a ok are updated upon success (if an event is available + /// within the \a deadline). A \a tag points to an arbitrary location usually + /// employed to uniquely identify an event. + /// + /// \param F[in] Function to execute before calling AsyncNext on this queue. + /// \param tag[out] Upon sucess, updated to point to the event's tag. + /// \param ok[out] Upon sucess, true if read a regular event, false otherwise. + /// \param deadline[in] How long to block in wait for an event. + /// + /// \return The type of event read. + template + NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) { + CompletionQueueTLSCache cache = CompletionQueueTLSCache(this); + f(); + if (cache.Flush(tag, ok)) { + return GOT_EVENT; + } else { + return AsyncNext(tag, ok, deadline); + } + } + + /// Request the shutdown of the queue. + /// + /// \warning This method must be called at some point if this completion queue + /// is accessed with Next or AsyncNext. \a Next will not return false + /// until this method has been called and all pending tags have been drained. + /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .) + /// Only once either one of these methods does that (that is, once the queue + /// has been \em drained) can an instance of this class be destroyed. + /// Also note that applications must ensure that no work is enqueued on this + /// completion queue after this method is called. + void Shutdown(); + + /// Returns a \em raw pointer to the underlying \a grpc_completion_queue + /// instance. + /// + /// \warning Remember that the returned instance is owned. No transfer of + /// owership is performed. + grpc_completion_queue* cq() { return cq_; } + + protected: + /// Private constructor of CompletionQueue only visible to friend classes + CompletionQueue(const grpc_completion_queue_attributes& attributes) { + cq_ = g_core_codegen_interface->grpc_completion_queue_create( + g_core_codegen_interface->grpc_completion_queue_factory_lookup( + &attributes), + &attributes, + NULL); + InitialAvalanching(); // reserve this for the future shutdown + } + + private: + // Friend synchronous wrappers so that they can access Pluck(), which is + // a semi-private API geared towards the synchronous implementation. + template + friend class ::grpc::ClientReader; + template + friend class ::grpc::ClientWriter; + template + friend class ::grpc::ClientReaderWriter; + template + friend class ::grpc::ServerReader; + template + friend class ::grpc::ServerWriter; + template + friend class ::grpc::internal::ServerReaderWriterBody; + template + friend class ::grpc::internal::RpcMethodHandler; + template + friend class ::grpc::internal::ClientStreamingHandler; + template + friend class ::grpc::internal::ServerStreamingHandler; + template + friend class ::grpc::internal::TemplatedBidiStreamingHandler; + friend class ::grpc::internal::UnknownMethodHandler; + friend class ::grpc::Server; + friend class ::grpc::ServerContext; + friend class ::grpc::ServerInterface; + template + friend class ::grpc::internal::BlockingUnaryCallImpl; + + /// EXPERIMENTAL + /// Creates a Thread Local cache to store the first event + /// On this completion queue queued from this thread. Once + /// initialized, it must be flushed on the same thread. + class CompletionQueueTLSCache { + public: + CompletionQueueTLSCache(CompletionQueue* cq); + ~CompletionQueueTLSCache(); + bool Flush(void** tag, bool* ok); + + private: + CompletionQueue* cq_; + bool flushed_; + }; + + NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline); + + /// Wraps \a grpc_completion_queue_pluck. + /// \warning Must not be mixed with calls to \a Next. + bool Pluck(internal::CompletionQueueTag* tag) { + auto deadline = + g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME); + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + bool ok = ev.success != 0; + void* ignored = tag; + GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok)); + GPR_CODEGEN_ASSERT(ignored == tag); + // Ignore mutations by FinalizeResult: Pluck returns the C API status + return ev.success != 0; + } + + /// Performs a single polling pluck on \a tag. + /// \warning Must not be mixed with calls to \a Next. + /// + /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already + /// shutdown. This is most likely a bug and if it is a bug, then change this + /// implementation to simple call the other TryPluck function with a zero + /// timeout. i.e: + /// TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME)) + void TryPluck(internal::CompletionQueueTag* tag) { + auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME); + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + if (ev.type == GRPC_QUEUE_TIMEOUT) return; + bool ok = ev.success != 0; + void* ignored = tag; + // the tag must be swallowed if using TryPluck + GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok)); + } + + /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if + /// the pluck() was successful and returned the tag. + /// + /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects + /// that the tag is internal not something that is returned to the user. + void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) { + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) { + return; + } + + bool ok = ev.success != 0; + void* ignored = tag; + GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok)); + } + + /// Manage state of avalanching operations : completion queue tags that + /// trigger other completion queue operations. The underlying core completion + /// queue should not really shutdown until all avalanching operations have + /// been finalized. Note that we maintain the requirement that an avalanche + /// registration must take place before CQ shutdown (which must be maintained + /// elsehwere) + void InitialAvalanching() { + gpr_atm_rel_store(&avalanches_in_flight_, static_cast(1)); + } + void RegisterAvalanching() { + gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_, + static_cast(1)); + } + void CompleteAvalanching(); + + grpc_completion_queue* cq_; // owned + + gpr_atm avalanches_in_flight_; +}; + +/// A specific type of completion queue used by the processing of notifications +/// by servers. Instantiated by \a ServerBuilder. +class ServerCompletionQueue : public CompletionQueue { + public: + bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; } + + private: + grpc_cq_polling_type polling_type_; + friend class ServerBuilder; + /// \param is_frequently_polled Informs the GRPC library about whether the + /// server completion queue would be actively polled (by calling Next() or + /// AsyncNext()). By default all server completion queues are assumed to be + /// frequently polled. + ServerCompletionQueue(grpc_cq_polling_type polling_type) + : CompletionQueue(grpc_completion_queue_attributes{ + GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}), + polling_type_(polling_type) {} +}; + +} // namespace grpc + +#endif // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h new file mode 100644 index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47 --- /dev/null +++ b/patches/grpc/grpc_library.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H +#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H + +#include + +#include + +namespace grpc { + +class GrpcLibraryInterface { + public: + virtual ~GrpcLibraryInterface() = default; + virtual void init() = 0; + virtual void shutdown() = 0; +}; + +/// Initialized by \a grpc::GrpcLibraryInitializer from +/// +extern GrpcLibraryInterface* g_glip; + +/// Classes that require gRPC to be initialized should inherit from this class. +class GrpcLibraryCodegen { + public: + GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) { + if (call_grpc_init) { + GPR_CODEGEN_ASSERT(g_glip && + "gRPC library not initialized. See " + "grpc::internal::GrpcLibraryInitializer."); + g_glip->init(); + grpc_init_called_ = true; + } + } + virtual ~GrpcLibraryCodegen() { + if (grpc_init_called_ && + typeid(*g_glip).hash_code() != + typeid(GrpcLibraryInterface).hash_code()) { + GPR_CODEGEN_ASSERT(g_glip && + "gRPC library not initialized. See " + "grpc::internal::GrpcLibraryInitializer."); + g_glip->shutdown(); + } + } + + private: + bool grpc_init_called_; +}; + +} // namespace grpc + +#endif // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -4,7 +4,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do - if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then + if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then continue; else cpplint --filter=-readability/fn_size $file;