diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 7ca694886e9209a49e214352f5babc473a1f275a..e6ee28ea8d920ef80fead258a9bd0d5f6762c879 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -211,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) { } } +void AsyncGRPCServer::WaitServerReady() { + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); +} + void AsyncGRPCServer::RunSyncUpdate() { ::grpc::ServerBuilder builder; builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(), @@ -244,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() { t_prefetch_.reset(new std::thread( std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), "cq_prefetch", prefetch_register))); + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); // wait server server_->Wait(); t_send_->join(); diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 99b87b8c6cb3e597778b88c395e4abf400d82c39..7f9cae21ccca8dd51f9fbe98148d01a51ac6eb84 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -45,8 +45,9 @@ class RequestBase; class AsyncGRPCServer final { public: explicit AsyncGRPCServer(const std::string &address, bool sync_mode) - : address_(address), sync_mode_(sync_mode) {} + : address_(address), sync_mode_(sync_mode), ready_(0) {} + void WaitServerReady(); void RunSyncUpdate(); // functions to sync server barrier status. @@ -118,6 +119,10 @@ class AsyncGRPCServer final { framework::ProgramDesc *program_; framework::Executor *executor_; int selected_port_; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; }; }; // namespace detail diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index f22f8b261030c0c536e2118351ec2aa1a9be6cd0..59b94511552874e1557d8a9d7a687af14f96c31c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -66,12 +66,7 @@ static void ParallelExecuteBlocks( for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } -static void SavePort(std::shared_ptr rpc_service) { - std::ofstream port_file; - port_file.open("/tmp/paddle.selected_port"); - port_file << rpc_service->GetSelectedPort(); - port_file.close(); -} +std::atomic_int ListenAndServOp::selected_port_{0}; ListenAndServOp::ListenAndServOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -79,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type, const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} -int ListenAndServOp::GetSelectedPort() const { - return rpc_service_->GetSelectedPort(); -} - void ListenAndServOp::Stop() { rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); server_thread_->join(); } +void ListenAndServOp::SavePort(const std::string &file_path) const { + // NOTE: default write file to /tmp/paddle.selected_port + selected_port_ = rpc_service_->GetSelectedPort(); + + std::ofstream port_file; + port_file.open(file_path); + port_file << selected_port_.load(); + port_file.close(); + VLOG(4) << "selected port written to " << file_path; +} + +void ListenAndServOp::WaitServerReady() { + while (selected_port_.load() == 0) { + } +} + void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope, @@ -318,9 +325,13 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); VLOG(3) << "wait server thread to become ready..."; - sleep(5); + rpc_service_->WaitServerReady(); + // Write to a file of server selected port for python use. - SavePort(rpc_service_); + std::string file_path = + string::Sprintf("/tmp/paddle.%d.selected_port", + static_cast(::getpid())); + SavePort(file_path); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); } else { diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 5c8fc31c9774a0f2e8233824459b29b42469bd1a..f52a55c5c2d6902df6cb7e0a0d7242c6e86dc786 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include @@ -39,8 +40,6 @@ class ListenAndServOp : public framework::OperatorBase { const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs); - int GetSelectedPort() const; - void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, @@ -49,14 +48,25 @@ class ListenAndServOp : public framework::OperatorBase { void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program) const; + void SavePort( + const std::string& file_path = "/tmp/paddle.selected_port") const; + + void WaitServerReady(); + + int GetSelectedPort() { return selected_port_; } + void Stop() override; void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; + static void ResetPort() { selected_port_ = 0; } + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr server_thread_; + // FIXME(wuyi): it's static so that the operator can be cloned. + static std::atomic_int selected_port_; }; } // namespace operators diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index 93e55d410388a672eb749302162ea81de4c6cba1..eb51f301bfe2a97c65dd1fec23ff5a44f3843b05 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -116,6 +116,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, void StartServerNet(bool is_sparse, std::atomic *initialized) { f::Scope scope; p::CPUPlace place; + VLOG(4) << "before init tensor"; if (is_sparse) { InitSelectedRowsInScope(place, &scope); } else { @@ -137,6 +138,7 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector({""})}); attrs.insert({"sync_mode", true}); + VLOG(4) << "before init op"; listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); *initialized = true; @@ -149,7 +151,9 @@ TEST(SendRecvOp, CPUDense) { std::thread server_thread(StartServerNet, false, &initialized); while (!initialized) { } - sleep(5); // wait server to start + static_cast(listen_and_serv_op.get()) + ->WaitServerReady(); + // local net f::Scope scope; p::CPUPlace place; @@ -185,6 +189,7 @@ TEST(SendRecvOp, CPUDense) { listen_and_serv_op->Stop(); server_thread.join(); listen_and_serv_op.reset(nullptr); + paddle::operators::ListenAndServOp::ResetPort(); } TEST(SendRecvOp, CPUSparse) { @@ -193,7 +198,12 @@ TEST(SendRecvOp, CPUSparse) { std::thread server_thread(StartServerNet, true, &initialized); while (!initialized) { } - sleep(5); // wait server to start + auto *listen_and_serv_op_ptr = + static_cast( + listen_and_serv_op.get()); + ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); + listen_and_serv_op_ptr->WaitServerReady(); + // local net f::Scope scope; p::CPUPlace place; @@ -201,10 +211,6 @@ TEST(SendRecvOp, CPUSparse) { InitSelectedRowsInScope(place, &scope); scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; - auto *listen_and_serv_op_ptr = - static_cast( - listen_and_serv_op.get()); - ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); selected_port = listen_and_serv_op_ptr->GetSelectedPort(); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); attrs.insert({"endpoints", std::vector({endpoint})}); @@ -236,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) { listen_and_serv_op->Stop(); server_thread.join(); listen_and_serv_op.reset(); + paddle::operators::ListenAndServOp::ResetPort(); } diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index c7fdd06f105e3b5fd906d3524d41df8f84160e63..77e9a8f7e72a9e0790ce1d1f48356abcca8eaccf 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase): p.start() time.sleep(10) - with open("/tmp/paddle.selected_port", "r") as fn: + with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn: selected_port = int(fn.readlines()[0]) self.init_client(place, selected_port)