diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 076ecc1f01d89913081892eb6aa828b095b09656..f5d5627815c7320dad5051b0f7d95b8ec6703687 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -86,7 +86,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, // stub context s->response_call_back_ = nullptr; - platform::RecordEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method, p_ctx); auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); @@ -143,7 +143,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, // stub context s->response_call_back_ = ProcGetResponse; - platform::RecordEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method, p_ctx); auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); @@ -191,7 +191,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, // stub context s->response_call_back_ = ProcGetResponse; - platform::RecordEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method, p_ctx); auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, @@ -221,7 +221,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); - platform::RecordEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method, nullptr); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -246,7 +246,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); - platform::RecordEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method, nullptr); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -271,7 +271,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); - platform::RecordEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method, nullptr); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -301,7 +301,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_out_varname(dir); - platform::RecordEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method, nullptr); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index ffe8f082db34b2ffd6b277080030463080feeb1d..bac098b892658beece85271765eb31eeb3eeda17 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -36,7 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name) { - platform::RecordEvent record_event("serial", &ctx); + platform::RecordRPCEvent record_event("serial", &ctx); // Default DestroyCallback does nothing, When using GPU // the CPU buffer need to be freed. DestroyCallback destroy_callback = [](void* backing) {}; @@ -148,7 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, const framework::Scope* scope, framework::Variable** var) { - platform::RecordEvent record_event("deserial", &ctx); + platform::RecordRPCEvent record_event("deserial", &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); *var = resp.GetVar(); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index a35147da90e87af85308431fd7dbe965bb1fd1d7..da46a1abe12258b47b2fd4afb5f146daf15e026d 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -30,6 +30,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/string/printf.h" +DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); + namespace paddle { namespace platform { @@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() { PopEvent(name_, dev_ctx_); } +RecordRPCEvent::RecordRPCEvent(const std::string& name, + const DeviceContext* dev_ctx) { + if (FLAGS_enable_rpc_profiler) { + event_.reset(new platform::RecordEvent(name, dev_ctx)); + } +} + RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { std::lock_guard l(profiler_mu); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 62c1762f32a0457e1292711dea57e064b93fbda1..e8eae874afa3d17f0d3374eef457cdbacb3f8424 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -87,6 +87,16 @@ struct RecordEvent { std::string full_name_; }; +class RecordRPCEvent { + public: + // dev_ctx can be set to nullptr if device is cpu. + RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx); + ~RecordRPCEvent() {} + + private: + std::unique_ptr event_; +}; + struct RecordBlock { explicit RecordBlock(int block_id); ~RecordBlock(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 41678918b8bb54078091f892ce7a519dfc8a0014..bcd4e4f6073eff1ea0449da8096030743158dd0f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -120,6 +120,7 @@ def __bootstrap__(): read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_server_profile_period') read_env_flags.append('rpc_server_profile_path') + read_env_flags.append('enable_rpc_profiler') if core.is_compiled_with_cuda(): read_env_flags += [