diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 076ecc1f01d89913081892eb6aa828b095b09656..f5d5627815c7320dad5051b0f7d95b8ec6703687 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -86,7 +86,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     // stub context
     s->response_call_back_ = nullptr;
 
-    platform::RecordEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method, p_ctx);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
@@ -143,7 +143,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method, p_ctx);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
@@ -191,7 +191,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method, p_ctx);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
@@ -221,7 +221,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
 
-  platform::RecordEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method, nullptr);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -246,7 +246,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  platform::RecordEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method, nullptr);
 
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -271,7 +271,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(COMPLETE_MESSAGE);
 
-  platform::RecordEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method, nullptr);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -301,7 +301,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
   req.set_out_varname(dir);
 
-  platform::RecordEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method, nullptr);
 
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index ffe8f082db34b2ffd6b277080030463080feeb1d..bac098b892658beece85271765eb31eeb3eeda17 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -36,7 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_name) {
-  platform::RecordEvent record_event("serial", &ctx);
+  platform::RecordRPCEvent record_event("serial", &ctx);
   // Default DestroyCallback does nothing, When using GPU
   // the CPU buffer need to be freed.
   DestroyCallback destroy_callback = [](void* backing) {};
@@ -148,7 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
                                framework::Variable** var) {
-  platform::RecordEvent record_event("deserial", &ctx);
+  platform::RecordRPCEvent record_event("deserial", &ctx);
   operators::distributed::GRPCVariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
   *var = resp.GetVar();
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index a35147da90e87af85308431fd7dbe965bb1fd1d7..da46a1abe12258b47b2fd4afb5f146daf15e026d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/string/printf.h"
 
+DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
+
 namespace paddle {
 namespace platform {
 
@@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() {
   PopEvent(name_, dev_ctx_);
 }
 
+RecordRPCEvent::RecordRPCEvent(const std::string& name,
+                               const DeviceContext* dev_ctx) {
+  if (FLAGS_enable_rpc_profiler) {
+    event_.reset(new platform::RecordEvent(name, dev_ctx));
+  }
+}
+
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
   std::lock_guard<std::mutex> l(profiler_mu);
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 62c1762f32a0457e1292711dea57e064b93fbda1..e8eae874afa3d17f0d3374eef457cdbacb3f8424 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -87,6 +87,16 @@ struct RecordEvent {
   std::string full_name_;
 };
 
+class RecordRPCEvent {
+ public:
+  // dev_ctx can be set to nullptr if device is cpu.
+  RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
+  ~RecordRPCEvent() {}
+
+ private:
+  std::unique_ptr<RecordEvent> event_;
+};
+
 struct RecordBlock {
   explicit RecordBlock(int block_id);
   ~RecordBlock();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 41678918b8bb54078091f892ce7a519dfc8a0014..bcd4e4f6073eff1ea0449da8096030743158dd0f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -120,6 +120,7 @@ def __bootstrap__():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_period')
         read_env_flags.append('rpc_server_profile_path')
+        read_env_flags.append('enable_rpc_profiler')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [