diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 9681b33c0aff676d20b1c3acc4659f2379f207eb..4e5be2e53503fde5168de6f59959b2f41a4bcd6d 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -103,7 +103,7 @@ static void BuildVar(const std::string& param_name,
 }
 
 TEST(Operator, CPUtoGPU) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index 7f0f46b1bb362b0b3983c1e61921d5c306e8d15f..d58cfe447e88a2cac4aed65626acaea667a09c92 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -26,7 +26,7 @@ __global__ void test(size_t* a, int size) {
 }
 
 TEST(LoD, data) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
 
   paddle::framework::LoD lod{{0, 1, 2}};
   lod.push_back({0, 2, 4, 5});
@@ -42,7 +42,7 @@ TEST(LoD, data) {
 }
 
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
 
   paddle::framework::LoDTensor lod_tensor;
   paddle::platform::CUDAPlace place(0);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 218fc8880bb276a75ed1dd71b04fcd9f387c9a54..368913700167ec65d8cf3e308968bce392421ae3 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -76,7 +76,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                              paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -228,7 +228,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -268,7 +268,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 
 // test with multi inputs
 TEST(OpKernel, multi_inputs) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
@@ -419,7 +419,7 @@ REGISTER_OP_CPU_KERNEL(indicate_other_data_type_test,
                            paddle::platform::CPUDeviceContext, int>);
 
 TEST(IndicateVarDataTypeTest, lodtensor) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("indicate_lod_tensor_data_type_test");
   BuildVar("LoDTensor", {"lodtensor_1"}, op_desc.add_inputs());
@@ -447,7 +447,7 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
 }
 
 TEST(IndicateVarDataTypeTest, selectedrows) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("indicate_selected_rows_data_type_test");
   BuildVar("SelectedRows", {"selected_rows_1"}, op_desc.add_inputs());
@@ -474,7 +474,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
 }
 
 TEST(IndicateVarDataTypeTest, other) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("indicate_other_data_type_test");
   BuildVar("Other", {"lod_rank_table_1"}, op_desc.add_inputs());
@@ -504,7 +504,7 @@ TEST(IndicateVarDataTypeTest, other) {
 }
 
 TEST(ExecutionContextAttrAndInOut, new_api) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -596,7 +596,7 @@ REGISTER_OP_CPU_KERNEL(set_lod_level_test,
                            paddle::platform::CPUDeviceContext, float>);
 
 void SetGetLoDLevelTestMain(std::string op_type) {
-  paddle::framework::InitDevices(false, {});
+  paddle::framework::InitDevices({});
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type(op_type);
   BuildVar("X", {"x.0"}, op_desc.add_inputs());
@@ -701,7 +701,7 @@ REGISTER_OP_CPU_KERNEL(op_without_unused_var,
 TEST(OpWithUnusedVar, all) {
   // enable the unused_var_check
   FLAGS_enable_unused_var_check = true;
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_unused_var");
   BuildVar("X", {"X"}, op_desc.add_inputs());
@@ -726,7 +726,7 @@ TEST(OpWithoutUnusedVar, all) {
   // enable the unused_var_check
   FLAGS_enable_unused_var_check = true;
 
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_without_unused_var");
   BuildVar("X", {"X"}, op_desc.add_inputs());
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 535ec9cd7d950588fd7877d0913e3e851f8fe8dc..d9ddf49f46b795b3dc28000ce850baa71490a687 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
 DECLARE_double(eager_delete_tensor_gb);
 
 #ifdef WITH_GPERFTOOLS
@@ -55,6 +59,10 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+std::once_flag p2p_init_flag;
+#endif
+
 class ParallelExecutorPrivate {
  public:
   ParallelExecutorPrivate(const std::vector<platform::Place> &places,
@@ -458,6 +466,41 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
   return executor && executor->NeedCreateLocalExeScope();
 }
 
+void InitP2P(const std::vector<platform::Place> &places) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    int count = places.size();
+    if (count <= 1) return;
+
+    std::vector<int> devices;
+    for (int i = 0; i < count; i++) {
+      if (!is_gpu_place(places[i])) return;
+
+      platform::CUDAPlace device =
+          BOOST_GET_CONST(platform::CUDAPlace, places[i]);
+      devices.push_back(device.GetDeviceId());
+    }
+
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (devices[i] == devices[j]) continue;
+        int can_acess = -1;
+        cudaError_t ret =
+            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != cudaSuccess || can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
+                       << " to " << devices[j];
+        } else {
+          platform::CUDADeviceGuard guard(devices[i]);
+          cudaDeviceEnablePeerAccess(devices[j], 0);
+        }
+      }
+    }
+    VLOG(1) << "init p2p";
+  });
+#endif
+}
+
 ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const std::vector<std::string> &bcast_vars,
                                    const std::string &loss_var_name,
@@ -470,6 +513,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
                  platform::errors::Unavailable(
                      "XPU is not supported in ParallelExecutor"));
+  InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
   member_->use_cuda_ = exec_strategy.use_cuda_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7bfdb2107c9a99f58034cf7d355b87ad0e470f16..ca75e30b9ea79ab37c1d1b9b3e2fa84f678c17b4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -174,7 +174,7 @@ bool AnalysisPredictor::PrepareScope(
     scope_ = parent_scope;
     status_is_cloned_ = true;
   } else {
-    paddle::framework::InitDevices(false);
+    paddle::framework::InitDevices();
     scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
       delete scope;
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c78cdf24dec561f5fd5643cb50ee243a58b3ab6a..9a5b301fdd4110ca5029e8f74823a5ed8426bd08 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -91,7 +91,7 @@ bool NativePaddlePredictor::Init(
                             platform::errors::PreconditionNotMet(
                                 "The sub_scope should not be nullptr."));
   } else {
-    paddle::framework::InitDevices(false);
+    paddle::framework::InitDevices();
     scope_.reset(new paddle::framework::Scope());
   }
 
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 84e011c6505a8fe974effbecf54101e0e51d29fa..d2bc95e7c3eb3d7bd7adf28d5d828cef6ec69e2f 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/pybind.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
-DEFINE_bool(init_p2p, false, "Whether to init p2p.");
 DEFINE_int32(math_num_threads, 1,
              "Number of threads used to run math functions.");
 
@@ -42,7 +41,7 @@ void Init(const std::vector<std::string> argv) {
   while (std::getline(tokenStream, token, ',')) {
     devices.push_back(std::stoi(token));
   }
-  framework::InitDevices(FLAGS_init_p2p, devices);
+  framework::InitDevices(devices);
 }
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4515dba4363babea0ff31d10207599c1e014f6cb..03c252909d92392e50a847d415b6ae5cec689bef 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,6 +37,7 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 654df5ccd5e9df324f6e127addadd4e71a641d94..e01b66b7a125cca1c48a54b7a62d9cfcd3dfb99d 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -57,7 +57,7 @@ void OpTester::Init(const OpTesterConfig &config) {
     place_ = paddle::platform::CPUPlace();
   }
 
-  framework::InitDevices(false);
+  framework::InitDevices();
   scope_.reset(new paddle::framework::Scope());
 
   op_ = framework::OpRegistry::CreateOp(op_desc_);
diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc
index d50c829b475752cfad5a41500c6d66d1ecc4c8bf..55b4dce4929b8cbdb69772b7ecfac5ccdeb65dcf 100644
--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -140,7 +140,7 @@ void TestMain(const std::vector<std::string>& input_names,
               std::string func_name, std::string cuda_kernel_str,
               CPUKernelFunc cpu_kernel_func) {
   // Compile the device code
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
   platform::CUDAPlace place = platform::CUDAPlace(0);
   PrepareDeviceCode(place, func_name, cuda_kernel_str);
 
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 9331532058095df61b0a07d78c9ebb14e8be8ce8..93bccd5cb8540aea4a770bb841787ee8245ec870 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -35,7 +35,7 @@ TEST(DeviceCode, cuda) {
     return;
   }
 
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
   paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
 
@@ -90,7 +90,7 @@ TEST(DeviceCodePool, cuda) {
     return;
   }
 
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
   paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
   paddle::platform::DeviceCodePool& pool =
       paddle::platform::DeviceCodePool::Init({place});
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index a594044e9bc2738d8305d22f0880db63b7be38ec..a3e035a8125274b0f073c472b28bda05511d5a92 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -63,7 +63,6 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
-std::once_flag p2p_init_flag;
 
 bool InitGflags(std::vector<std::string> args) {
   bool successed = false;
@@ -95,28 +94,7 @@ bool InitGflags(std::vector<std::string> args) {
   return successed;
 }
 
-void InitP2P(std::vector<int> devices) {
-#ifdef PADDLE_WITH_CUDA
-  std::call_once(p2p_init_flag, [&]() {
-    int count = devices.size();
-    for (int i = 0; i < count; ++i) {
-      for (int j = 0; j < count; ++j) {
-        if (devices[i] == devices[j]) continue;
-        int can_acess = -1;
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]));
-        if (can_acess != 1) {
-          VLOG(2) << "Cannot enable P2P access from " << devices[i] << " to "
-                  << devices[j];
-        } else {
-          platform::CUDADeviceGuard guard(devices[i]);
-          cudaDeviceEnablePeerAccess(devices[j], 0);
-        }
-      }
-    }
-  });
-#endif
-}
+
 
 void InitCupti() {
 #ifdef PADDLE_WITH_CUPTI
@@ -144,7 +122,7 @@ void InitCupti() {
 #endif
 }
 
-void InitDevices(bool init_p2p) {
+void InitDevices() {
   // CUPTI attribute should be set before any CUDA context is created (see CUPTI
   // documentation about CUpti_ActivityAttribute).
   InitCupti();
@@ -166,10 +144,10 @@ void InitDevices(bool init_p2p) {
     LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
   }
 #endif
-  InitDevices(init_p2p, devices);
+  InitDevices(devices);
 }
 
-void InitDevices(bool init_p2p, const std::vector<int> devices) {
+void InitDevices(const std::vector<int> devices) {
   std::vector<platform::Place> places;
 
   for (size_t i = 0; i < devices.size(); ++i) {
@@ -187,9 +165,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
     places.emplace_back(platform::XPUPlace(devices[i]));
 #endif
   }
-  if (init_p2p) {
-    InitP2P(devices);
-  }
   places.emplace_back(platform::CPUPlace());
 #ifdef PADDLE_WITH_CUDA
   places.emplace_back(platform::CUDAPinnedPlace());
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 5bd5a640ade351fc66b01e89cf670ed8b0fd3b05..cd5ef843fa8f7d436b9ca10367e46cee6d2822cb 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -35,9 +35,9 @@ bool InitGflags(std::vector<std::string> argv);
 
 void InitGLOG(const std::string& prog_name);
 
-void InitDevices(bool init_p2p);
+void InitDevices();
 
-void InitDevices(bool init_p2p, const std::vector<int> devices);
+void InitDevices(const std::vector<int> devices);
 
 #ifndef _WIN32
 class SignalMessageDumper {
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index f1832206a1abbbfccb5e79b39b1c67307aca6769..5866ede40322b041285df3ef64eb3d09d385dd04 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -22,7 +22,7 @@ TEST(InitDevices, CPU) {
   using paddle::platform::DeviceContextPool;
 
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
-  InitDevices(true);
+  InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
 #endif
@@ -34,7 +34,7 @@ TEST(InitDevices, CUDA) {
 
 #ifdef PADDLE_WITH_CUDA
   int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices(true);
+  InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
 #endif
@@ -46,7 +46,7 @@ TEST(InitDevices, XPU) {
 
 #ifdef PADDLE_WITH_XPU
   int count = paddle::platform::GetXPUDeviceCount();
-  InitDevices(true);
+  InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a7e3cd82d26a44d49930c0001f04a39819b85140..879748c7db78aea38188cb0c5943e37d8aa50be8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1715,7 +1715,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_glog", framework::InitGLOG);
   m.def("load_op_library", framework::LoadOpLib);
   m.def("init_devices",
-        [](bool init_p2p) { framework::InitDevices(init_p2p); });
+        []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 1ef98720f83697715c05e868177faba489fd8760..830f00b8db1d5c4bab40710d8fd22ea77768aa9e 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -55,7 +55,7 @@ std::unique_ptr<paddle::framework::ProgramDesc> Load(
 }  // namespace paddle
 
 int main() {
-  paddle::framework::InitDevices(false);
+  paddle::framework::InitDevices();
 
   const auto cpu_place = paddle::platform::CPUPlace();
 
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
index a08069a57ca824f307b4bf8836237f573ab3c429..6272478deaead65c47cd040e7ff7f3e5d4083e08 100644
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
       platform::errors::InvalidArgument(
           "At least one file to train, but received number of file is %d.",
           file_vec.size()));
-  paddle::framework::InitDevices(false);
+  paddle::framework::InitDevices();
   const auto cpu_place = paddle::platform::CPUPlace();
   paddle::framework::Executor executor(cpu_place);
   paddle::framework::Scope scope;
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index fb993439bb8e4bf31bea8f987c17d8b9022eadfe..7a980cbac8b95f3aa887e50030b0fd9c12c59e2a 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -33,7 +33,7 @@ DEFINE_string(dirname, "", "Directory of the train model.");
 namespace paddle {
 
 void Train(std::string model_dir) {
-  framework::InitDevices(false);
+  framework::InitDevices();
   const auto cpu_place = platform::CPUPlace();
   framework::Executor executor(cpu_place);
   framework::Scope scope;
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 5400c55a0b150ecc0f534ed1bbb83aba236ec24b..eb038fb98d60c9830043371db7de06ff8b349063 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -121,7 +121,7 @@ int main(int argc, char** argv) {
   int internal_argc = internal_argv.size();
   char** arr = internal_argv.data();
   paddle::platform::ParseCommandLineFlags(internal_argc, arr, true);
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
 
   int ret = RUN_ALL_TESTS();
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 74b56b842cf9644ac54d42a30f2d197549803f05..7865dc04e3fd9c70481926a519696c6d3b132a54 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -254,7 +254,7 @@ def __bootstrap__():
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
     # don't init_p2p when in unittest to save time.
-    core.init_devices(not in_test)
+    core.init_devices()
 
 
 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.