Fix gpu memory allocation bug. (#28703)

1dad8cea · gongweibao · GitHub · 91bab752 · 1dad8cea · 1dad8cea
20 changed file
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -103,7 +103,7 @@ static void BuildVar(const std::string& param_name,
 }
 TEST(Operator, CPUtoGPU) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace cpu_place;

--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -26,7 +26,7 @@ __global__ void test(size_t* a, int size) {
 }
 TEST(LoD, data) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -42,7 +42,7 @@ TEST(LoD, data) {
 }
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::CUDAPlace place(0);

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -76,7 +76,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                             paddle::framework::OpWithoutKernelCheckerMaker);
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -228,7 +228,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -268,7 +268,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 // test with multi inputs
 TEST(OpKernel, multi_inputs) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_multi_inputs_with_kernel");
@@ -419,7 +419,7 @@ REGISTER_OP_CPU_KERNEL(indicate_other_data_type_test,
                           paddle::platform::CPUDeviceContext, int>);
 TEST(IndicateVarDataTypeTest, lodtensor) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("indicate_lod_tensor_data_type_test");
  BuildVar("LoDTensor", {"lodtensor_1"}, op_desc.add_inputs());
@@ -447,7 +447,7 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
 }
 TEST(IndicateVarDataTypeTest, selectedrows) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("indicate_selected_rows_data_type_test");
  BuildVar("SelectedRows", {"selected_rows_1"}, op_desc.add_inputs());
@@ -474,7 +474,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
 }
 TEST(IndicateVarDataTypeTest, other) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("indicate_other_data_type_test");
  BuildVar("Other", {"lod_rank_table_1"}, op_desc.add_inputs());
@@ -504,7 +504,7 @@ TEST(IndicateVarDataTypeTest, other) {
 }
 TEST(ExecutionContextAttrAndInOut, new_api) {
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -596,7 +596,7 @@ REGISTER_OP_CPU_KERNEL(set_lod_level_test,
                           paddle::platform::CPUDeviceContext, float>);
 void SetGetLoDLevelTestMain(std::string op_type) {
-  paddle::framework::InitDevices(false, {});
+  paddle::framework::InitDevices({});
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type(op_type);
  BuildVar("X", {"x.0"}, op_desc.add_inputs());
@@ -701,7 +701,7 @@ REGISTER_OP_CPU_KERNEL(op_without_unused_var,
 TEST(OpWithUnusedVar, all) {
  // enable the unused_var_check
  FLAGS_enable_unused_var_check = true;
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_unused_var");
  BuildVar("X", {"X"}, op_desc.add_inputs());
@@ -726,7 +726,7 @@ TEST(OpWithoutUnusedVar, all) {
  // enable the unused_var_check
  FLAGS_enable_unused_var_check = true;
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_without_unused_var");
  BuildVar("X", {"X"}, op_desc.add_inputs());

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 DECLARE_double(eager_delete_tensor_gb);
 #ifdef WITH_GPERFTOOLS
@@ -55,6 +59,10 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
+#ifdef PADDLE_WITH_CUDA
+std::once_flag p2p_init_flag;
+#endif
 class ParallelExecutorPrivate {
 public:
  ParallelExecutorPrivate(const std::vector<platform::Place> &places,
@@ -458,6 +466,41 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
  return executor && executor->NeedCreateLocalExeScope();
 }
+void InitP2P(const std::vector<platform::Place> &places) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    int count = places.size();
+    if (count <= 1) return;
+    std::vector<int> devices;
+    for (int i = 0; i < count; i++) {
+      if (!is_gpu_place(places[i])) return;
+      platform::CUDAPlace device =
+          BOOST_GET_CONST(platform::CUDAPlace, places[i]);
+      devices.push_back(device.GetDeviceId());
+    }
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (devices[i] == devices[j]) continue;
+        int can_acess = -1;
+        cudaError_t ret =
+            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != cudaSuccess || can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
+                       << " to " << devices[j];
+        } else {
+          platform::CUDADeviceGuard guard(devices[i]);
+          cudaDeviceEnablePeerAccess(devices[j], 0);
+        }
+      }
+    }
+    VLOG(1) << "init p2p";
+  });
+#endif
+}
 ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const std::vector<std::string> &bcast_vars,
                                   const std::string &loss_var_name,
@@ -470,6 +513,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
                 platform::errors::Unavailable(
                     "XPU is not supported in ParallelExecutor"));
+  InitP2P(places);
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());
  member_->use_cuda_ = exec_strategy.use_cuda_;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -174,7 +174,7 @@ bool AnalysisPredictor::PrepareScope(
    scope_ = parent_scope;
    status_is_cloned_ = true;
  } else {
-    paddle::framework::InitDevices(false);
+    paddle::framework::InitDevices();
    scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
      delete scope;
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -91,7 +91,7 @@ bool NativePaddlePredictor::Init(
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
  } else {
-    paddle::framework::InitDevices(false);
+    paddle::framework::InitDevices();
    scope_.reset(new paddle::framework::Scope());
  }

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/pybind.h"
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
-DEFINE_bool(init_p2p, false, "Whether to init p2p.");
 DEFINE_int32(math_num_threads, 1,
             "Number of threads used to run math functions.");
@@ -42,7 +41,7 @@ void Init(const std::vector<std::string> argv) {
  while (std::getline(tokenStream, token, ',')) {
    devices.push_back(std::stoi(token));
  }
-  framework::InitDevices(FLAGS_init_p2p, devices);
+  framework::InitDevices(devices);
 }
 void ReadBinaryFile(const std::string& filename, std::string* contents) {

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,6 +37,7 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -57,7 +57,7 @@ void OpTester::Init(const OpTesterConfig &config) {
    place_ = paddle::platform::CPUPlace();
  }
-  framework::InitDevices(false);
+  framework::InitDevices();
  scope_.reset(new paddle::framework::Scope());
  op_ = framework::OpRegistry::CreateOp(op_desc_);

--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -140,7 +140,7 @@ void TestMain(const std::vector<std::string>& input_names,
              std::string func_name, std::string cuda_kernel_str,
              CPUKernelFunc cpu_kernel_func) {
  // Compile the device code
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
  platform::CUDAPlace place = platform::CUDAPlace(0);
  PrepareDeviceCode(place, func_name, cuda_kernel_str);

--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -35,7 +35,7 @@ TEST(DeviceCode, cuda) {
    return;
  }
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
@@ -90,7 +90,7 @@ TEST(DeviceCodePool, cuda) {
    return;
  }
-  paddle::framework::InitDevices(false, {0});
+  paddle::framework::InitDevices({0});
  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
  paddle::platform::DeviceCodePool& pool =
      paddle::platform::DeviceCodePool::Init({place});

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -63,7 +63,6 @@ namespace framework {
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
-std::once_flag p2p_init_flag;
 bool InitGflags(std::vector<std::string> args) {
  bool successed = false;
@@ -95,28 +94,7 @@ bool InitGflags(std::vector<std::string> args) {
  return successed;
 }
-void InitP2P(std::vector<int> devices) {
-#ifdef PADDLE_WITH_CUDA
-  std::call_once(p2p_init_flag, [&]() {
-    int count = devices.size();
-    for (int i = 0; i < count; ++i) {
-      for (int j = 0; j < count; ++j) {
-        if (devices[i] == devices[j]) continue;
-        int can_acess = -1;
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]));
-        if (can_acess != 1) {
-          VLOG(2) << "Cannot enable P2P access from " << devices[i] << " to "
-                  << devices[j];
-        } else {
-          platform::CUDADeviceGuard guard(devices[i]);
-          cudaDeviceEnablePeerAccess(devices[j], 0);
-        }
-      }
-    }
-  });
-#endif
-}
 void InitCupti() {
 #ifdef PADDLE_WITH_CUPTI
@@ -144,7 +122,7 @@ void InitCupti() {
 #endif
 }
-void InitDevices(bool init_p2p) {
+void InitDevices() {
  // CUPTI attribute should be set before any CUDA context is created (see CUPTI
  // documentation about CUpti_ActivityAttribute).
  InitCupti();
@@ -166,10 +144,10 @@ void InitDevices(bool init_p2p) {
    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
  }
 #endif
-  InitDevices(init_p2p, devices);
+  InitDevices(devices);
 }
-void InitDevices(bool init_p2p, const std::vector<int> devices) {
+void InitDevices(const std::vector<int> devices) {
  std::vector<platform::Place> places;
  for (size_t i = 0; i < devices.size(); ++i) {
@@ -187,9 +165,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
    places.emplace_back(platform::XPUPlace(devices[i]));
 #endif
  }
-  if (init_p2p) {
-    InitP2P(devices);
-  }
  places.emplace_back(platform::CPUPlace());
 #ifdef PADDLE_WITH_CUDA
  places.emplace_back(platform::CUDAPinnedPlace());

--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -35,9 +35,9 @@ bool InitGflags(std::vector<std::string> argv);
 void InitGLOG(const std::string& prog_name);
-void InitDevices(bool init_p2p);
+void InitDevices();
-void InitDevices(bool init_p2p, const std::vector<int> devices);
+void InitDevices(const std::vector<int> devices);
 #ifndef _WIN32
 class SignalMessageDumper {

--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -22,7 +22,7 @@ TEST(InitDevices, CPU) {
  using paddle::platform::DeviceContextPool;
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
-  InitDevices(true);
+  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U);
 #endif
@@ -34,7 +34,7 @@ TEST(InitDevices, CUDA) {
 #ifdef PADDLE_WITH_CUDA
  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices(true);
+  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
 #endif
@@ -46,7 +46,7 @@ TEST(InitDevices, XPU) {
 #ifdef PADDLE_WITH_XPU
  int count = paddle::platform::GetXPUDeviceCount();
-  InitDevices(true);
+  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1715,7 +1715,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_glog", framework::InitGLOG);
  m.def("load_op_library", framework::LoadOpLib);
  m.def("init_devices",
-        [](bool init_p2p) { framework::InitDevices(init_p2p); });
+        []() { framework::InitDevices(); });
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);

--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -55,7 +55,7 @@ std::unique_ptr<paddle::framework::ProgramDesc> Load(
 }  // namespace paddle
 int main() {
-  paddle::framework::InitDevices(false);
+  paddle::framework::InitDevices();
  const auto cpu_place = paddle::platform::CPUPlace();

--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
      platform::errors::InvalidArgument(
          "At least one file to train, but received number of file is %d.",
          file_vec.size()));
-  paddle::framework::InitDevices(false);
+  paddle::framework::InitDevices();
  const auto cpu_place = paddle::platform::CPUPlace();
  paddle::framework::Executor executor(cpu_place);
  paddle::framework::Scope scope;

--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -33,7 +33,7 @@ DEFINE_string(dirname, "", "Directory of the train model.");
 namespace paddle {
 void Train(std::string model_dir) {
-  framework::InitDevices(false);
+  framework::InitDevices();
  const auto cpu_place = platform::CPUPlace();
  framework::Executor executor(cpu_place);
  framework::Scope scope;

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -121,7 +121,7 @@ int main(int argc, char** argv) {
  int internal_argc = internal_argv.size();
  char** arr = internal_argv.data();
  paddle::platform::ParseCommandLineFlags(internal_argc, arr, true);
-  paddle::framework::InitDevices(true);
+  paddle::framework::InitDevices();
  int ret = RUN_ALL_TESTS();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -254,7 +254,7 @@ def __bootstrap__():
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])
    # don't init_p2p when in unittest to save time.
-    core.init_devices(not in_test)
+    core.init_devices()
 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.