From 1dad8ceaabfb7d46d229a67ce54846d583c071de Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 20 Nov 2020 20:50:06 +0800 Subject: [PATCH] Fix gpu memory allocation bug. (#28703) --- .../framework/data_device_transform_test.cu | 2 +- paddle/fluid/framework/lod_tensor_test.cu | 4 +- paddle/fluid/framework/operator_test.cc | 20 ++++----- paddle/fluid/framework/parallel_executor.cc | 44 +++++++++++++++++++ .../fluid/inference/api/analysis_predictor.cc | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/io.cc | 3 +- .../memory/allocation/allocator_facade.cc | 1 + paddle/fluid/operators/benchmark/op_tester.cc | 2 +- .../operators/fused/fusion_group_op_test.cc | 2 +- paddle/fluid/platform/device_code_test.cc | 4 +- paddle/fluid/platform/init.cc | 33 ++------------ paddle/fluid/platform/init.h | 4 +- paddle/fluid/platform/init_test.cc | 6 +-- paddle/fluid/pybind/pybind.cc | 2 +- paddle/fluid/train/demo/demo_trainer.cc | 2 +- paddle/fluid/train/imdb_demo/demo_trainer.cc | 2 +- .../train/test_train_recognize_digits.cc | 2 +- paddle/testing/paddle_gtest_main.cc | 2 +- python/paddle/fluid/__init__.py | 2 +- 20 files changed, 80 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 9681b33c0af..4e5be2e5350 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -103,7 +103,7 @@ static void BuildVar(const std::string& param_name, } TEST(Operator, CPUtoGPU) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::Scope scope; paddle::platform::CPUPlace cpu_place; diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu index 7f0f46b1bb3..d58cfe447e8 100644 --- a/paddle/fluid/framework/lod_tensor_test.cu +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -26,7 +26,7 @@ __global__ void test(size_t* a, int size) { } TEST(LoD, data) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::LoD lod{{0, 1, 2}}; lod.push_back({0, 2, 4, 5}); @@ -42,7 +42,7 @@ TEST(LoD, data) { } TEST(LoDTensor, LoDInGPU) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::LoDTensor lod_tensor; paddle::platform::CUDAPlace place(0); diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index 218fc8880bb..36891370016 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -76,7 +76,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator, paddle::framework::OpWithoutKernelCheckerMaker); TEST(OperatorBase, all) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("test_operator"); BuildVar("input", {"IN1"}, op_desc.add_inputs()); @@ -228,7 +228,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( // test with single input TEST(OpKernel, all) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("op_with_kernel"); BuildVar("x", {"IN1"}, op_desc.add_inputs()); @@ -268,7 +268,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, // test with multi inputs TEST(OpKernel, multi_inputs) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("op_multi_inputs_with_kernel"); @@ -419,7 +419,7 @@ REGISTER_OP_CPU_KERNEL(indicate_other_data_type_test, paddle::platform::CPUDeviceContext, int>); TEST(IndicateVarDataTypeTest, lodtensor) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("indicate_lod_tensor_data_type_test"); BuildVar("LoDTensor", {"lodtensor_1"}, op_desc.add_inputs()); @@ -447,7 +447,7 @@ TEST(IndicateVarDataTypeTest, lodtensor) { } TEST(IndicateVarDataTypeTest, selectedrows) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("indicate_selected_rows_data_type_test"); BuildVar("SelectedRows", {"selected_rows_1"}, op_desc.add_inputs()); @@ -474,7 +474,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) { } TEST(IndicateVarDataTypeTest, other) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("indicate_other_data_type_test"); BuildVar("Other", {"lod_rank_table_1"}, op_desc.add_inputs()); @@ -504,7 +504,7 @@ TEST(IndicateVarDataTypeTest, other) { } TEST(ExecutionContextAttrAndInOut, new_api) { - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("test_operator"); BuildVar("input", {"IN1"}, op_desc.add_inputs()); @@ -596,7 +596,7 @@ REGISTER_OP_CPU_KERNEL(set_lod_level_test, paddle::platform::CPUDeviceContext, float>); void SetGetLoDLevelTestMain(std::string op_type) { - paddle::framework::InitDevices(false, {}); + paddle::framework::InitDevices({}); paddle::framework::proto::OpDesc op_desc; op_desc.set_type(op_type); BuildVar("X", {"x.0"}, op_desc.add_inputs()); @@ -701,7 +701,7 @@ REGISTER_OP_CPU_KERNEL(op_without_unused_var, TEST(OpWithUnusedVar, all) { // enable the unused_var_check FLAGS_enable_unused_var_check = true; - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("op_with_unused_var"); BuildVar("X", {"X"}, op_desc.add_inputs()); @@ -726,7 +726,7 @@ TEST(OpWithoutUnusedVar, all) { // enable the unused_var_check FLAGS_enable_unused_var_check = true; - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("op_without_unused_var"); BuildVar("X", {"X"}, op_desc.add_inputs()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 535ec9cd7d9..d9ddf49f46b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -36,6 +36,10 @@ limitations under the License. */ #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + DECLARE_double(eager_delete_tensor_gb); #ifdef WITH_GPERFTOOLS @@ -55,6 +59,10 @@ static std::once_flag gProfileOnce; static bool gProfileStarted = false; #endif +#ifdef PADDLE_WITH_CUDA +std::once_flag p2p_init_flag; +#endif + class ParallelExecutorPrivate { public: ParallelExecutorPrivate(const std::vector &places, @@ -458,6 +466,41 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { return executor && executor->NeedCreateLocalExeScope(); } +void InitP2P(const std::vector &places) { +#ifdef PADDLE_WITH_CUDA + std::call_once(p2p_init_flag, [&]() { + int count = places.size(); + if (count <= 1) return; + + std::vector devices; + for (int i = 0; i < count; i++) { + if (!is_gpu_place(places[i])) return; + + platform::CUDAPlace device = + BOOST_GET_CONST(platform::CUDAPlace, places[i]); + devices.push_back(device.GetDeviceId()); + } + + for (int i = 0; i < count; ++i) { + for (int j = 0; j < count; ++j) { + if (devices[i] == devices[j]) continue; + int can_acess = -1; + cudaError_t ret = + cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); + if (ret != cudaSuccess || can_acess != 1) { + LOG(WARNING) << "Cannot enable P2P access from " << devices[i] + << " to " << devices[j]; + } else { + platform::CUDADeviceGuard guard(devices[i]); + cudaDeviceEnablePeerAccess(devices[j], 0); + } + } + } + VLOG(1) << "init p2p"; + }); +#endif +} + ParallelExecutor::ParallelExecutor(const std::vector &places, const std::vector &bcast_vars, const std::string &loss_var_name, @@ -470,6 +513,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]), platform::errors::Unavailable( "XPU is not supported in ParallelExecutor")); + InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); member_->use_cuda_ = exec_strategy.use_cuda_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7bfdb2107c9..ca75e30b9ea 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -174,7 +174,7 @@ bool AnalysisPredictor::PrepareScope( scope_ = parent_scope; status_is_cloned_ = true; } else { - paddle::framework::InitDevices(false); + paddle::framework::InitDevices(); scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { delete scope; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c78cdf24dec..9a5b301fdd4 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -91,7 +91,7 @@ bool NativePaddlePredictor::Init( platform::errors::PreconditionNotMet( "The sub_scope should not be nullptr.")); } else { - paddle::framework::InitDevices(false); + paddle::framework::InitDevices(); scope_.reset(new paddle::framework::Scope()); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 84e011c6505..d2bc95e7c3e 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); -DEFINE_bool(init_p2p, false, "Whether to init p2p."); DEFINE_int32(math_num_threads, 1, "Number of threads used to run math functions."); @@ -42,7 +41,7 @@ void Init(const std::vector argv) { while (std::getline(tokenStream, token, ',')) { devices.push_back(std::stoi(token)); } - framework::InitDevices(FLAGS_init_p2p, devices); + framework::InitDevices(devices); } void ReadBinaryFile(const std::string& filename, std::string* contents) { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4515dba4363..03c252909d9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -37,6 +37,7 @@ #include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/gpu_info.h" #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 654df5ccd5e..e01b66b7a12 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -57,7 +57,7 @@ void OpTester::Init(const OpTesterConfig &config) { place_ = paddle::platform::CPUPlace(); } - framework::InitDevices(false); + framework::InitDevices(); scope_.reset(new paddle::framework::Scope()); op_ = framework::OpRegistry::CreateOp(op_desc_); diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc index d50c829b475..55b4dce4929 100644 --- a/paddle/fluid/operators/fused/fusion_group_op_test.cc +++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc @@ -140,7 +140,7 @@ void TestMain(const std::vector& input_names, std::string func_name, std::string cuda_kernel_str, CPUKernelFunc cpu_kernel_func) { // Compile the device code - paddle::framework::InitDevices(false, {0}); + paddle::framework::InitDevices({0}); platform::CUDAPlace place = platform::CUDAPlace(0); PrepareDeviceCode(place, func_name, cuda_kernel_str); diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index 93315320580..93bccd5cb85 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -35,7 +35,7 @@ TEST(DeviceCode, cuda) { return; } - paddle::framework::InitDevices(false, {0}); + paddle::framework::InitDevices({0}); paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code); @@ -90,7 +90,7 @@ TEST(DeviceCodePool, cuda) { return; } - paddle::framework::InitDevices(false, {0}); + paddle::framework::InitDevices({0}); paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); paddle::platform::DeviceCodePool& pool = paddle::platform::DeviceCodePool::Init({place}); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index a594044e9bc..a3e035a8125 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -63,7 +63,6 @@ namespace framework { std::once_flag gflags_init_flag; std::once_flag glog_init_flag; -std::once_flag p2p_init_flag; bool InitGflags(std::vector args) { bool successed = false; @@ -95,28 +94,7 @@ bool InitGflags(std::vector args) { return successed; } -void InitP2P(std::vector devices) { -#ifdef PADDLE_WITH_CUDA - std::call_once(p2p_init_flag, [&]() { - int count = devices.size(); - for (int i = 0; i < count; ++i) { - for (int j = 0; j < count; ++j) { - if (devices[i] == devices[j]) continue; - int can_acess = -1; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j])); - if (can_acess != 1) { - VLOG(2) << "Cannot enable P2P access from " << devices[i] << " to " - << devices[j]; - } else { - platform::CUDADeviceGuard guard(devices[i]); - cudaDeviceEnablePeerAccess(devices[j], 0); - } - } - } - }); -#endif -} + void InitCupti() { #ifdef PADDLE_WITH_CUPTI @@ -144,7 +122,7 @@ void InitCupti() { #endif } -void InitDevices(bool init_p2p) { +void InitDevices() { // CUPTI attribute should be set before any CUDA context is created (see CUPTI // documentation about CUpti_ActivityAttribute). InitCupti(); @@ -166,10 +144,10 @@ void InitDevices(bool init_p2p) { LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; } #endif - InitDevices(init_p2p, devices); + InitDevices(devices); } -void InitDevices(bool init_p2p, const std::vector devices) { +void InitDevices(const std::vector devices) { std::vector places; for (size_t i = 0; i < devices.size(); ++i) { @@ -187,9 +165,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::XPUPlace(devices[i])); #endif } - if (init_p2p) { - InitP2P(devices); - } places.emplace_back(platform::CPUPlace()); #ifdef PADDLE_WITH_CUDA places.emplace_back(platform::CUDAPinnedPlace()); diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 5bd5a640ade..cd5ef843fa8 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -35,9 +35,9 @@ bool InitGflags(std::vector argv); void InitGLOG(const std::string& prog_name); -void InitDevices(bool init_p2p); +void InitDevices(); -void InitDevices(bool init_p2p, const std::vector devices); +void InitDevices(const std::vector devices); #ifndef _WIN32 class SignalMessageDumper { diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index f1832206a1a..5866ede4032 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -22,7 +22,7 @@ TEST(InitDevices, CPU) { using paddle::platform::DeviceContextPool; #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) - InitDevices(true); + InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); ASSERT_EQ(pool.size(), 1U); #endif @@ -34,7 +34,7 @@ TEST(InitDevices, CUDA) { #ifdef PADDLE_WITH_CUDA int count = paddle::platform::GetCUDADeviceCount(); - InitDevices(true); + InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); ASSERT_EQ(pool.size(), 2U + static_cast(count)); #endif @@ -46,7 +46,7 @@ TEST(InitDevices, XPU) { #ifdef PADDLE_WITH_XPU int count = paddle::platform::GetXPUDeviceCount(); - InitDevices(true); + InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); ASSERT_EQ(pool.size(), 1U + static_cast(count)); #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a7e3cd82d26..879748c7db7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1715,7 +1715,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_glog", framework::InitGLOG); m.def("load_op_library", framework::LoadOpLib); m.def("init_devices", - [](bool init_p2p) { framework::InitDevices(init_p2p); }); + []() { framework::InitDevices(); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_xpu", IsCompiledWithXPU); diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc index 1ef98720f83..830f00b8db1 100644 --- a/paddle/fluid/train/demo/demo_trainer.cc +++ b/paddle/fluid/train/demo/demo_trainer.cc @@ -55,7 +55,7 @@ std::unique_ptr Load( } // namespace paddle int main() { - paddle::framework::InitDevices(false); + paddle::framework::InitDevices(); const auto cpu_place = paddle::platform::CPUPlace(); diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc index a08069a57ca..6272478deae 100644 --- a/paddle/fluid/train/imdb_demo/demo_trainer.cc +++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) { platform::errors::InvalidArgument( "At least one file to train, but received number of file is %d.", file_vec.size())); - paddle::framework::InitDevices(false); + paddle::framework::InitDevices(); const auto cpu_place = paddle::platform::CPUPlace(); paddle::framework::Executor executor(cpu_place); paddle::framework::Scope scope; diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc index fb993439bb8..7a980cbac8b 100644 --- a/paddle/fluid/train/test_train_recognize_digits.cc +++ b/paddle/fluid/train/test_train_recognize_digits.cc @@ -33,7 +33,7 @@ DEFINE_string(dirname, "", "Directory of the train model."); namespace paddle { void Train(std::string model_dir) { - framework::InitDevices(false); + framework::InitDevices(); const auto cpu_place = platform::CPUPlace(); framework::Executor executor(cpu_place); framework::Scope scope; diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 5400c55a0b1..eb038fb98d6 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -121,7 +121,7 @@ int main(int argc, char** argv) { int internal_argc = internal_argv.size(); char** arr = internal_argv.data(); paddle::platform::ParseCommandLineFlags(internal_argc, arr, true); - paddle::framework::InitDevices(true); + paddle::framework::InitDevices(); int ret = RUN_ALL_TESTS(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 74b56b842cf..7865dc04e3f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -254,7 +254,7 @@ def __bootstrap__(): core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) # don't init_p2p when in unittest to save time. - core.init_devices(not in_test) + core.init_devices() # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py. -- GitLab