未验证 提交 1dad8cea 编写于 作者: G gongweibao 提交者: GitHub

Fix gpu memory allocation bug. (#28703)

上级 91bab752
......@@ -103,7 +103,7 @@ static void BuildVar(const std::string& param_name,
}
TEST(Operator, CPUtoGPU) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place;
......
......@@ -26,7 +26,7 @@ __global__ void test(size_t* a, int size) {
}
TEST(LoD, data) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
......@@ -42,7 +42,7 @@ TEST(LoD, data) {
}
TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0);
......
......@@ -76,7 +76,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
paddle::framework::OpWithoutKernelCheckerMaker);
TEST(OperatorBase, all) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("test_operator");
BuildVar("input", {"IN1"}, op_desc.add_inputs());
......@@ -228,7 +228,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
// test with single input
TEST(OpKernel, all) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_with_kernel");
BuildVar("x", {"IN1"}, op_desc.add_inputs());
......@@ -268,7 +268,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
// test with multi inputs
TEST(OpKernel, multi_inputs) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel");
......@@ -419,7 +419,7 @@ REGISTER_OP_CPU_KERNEL(indicate_other_data_type_test,
paddle::platform::CPUDeviceContext, int>);
TEST(IndicateVarDataTypeTest, lodtensor) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("indicate_lod_tensor_data_type_test");
BuildVar("LoDTensor", {"lodtensor_1"}, op_desc.add_inputs());
......@@ -447,7 +447,7 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
}
TEST(IndicateVarDataTypeTest, selectedrows) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("indicate_selected_rows_data_type_test");
BuildVar("SelectedRows", {"selected_rows_1"}, op_desc.add_inputs());
......@@ -474,7 +474,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
}
TEST(IndicateVarDataTypeTest, other) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("indicate_other_data_type_test");
BuildVar("Other", {"lod_rank_table_1"}, op_desc.add_inputs());
......@@ -504,7 +504,7 @@ TEST(IndicateVarDataTypeTest, other) {
}
TEST(ExecutionContextAttrAndInOut, new_api) {
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("test_operator");
BuildVar("input", {"IN1"}, op_desc.add_inputs());
......@@ -596,7 +596,7 @@ REGISTER_OP_CPU_KERNEL(set_lod_level_test,
paddle::platform::CPUDeviceContext, float>);
void SetGetLoDLevelTestMain(std::string op_type) {
paddle::framework::InitDevices(false, {});
paddle::framework::InitDevices({});
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type(op_type);
BuildVar("X", {"x.0"}, op_desc.add_inputs());
......@@ -701,7 +701,7 @@ REGISTER_OP_CPU_KERNEL(op_without_unused_var,
TEST(OpWithUnusedVar, all) {
// enable the unused_var_check
FLAGS_enable_unused_var_check = true;
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_with_unused_var");
BuildVar("X", {"X"}, op_desc.add_inputs());
......@@ -726,7 +726,7 @@ TEST(OpWithoutUnusedVar, all) {
// enable the unused_var_check
FLAGS_enable_unused_var_check = true;
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_without_unused_var");
BuildVar("X", {"X"}, op_desc.add_inputs());
......
......@@ -36,6 +36,10 @@ limitations under the License. */
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
DECLARE_double(eager_delete_tensor_gb);
#ifdef WITH_GPERFTOOLS
......@@ -55,6 +59,10 @@ static std::once_flag gProfileOnce;
static bool gProfileStarted = false;
#endif
#ifdef PADDLE_WITH_CUDA
std::once_flag p2p_init_flag;
#endif
class ParallelExecutorPrivate {
public:
ParallelExecutorPrivate(const std::vector<platform::Place> &places,
......@@ -458,6 +466,41 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
return executor && executor->NeedCreateLocalExeScope();
}
void InitP2P(const std::vector<platform::Place> &places) {
#ifdef PADDLE_WITH_CUDA
std::call_once(p2p_init_flag, [&]() {
int count = places.size();
if (count <= 1) return;
std::vector<int> devices;
for (int i = 0; i < count; i++) {
if (!is_gpu_place(places[i])) return;
platform::CUDAPlace device =
BOOST_GET_CONST(platform::CUDAPlace, places[i]);
devices.push_back(device.GetDeviceId());
}
for (int i = 0; i < count; ++i) {
for (int j = 0; j < count; ++j) {
if (devices[i] == devices[j]) continue;
int can_acess = -1;
cudaError_t ret =
cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
if (ret != cudaSuccess || can_acess != 1) {
LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
<< " to " << devices[j];
} else {
platform::CUDADeviceGuard guard(devices[i]);
cudaDeviceEnablePeerAccess(devices[j], 0);
}
}
}
VLOG(1) << "init p2p";
});
#endif
}
ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name,
......@@ -470,6 +513,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
platform::errors::Unavailable(
"XPU is not supported in ParallelExecutor"));
InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size());
member_->use_cuda_ = exec_strategy.use_cuda_;
......
......@@ -174,7 +174,7 @@ bool AnalysisPredictor::PrepareScope(
scope_ = parent_scope;
status_is_cloned_ = true;
} else {
paddle::framework::InitDevices(false);
paddle::framework::InitDevices();
scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
delete scope;
#ifdef PADDLE_WITH_CUDA
......
......@@ -91,7 +91,7 @@ bool NativePaddlePredictor::Init(
platform::errors::PreconditionNotMet(
"The sub_scope should not be nullptr."));
} else {
paddle::framework::InitDevices(false);
paddle::framework::InitDevices();
scope_.reset(new paddle::framework::Scope());
}
......
......@@ -25,7 +25,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
DEFINE_bool(init_p2p, false, "Whether to init p2p.");
DEFINE_int32(math_num_threads, 1,
"Number of threads used to run math functions.");
......@@ -42,7 +41,7 @@ void Init(const std::vector<std::string> argv) {
while (std::getline(tokenStream, token, ',')) {
devices.push_back(std::stoi(token));
}
framework::InitDevices(FLAGS_init_p2p, devices);
framework::InitDevices(devices);
}
void ReadBinaryFile(const std::string& filename, std::string* contents) {
......
......@@ -37,6 +37,7 @@
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
......
......@@ -57,7 +57,7 @@ void OpTester::Init(const OpTesterConfig &config) {
place_ = paddle::platform::CPUPlace();
}
framework::InitDevices(false);
framework::InitDevices();
scope_.reset(new paddle::framework::Scope());
op_ = framework::OpRegistry::CreateOp(op_desc_);
......
......@@ -140,7 +140,7 @@ void TestMain(const std::vector<std::string>& input_names,
std::string func_name, std::string cuda_kernel_str,
CPUKernelFunc cpu_kernel_func) {
// Compile the device code
paddle::framework::InitDevices(false, {0});
paddle::framework::InitDevices({0});
platform::CUDAPlace place = platform::CUDAPlace(0);
PrepareDeviceCode(place, func_name, cuda_kernel_str);
......
......@@ -35,7 +35,7 @@ TEST(DeviceCode, cuda) {
return;
}
paddle::framework::InitDevices(false, {0});
paddle::framework::InitDevices({0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
......@@ -90,7 +90,7 @@ TEST(DeviceCodePool, cuda) {
return;
}
paddle::framework::InitDevices(false, {0});
paddle::framework::InitDevices({0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::DeviceCodePool& pool =
paddle::platform::DeviceCodePool::Init({place});
......
......@@ -63,7 +63,6 @@ namespace framework {
std::once_flag gflags_init_flag;
std::once_flag glog_init_flag;
std::once_flag p2p_init_flag;
bool InitGflags(std::vector<std::string> args) {
bool successed = false;
......@@ -95,28 +94,7 @@ bool InitGflags(std::vector<std::string> args) {
return successed;
}
void InitP2P(std::vector<int> devices) {
#ifdef PADDLE_WITH_CUDA
std::call_once(p2p_init_flag, [&]() {
int count = devices.size();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < count; ++j) {
if (devices[i] == devices[j]) continue;
int can_acess = -1;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]));
if (can_acess != 1) {
VLOG(2) << "Cannot enable P2P access from " << devices[i] << " to "
<< devices[j];
} else {
platform::CUDADeviceGuard guard(devices[i]);
cudaDeviceEnablePeerAccess(devices[j], 0);
}
}
}
});
#endif
}
void InitCupti() {
#ifdef PADDLE_WITH_CUPTI
......@@ -144,7 +122,7 @@ void InitCupti() {
#endif
}
void InitDevices(bool init_p2p) {
void InitDevices() {
// CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute).
InitCupti();
......@@ -166,10 +144,10 @@ void InitDevices(bool init_p2p) {
LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
}
#endif
InitDevices(init_p2p, devices);
InitDevices(devices);
}
void InitDevices(bool init_p2p, const std::vector<int> devices) {
void InitDevices(const std::vector<int> devices) {
std::vector<platform::Place> places;
for (size_t i = 0; i < devices.size(); ++i) {
......@@ -187,9 +165,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places.emplace_back(platform::XPUPlace(devices[i]));
#endif
}
if (init_p2p) {
InitP2P(devices);
}
places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
places.emplace_back(platform::CUDAPinnedPlace());
......
......@@ -35,9 +35,9 @@ bool InitGflags(std::vector<std::string> argv);
void InitGLOG(const std::string& prog_name);
void InitDevices(bool init_p2p);
void InitDevices();
void InitDevices(bool init_p2p, const std::vector<int> devices);
void InitDevices(const std::vector<int> devices);
#ifndef _WIN32
class SignalMessageDumper {
......
......@@ -22,7 +22,7 @@ TEST(InitDevices, CPU) {
using paddle::platform::DeviceContextPool;
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
InitDevices(true);
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U);
#endif
......@@ -34,7 +34,7 @@ TEST(InitDevices, CUDA) {
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices(true);
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
#endif
......@@ -46,7 +46,7 @@ TEST(InitDevices, XPU) {
#ifdef PADDLE_WITH_XPU
int count = paddle::platform::GetXPUDeviceCount();
InitDevices(true);
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
......
......@@ -1715,7 +1715,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_glog", framework::InitGLOG);
m.def("load_op_library", framework::LoadOpLib);
m.def("init_devices",
[](bool init_p2p) { framework::InitDevices(init_p2p); });
[]() { framework::InitDevices(); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
......
......@@ -55,7 +55,7 @@ std::unique_ptr<paddle::framework::ProgramDesc> Load(
} // namespace paddle
int main() {
paddle::framework::InitDevices(false);
paddle::framework::InitDevices();
const auto cpu_place = paddle::platform::CPUPlace();
......
......@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
platform::errors::InvalidArgument(
"At least one file to train, but received number of file is %d.",
file_vec.size()));
paddle::framework::InitDevices(false);
paddle::framework::InitDevices();
const auto cpu_place = paddle::platform::CPUPlace();
paddle::framework::Executor executor(cpu_place);
paddle::framework::Scope scope;
......
......@@ -33,7 +33,7 @@ DEFINE_string(dirname, "", "Directory of the train model.");
namespace paddle {
void Train(std::string model_dir) {
framework::InitDevices(false);
framework::InitDevices();
const auto cpu_place = platform::CPUPlace();
framework::Executor executor(cpu_place);
framework::Scope scope;
......
......@@ -121,7 +121,7 @@ int main(int argc, char** argv) {
int internal_argc = internal_argv.size();
char** arr = internal_argv.data();
paddle::platform::ParseCommandLineFlags(internal_argc, arr, true);
paddle::framework::InitDevices(true);
paddle::framework::InitDevices();
int ret = RUN_ALL_TESTS();
......
......@@ -254,7 +254,7 @@ def __bootstrap__():
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time.
core.init_devices(not in_test)
core.init_devices()
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册