未验证 提交 cee7a3db 编写于 作者: S ShenLiang 提交者: GitHub

fix bug of reducer in best_fit (#48668)

上级 89f024e3
...@@ -17,10 +17,16 @@ ...@@ -17,10 +17,16 @@
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
DECLARE_bool(use_stream_safe_cuda_allocator); DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_string(allocator_strategy);
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
static bool IsStreamSafeAllocator() {
return FLAGS_allocator_strategy == "auto_growth" &&
FLAGS_use_stream_safe_cuda_allocator;
}
static Backend TransToBackend(platform::Place place) { static Backend TransToBackend(platform::Place place) {
static const std::map<phi::AllocationType, Backend> type_backend = { static const std::map<phi::AllocationType, Backend> type_backend = {
{phi::AllocationType::GPU, Backend::GPU}, {phi::AllocationType::GPU, Backend::GPU},
...@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { ...@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
} }
} }
void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) { void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto &gpu_context = static_cast<const phi::GPUContext &>(context); auto &gpu_context = static_cast<const phi::GPUContext &>(context);
SplitTensorsWithType( SplitTensorsWithType(
gpu_context, &dense_contents_, &dense_tensors_, dtype_); gpu_context, &dense_contents_, &dense_tensors_, dtype_);
if (FLAGS_use_stream_safe_cuda_allocator) { if (IsStreamSafeAllocator()) {
auto dense_tensor = auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl()); std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
VLOG(3) << "Free dense_contents_ " << dense_contents_.numel(); VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
...@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() { ...@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
for (auto &group : groups_) { for (auto &group : groups_) {
if (!group.is_sparse_) { if (!group.is_sparse_) {
group.task->Synchronize(); group.task->Synchronize();
} if (!IsStreamSafeAllocator()) {
} auto *default_ctx =
platform::DeviceContextPool::Instance().Get(inner_place_);
for (auto &group : groups_) { group.SplitTensors(*default_ctx);
if (!group.is_sparse_) { }
group.dense_contents_.reset();
} }
} }
...@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, ...@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
group->task = process_group_->AllReduce(in_out, in_out, opts); group->task = process_group_->AllReduce(in_out, in_out, opts);
auto *context = process_group_->GetDeviceContext(inner_place_); auto *context = process_group_->GetDeviceContext(inner_place_);
group->SplitTensorsDev(*context);
group->task->UpdateWaitChain(*context); if (IsStreamSafeAllocator()) {
// split in FinalizeBackward() // NOTE(shenliang03): The best_fit allocator strategy is multi-stream
// insecure. In the Split operator, additional memory will be applied for
// calculation, and if it is asynchronous, an illegal memory access may be
// encountered.
group->SplitTensors(*context);
group->task->UpdateWaitChain(*context);
}
} }
void EagerReducer::AllReduceSparse(EagerGroup *group, void EagerReducer::AllReduceSparse(EagerGroup *group,
......
...@@ -75,7 +75,7 @@ class EagerGroup { ...@@ -75,7 +75,7 @@ class EagerGroup {
// context is used to select the stream for split // context is used to select the stream for split
void SplitTensorsDev(const platform::DeviceContext &); void SplitTensors(const platform::DeviceContext &);
friend std::ostream &operator<<(std::ostream &, const EagerGroup &); friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
}; };
......
...@@ -103,6 +103,7 @@ def start_local_trainers( ...@@ -103,6 +103,7 @@ def start_local_trainers(
training_script, training_script,
training_script_args, training_script_args,
eager_mode=True, eager_mode=True,
allocator_strategy="auto_growth",
log_dir=None, log_dir=None,
): ):
current_env = copy.copy(os.environ.copy()) current_env = copy.copy(os.environ.copy())
...@@ -126,6 +127,10 @@ def start_local_trainers( ...@@ -126,6 +127,10 @@ def start_local_trainers(
if not eager_mode: if not eager_mode:
proc_env["FLAGS_enable_eager_mode"] = "%d" % 0 proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
proc_env["FLAGS_allocator_strategy"] = allocator_strategy
if allocator_strategy == "auto_growth":
proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
current_env.update(proc_env) current_env.update(proc_env)
print("trainer proc env:{}".format(current_env)) print("trainer proc env:{}".format(current_env))
...@@ -153,7 +158,12 @@ def start_local_trainers( ...@@ -153,7 +158,12 @@ def start_local_trainers(
class TestMultipleGpus(unittest.TestCase): class TestMultipleGpus(unittest.TestCase):
def run_mnist_2gpu(self, target_file_name, eager_mode=True): def run_mnist_2gpu(
self,
target_file_name,
eager_mode=True,
allocator_strategy="auto_growth",
):
if ( if (
not fluid.core.is_compiled_with_cuda() not fluid.core.is_compiled_with_cuda()
or fluid.core.get_cuda_device_count() == 0 or fluid.core.get_cuda_device_count() == 0
...@@ -170,6 +180,7 @@ class TestMultipleGpus(unittest.TestCase): ...@@ -170,6 +180,7 @@ class TestMultipleGpus(unittest.TestCase):
cluster, cluster,
pod, pod,
eager_mode=eager_mode, eager_mode=eager_mode,
allocator_strategy=allocator_strategy,
training_script=target_file_name, training_script=target_file_name,
training_script_args=[], training_script_args=[],
) )
...@@ -218,6 +229,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus): ...@@ -218,6 +229,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus):
self.run_mnist_2gpu( self.run_mnist_2gpu(
'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False 'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False
) )
self.run_mnist_2gpu(
'parallel_dygraph_dataparallel_with_pylayer.py',
allocator_strategy="naive_best_fit",
)
class TestGradientCheckInEagerMode(TestMultipleGpus): class TestGradientCheckInEagerMode(TestMultipleGpus):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册