未验证 提交 cee7a3db 编写于 作者: S ShenLiang 提交者: GitHub

fix bug of reducer in best_fit (#48668)

上级 89f024e3
......@@ -17,10 +17,16 @@
#include "paddle/phi/backends/device_manager.h"
DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_string(allocator_strategy);
namespace paddle {
namespace distributed {
static bool IsStreamSafeAllocator() {
return FLAGS_allocator_strategy == "auto_growth" &&
FLAGS_use_stream_safe_cuda_allocator;
}
static Backend TransToBackend(platform::Place place) {
static const std::map<phi::AllocationType, Backend> type_backend = {
{phi::AllocationType::GPU, Backend::GPU},
......@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
}
}
void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) {
void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto &gpu_context = static_cast<const phi::GPUContext &>(context);
SplitTensorsWithType(
gpu_context, &dense_contents_, &dense_tensors_, dtype_);
if (FLAGS_use_stream_safe_cuda_allocator) {
if (IsStreamSafeAllocator()) {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
......@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
for (auto &group : groups_) {
if (!group.is_sparse_) {
group.task->Synchronize();
}
}
for (auto &group : groups_) {
if (!group.is_sparse_) {
group.dense_contents_.reset();
if (!IsStreamSafeAllocator()) {
auto *default_ctx =
platform::DeviceContextPool::Instance().Get(inner_place_);
group.SplitTensors(*default_ctx);
}
}
}
......@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
group->task = process_group_->AllReduce(in_out, in_out, opts);
auto *context = process_group_->GetDeviceContext(inner_place_);
group->SplitTensorsDev(*context);
group->task->UpdateWaitChain(*context);
// split in FinalizeBackward()
if (IsStreamSafeAllocator()) {
// NOTE(shenliang03): The best_fit allocator strategy is multi-stream
// insecure. In the Split operator, additional memory will be applied for
// calculation, and if it is asynchronous, an illegal memory access may be
// encountered.
group->SplitTensors(*context);
group->task->UpdateWaitChain(*context);
}
}
void EagerReducer::AllReduceSparse(EagerGroup *group,
......
......@@ -75,7 +75,7 @@ class EagerGroup {
// context is used to select the stream for split
void SplitTensorsDev(const platform::DeviceContext &);
void SplitTensors(const platform::DeviceContext &);
friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
};
......
......@@ -103,6 +103,7 @@ def start_local_trainers(
training_script,
training_script_args,
eager_mode=True,
allocator_strategy="auto_growth",
log_dir=None,
):
current_env = copy.copy(os.environ.copy())
......@@ -126,6 +127,10 @@ def start_local_trainers(
if not eager_mode:
proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
proc_env["FLAGS_allocator_strategy"] = allocator_strategy
if allocator_strategy == "auto_growth":
proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
current_env.update(proc_env)
print("trainer proc env:{}".format(current_env))
......@@ -153,7 +158,12 @@ def start_local_trainers(
class TestMultipleGpus(unittest.TestCase):
def run_mnist_2gpu(self, target_file_name, eager_mode=True):
def run_mnist_2gpu(
self,
target_file_name,
eager_mode=True,
allocator_strategy="auto_growth",
):
if (
not fluid.core.is_compiled_with_cuda()
or fluid.core.get_cuda_device_count() == 0
......@@ -170,6 +180,7 @@ class TestMultipleGpus(unittest.TestCase):
cluster,
pod,
eager_mode=eager_mode,
allocator_strategy=allocator_strategy,
training_script=target_file_name,
training_script_args=[],
)
......@@ -218,6 +229,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus):
self.run_mnist_2gpu(
'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False
)
self.run_mnist_2gpu(
'parallel_dygraph_dataparallel_with_pylayer.py',
allocator_strategy="naive_best_fit",
)
class TestGradientCheckInEagerMode(TestMultipleGpus):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册