From 15c875283bc888ff099d5b84419df694c7794ec3 Mon Sep 17 00:00:00 2001 From: XiaociZhang Date: Wed, 28 Jun 2023 11:14:08 +0800 Subject: [PATCH] [XPU] fix the dataloader problem in RDMA env (#54150) * [kunlun] fix the dataloader problem in RDMA env When running multi-machine training with Paddle DataLoader, an unexpected segmentfault will be raised in DataLoader Process, where the traceback goes all back to a runtime error that dataloader workers exit unexpectedly. Similar problems have been discussed that lead to a misbehavior of OpenCV working in multiprocessing environment. See https://stackoverflow.com/questions/54013846/pytorch-dataloader-stucked-if-using-opencv-resize-method * code style * fix 'RuntimeError: context has already been set' * Update dataloader_iter.py spawn method raise error 'Can't pickle local object' in some situations * code format check * code style --- python/paddle/io/dataloader/dataloader_iter.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py index 0ffe7c46e77..c15d3377eb6 100644 --- a/python/paddle/io/dataloader/dataloader_iter.py +++ b/python/paddle/io/dataloader/dataloader_iter.py @@ -427,7 +427,21 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._shutdown = False def _init_workers(self): - from paddle.incubate import multiprocessing + # NOTE(zhangxiaoci): When trained in XPU multi-node RDMA environment, an unexpected + # segmentfault will be raised in dataloader process, where the traceback goes all + # back to a runtime error that dataloader workers exit unexpectedly. Similar problems + # have been discussed that lead to a misbehavior of OpenCV working in multiprocessing + # environment. A possible solution is to change default 'fork' mode of multiprocessing + # start method to 'spawn'. See https://stackoverflow.com/questions/54013846 for details. + # NOTE(zhangxiaoci): Replace multiprocessing with multiprocess since in some training + # environments the former will raise 'AttributeError: Can't pickle local object xxx', + # which is a side effect of changing the default start method. + if paddle.is_compiled_with_xpu(): + import multiprocess as multiprocessing + + multiprocessing.set_start_method('spawn', force=True) + else: + from paddle.incubate import multiprocessing # multiprocess worker and indice queue list initial as empty self._workers = [] -- GitLab