From 86694ce39fed10fe9815ccb7cd81119efd0d12ab Mon Sep 17 00:00:00 2001 From: XiaociZhang Date: Thu, 6 Jul 2023 10:12:10 +0800 Subject: [PATCH] Revert "[XPU] fix the dataloader problem in RDMA env (#54150)" (#55150) This reverts commit 15c875283bc888ff099d5b84419df694c7794ec3. --- cmake/external/xpu.cmake | 3 ++- python/paddle/io/dataloader/dataloader_iter.py | 16 +--------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 45adc981562..1c01f760b47 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -134,7 +134,8 @@ ExternalProject_Add( ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} && wget ${XPU_XFT_GET_DEPENCE_URL} && bash - get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash + get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && + WITH_XPTI=${WITH_XPTI} bash ${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL} ${XPU_XPTI_DIR_NAME} DOWNLOAD_NO_PROGRESS 1 diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py index c15d3377eb6..0ffe7c46e77 100644 --- a/python/paddle/io/dataloader/dataloader_iter.py +++ b/python/paddle/io/dataloader/dataloader_iter.py @@ -427,21 +427,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._shutdown = False def _init_workers(self): - # NOTE(zhangxiaoci): When trained in XPU multi-node RDMA environment, an unexpected - # segmentfault will be raised in dataloader process, where the traceback goes all - # back to a runtime error that dataloader workers exit unexpectedly. Similar problems - # have been discussed that lead to a misbehavior of OpenCV working in multiprocessing - # environment. A possible solution is to change default 'fork' mode of multiprocessing - # start method to 'spawn'. See https://stackoverflow.com/questions/54013846 for details. - # NOTE(zhangxiaoci): Replace multiprocessing with multiprocess since in some training - # environments the former will raise 'AttributeError: Can't pickle local object xxx', - # which is a side effect of changing the default start method. - if paddle.is_compiled_with_xpu(): - import multiprocess as multiprocessing - - multiprocessing.set_start_method('spawn', force=True) - else: - from paddle.incubate import multiprocessing + from paddle.incubate import multiprocessing # multiprocess worker and indice queue list initial as empty self._workers = [] -- GitLab