fix dataloader exit terminate error (#34501)

* fix DataLoader exit with SIGABRT/SIGSEGV. test=develop

fix dataloader exit terminate error (#34501)
* fix DataLoader exit with SIGABRT/SIGSEGV. test=develop
e93c18a3 · Kaipeng Deng · GitHub · 2df74aa6 · e93c18a3 · e93c18a3
3 changed file
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -43,6 +43,36 @@ from .flat import _flatten_batch, _restore_batch
 __all__ = ['get_worker_info']
+# NOTE: fix `terminate called without an active exception`
+# if for loop break and program exit immediately(with no model
+# layers processing) after iterate **the first few data** in
+# distributed lauch mode, distributed launch will call
+# terminate() to kill main process on each devices, but thread
+# is still iterating to fullfill blocking queue caches, which
+# may cause thread error `terminate called without an active
+# exception` for terminate is a strong singal and `__del__`
+# of DataLoader may not be called, so we add a global link to
+# the last DataLoader instance to call `__del__` to clean up
+# resources
+# NOTE: cannot simply as `__del__` to CleanupFuncRegistrar,
+# for this will remain a link to each DataLoader instance in
+# global, and will precludes GC to auto collect DataLoader
+# instance and will cause memory leak
+_loader = None
+def _clear_loader():
+    global _loader
+    if _loader is not None:
+        try:
+            _loader.__del__()
+            del _loader
+        except:
+            pass
+CleanupFuncRegistrar.register(_clear_loader)
 class _DataLoaderIterBase(object):
    """
@@ -100,6 +130,16 @@ class _DataLoaderIterBase(object):
    def __len__(self):
        return len(self._batch_sampler)
+    def _exit_thread_expectedly(self):
+        self._thread_done_event.set()
+        if self._blocking_queue:
+            self._blocking_queue.close()
+    def _exit_thread_unexpectedly(self):
+        self._thread_done_event.set()
+        if self._blocking_queue:
+            self._blocking_queue.kill()
 class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
    """
@@ -125,9 +165,13 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
        # NOTE: len(self._places) batch data compose as an output
        # iteration, set blocking_queue can cache 2 iteration datas
        # at most here
-        self._blocking_queue_capacity = 2 * len(self._places)
+        self._blocking_queue_capacity = 1 * len(self._places)
        self._init_thread()
+        self._shutdown = False
+        global _loader
+        _loader = self
    def _init_thread(self):
        self._var_names = [v.name for v in self._feed_list]
@@ -151,22 +195,35 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
        self._thread.start()
    def _thread_loop(self, legacy_expected_place):
-        try:
+        #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
-            #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
+        # and it will call platform::SetDeviceId() in c++ internally.
-            # and it will call platform::SetDeviceId() in c++ internally.
+        # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
-            # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
+        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
-            # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
+        # APIs in this thread.
-            # APIs in this thread.
+        _set_expected_place(legacy_expected_place)
-            _set_expected_place(legacy_expected_place)
+        while not self._thread_done_event.is_set():
-            for indices in self._sampler_iter:
+            try:
+                indices = next(self._sampler_iter)
                # read data from dataset in mini-batch
-                batch = self._dataset_fetcher.fetch(indices)
+                # with paddle.fluid.dygraph.guard(place=paddle.CPUPlace()):
+                # read data from dataset in mini-batch
+                batch = self._dataset_fetcher.fetch(indices,
+                                                    self._thread_done_event)
+            except StopIteration:
+                self._exit_thread_expectedly()
+                return
+            if batch is None or self._thread_done_event.is_set(): break
+            # flat batch and record structure infos
+            batch, structure = _flatten_batch(batch)
+            self._structure_infos.append(structure)
-                # flat batch and record structure infos
+            if self._thread_done_event.is_set(): break
-                batch, structure = _flatten_batch(batch)
-                self._structure_infos.append(structure)
+            try:
                # pack as LoDTensorArray
                array = core.LoDTensorArray()
                for slot in batch:
@@ -179,21 +236,18 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
                    array.append(slot)
-                if not self._blocking_queue.push(array):
+                if self._thread_done_event.is_set(): break
-                    break
-                if self._thread_done_event.is_set():
+                try:
-                    break
+                    self._blocking_queue.push(array)
+                except:
+                    self._exit_thread_expectedly()
-            self._blocking_queue.close()
+            except:
-            self._shutdown_thread()
+                self._exit_thread_unexpectedly()
-        except StopIteration:
+                six.reraise(*sys.exc_info())
-            self._blocking_queue.close()
-        except Exception:
+        self._exit_thread_expectedly()
-            self._blocking_queue.kill()
-            self._shutdown_thread()
-            logging.warning("DataLoader reader thread raised an exception.")
-            six.reraise(*sys.exc_info())
    def __next__(self):
        try:
@@ -221,28 +275,46 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
            return data
        except StopIteration:
            self._reader.shutdown()
+            self._try_shutdown_all()
            six.reraise(*sys.exc_info())
    def _shutdown_thread(self):
        if self._thread:
            self._thread_done_event.set()
-            if self._thread is not threading.current_thread():
+            # NOTE: we wait for _thread exit for 3 seconds, if
-                self._thread.join()
+            #       thread not exit normally, force kill it
+            for _ in range(3):
+                if self._thread.is_alive():
+                    time.sleep(1)
+                else:
+                    break
+            else:
+                if self._thread is not threading.current_thread():
+                    self._thread.join()
            self._thread = None
    # python2 compatibility
    def next(self):
        return self.__next__()
+    def _try_shutdown_all(self):
+        if not self._shutdown:
+            try:
+                # # _blocking_queue in keep order mode holds sub-threads
+                # # need to release thread resources on unexpected exit
+                if self._blocking_queue:
+                    self._blocking_queue.close()
+                    self._blocking_queue = None
+                # NOTE: blocking queue should be closed firstly for
+                # blocking queue read may hang and _thread_done_event
+                # cannot be checked
+                self._shutdown_thread()
+            finally:
+                self._shutdown = True
    def __del__(self):
-        # _blocking_queue in keep order mode holds sub-threads
+        self._try_shutdown_all()
-        # need to release thread resources on unexpected exit
-        if self._blocking_queue:
-            self._blocking_queue.close()
-        # NOTE: blocking queue should be closed firstly for
-        # blocking queue read may hang and _thread_done_event
-        # cannot be checked
-        self._shutdown_thread()
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
@@ -421,15 +493,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                core._erase_process_pids(id(self))
                self._shutdown = True
-    def _exit_thread_expectedly(self):
-        self._thread_done_event.set()
-        self._blocking_queue.close()
-    def _exit_thread_unexpectedly(self):
-        self._thread_done_event.set()
-        self._blocking_queue.kill()
-        logging.error("DataLoader reader thread raised an exception!")
    def _thread_loop(self, legacy_expected_place):
        #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
        # and it will call platform::SetDeviceId() in c++ internally.

--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -26,7 +26,16 @@ class _DatasetFetcher(object):
        self.collate_fn = collate_fn
        self.drop_last = drop_last
-    def fetch(self, batch_indices):
+    # NOTE: fetch function here perform the whole pipeline of dataset
+    #       reading and data trasforms of a batch in each calling, this
+    #       may take a long time inside, if DataLoader is exit outside,
+    #       fetch need to perceive exit situation, so we pass done_event
+    #       here for fetch to check exit status
+    # NOTE: if DataLoadet exit by `break`, performing GPU tensor operations,
+    #       e.g. to_tensor may cause SIGSEGV in thread, so we pass the
+    #       done_event argument to check DataLoader exit status between
+    #       ecah sample processing in the batch
+    def fetch(self, batch_indices, done_event=None):
        raise NotImplementedError("'fetch' not implement for class {}".format(
            self.__class__.__name__))
@@ -69,15 +78,18 @@ class _IterableDatasetFetcher(_DatasetFetcher):
            dataset, auto_collate_batch, collate_fn, drop_last)
        self.dataset_iter = iter(dataset)
-    def fetch(self, batch_indices):
+    def fetch(self, batch_indices, done_event=None):
        if self.auto_collate_batch:
            data = []
            for _ in batch_indices:
-                try:
+                if done_event is None or not done_event.is_set():
-                    data.append(next(self.dataset_iter))
+                    try:
-                except StopIteration:
+                        data.append(next(self.dataset_iter))
-                    break
+                    except StopIteration:
+                        break
+                else:
+                    return None
            if len(data) == 0 or (self.drop_last and
                                  len(data) < len(batch_indices)):
@@ -101,9 +113,14 @@ class _MapDatasetFetcher(_DatasetFetcher):
        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
                                                 collate_fn, drop_last)
-    def fetch(self, batch_indices):
+    def fetch(self, batch_indices, done_event=None):
        if self.auto_collate_batch:
-            data = [self.dataset[idx] for idx in batch_indices]
+            data = []
+            for idx in batch_indices:
+                if done_event is None or not done_event.is_set():
+                    data.append(self.dataset[idx])
+                else:
+                    return None
            global _WARNING_TO_LOG
            if not isinstance(data[0], (Sequence, Mapping)) \

--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@@ -43,14 +43,18 @@ class TestDatasetAbstract(unittest.TestCase):
 class TestDatasetWithDiffOutputPlace(unittest.TestCase):
    def get_dataloader(self, num_workers):
        dataset = paddle.vision.datasets.MNIST(
-            mode='test', transform=transforms.ToTensor())
+            mode='test',
+            transform=transforms.Compose([
+                transforms.CenterCrop(20), transforms.RandomResizedCrop(14),
+                transforms.Normalize(), transforms.ToTensor()
+            ]))
        loader = paddle.io.DataLoader(
            dataset, batch_size=32, num_workers=num_workers, shuffle=True)
        return loader
    def run_check_on_cpu(self):
        paddle.set_device('cpu')
-        loader = self.get_dataloader(0)
+        loader = self.get_dataloader(1)
        for image, label in loader:
            self.assertTrue(image.place.is_cpu_place())
            self.assertTrue(label.place.is_cpu_place())
@@ -66,12 +70,7 @@ class TestDatasetWithDiffOutputPlace(unittest.TestCase):
            for image, label in loader:
                self.assertTrue(image.place.is_gpu_place())
                self.assertTrue(label.place.is_cuda_pinned_place())
-                # FIXME(dkp): when input tensor is in GPU place and
+                break
-                # iteration break in the median, it seems the GPU
-                # tensor put into blocking_queue cannot be safely
-                # released and may cause ABRT/SEGV, this should
-                # be fixed
-                # break
    def test_multi_process(self):
        # DataLoader with multi-process mode is not supported on MacOs and Windows currently