From 7d8d573453d6fbb890a5ac386bcc4666e1cda7b1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 2 Mar 2020 14:24:14 +0800
Subject: [PATCH] Speed up dygraph DataLoader based on shared memory and
 LoDTensor serialization (#22541)

* add lodtensor share memory & serialization, test=develop

* fix windows compile error, test=develop

* deal vartype pickle & fix unittest matching error message, test=develop

* update timeout variable name, test=develop

* refactor memory map implement, test=develop

* clear mmap file discripter when exit unexpectedly, test=develop

* remove the child process fd in advance, test=develop

* remove mmap fds after Queue.put in child process, test=develop

* add hard unittests for register exit func, test=develop

* fix python2 compatibility problem in unittest, test=develop

* fix exception unittest error, test=develop

* polish code based review comment, test=develop
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   5 +
 .../fluid/memory/allocation/mmap_allocator.cc | 142 +++++++++++++
 .../fluid/memory/allocation/mmap_allocator.h  |  90 ++++++++
 .../memory/allocation/mmap_allocator_test.cc  |  54 +++++
 paddle/fluid/pybind/CMakeLists.txt            |   1 +
 paddle/fluid/pybind/imperative.cc             |  93 +++++++--
 paddle/fluid/pybind/pybind.cc                 |  52 +++++
 python/paddle/fluid/core.py                   |   6 +
 python/paddle/fluid/reader.py                 | 197 +++++++++++++-----
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../unittests/test_imperative_data_loader.py  |   2 +-
 .../test_imperative_data_loader_exception.py  |   6 +-
 .../test_imperative_data_loader_exit_func.py  |  84 ++++++++
 .../test_imperative_data_loader_fds_clear.py  |  78 +++++++
 .../test_imperative_data_loader_process.py    |   5 +-
 15 files changed, 745 insertions(+), 72 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/mmap_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/mmap_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/mmap_allocator_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index dc3d9a1f56..dc26c19cbc 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -66,3 +66,8 @@ cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.c
 cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
+
+if(NOT WIN32)
+  cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
+  cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
+endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
new file mode 100644
index 0000000000..0ef084bafd
--- /dev/null
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <random>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
+  PADDLE_ENFORCE_NE(
+      munmap(this->ptr(), this->size()), -1,
+      platform::errors::Unavailable("could not unmap the shared memory file %s",
+                                    this->ipc_name()));
+}
+
+MemoryMapReaderAllocation::~MemoryMapReaderAllocation() {
+  PADDLE_ENFORCE_NE(
+      munmap(this->ptr(), this->size()), -1,
+      platform::errors::Unavailable("could not unmap the shared memory file %s",
+                                    this->ipc_name()));
+  /* Here we do not pay attention to the result of shm_unlink,
+     because the memory mapped file may have been cleared due to the
+     MemoryMapFdSet::Clear() */
+  shm_unlink(this->ipc_name().c_str());
+  MemoryMapFdSet::Instance().Remove(this->ipc_name());
+  VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name();
+}
+
+std::string GetIPCName() {
+  static std::random_device rd;
+  std::string handle = "/paddle_";
+#ifdef _WIN32
+  handle += std::to_string(GetCurrentProcessId());
+#else
+  handle += std::to_string(getpid());
+#endif
+  handle += "_";
+  handle += std::to_string(rd());
+  return std::move(handle);
+}
+
+std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
+    size_t size) {
+  const std::string &ipc_name = GetIPCName();
+  int flags = O_RDWR | O_CREAT;
+
+  int fd = shm_open(ipc_name.c_str(), flags, 0644);
+  PADDLE_ENFORCE_NE(
+      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
+                                            ipc_name.c_str()));
+  PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
+                    platform::errors::Unavailable(
+                        "Fruncate a file to a specified length failed!"));
+
+  void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
+                    platform::errors::Unavailable(
+                        "Memory map failed when create shared memory."));
+  close(fd);
+
+  return std::make_shared<MemoryMapWriterAllocation>(ptr, size, ipc_name);
+}
+
+std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
+    const std::string &ipc_name, size_t size) {
+  int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644);
+  PADDLE_ENFORCE_NE(
+      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
+                                            ipc_name.c_str()));
+
+  void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+  PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
+                    platform::errors::Unavailable(
+                        "Memory map failed when rebuild shared memory."));
+  close(fd);
+  return std::make_shared<MemoryMapReaderAllocation>(ptr, size, ipc_name);
+}
+
+MemoryMapFdSet &MemoryMapFdSet::Instance() {  // NOLINT
+  static MemoryMapFdSet set;
+  return set;
+}
+
+void MemoryMapFdSet::Insert(const std::string &ipc_name) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  fd_set_.emplace(ipc_name);
+  VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: insert " << ipc_name
+          << ", set size: " << fd_set_.size();
+}
+
+void MemoryMapFdSet::Remove(const std::string &ipc_name) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  fd_set_.erase(ipc_name);
+  VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: erase " << ipc_name
+          << ", set size: " << fd_set_.size();
+}
+
+void MemoryMapFdSet::Clear() {
+  VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: set size - "
+          << fd_set_.size();
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto fd : fd_set_) {
+    int rlt = shm_unlink(fd.c_str());
+    if (rlt == 0) {
+      VLOG(3) << "PID: " << getpid() << ", MemoryMapFdSet: clear " << fd;
+    }
+  }
+  fd_set_.clear();
+}
+
+MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
new file mode 100644
index 0000000000..3f91e5c427
--- /dev/null
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _WIN32
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class MemoryMapWriterAllocation : public Allocation {
+ public:
+  explicit MemoryMapWriterAllocation(void *ptr, size_t size,
+                                     std::string ipc_name)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)) {}
+
+  inline const std::string &ipc_name() const { return ipc_name_; }
+
+  ~MemoryMapWriterAllocation() override;
+
+ private:
+  std::string ipc_name_;
+};
+
+class MemoryMapReaderAllocation : public Allocation {
+ public:
+  explicit MemoryMapReaderAllocation(void *ptr, size_t size,
+                                     std::string ipc_name)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)) {}
+
+  inline const std::string &ipc_name() const { return ipc_name_; }
+
+  ~MemoryMapReaderAllocation() override;
+
+ private:
+  std::string ipc_name_;
+};
+
+std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
+    size_t size);
+
+std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
+    const std::string &ipc_name, size_t size);
+
+class MemoryMapFdSet {
+ public:
+  static MemoryMapFdSet &Instance();  // NOLINT
+
+  void Insert(const std::string &ipc_name);
+
+  void Remove(const std::string &ipc_name);
+
+  void Clear();
+
+  ~MemoryMapFdSet();
+
+ private:
+  MemoryMapFdSet() = default;
+
+  std::unordered_set<std::string> fd_set_;
+  std::mutex mtx_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/mmap_allocator_test.cc b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
new file mode 100644
index 0000000000..5b66920be2
--- /dev/null
+++ b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(MemoryMapAllocation, test_allocation_base) {
+  size_t data_size = 4UL * 1024;
+  // 1. allocate writer holader
+  auto mmap_writer_holder = AllocateMemoryMapWriterAllocation(data_size);
+  std::string ipc_name = mmap_writer_holder->ipc_name();
+  // 2. write data
+  auto* writer_ptr = static_cast<int32_t*>(mmap_writer_holder->ptr());
+  for (int32_t i = 0; i < 1024; ++i) {
+    writer_ptr[i] = i;
+  }
+  // 3. create child process
+  pid_t fpid = fork();
+  if (fpid == 0) {
+    // 4. rebuild reader holder
+    auto mmap_reader_holder =
+        RebuildMemoryMapReaderAllocation(ipc_name, data_size);
+    auto* reader_ptr = static_cast<int32_t*>(mmap_reader_holder->ptr());
+    for (int32_t i = 0; i < 1024; ++i) {
+      ASSERT_EQ(reader_ptr[i], i);
+    }
+  }
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 320951debb..87dceb1850 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
+  set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
   if (WITH_NCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
   endif()
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index beeae143f5..c9f455d66e 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -220,6 +221,85 @@ void BindImperative(py::module *m_ptr) {
 
   BindOpFunctions(&m);
 
+#ifndef _WIN32
+  // Dygraph DataLoader signal handler
+  m.def("_set_process_pid", [](int64_t key, pid_t pid) {
+    imperative::SetLoadProcessPID(key, pid);
+  });
+  m.def("_erase_process_pid",
+        [](int64_t key) { imperative::EraseLoadProcessPID(key); });
+  m.def("_set_process_signal_handler",
+        []() { imperative::SetLoadProcessSignalHandler(); });
+  m.def("_throw_error_if_process_failed",
+        []() { imperative::ThrowErrorIfLoadProcessFailed(); });
+
+  // Dygraph DataLoader reader process & thread related functions
+  m.def(
+      "_convert_to_tensor_list",
+      [](py::object &obj) -> py::list {
+        // 0. input data check
+        PADDLE_ENFORCE(
+            py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj),
+            platform::errors::InvalidArgument(
+                "The batch data read into DataLoader is illegal."
+                "Expected data type is tuple or list, but received %s",
+                obj.get_type()));
+        py::list batch = py::cast<py::list>(obj);
+        py::list tensors;
+        for (size_t i = 0; i < batch.size(); ++i) {
+          // 1. cast to python array
+          auto array = batch[i].cast<py::array>();
+          PADDLE_ENFORCE_NE(
+              string::Sprintf("%s", array.dtype()).compare("object"), 0,
+              platform::errors::InvalidArgument(
+                  "Faild to convert input data to a regular ndarray.\n  * "
+                  "Usually this means the input data contains nested "
+                  "lists with different lengths.\n  * Check the reader "
+                  "function passed to 'set_(sample/sample_list/batch)"
+                  "_generator' to locate the data causes this issue."));
+          // 2. construcct LoDTensor
+          framework::LoDTensor t;
+          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                   platform::CPUPlace(), true);
+          // 3. allocate shared memory
+          void *data_ptr = t.data<void>();
+          size_t data_size = t.numel() * framework::SizeOfType(t.type());
+          auto shared_writer_holder =
+              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+          // 4. maintain mmap fd set & backup ipc_name
+          const std::string &ipc_name = shared_writer_holder->ipc_name();
+          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+          // 5. copy data & reset holder
+          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                       platform::CPUPlace(), data_ptr, data_size);
+          t.ResetHolder(shared_writer_holder);
+          // 6. append to result list
+          tensors.append(t);
+        }
+        return tensors;
+      },
+      py::return_value_policy::take_ownership);
+
+  m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
+    for (size_t i = 0; i < tensor_list.size(); ++i) {
+      auto t = tensor_list[i].cast<framework::LoDTensor>();
+      auto *mmap_writer_allocation =
+          dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+              t.Holder().get());
+      PADDLE_ENFORCE_NOT_NULL(
+          mmap_writer_allocation,
+          platform::errors::NotFound("The shared memory of LoDTensor in "
+                                     "DataLoader's child process has been "
+                                     "released."));
+      memory::allocation::MemoryMapFdSet::Instance().Remove(
+          mmap_writer_allocation->ipc_name());
+    }
+  });
+
+  m.def("_cleanup_mmap_fds",
+        []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
+#endif
+
   py::class_<imperative::detail::BackwardStrategy> backward_strategy(
       m, "BackwardStrategy", R"DOC(
 
@@ -277,19 +357,6 @@ void BindImperative(py::module *m_ptr) {
           imperative::SetCurrentTracer(tracer);
         });
 
-#ifndef _WIN32
-  // Dygraph DataLoader signal handler
-  m.def("_set_process_pid", [](int64_t key, pid_t pid) {
-    imperative::SetLoadProcessPID(key, pid);
-  });
-  m.def("_erase_process_pid",
-        [](int64_t key) { imperative::EraseLoadProcessPID(key); });
-  m.def("_set_process_signal_handler",
-        []() { imperative::SetLoadProcessSignalHandler(); });
-  m.def("_throw_error_if_process_failed",
-        []() { imperative::ThrowErrorIfLoadProcessFailed(); });
-#endif
-
   py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
       m, "VarBase",
       R"DOC()DOC")
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 03078a767b..a0cf35ea29 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -47,6 +47,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -860,7 +861,58 @@ PYBIND11_MODULE(core_noavx, m) {
         }
         dst.set_lod(self.lod());
         return dst;
+#ifdef _WIN32
       });
+#else
+           })
+      .def(py::pickle(
+          [](const LoDTensor &t) {  // __getstate__
+            auto holder = t.Holder();
+            PADDLE_ENFORCE_EQ(
+              platform::is_cpu_place(holder->place()), true,
+              platform::errors::PreconditionNotMet(
+                  "LoDTensor is not on CPU."
+                  "Now only LoDTensor on CPU can be serialized."));
+            auto* mmap_writer_allocation =
+              dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+                holder.get());
+            PADDLE_ENFORCE_NOT_NULL(mmap_writer_allocation,
+              platform::errors::PreconditionNotMet(
+                "LoDTensor is not in shared memory."
+                "Now only LoDTensor on shared memory can be serialized."));
+            int type_idx = static_cast<int>(t.type());
+
+            return py::make_tuple(mmap_writer_allocation->ipc_name(),
+                                  mmap_writer_allocation->size(),
+                                  type_idx, vectorize(t.dims()), t.lod());
+          },
+          [](py::tuple t) {  // __setstate__
+            if (t.size() != 5)
+              throw std::runtime_error("Invalid LoDTensor state!");
+
+            // 1. Create a new C++ instance
+            LoDTensor tensor;
+
+            // 2. Rebuild Allocation
+            const std::string &ipc_name = t[0].cast<std::string>();
+            size_t size = t[1].cast<size_t>();
+            auto shared_reader_holder =
+              memory::allocation::RebuildMemoryMapReaderAllocation(
+                ipc_name, size);
+
+            // 3. Maintain global fd set
+            VLOG(3) << "LoDTensor ipc name: " << ipc_name;
+            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+
+            // 4. Rebuild LoDTensor
+            tensor.ResetHolderWithType(shared_reader_holder,
+              static_cast<proto::VarType::Type>(t[2].cast<int>()));
+            tensor.Resize(make_ddim(t[3].cast<std::vector<int>>()));
+            tensor.set_lod(t[4].cast<framework::LoD>());
+
+            return tensor;
+          }));
+#endif
 
   py::class_<SelectedRows>(m, "SelectedRows")
       .def("__init__",
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 067e733094..71ed95a906 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -189,6 +189,9 @@ if avx_supported():
             from .core_avx import _erase_process_pid
             from .core_avx import _set_process_signal_handler
             from .core_avx import _throw_error_if_process_failed
+            from .core_avx import _convert_to_tensor_list
+            from .core_avx import _cleanup_mmap_fds
+            from .core_avx import _remove_tensor_list_mmap_fds
     except Exception as e:
         if has_avx_core:
             raise e
@@ -230,6 +233,9 @@ if load_noavx:
             from .core_noavx import _erase_process_pid
             from .core_noavx import _set_process_signal_handler
             from .core_noavx import _throw_error_if_process_failed
+            from .core_noavx import _convert_to_tensor_list
+            from .core_noavx import _cleanup_mmap_fds
+            from .core_noavx import _remove_tensor_list_mmap_fds
     except Exception as e:
         if has_noavx_core:
             sys.stderr.write(
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index b6723bfe90..ef99b82b02 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -27,6 +27,8 @@ import logging
 from .dataset import DatasetBase, InMemoryDataset
 
 ### Dygraph DataLoader configs ###
+import atexit
+import os
 import multiprocessing
 import signal
 # NOTE: queue has a different name in python2 and python3
@@ -34,9 +36,13 @@ if sys.version_info[0] == 2:
     import Queue as queue
 else:
     import queue
-# NOTE: [ avoid hanging ] These value is used in getting data from another process
-QUEUE_GET_TIMEOUT = 5
-MAX_GET_FAILED_TIME = 12
+# NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
+QUEUE_GET_TIMEOUT = 60
+
+# NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading,
+# the data in the queue needs to be popped. Then the LoDTensor read by the main process
+# from the child process will automatically clear the memory-mapped file.
+multiprocess_queue_set = set()
 
 __all__ = ['PyReader', 'DataLoader']
 
@@ -58,6 +64,82 @@ def _convert_places(places):
     return ret
 
 
+def _clear_multiprocess_queue_set():
+    global multiprocess_queue_set
+    for data_queue in multiprocess_queue_set:
+        while True:
+            try:
+                data_queue.get_nowait()
+            except queue.Empty:
+                break
+
+
+# NOTE: main process clear function at exit
+def _cleanup():
+    # NOTE: inter-process Queue shared memory objects clear function
+    _clear_multiprocess_queue_set()
+    # NOTE: main process memory map files clear funciton
+    core._cleanup_mmap_fds()
+
+
+# NOTE used for register a function to be executed at interpreter exit.
+class CleanupFuncRegistrar():
+    # Record the cleanup functions that have been executed
+    _executed_func_set = set()
+    # Record the cleanup functions that have been registered
+    _registered_func_set = set()
+
+    @classmethod
+    def register(cls, function, signals=[signal.SIGTERM]):
+        def _func_exectuor():
+            if function not in cls._executed_func_set:
+                try:
+                    function()
+                finally:
+                    cls._executed_func_set.add(function)
+
+        def _func_register(function):
+            if not callable(function):
+                raise TypeError("%s is not callable object." % (function))
+            # check function object whether hash-able
+            set([function])
+            if function not in cls._registered_func_set:
+                atexit.register(_func_exectuor)
+                cls._registered_func_set.add(function)
+
+        def _signal_handler(signum=None, frame=None):
+            _func_exectuor()
+            if signum is not None:
+                if signum == signal.SIGINT:
+                    raise KeyboardInterrupt
+                sys.exit(signum)
+
+        def _signal_register(signals):
+            signals = set(signals)
+            for sig in signals:
+                orig_handler = signal.signal(sig, _signal_handler)
+                if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN):
+                    if (sig == signal.SIGINT and
+                            orig_handler is signal.default_int_handler):
+                        continue
+                    if orig_handler not in cls._registered_func_set:
+                        atexit.register(orig_handler)
+                        cls._registered_func_set.add(orig_handler)
+
+        # deal with signals
+        _signal_register(signals)
+        # deal with function
+        _func_register(function)
+
+
+# NOTE: [ mmap files clear ] When the main process exits unexpectedly, the remaining
+# shared memory objects in the inter-process Queue and the main process (mostly in the
+# BlockingQueue) may not be completely released, resulting in the corresponding
+# memory-mapped file remaining on the disk (/dev/shm), so register this function
+# to clean up shared memory objects in these two queues before the python interpreter exits.
+CleanupFuncRegistrar.register(_cleanup)
+
+
 class DataLoaderBase(object):
     def __init__(self):
         self._places = None
@@ -383,6 +465,16 @@ class DygraphGeneratorLoader(DataLoaderBase):
     def iterable(self):
         return self._iterable
 
+    def _clear_and_remove_data_queue(self):
+        if self._data_queue is not None:
+            while True:
+                try:
+                    self._data_queue.get_nowait()
+                except queue.Empty:
+                    break
+            global multiprocess_queue_set
+            multiprocess_queue_set.remove(self._data_queue)
+
     def _wait_thread_ends(self):
         thread = self._thread
         if thread is not None:
@@ -392,12 +484,25 @@ class DygraphGeneratorLoader(DataLoaderBase):
     def _wait_process_ends(self):
         process = self._process
         if process is not None:
-            self._data_queue.cancel_join_thread()
-            self._data_queue.close()
             process.join()
             # erase process id
             core._erase_process_pid(id(self))
 
+    def _set_child_signal_handler(self):
+        core._set_process_pid(id(self), self._process.pid)
+        current_handler = signal.getsignal(signal.SIGCHLD)
+        if not callable(current_handler):
+            current_handler = None
+
+        def __handler__(signum, frame):
+            # NOTE: Here the signum is SIGDHLD, when the child process exits, this handler
+            # will be called whenever the child process exits normally or abnormally.
+            core._throw_error_if_process_failed()
+            if current_handler is not None:
+                current_handler(signum, frame)
+
+        signal.signal(signal.SIGCHLD, __handler__)
+
     def _init_iterable(self):
         self._wait_thread_ends()
         if self._use_multiprocess:
@@ -414,8 +519,13 @@ class DygraphGeneratorLoader(DataLoaderBase):
 
     def _start(self):
         if self._use_multiprocess:
-            # Set data_queue and process
+            # clear old _data_queue and remove it from multiprocess_queue_set
+            self._clear_and_remove_data_queue()
+            # set data_queue and process
             self._data_queue = multiprocessing.Queue(self._capacity)
+            # add _data_queue into global queue set
+            global multiprocess_queue_set
+            multiprocess_queue_set.add(self._data_queue)
             self._process = multiprocessing.Process(
                 target=self._reader_process_loop)
             self._process.daemon = True
@@ -473,28 +583,13 @@ class DygraphGeneratorLoader(DataLoaderBase):
                 " to locate the data causes this issue.\n\t* Please consider using "
                 "'fluid.create_lod_tensor' to convert it to a LoD-Tensor.")
 
-    def _set_child_signal_handler(self):
-        core._set_process_pid(id(self), self._process.pid)
-        current_handler = signal.getsignal(signal.SIGCHLD)
-        if not callable(current_handler):
-            current_handler = None
-
-        def __handler__(signum, frame):
-            core._throw_error_if_process_failed()
-            if current_handler is not None:
-                current_handler(signum, frame)
-
-        signal.signal(signal.SIGCHLD, __handler__)
-
     def _exit_thread_expectedly(self):
         self._thread_done_event.set()
         self._blocking_queue.close()
-        self._data_queue.close()
 
     def _exit_thread_unexpectedly(self):
         self._thread_done_event.set()
         self._blocking_queue.kill()
-        self._data_queue.close()
         logging.error("DataLoader reader thread raised an exception!")
 
     def _reader_process_loop(self):
@@ -502,23 +597,29 @@ class DygraphGeneratorLoader(DataLoaderBase):
             # set signal handler
             core._set_process_signal_handler()
 
-            for sample in self._batch_reader():
-                if sample is None:
-                    raise ValueError(
-                        "Sample in reader is None. Please check whether your dataset is valid."
-                    )
-                self._data_queue.put(sample)
+            # child process clear function at exit
+            def _cleanup():
+                # clear memory map files in child process
+                core._cleanup_mmap_fds()
+
+            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+            # some shared memory objects may have been applied for but have not yet
+            # been put into the inter-process Queue. This part of the object needs
+            # to be cleaned up when the process ends.
+            CleanupFuncRegistrar.register(_cleanup)
+
+            for batch in self._batch_reader():
+                tensor_list = core._convert_to_tensor_list(batch)
+                self._data_queue.put(tensor_list)
+                core._remove_tensor_list_mmap_fds(tensor_list)
             self._data_queue.put(None)
         except KeyboardInterrupt:
             # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
             pass
         except:
-            self._data_queue.cancel_join_thread()
-            self._data_queue.close()
             six.reraise(*sys.exc_info())
 
     def _reader_thread_loop_with_process(self):
-        get_sample_try_time = 0
         while not self._thread_done_event.is_set():
             try:
                 # NOTE: [ avoid hanging ] Even with carefully designed data dependencies 
@@ -526,33 +627,25 @@ class DygraphGeneratorLoader(DataLoaderBase):
                 # still happen when data in queue is corrupted (e.g., due to 
                 # Queue.cancel_join_thread or unexpected exit). So we set a timeout whenever 
                 # we try to get data from `data_queue`
-                sample = self._data_queue.get(timeout=QUEUE_GET_TIMEOUT)
-                get_sample_try_time = 0
+                # NOTE: [ avoid failed quickly ] Here, the time setting of QUEUE_GET_TIMEOUT
+                # is relatively long, currently it is 60 seconds, because in some models,
+                # if the reader child process starts with a heavy burden, the child process
+                # has no enough time to put the data in the queue when the main process
+                # start trying to get data from queue. At this time, the child thread needs
+                # to wait slightly longer
+                tensor_list = self._data_queue.get(timeout=QUEUE_GET_TIMEOUT)
             except queue.Empty:
-                get_sample_try_time += 1
-                if get_sample_try_time > MAX_GET_FAILED_TIME:
-                    self._exit_thread_unexpectedly()
-                    raise RuntimeError(
-                        "DataLoader reader thread has not read data for a long time (60s)."
-                    )
-                else:
-                    # NOTE: [ avoid failed quickly ] Sometimes if the reader child process has a heavy burden,
-                    # the child process has no enough time to put the data in the queue when the main process
-                    # start trying to get data from queue. At this time, failure to read data should not be
-                    # counted as a fatal error, there should be a certain number of attempts.
-                    continue
+                self._exit_thread_unexpectedly()
+                raise RuntimeError(
+                    "DataLoader reader thread has not read data for a long time (60s)."
+                )
 
             if not self._thread_done_event.is_set():
-                if sample is not None:
+                if tensor_list is not None:
                     try:
                         array = core.LoDTensorArray()
-                        for item in sample:
-                            if not isinstance(item, core.LoDTensor):
-                                self._check_input_array(item)
-                                tmp = core.LoDTensor()
-                                tmp.set(item, core.CPUPlace())
-                                item = tmp
-                            array.append(item)
+                        for tensor in tensor_list:
+                            array.append(tensor)
                         if not self._blocking_queue.push(array):
                             self._blocking_queue.close()
                     except:
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 3a503396ea..7bd2eabaf5 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -200,6 +200,8 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_process)
+  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_fds_clear)
+  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
   list(REMOVE_ITEM TEST_OPS test_imperative_signal_handler)
 endif()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader.py
index 50a7e21609..eabcdc4b7c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader.py
@@ -44,7 +44,7 @@ def batch_generator_creator(batch_size, batch_num):
     return __reader__
 
 
-class TestDygraphhDataLoader(unittest.TestCase):
+class TestDygraphDataLoader(unittest.TestCase):
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
index ca3995d602..d317d943dc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
@@ -27,7 +27,7 @@ def get_random_images_and_labels(image_shape, label_shape):
     return image, label
 
 
-class TestDygraphhDataLoaderWithException(unittest.TestCase):
+class TestDygraphDataLoaderWithException(unittest.TestCase):
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
@@ -63,7 +63,7 @@ class TestDygraphhDataLoaderWithException(unittest.TestCase):
                 exception = ex
             self.assertIsNotNone(exception)
 
-    def test_multi_process_with_thread_expection(self):
+    def test_multi_process_with_process_expection(self):
         def error_sample_genarator(batch_num):
             def __reader__():
                 for _ in range(batch_num):
@@ -81,8 +81,6 @@ class TestDygraphhDataLoaderWithException(unittest.TestCase):
                 for _ in loader():
                     print("test_multi_process_with_thread_expection")
             except core.EnforceNotMet as ex:
-                self.assertIn("Blocking queue is killed",
-                              cpt.get_exception_message(ex))
                 exception = ex
             self.assertIsNotNone(exception)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
new file mode 100644
index 0000000000..ba98a343a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import signal
+import unittest
+import multiprocessing
+import time
+
+import paddle.compat as cpt
+
+if sys.version_info[0] == 2:
+    import Queue as queue
+else:
+    import queue
+
+from paddle.fluid.reader import multiprocess_queue_set, _cleanup, CleanupFuncRegistrar
+
+# NOTE: These special functions cannot be detected by the existing coverage mechanism,
+# so the following unittests are added for these internal functions.
+
+
+class TestDygraphDataLoaderCleanUpFunc(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+
+    def test_clear_queue_set(self):
+        test_queue = queue.Queue(self.capacity)
+        global multiprocess_queue_set
+        multiprocess_queue_set.add(test_queue)
+        for i in range(0, self.capacity):
+            test_queue.put(i)
+        _cleanup()
+
+
+class TestRegisterExitFunc(unittest.TestCase):
+    # This function does not need to be implemented in this case
+    def none_func(self):
+        pass
+
+    def test_not_callable_func(self):
+        exception = None
+        try:
+            CleanupFuncRegistrar.register(5)
+        except TypeError as ex:
+            self.assertIn("is not callable", cpt.get_exception_message(ex))
+            exception = ex
+        self.assertIsNotNone(exception)
+
+    def test_old_handler_for_sigint(self):
+        CleanupFuncRegistrar.register(
+            function=self.none_func, signals=[signal.SIGINT])
+
+    def test_signal_wrapper_by_sigchld(self):
+        # This function does not need to be implemented in this case
+        def __test_process__():
+            pass
+
+        CleanupFuncRegistrar.register(
+            function=self.none_func, signals=[signal.SIGCHLD])
+
+        exception = None
+        try:
+            test_process = multiprocessing.Process(target=__test_process__)
+            test_process.start()
+            time.sleep(3)
+        except SystemExit as ex:
+            exception = ex
+        self.assertIsNotNone(exception)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
new file mode 100644
index 0000000000..664d4078d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+def get_random_images_and_labels(image_shape, label_shape):
+    image = np.random.random(size=image_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('int64')
+    return image, label
+
+
+def batch_generator_creator(batch_size, batch_num):
+    def __reader__():
+        for _ in range(batch_num):
+            batch_image, batch_label = get_random_images_and_labels(
+                [batch_size, 784], [batch_size, 1])
+            yield batch_image, batch_label
+
+    return __reader__
+
+
+class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 8
+        self.batch_num = 100
+        self.epoch_num = 2
+        self.capacity = 50
+
+    def prepare_data_loader(self):
+        loader = fluid.io.DataLoader.from_generator(
+            capacity=self.capacity, use_multiprocess=True)
+        loader.set_batch_generator(
+            batch_generator_creator(self.batch_size, self.batch_num),
+            places=fluid.CPUPlace())
+        return loader
+
+    def run_one_epoch_with_break(self, loader):
+        for step_id, data in enumerate(loader()):
+            image, label = data
+            relu = fluid.layers.relu(image)
+            self.assertEqual(image.shape, [self.batch_size, 784])
+            self.assertEqual(label.shape, [self.batch_size, 1])
+            self.assertEqual(relu.shape, [self.batch_size, 784])
+            if step_id == 30:
+                break
+
+    def test_data_loader_break(self):
+        with fluid.dygraph.guard():
+            loader = self.prepare_data_loader()
+            for _ in range(self.epoch_num):
+                self.run_one_epoch_with_break(loader)
+                break
+
+    def test_data_loader_continue_break(self):
+        with fluid.dygraph.guard():
+            loader = self.prepare_data_loader()
+            for _ in range(self.epoch_num):
+                self.run_one_epoch_with_break(loader)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 8ca1460741..051cb3b5bc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -16,6 +16,7 @@ import sys
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+from paddle.fluid import core
 
 if sys.version_info[0] == 2:
     import Queue as queue
@@ -41,7 +42,7 @@ def batch_generator_creator(batch_size, batch_num):
 
 # NOTE: coverage CI can't cover child process code, so need these test.
 # Here test child process loop function in main process
-class TestDygraphhDataLoaderProcess(unittest.TestCase):
+class TestDygraphDataLoaderProcess(unittest.TestCase):
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
@@ -77,7 +78,7 @@ class TestDygraphhDataLoaderProcess(unittest.TestCase):
             exception = None
             try:
                 loader._reader_process_loop()
-            except AttributeError as ex:
+            except core.EnforceNotMet as ex:
                 exception = ex
             self.assertIsNotNone(exception)
 
-- 
GitLab