Fix dygraph mem leak (#18082)

* fix dygraph mem leak, test=develop * polish msg, test=develop

Fix dygraph mem leak (#18082)
* fix dygraph mem leak, test=develop * polish msg, test=develop
25ab23be · Zeng Jinle · GitHub · 1c6e5606 · 25ab23be · 25ab23be
11 changed file
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
+cc_library(imperative_flag SRCS flags.cc DEPS gflags) 
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
 cc_library(engine SRCS engine.cc)
 cc_library(imperative_profiler SRCS profiler.cc)

--- a/paddle/fluid/imperative/flags.cc
+++ b/paddle/fluid/imperative/flags.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/imperative/flags.h"
+#include "gflags/gflags.h"
+DEFINE_uint64(dygraph_debug, 0,
+              "Debug level of dygraph. This flag is not "
+              "open to users");
+namespace paddle {
+namespace imperative {
+bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; }
+uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; }
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/flags.h
+++ b/paddle/fluid/imperative/flags.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cstdint>
+namespace paddle {
+namespace imperative {
+extern bool IsDebugEnabled();
+extern uint64_t GetDebugLevel();
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -34,6 +34,27 @@
 namespace paddle {
 namespace imperative {
+void ThreadSafeNameSet::Insert(const std::string& name) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  set_.insert(name);
+}
+void ThreadSafeNameSet::Remove(const std::string& name) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto iter = set_.find(name);
+  PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name);
+  set_.erase(iter);
+}
+std::vector<std::string> ThreadSafeNameSet::Names() const {
+  std::lock_guard<std::mutex> guard(mtx_);
+  return std::vector<std::string>(set_.begin(), set_.end());
+}
+ThreadSafeNameSet VarBase::name_set_;
+std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
 using framework::Variable;
 namespace detail {

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,8 +14,11 @@
 #pragma once
-#include <map>            // NOLINT
+#include <cstdint>
-#include <memory>         // NOLINT
+#include <map>     // NOLINT
+#include <memory>  // NOLINT
+#include <mutex>   // NOLINT
+#include <set>
 #include <string>         // NOLINT
 #include <unordered_map>  // NOLINT
 #include <utility>
@@ -34,6 +37,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/flags.h"
 namespace paddle {
 namespace imperative {
@@ -108,6 +112,19 @@ class PreparedOp {
 class OpBase;
+class ThreadSafeNameSet {
+ public:
+  void Insert(const std::string& name);
+  void Remove(const std::string& name);
+  std::vector<std::string> Names() const;
+ private:
+  std::multiset<std::string> set_;
+  mutable std::mutex mtx_;
+};
 /* The wrapper for Variable which holds a Variable and a VarBase of its
 * gradient. This object should be managed totally by Python intepreter.
 *
@@ -115,6 +132,8 @@ class OpBase;
 */
 class VarBase {
 public:
+  static std::vector<std::string> AliveVarNames();
  // Internal interface, create VarBase from exist variable
  VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
          VarBase* grad, bool stop_gradient)
@@ -180,6 +199,10 @@ class VarBase {
    }
    VLOG(8) << "create varbase: " << name_ << " type: " << dtype
            << " place: " << place << "Stop gradient: " << stop_gradient_;
+    if (IsDebugEnabled()) {
+      name_set_.Insert(name_);
+    }
  }
 public:
@@ -187,6 +210,9 @@ class VarBase {
    pre_op_ = nullptr;
    pre_op_out_idx_ = -1;
    VLOG(8) << "destruct varbase: " << name_;
+    if (IsDebugEnabled()) {
+      name_set_.Remove(name_);
+    }
  }
  inline void SetName(const std::string& name) { name_ = name; }
@@ -297,6 +323,9 @@ class VarBase {
  OpBase* pre_op_;
  std::string pre_op_out_name_;
  int pre_op_out_idx_;
+  // A private flag to check memory leak
+  static ThreadSafeNameSet name_set_;
 };
 /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -194,8 +194,13 @@ void BindImperative(pybind11::module *m_ptr) {
  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
+  m.def("_is_dygraph_debug_enabled",
+        []() { return imperative::IsDebugEnabled(); });
+  m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
      m, "VarBase", R"DOC()DOC")
+      .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
      .def(
          py::init<const std::string &, paddle::framework::proto::VarType::Type,
                   const std::vector<int64_t>, const paddle::platform::CPUPlace,

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -143,7 +143,7 @@ def __bootstrap__():
        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
        'enable_parallel_graph', 'fuse_parameter_groups_size',
        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname'
+        'tracer_profile_fname', 'dygraph_debug'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')

--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -57,6 +57,8 @@ if 'avx' in get_cpu_info()['flags']:
        from .core_avx import _set_eager_deletion_mode
        from .core_avx import _set_fuse_parameter_group_size
        from .core_avx import _set_fuse_parameter_memory_size
+        from .core_avx import _is_dygraph_debug_enabled
+        from .core_avx import _dygraph_debug_level
    except ImportError:
        sys.stderr.write(
            'WARNING: Can not import avx core. You may not build with AVX, '
@@ -78,6 +80,8 @@ if load_noavx:
        from .core_noavx import _set_eager_deletion_mode
        from .core_noavx import _set_fuse_parameter_group_size
        from .core_noavx import _set_fuse_parameter_memory_size
+        from .core_noavx import _is_dygraph_debug_enabled
+        from .core_noavx import _dygraph_debug_level
    except ImportError as error:
        sys.exit("Error: Can not load core_noavx.* ." +
                 error.__class__.__name__)

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -14,10 +14,12 @@
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
 import contextlib
 import numpy as np
+import os
 from paddle.fluid import core
 from paddle.fluid import framework
 from .tracer import Tracer
+import logging
 __all__ = [
    'enabled',
@@ -136,6 +138,21 @@ def guard(place=None):
                    yield
+def _print_debug_msg():
+    if not core._is_dygraph_debug_enabled():
+        logging.warn(
+            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
+        )
+        return
+    unique_name_size = len(framework.unique_name.generator.ids)
+    tracer_var_size = len(framework._dygraph_tracer()._vars)
+    alive_cpp_var_size = len(core.VarBase._alive_vars())
+    logging.warn(
+        'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
+        .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
 def to_variable(value, block=None, name=None):
    """
    This function will create a variable from ndarray

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -60,7 +60,7 @@ class LearningRateDecay(object):
            shape=[1],
            value=float(lr),
            dtype=self.dtype,
-            persistable=True)
+            persistable=False)
        return lr
    def step(self):

--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -79,7 +79,7 @@ def generate(key):
 # FIXME(zjl): The previous naming rule in static graph would
 # cause memory leak in dygraph mode. It is because the previous
-# nameing rule would use `conv_0.tmp` as the key, and in dygraph
+# naming rule would use `conv_0.tmp` as the key, and in dygraph
 # mode, `conv_i` increases as batch increases. Thus, keys would
 # increase in a way like `conv_0.tmp`, `conv_1.tmp`, .... 
 # Not find a better way to fix this bug in dygraph mode. In TF,
@@ -87,7 +87,7 @@ def generate(key):
 # PyTorch, there is no variable name at all. Maybe we should
 # discard variable name in dygraph mode.
 #
-# Another concern is that save/load inference. Usually, user
+# Another concern is that save/load interfaces. Usually, user
 # would save model in static graph mode, and load it in dygraph
 # mode. Therefore, we keep the variable name of Parameter currently.
 #