From 253c7f4bba33fe25d35067816d442d2c70382300 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 18 Jun 2019 11:33:55 +0800 Subject: [PATCH] [Cherry-pick][Release/1.5][Bug fix]Fix dygraph mem leak release/1.5 (#18133) * fix dygraph mem leak, test=release/1.5 * polish msg, test=release/1.5 --- paddle/fluid/imperative/CMakeLists.txt | 4 ++- paddle/fluid/imperative/flags.cc | 30 +++++++++++++++++ paddle/fluid/imperative/flags.h | 26 +++++++++++++++ paddle/fluid/imperative/layer.cc | 21 ++++++++++++ paddle/fluid/imperative/layer.h | 33 +++++++++++++++++-- paddle/fluid/pybind/imperative.cc | 5 +++ python/paddle/fluid/__init__.py | 2 +- python/paddle/fluid/core.py | 4 +++ python/paddle/fluid/dygraph/base.py | 17 ++++++++++ .../fluid/dygraph/learning_rate_scheduler.py | 2 +- python/paddle/fluid/unique_name.py | 4 +-- 11 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/imperative/flags.cc create mode 100644 paddle/fluid/imperative/flags.h diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index bd811bd8eb2..73c629fd227 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,7 @@ +cc_library(imperative_flag SRCS flags.cc DEPS gflags) + if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler) cc_library(engine SRCS engine.cc) cc_library(imperative_profiler SRCS profiler.cc) diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc new file mode 100644 index 00000000000..57656d64ab7 --- /dev/null +++ b/paddle/fluid/imperative/flags.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/flags.h" +#include "gflags/gflags.h" + +DEFINE_uint64(dygraph_debug, 0, + "Debug level of dygraph. This flag is not " + "open to users"); + +namespace paddle { +namespace imperative { + +bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; } + +uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; } + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/flags.h b/paddle/fluid/imperative/flags.h new file mode 100644 index 00000000000..094bce831c4 --- /dev/null +++ b/paddle/fluid/imperative/flags.h @@ -0,0 +1,26 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace paddle { +namespace imperative { + +extern bool IsDebugEnabled(); +extern uint64_t GetDebugLevel(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 27463c0470a..fb22d334902 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -34,6 +34,27 @@ namespace paddle { namespace imperative { +void ThreadSafeNameSet::Insert(const std::string& name) { + std::lock_guard guard(mtx_); + set_.insert(name); +} + +void ThreadSafeNameSet::Remove(const std::string& name) { + std::lock_guard guard(mtx_); + auto iter = set_.find(name); + PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name); + set_.erase(iter); +} + +std::vector ThreadSafeNameSet::Names() const { + std::lock_guard guard(mtx_); + return std::vector(set_.begin(), set_.end()); +} + +ThreadSafeNameSet VarBase::name_set_; + +std::vector VarBase::AliveVarNames() { return name_set_.Names(); } + using framework::Variable; namespace detail { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index d0d02f0f424..2fbedd82ea5 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,8 +14,11 @@ #pragma once -#include // NOLINT -#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include #include // NOLINT #include // NOLINT #include @@ -34,6 +37,7 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/imperative/flags.h" namespace paddle { namespace imperative { @@ -108,6 +112,19 @@ class PreparedOp { class OpBase; +class ThreadSafeNameSet { + public: + void Insert(const std::string& name); + + void Remove(const std::string& name); + + std::vector Names() const; + + private: + std::multiset set_; + mutable std::mutex mtx_; +}; + /* The wrapper for Variable which holds a Variable and a VarBase of its * gradient. This object should be managed totally by Python intepreter. * @@ -115,6 +132,8 @@ class OpBase; */ class VarBase { public: + static std::vector AliveVarNames(); + // Internal interface, create VarBase from exist variable VarBase(const std::string& name, std::unique_ptr var, VarBase* grad, bool stop_gradient) @@ -180,6 +199,10 @@ class VarBase { } VLOG(8) << "create varbase: " << name_ << " type: " << dtype << " place: " << place << "Stop gradient: " << stop_gradient_; + + if (IsDebugEnabled()) { + name_set_.Insert(name_); + } } public: @@ -187,6 +210,9 @@ class VarBase { pre_op_ = nullptr; pre_op_out_idx_ = -1; VLOG(8) << "destruct varbase: " << name_; + if (IsDebugEnabled()) { + name_set_.Remove(name_); + } } inline void SetName(const std::string& name) { name_ = name; } @@ -297,6 +323,9 @@ class VarBase { OpBase* pre_op_; std::string pre_op_out_name_; int pre_op_out_idx_; + + // A private flag to check memory leak + static ThreadSafeNameSet name_set_; }; /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index c4706a648ab..0d15b9a44d8 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -194,8 +194,13 @@ void BindImperative(pybind11::module *m_ptr) { m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); }); + m.def("_is_dygraph_debug_enabled", + []() { return imperative::IsDebugEnabled(); }); + m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); }); + py::class_>( m, "VarBase", R"DOC()DOC") + .def_static("_alive_vars", &imperative::VarBase::AliveVarNames) .def( py::init, const paddle::platform::CPUPlace, diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 00f97389b70..70252a51e76 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -142,7 +142,7 @@ def __bootstrap__(): 'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', 'enable_parallel_graph', 'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size', - 'tracer_profile_fname' + 'tracer_profile_fname', 'dygraph_debug' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 96163912971..1b2ce77014f 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -57,6 +57,8 @@ if cpuinfo.supports_avx: from .core_avx import _set_eager_deletion_mode from .core_avx import _set_fuse_parameter_group_size from .core_avx import _set_fuse_parameter_memory_size + from .core_avx import _is_dygraph_debug_enabled + from .core_avx import _dygraph_debug_level except ImportError as error: sys.stderr.write( error.__class__.__name__ + @@ -79,6 +81,8 @@ if load_noavx: from .core_noavx import _set_eager_deletion_mode from .core_noavx import _set_fuse_parameter_group_size from .core_noavx import _set_fuse_parameter_memory_size + from .core_noavx import _is_dygraph_debug_enabled + from .core_noavx import _dygraph_debug_level except ImportError as error: sys.exit("Error: Can not load core_noavx.* ." + error.__class__.__name__) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 598facce4b7..133eb6a19c2 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -14,10 +14,12 @@ from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator import contextlib import numpy as np +import os from paddle.fluid import core from paddle.fluid import framework from .tracer import Tracer +import logging __all__ = [ 'enabled', @@ -136,6 +138,21 @@ def guard(place=None): yield +def _print_debug_msg(): + if not core._is_dygraph_debug_enabled(): + logging.warn( + 'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug' + ) + return + + unique_name_size = len(framework.unique_name.generator.ids) + tracer_var_size = len(framework._dygraph_tracer()._vars) + alive_cpp_var_size = len(core.VarBase._alive_vars()) + logging.warn( + 'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}' + .format(unique_name_size, tracer_var_size, alive_cpp_var_size)) + + def to_variable(value, block=None, name=None): """ This function will create a variable from ndarray diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index d28c8d3c1d2..500ab63b0e0 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -60,7 +60,7 @@ class LearningRateDecay(object): shape=[1], value=float(lr), dtype=self.dtype, - persistable=True) + persistable=False) return lr def step(self): diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py index 044dc802dbf..9e3cd063092 100644 --- a/python/paddle/fluid/unique_name.py +++ b/python/paddle/fluid/unique_name.py @@ -79,7 +79,7 @@ def generate(key): # FIXME(zjl): The previous naming rule in static graph would # cause memory leak in dygraph mode. It is because the previous -# nameing rule would use `conv_0.tmp` as the key, and in dygraph +# naming rule would use `conv_0.tmp` as the key, and in dygraph # mode, `conv_i` increases as batch increases. Thus, keys would # increase in a way like `conv_0.tmp`, `conv_1.tmp`, .... # Not find a better way to fix this bug in dygraph mode. In TF, @@ -87,7 +87,7 @@ def generate(key): # PyTorch, there is no variable name at all. Maybe we should # discard variable name in dygraph mode. # -# Another concern is that save/load inference. Usually, user +# Another concern is that save/load interfaces. Usually, user # would save model in static graph mode, and load it in dygraph # mode. Therefore, we keep the variable name of Parameter currently. # -- GitLab