未验证 提交 25ab23be 编写于 作者: Z Zeng Jinle 提交者: GitHub

Fix dygraph mem leak (#18082)

* fix dygraph mem leak, test=develop

* polish msg, test=develop
上级 1c6e5606
cc_library(imperative_flag SRCS flags.cc DEPS gflags)
if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/flags.h"
#include "gflags/gflags.h"
DEFINE_uint64(dygraph_debug, 0,
"Debug level of dygraph. This flag is not "
"open to users");
namespace paddle {
namespace imperative {
bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; }
uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; }
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
namespace paddle {
namespace imperative {
extern bool IsDebugEnabled();
extern uint64_t GetDebugLevel();
} // namespace imperative
} // namespace paddle
......@@ -34,6 +34,27 @@
namespace paddle {
namespace imperative {
void ThreadSafeNameSet::Insert(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_);
set_.insert(name);
}
void ThreadSafeNameSet::Remove(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_);
auto iter = set_.find(name);
PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name);
set_.erase(iter);
}
std::vector<std::string> ThreadSafeNameSet::Names() const {
std::lock_guard<std::mutex> guard(mtx_);
return std::vector<std::string>(set_.begin(), set_.end());
}
ThreadSafeNameSet VarBase::name_set_;
std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
using framework::Variable;
namespace detail {
......
......@@ -14,8 +14,11 @@
#pragma once
#include <map> // NOLINT
#include <memory> // NOLINT
#include <cstdint>
#include <map> // NOLINT
#include <memory> // NOLINT
#include <mutex> // NOLINT
#include <set>
#include <string> // NOLINT
#include <unordered_map> // NOLINT
#include <utility>
......@@ -34,6 +37,7 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/imperative/flags.h"
namespace paddle {
namespace imperative {
......@@ -108,6 +112,19 @@ class PreparedOp {
class OpBase;
class ThreadSafeNameSet {
public:
void Insert(const std::string& name);
void Remove(const std::string& name);
std::vector<std::string> Names() const;
private:
std::multiset<std::string> set_;
mutable std::mutex mtx_;
};
/* The wrapper for Variable which holds a Variable and a VarBase of its
* gradient. This object should be managed totally by Python intepreter.
*
......@@ -115,6 +132,8 @@ class OpBase;
*/
class VarBase {
public:
static std::vector<std::string> AliveVarNames();
// Internal interface, create VarBase from exist variable
VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
VarBase* grad, bool stop_gradient)
......@@ -180,6 +199,10 @@ class VarBase {
}
VLOG(8) << "create varbase: " << name_ << " type: " << dtype
<< " place: " << place << "Stop gradient: " << stop_gradient_;
if (IsDebugEnabled()) {
name_set_.Insert(name_);
}
}
public:
......@@ -187,6 +210,9 @@ class VarBase {
pre_op_ = nullptr;
pre_op_out_idx_ = -1;
VLOG(8) << "destruct varbase: " << name_;
if (IsDebugEnabled()) {
name_set_.Remove(name_);
}
}
inline void SetName(const std::string& name) { name_ = name; }
......@@ -297,6 +323,9 @@ class VarBase {
OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_;
// A private flag to check memory leak
static ThreadSafeNameSet name_set_;
};
/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
......
......@@ -194,8 +194,13 @@ void BindImperative(pybind11::module *m_ptr) {
m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
m.def("_is_dygraph_debug_enabled",
[]() { return imperative::IsDebugEnabled(); });
m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
m, "VarBase", R"DOC()DOC")
.def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
.def(
py::init<const std::string &, paddle::framework::proto::VarType::Type,
const std::vector<int64_t>, const paddle::platform::CPUPlace,
......
......@@ -143,7 +143,7 @@ def __bootstrap__():
'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
'enable_parallel_graph', 'fuse_parameter_groups_size',
'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
'tracer_profile_fname'
'tracer_profile_fname', 'dygraph_debug'
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
......
......@@ -57,6 +57,8 @@ if 'avx' in get_cpu_info()['flags']:
from .core_avx import _set_eager_deletion_mode
from .core_avx import _set_fuse_parameter_group_size
from .core_avx import _set_fuse_parameter_memory_size
from .core_avx import _is_dygraph_debug_enabled
from .core_avx import _dygraph_debug_level
except ImportError:
sys.stderr.write(
'WARNING: Can not import avx core. You may not build with AVX, '
......@@ -78,6 +80,8 @@ if load_noavx:
from .core_noavx import _set_eager_deletion_mode
from .core_noavx import _set_fuse_parameter_group_size
from .core_noavx import _set_fuse_parameter_memory_size
from .core_noavx import _is_dygraph_debug_enabled
from .core_noavx import _dygraph_debug_level
except ImportError as error:
sys.exit("Error: Can not load core_noavx.* ." +
error.__class__.__name__)
......
......@@ -14,10 +14,12 @@
from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
import contextlib
import numpy as np
import os
from paddle.fluid import core
from paddle.fluid import framework
from .tracer import Tracer
import logging
__all__ = [
'enabled',
......@@ -136,6 +138,21 @@ def guard(place=None):
yield
def _print_debug_msg():
if not core._is_dygraph_debug_enabled():
logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
)
return
unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars)
alive_cpp_var_size = len(core.VarBase._alive_vars())
logging.warn(
'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
.format(unique_name_size, tracer_var_size, alive_cpp_var_size))
def to_variable(value, block=None, name=None):
"""
This function will create a variable from ndarray
......
......@@ -60,7 +60,7 @@ class LearningRateDecay(object):
shape=[1],
value=float(lr),
dtype=self.dtype,
persistable=True)
persistable=False)
return lr
def step(self):
......
......@@ -79,7 +79,7 @@ def generate(key):
# FIXME(zjl): The previous naming rule in static graph would
# cause memory leak in dygraph mode. It is because the previous
# nameing rule would use `conv_0.tmp` as the key, and in dygraph
# naming rule would use `conv_0.tmp` as the key, and in dygraph
# mode, `conv_i` increases as batch increases. Thus, keys would
# increase in a way like `conv_0.tmp`, `conv_1.tmp`, ....
# Not find a better way to fix this bug in dygraph mode. In TF,
......@@ -87,7 +87,7 @@ def generate(key):
# PyTorch, there is no variable name at all. Maybe we should
# discard variable name in dygraph mode.
#
# Another concern is that save/load inference. Usually, user
# Another concern is that save/load interfaces. Usually, user
# would save model in static graph mode, and load it in dygraph
# mode. Therefore, we keep the variable name of Parameter currently.
#
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册