未验证 提交 515c42ec 编写于 作者: H Houjiang Chen 提交者: GitHub

Fix memory leak if tensor was created outside of global scope, and fix gemv bugs (#1581)

* Fix memory leak if tensor was created outside of global scope

* Fix gemv bugs
上级 9061c70b
......@@ -27,14 +27,20 @@ Scope &Scope::NewScope() const {
return *kids_.back();
}
Variable *Scope::Var() {
auto *pvar = new Variable;
unnamed_vars_.push_back(pvar);
return pvar;
}
Variable *Scope::Var(const std::string &name) {
auto *pvar = FindVarLocally(name);
if (pvar != nullptr) {
return pvar;
}
pvar = new Variable;
vars_[name] = pvar;
pvar->name_ = vars_.find(name)->first;
named_vars_[name] = pvar;
pvar->name_ = named_vars_.find(name)->first;
return pvar;
}
......@@ -47,7 +53,7 @@ Variable *Scope::FindVar(const std::string &name) const {
}
const Scope *Scope::FindScope(const Variable *var) const {
for (auto &name_var : vars_) {
for (auto &name_var : named_vars_) {
if (name_var.second == var) {
return this;
}
......@@ -64,8 +70,8 @@ void Scope::DropKids() {
std::vector<std::string> Scope::LocalVarNames() const {
std::vector<std::string> known_vars;
known_vars.reserve(vars_.size());
for (auto &name_var : vars_) {
known_vars.reserve(named_vars_.size());
for (auto &name_var : named_vars_) {
known_vars.emplace_back(name_var.first);
}
return known_vars;
......@@ -79,10 +85,10 @@ void Scope::DeleteScope(Scope *scope) const {
void Scope::EraseVars(const std::vector<std::string> &var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) {
for (auto it = named_vars_.begin(); it != named_vars_.end();) {
if (var_set.find(it->first) != var_set.end()) {
delete it->second;
it = vars_.erase(it);
it = named_vars_.erase(it);
} else {
++it;
}
......@@ -91,21 +97,21 @@ void Scope::EraseVars(const std::vector<std::string> &var_names) {
void Scope::Rename(const std::string &origin_name,
const std::string &new_name) const {
auto origin_it = vars_.find(origin_name);
if (origin_it == vars_.end()) {
auto origin_it = named_vars_.find(origin_name);
if (origin_it == named_vars_.end()) {
return;
}
auto new_it = vars_.find(new_name);
if (new_it != vars_.end()) {
auto new_it = named_vars_.find(new_name);
if (new_it != named_vars_.end()) {
return;
}
vars_[new_name] = origin_it->second;
vars_.erase(origin_it);
named_vars_[new_name] = origin_it->second;
named_vars_.erase(origin_it);
}
Variable *Scope::FindVarLocally(const std::string &name) const {
auto it = vars_.find(name);
if (it != vars_.end()) {
auto it = named_vars_.find(name);
if (it != named_vars_.end()) {
return it->second;
}
return nullptr;
......@@ -122,7 +128,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring,
int temp = 9999;
auto len0 = substring.length();
for (auto pair : vars_) {
for (auto pair : named_vars_) {
if (pair.first.find(substring) == 0) {
v.push_back(pair.second);
auto len1 = pair.first.length();
......@@ -138,7 +144,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring,
void Scope::print_vars() {
DLOG << "====================start to print variables=================";
for (auto pair : vars_) {
for (auto pair : named_vars_) {
DLOG << pair.first;
}
DLOG << "==================complete printing variables================";
......
......@@ -32,15 +32,17 @@ class Scope {
Scope() = default;
~Scope() {
for (auto &var : vars_) {
DropKids();
// clear named variables
for (auto &var : named_vars_) {
delete var.second;
}
vars_.clear();
for (auto kid : kids_) {
delete kid;
named_vars_.clear();
// clear unnamed variables
for (auto &var : unnamed_vars_) {
delete var;
}
kids_.clear();
unnamed_vars_.clear();
DropKids();
#ifdef PADDLE_MOBILE_CL
delete cl_scope_;
......@@ -49,12 +51,12 @@ class Scope {
Scope &NewScope() const;
/// Create a variable without name if it doesn't exist.
Variable *Var();
/// Create a variable with given name if it doesn't exist.
Variable *Var(const std::string &name);
/// Create a variable with a scope-unique name.
Variable *Var(std::string *name = nullptr);
void EraseVars(const std::vector<std::string> &var_names);
/// Find a variable in the scope or any of its ancestors. Returns
......@@ -98,7 +100,8 @@ class Scope {
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const *parent) : parent_(parent) {}
mutable std::unordered_map<std::string, Variable *> vars_;
mutable std::unordered_map<std::string, Variable *> named_vars_;
mutable std::vector<Variable *> unnamed_vars_;
mutable std::list<Scope *> kids_;
Scope const *parent_{nullptr};
......
......@@ -45,13 +45,13 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
// Tensor *new_scale = new Tensor();
// Tensor *new_bias = new Tensor();
auto *new_scale = param->CreateNewScale<framework::LoDTensor>();
auto *new_bias = param->CreateNewBiase<framework::LoDTensor>();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
Variable *scale_var = param->GetScope()->Var();
Variable *bias_var = param->GetScope()->Var();
LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) *
......
......@@ -43,10 +43,12 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
auto *new_scale = param->CreateNewScale<framework::LoDTensor>();
auto *new_bias = param->CreateNewBiase<framework::LoDTensor>();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
Variable *scale_var = param->GetScope()->Var();
Variable *bias_var = param->GetScope()->Var();
LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
......
......@@ -64,7 +64,9 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
) {
param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
// transform weight
param->transformed_filter_ = new framework::LoDTensor;
Variable *transformed_var = param->GetScope()->Var();
param->transformed_filter_ =
transformed_var->GetMutable<framework::LoDTensor>();
operators::math::winograd_transform_weight<8, 3>(
*param->Filter(), param->transformed_filter_);
} else if (conv3x3 && param->Groups() == 1 &&
......
......@@ -42,10 +42,12 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
LoDTensor *new_scale = new LoDTensor();
LoDTensor *new_bias = new LoDTensor();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
Variable *scale_var = param->GetScope()->Var();
Variable *bias_var = param->GetScope()->Var();
LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
......
......@@ -438,14 +438,14 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
#else
const int tid = 0;
#endif // _OPENMP
register float *thread_buf_c = buf_c + tid * M;
register const float *in0 = A + n * lda;
register const float *in1 = in0 + lda;
register const float *in2 = in1 + lda;
register const float *in3 = in2 + lda;
register float32x4_t _b = vld1q_f32(B + n);
register float32x4_t _sum0;
register int m = 0;
float *thread_buf_c = buf_c + tid * M;
const float *in0 = A + n * lda;
const float *in1 = in0 + lda;
const float *in2 = in1 + lda;
const float *in3 = in2 + lda;
float32x4_t _b = vld1q_f32(B + n);
float32x4_t _sum0;
int m = 0;
for (; m < M - 3; m += 4) {
float32x4_t _r0 = vld1q_f32(in0 + m);
float32x4_t _r1 = vld1q_f32(in1 + m);
......@@ -495,11 +495,11 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
#else
const int tid = 0;
#endif // _OPENMP
register float *thread_buf_c = buf_c + tid * M;
register const float *in0 = A + n * lda;
register float32x4_t _b = vld1q_dup_f32(B + n);
register float32x4_t _sum0;
register int m = 0;
float *thread_buf_c = buf_c + tid * M;
const float *in0 = A + n * lda;
float32x4_t _b = vld1q_dup_f32(B + n);
float32x4_t _sum0;
int m = 0;
for (; m < M - 3; m += 4) {
float32x4_t _r0 = vld1q_f32(in0 + m);
float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
......@@ -515,47 +515,39 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
// reduction operate for buf_c, sum to C and do left operations
// y := alpha * A' * X + beta * y
// reduction operate: sum multi-threadings result for over-all: A' * X
register float32x4_t _valpha = vdupq_n_f32(alpha);
float32x4_t _valpha = vdupq_n_f32(alpha);
if (beta == 0.f) {
#pragma omp parallel for
for (int m = 0; m < M; m += 4) {
register float32x4_t _sum0 = vld1q_f32(buf_c + m);
float32x4_t _sum0 = vld1q_f32(buf_c + m);
for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += vld1q_f32(buf_c + tid * M + m);
}
vst1q_f32(C + m, _sum0 * _valpha);
}
#pragma omp parallel for
for (int m = (M & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) {
float _sum0 = *(buf_c + m);
for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m);
}
C[m] = _sum0 * alpha;
}
} else { // beta != 0.f
register float32x4_t _vbeta = vdupq_n_f32(beta);
float32x4_t _vbeta = vdupq_n_f32(beta);
#pragma omp parallel for
for (int m = 0; m < M; m += 4) {
register float32x4_t _sum0 = vld1q_f32(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) {
float32x4_t _sum0 = vld1q_f32(buf_c + m);
for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += vld1q_f32(buf_c + tid * M + m);
}
float32x4_t _vc = vld1q_f32(C + m);
vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc);
}
#pragma omp parallel for
for (int m = (m & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m);
}
C[m] = _sum0 * alpha + beta * C[m];
}
#pragma omp parallel for
for (int m = (m & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) {
for (int m = (M & 0xfffffffc); m < M; ++m) {
float _sum0 = *(buf_c + m);
for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m);
}
C[m] = _sum0 * alpha + beta * C[m];
......
......@@ -76,27 +76,11 @@ struct DtypeTensorTrait<GPU_CL> {
class OpParam {
public:
OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope) {
scope_pointer_ = scope;
inputs_ = inputs;
}
template <typename T>
T *CreateNewScale() {
std::string scale_key = Getkey("Scale", inputs_, 0);
auto var = scope_pointer_->Var(scale_key + "_new");
return var->GetMutable<T>();
}
template <typename T>
T *CreateNewBiase() {
std::string biase_key = Getkey("Bias", inputs_, 0);
auto var = scope_pointer_->Var(biase_key + "_new");
return var->GetMutable<T>();
}
const AttributeMap &attrs, Scope *scope)
: scope_(scope) {}
VariableNameMap inputs_;
Scope *scope_pointer_ = nullptr;
Scope *GetScope() const { return scope_; }
Scope *scope_ = nullptr;
#ifdef PADDLE_MOBILE_FPGA_KD
zynqmp::Context &context() { return context_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册