提交 b734c439 编写于 作者: H Houjiang Chen 提交者: GitHub

Fix memory leak if tensor was created outside of global scope, and fix gemv bugs (#1581)

* Fix memory leak if tensor was created outside of global scope

* Fix gemv bugs
上级 980b855d
...@@ -27,14 +27,20 @@ Scope &Scope::NewScope() const { ...@@ -27,14 +27,20 @@ Scope &Scope::NewScope() const {
return *kids_.back(); return *kids_.back();
} }
Variable *Scope::Var() {
auto *pvar = new Variable;
unnamed_vars_.push_back(pvar);
return pvar;
}
Variable *Scope::Var(const std::string &name) { Variable *Scope::Var(const std::string &name) {
auto *pvar = FindVarLocally(name); auto *pvar = FindVarLocally(name);
if (pvar != nullptr) { if (pvar != nullptr) {
return pvar; return pvar;
} }
pvar = new Variable; pvar = new Variable;
vars_[name] = pvar; named_vars_[name] = pvar;
pvar->name_ = vars_.find(name)->first; pvar->name_ = named_vars_.find(name)->first;
return pvar; return pvar;
} }
...@@ -47,7 +53,7 @@ Variable *Scope::FindVar(const std::string &name) const { ...@@ -47,7 +53,7 @@ Variable *Scope::FindVar(const std::string &name) const {
} }
const Scope *Scope::FindScope(const Variable *var) const { const Scope *Scope::FindScope(const Variable *var) const {
for (auto &name_var : vars_) { for (auto &name_var : named_vars_) {
if (name_var.second == var) { if (name_var.second == var) {
return this; return this;
} }
...@@ -64,8 +70,8 @@ void Scope::DropKids() { ...@@ -64,8 +70,8 @@ void Scope::DropKids() {
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(vars_.size()); known_vars.reserve(named_vars_.size());
for (auto &name_var : vars_) { for (auto &name_var : named_vars_) {
known_vars.emplace_back(name_var.first); known_vars.emplace_back(name_var.first);
} }
return known_vars; return known_vars;
...@@ -79,10 +85,10 @@ void Scope::DeleteScope(Scope *scope) const { ...@@ -79,10 +85,10 @@ void Scope::DeleteScope(Scope *scope) const {
void Scope::EraseVars(const std::vector<std::string> &var_names) { void Scope::EraseVars(const std::vector<std::string> &var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = named_vars_.begin(); it != named_vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
delete it->second; delete it->second;
it = vars_.erase(it); it = named_vars_.erase(it);
} else { } else {
++it; ++it;
} }
...@@ -91,21 +97,21 @@ void Scope::EraseVars(const std::vector<std::string> &var_names) { ...@@ -91,21 +97,21 @@ void Scope::EraseVars(const std::vector<std::string> &var_names) {
void Scope::Rename(const std::string &origin_name, void Scope::Rename(const std::string &origin_name,
const std::string &new_name) const { const std::string &new_name) const {
auto origin_it = vars_.find(origin_name); auto origin_it = named_vars_.find(origin_name);
if (origin_it == vars_.end()) { if (origin_it == named_vars_.end()) {
return; return;
} }
auto new_it = vars_.find(new_name); auto new_it = named_vars_.find(new_name);
if (new_it != vars_.end()) { if (new_it != named_vars_.end()) {
return; return;
} }
vars_[new_name] = origin_it->second; named_vars_[new_name] = origin_it->second;
vars_.erase(origin_it); named_vars_.erase(origin_it);
} }
Variable *Scope::FindVarLocally(const std::string &name) const { Variable *Scope::FindVarLocally(const std::string &name) const {
auto it = vars_.find(name); auto it = named_vars_.find(name);
if (it != vars_.end()) { if (it != named_vars_.end()) {
return it->second; return it->second;
} }
return nullptr; return nullptr;
...@@ -122,7 +128,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring, ...@@ -122,7 +128,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring,
int temp = 9999; int temp = 9999;
auto len0 = substring.length(); auto len0 = substring.length();
for (auto pair : vars_) { for (auto pair : named_vars_) {
if (pair.first.find(substring) == 0) { if (pair.first.find(substring) == 0) {
v.push_back(pair.second); v.push_back(pair.second);
auto len1 = pair.first.length(); auto len1 = pair.first.length();
...@@ -138,7 +144,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring, ...@@ -138,7 +144,7 @@ std::vector<Variable *> Scope::VarContain(const std::string substring,
void Scope::print_vars() { void Scope::print_vars() {
DLOG << "====================start to print variables================="; DLOG << "====================start to print variables=================";
for (auto pair : vars_) { for (auto pair : named_vars_) {
DLOG << pair.first; DLOG << pair.first;
} }
DLOG << "==================complete printing variables================"; DLOG << "==================complete printing variables================";
......
...@@ -32,15 +32,17 @@ class Scope { ...@@ -32,15 +32,17 @@ class Scope {
Scope() = default; Scope() = default;
~Scope() { ~Scope() {
for (auto &var : vars_) { // clear named variables
DropKids(); for (auto &var : named_vars_) {
delete var.second; delete var.second;
} }
vars_.clear(); named_vars_.clear();
for (auto kid : kids_) { // clear unnamed variables
delete kid; for (auto &var : unnamed_vars_) {
delete var;
} }
kids_.clear(); unnamed_vars_.clear();
DropKids();
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
delete cl_scope_; delete cl_scope_;
...@@ -49,12 +51,12 @@ class Scope { ...@@ -49,12 +51,12 @@ class Scope {
Scope &NewScope() const; Scope &NewScope() const;
/// Create a variable without name if it doesn't exist.
Variable *Var();
/// Create a variable with given name if it doesn't exist. /// Create a variable with given name if it doesn't exist.
Variable *Var(const std::string &name); Variable *Var(const std::string &name);
/// Create a variable with a scope-unique name.
Variable *Var(std::string *name = nullptr);
void EraseVars(const std::vector<std::string> &var_names); void EraseVars(const std::vector<std::string> &var_names);
/// Find a variable in the scope or any of its ancestors. Returns /// Find a variable in the scope or any of its ancestors. Returns
...@@ -98,7 +100,8 @@ class Scope { ...@@ -98,7 +100,8 @@ class Scope {
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const *parent) : parent_(parent) {} explicit Scope(Scope const *parent) : parent_(parent) {}
mutable std::unordered_map<std::string, Variable *> vars_; mutable std::unordered_map<std::string, Variable *> named_vars_;
mutable std::vector<Variable *> unnamed_vars_;
mutable std::list<Scope *> kids_; mutable std::list<Scope *> kids_;
Scope const *parent_{nullptr}; Scope const *parent_{nullptr};
......
...@@ -45,13 +45,13 @@ bool ConvAddBNReluKernel<CPU, float>::Init( ...@@ -45,13 +45,13 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
inv_std_ptr[i] = inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5)); 1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
} }
// Tensor *new_scale = new Tensor();
// Tensor *new_bias = new Tensor();
auto *new_scale = param->CreateNewScale<framework::LoDTensor>();
auto *new_bias = param->CreateNewBiase<framework::LoDTensor>();
auto new_scale_ptr = new_scale->mutable_data<float>({C}); Variable *scale_var = param->GetScope()->Var();
auto new_bias_ptr = new_bias->mutable_data<float>({C}); Variable *bias_var = param->GetScope()->Var();
LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) * new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) *
......
...@@ -43,10 +43,12 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) { ...@@ -43,10 +43,12 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5)); 1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
} }
auto *new_scale = param->CreateNewScale<framework::LoDTensor>(); Variable *scale_var = param->GetScope()->Var();
auto *new_bias = param->CreateNewBiase<framework::LoDTensor>(); Variable *bias_var = param->GetScope()->Var();
auto new_scale_ptr = new_scale->mutable_data<float>({C}); LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
auto new_bias_ptr = new_bias->mutable_data<float>({C}); LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
......
...@@ -64,7 +64,9 @@ void InitBaseConvKernel(ConvParam<CPU> *param) { ...@@ -64,7 +64,9 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
) { ) {
param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT; param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
// transform weight // transform weight
param->transformed_filter_ = new framework::LoDTensor; Variable *transformed_var = param->GetScope()->Var();
param->transformed_filter_ =
transformed_var->GetMutable<framework::LoDTensor>();
operators::math::winograd_transform_weight<8, 3>( operators::math::winograd_transform_weight<8, 3>(
*param->Filter(), param->transformed_filter_); *param->Filter(), param->transformed_filter_);
} else if (conv3x3 && param->Groups() == 1 && } else if (conv3x3 && param->Groups() == 1 &&
......
...@@ -42,10 +42,12 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) { ...@@ -42,10 +42,12 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
inv_std_ptr[i] = inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5)); 1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
} }
LoDTensor *new_scale = new LoDTensor(); Variable *scale_var = param->GetScope()->Var();
LoDTensor *new_bias = new LoDTensor(); Variable *bias_var = param->GetScope()->Var();
auto new_scale_ptr = new_scale->mutable_data<float>({C}); LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
auto new_bias_ptr = new_bias->mutable_data<float>({C}); LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
float *new_scale_ptr = new_scale->mutable_data<float>({C});
float *new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
......
...@@ -438,14 +438,14 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, ...@@ -438,14 +438,14 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
#else #else
const int tid = 0; const int tid = 0;
#endif // _OPENMP #endif // _OPENMP
register float *thread_buf_c = buf_c + tid * M; float *thread_buf_c = buf_c + tid * M;
register const float *in0 = A + n * lda; const float *in0 = A + n * lda;
register const float *in1 = in0 + lda; const float *in1 = in0 + lda;
register const float *in2 = in1 + lda; const float *in2 = in1 + lda;
register const float *in3 = in2 + lda; const float *in3 = in2 + lda;
register float32x4_t _b = vld1q_f32(B + n); float32x4_t _b = vld1q_f32(B + n);
register float32x4_t _sum0; float32x4_t _sum0;
register int m = 0; int m = 0;
for (; m < M - 3; m += 4) { for (; m < M - 3; m += 4) {
float32x4_t _r0 = vld1q_f32(in0 + m); float32x4_t _r0 = vld1q_f32(in0 + m);
float32x4_t _r1 = vld1q_f32(in1 + m); float32x4_t _r1 = vld1q_f32(in1 + m);
...@@ -495,11 +495,11 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, ...@@ -495,11 +495,11 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
#else #else
const int tid = 0; const int tid = 0;
#endif // _OPENMP #endif // _OPENMP
register float *thread_buf_c = buf_c + tid * M; float *thread_buf_c = buf_c + tid * M;
register const float *in0 = A + n * lda; const float *in0 = A + n * lda;
register float32x4_t _b = vld1q_dup_f32(B + n); float32x4_t _b = vld1q_dup_f32(B + n);
register float32x4_t _sum0; float32x4_t _sum0;
register int m = 0; int m = 0;
for (; m < M - 3; m += 4) { for (; m < M - 3; m += 4) {
float32x4_t _r0 = vld1q_f32(in0 + m); float32x4_t _r0 = vld1q_f32(in0 + m);
float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m); float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
...@@ -515,47 +515,39 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha, ...@@ -515,47 +515,39 @@ void sgemv_trans_mx1(const int M, const int N, const float alpha,
// reduction operate for buf_c, sum to C and do left operations // reduction operate for buf_c, sum to C and do left operations
// y := alpha * A' * X + beta * y // y := alpha * A' * X + beta * y
// reduction operate: sum multi-threadings result for over-all: A' * X // reduction operate: sum multi-threadings result for over-all: A' * X
register float32x4_t _valpha = vdupq_n_f32(alpha); float32x4_t _valpha = vdupq_n_f32(alpha);
if (beta == 0.f) { if (beta == 0.f) {
#pragma omp parallel for #pragma omp parallel for
for (int m = 0; m < M; m += 4) { for (int m = 0; m < M; m += 4) {
register float32x4_t _sum0 = vld1q_f32(buf_c + m); float32x4_t _sum0 = vld1q_f32(buf_c + m);
for (int tid = 1; tid < threads_num; ++tid) { for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += vld1q_f32(buf_c + tid * M + m); _sum0 += vld1q_f32(buf_c + tid * M + m);
} }
vst1q_f32(C + m, _sum0 * _valpha); vst1q_f32(C + m, _sum0 * _valpha);
} }
#pragma omp parallel for
for (int m = (M & 0xfffffffc); m < M; ++m) { for (int m = (M & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m); float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) { for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m); _sum0 += *(buf_c + tid * M + m);
} }
C[m] = _sum0 * alpha; C[m] = _sum0 * alpha;
} }
} else { // beta != 0.f } else { // beta != 0.f
register float32x4_t _vbeta = vdupq_n_f32(beta); float32x4_t _vbeta = vdupq_n_f32(beta);
#pragma omp parallel for #pragma omp parallel for
for (int m = 0; m < M; m += 4) { for (int m = 0; m < M; m += 4) {
register float32x4_t _sum0 = vld1q_f32(buf_c + m); float32x4_t _sum0 = vld1q_f32(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) { for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += vld1q_f32(buf_c + tid * M + m); _sum0 += vld1q_f32(buf_c + tid * M + m);
} }
float32x4_t _vc = vld1q_f32(C + m); float32x4_t _vc = vld1q_f32(C + m);
vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc); vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc);
} }
#pragma omp parallel for
for (int m = (m & 0xfffffffc); m < M; ++m) { for (int m = (M & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m); float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) { for (int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m);
}
C[m] = _sum0 * alpha + beta * C[m];
}
#pragma omp parallel for
for (int m = (m & 0xfffffffc); m < M; ++m) {
register float _sum0 = *(buf_c + m);
for (register int tid = 1; tid < threads_num; ++tid) {
_sum0 += *(buf_c + tid * M + m); _sum0 += *(buf_c + tid * M + m);
} }
C[m] = _sum0 * alpha + beta * C[m]; C[m] = _sum0 * alpha + beta * C[m];
......
...@@ -76,27 +76,11 @@ struct DtypeTensorTrait<GPU_CL> { ...@@ -76,27 +76,11 @@ struct DtypeTensorTrait<GPU_CL> {
class OpParam { class OpParam {
public: public:
OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs, OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope) { const AttributeMap &attrs, Scope *scope)
scope_pointer_ = scope; : scope_(scope) {}
inputs_ = inputs;
}
template <typename T>
T *CreateNewScale() {
std::string scale_key = Getkey("Scale", inputs_, 0);
auto var = scope_pointer_->Var(scale_key + "_new");
return var->GetMutable<T>();
}
template <typename T>
T *CreateNewBiase() {
std::string biase_key = Getkey("Bias", inputs_, 0);
auto var = scope_pointer_->Var(biase_key + "_new");
return var->GetMutable<T>();
}
VariableNameMap inputs_; Scope *GetScope() const { return scope_; }
Scope *scope_pointer_ = nullptr; Scope *scope_ = nullptr;
#ifdef PADDLE_MOBILE_FPGA_KD #ifdef PADDLE_MOBILE_FPGA_KD
zynqmp::Context &context() { return context_; } zynqmp::Context &context() { return context_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册