提交 9d2d3d0f 编写于 作者: C chonwhite

fixed memory leak

上级 9c15846a
......@@ -14,6 +14,8 @@
#pragma once
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
......@@ -37,8 +39,18 @@ class Debugger {
}
}
void tick(std::string key) {
float value = 0;
if (tick_tock_map.count(key) > 0) {
value += tick_tock_map[key] = value;
}
}
void tock(std::string key) {}
private:
std::unordered_map<std::string, bool> op_config;
std::unordered_map<std::string, float> tick_tock_map;
Debugger() {
op_config["concat"] = true;
op_config["pooling"] = true;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io.hpp"
namespace paddle {
namespace zynqmp {
// FpgaIO::FpgaIO() {}
// void FpgaIO::setMutex(std::mutex* mtx) { mtx_ = mtx; }
// void FpgaIO::setConditionVariable(std::condition_variable* condition) {
// condition_ = condition;
// }
// void FpgaIO::lock() {
// if (mtx_ != nullptr && !locked_) {
// mtx_->lock();
// locked_ = true;
// }
// }
// void FpgaIO::unlock() {
// if (mtx_ != nullptr) {
// mtx_->unlock();
// condition_->notify_one();
// }
// locked_ = false;
// }
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
// #include <condition_variable>
// #include <mutex>
namespace paddle {
namespace zynqmp {
class FpgaIO {
public:
static FpgaIO& get_instance() {
static FpgaIO s_instance;
return s_instance;
}
void allocData(size_t s) { data_ = new float[s]; }
float* getData() { return data_; }
// void setMutex(std::mutex* mtx);
// void setConditionVariable(std::condition_variable* condition);
// void lock();
// void unlock();
private:
std::mutex* mtx_ = nullptr;
std::condition_variable* condition_ = nullptr;
bool locked_ = false;
float* data_ = nullptr;
FpgaIO();
};
} // namespace zynqmp
} // namespace paddle
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <cmath>
#include <vector>
#include "lite/backends/fpga/KD/pe.hpp"
......@@ -37,10 +39,9 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input;
convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output;
convParam_.relu = param_.relu;
// convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -49,34 +50,54 @@ class FullyConnectedPE : public PE {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
int align = 32;
int chw_aligned = ((chw + align - 1) / align) * align;
int infer_num = 1;
Shape in_shape(NCHW, {infer_num, chw_aligned, 1, 1});
aligned_input_.mutableData<float16>(FP16, in_shape);
convParam_.input = &aligned_input_;
Shape out_shape(NCHW, {infer_num, num, 1, 1});
aligned_output_.mutableData<float16>(FP16, out_shape);
convParam_.output = &aligned_output_;
int height = param_.input->shape().height();
int width = param_.input->shape().width();
int filter_channel = chw / height / width;
// int filter_channel = chw / height / width;
int channel = param_.output->shape().channel();
Shape shape(NCHW, {num, filter_channel, height, width});
Tensor* conv_filter = new Tensor();
float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
Shape shape(NCHW, {num, chw_aligned, 1, 1});
float* new_filter_data = conv_filter_.mutableData<float>(FP32, shape);
float* filter_data = param_.filter->data<float>();
memset(new_filter_data, 0, num * chw_aligned * sizeof(float));
for (int i = 0; i < num; i++) {
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
new_filter_data[i * chw + j] = scale;
new_filter_data[i * chw_aligned + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
Shape sb_shape(N, {channel});
conv_filter_.flush();
convParam_.filter = &conv_filter_;
// param_.filter->saveToFile("param_filter", true);
// conv_filter->saveToFile("conv_filter", true);
// exit(-1);
Shape sb_shape(N, {num});
float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
for (int i = 0; i < channel; i++) {
for (int i = 0; i < num; i++) {
scale_data[i] = 1.0f;
bias_data[i] = param_.bias->data<float>()[i];
}
// for (int i = 0; i < num; i++) {
// scale_data[i] = 1.0f;
// bias_data[i] = param_.bias->data<float>()[i];
// }
convParam_.scale()->flush();
convParam_.bias()->flush();
......@@ -84,15 +105,197 @@ class FullyConnectedPE : public PE {
convPE_.apply();
}
bool dispatch() { return convPE_.dispatch(); }
void cpu_compute() {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
float* filter_data = param_.filter->data<float>();
float max = 0.0f;
Tensor* input = param_.input;
Tensor* output = param_.output;
float16* input_data = input->data<float16>();
float16* output_data = output->data<float16>();
for (int n = 0; n < input->shape().num(); n++) {
float16* input_data = input->data<float16>() + n * chw;
float16* output_data =
output->data<float16>() + n * output->shape().channel();
for (int i = 0; i < num; i++) {
float sum = 0;
float bias = param_.bias->data<float>()[i];
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
float data = half_to_float(input_data[j]);
sum += scale * data;
}
output_data[i] = float_to_half(sum + bias);
if (max < output_data[i]) {
max = output_data[i];
}
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
void cpu_compute1() {
int num = conv_filter_.shape().num();
int chw = conv_filter_.shape().channel();
// chw = 336;
float* filter_data = conv_filter_.data<float>();
float max = 0.0f;
Tensor* input = param_.input;
Tensor* output = param_.output;
float16* input_data = input->data<float16>();
float16* output_data = output->data<float16>();
for (int n = 0; n < input->shape().num(); n++) {
float16* input_data = input->data<float16>() + n * chw;
float16* output_data =
output->data<float16>() + n * output->shape().channel();
for (int i = 0; i < num; i++) {
float sum = 0;
float bias = param_.bias->data<float>()[i];
for (int j = 0; j < chw; j++) {
float scale = filter_data[i * chw + j];
float data = half_to_float(input_data[j]);
sum += scale * data;
}
float value = sum + bias;
if (std::isinf(value) || i > 321) {
std::cout << "i:" << i << " sum:" << sum << " bias:" << bias
<< std::endl;
// exit(-1);
}
if (i > 321) {
std::cout << "i:" << i << " sum:" << sum << " bias:" << bias
<< std::endl;
// exit(-1);
}
output_data[i] = float_to_half(value);
if (max < value) {
max = value;
}
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
output->saveToFile("cpu_compute", true);
// exit(-1);
}
void batch_to_w() {
ConvParam& convParam_ = convPE_.param();
int channel = param_.input->shape().channel();
param_.input->invalidate();
int remainder =
aligned_input_.shape().channel() - param_.input->shape().channel();
float max = 0;
for (int n = 0; n < param_.input->shape().num(); n++) {
memset(aligned_input_.data<float16>(),
0,
aligned_input_.shape().channel() * sizeof(float16));
memcpy(
aligned_input_.data<float16>() + n * aligned_input_.shape().channel(),
param_.input->data<float16>() + n * channel,
channel * sizeof(float16));
aligned_input_.copyScaleFrom(param_.input);
aligned_input_.flush();
}
convPE_.dispatch();
}
bool dispatch() {
// batch_to_w();
// return 1;
// cpu_compute1();
// return 1;
// int num = param_.filter->shape().channel();
// if (num == 2) {
// cpu_compute();
// return 1;
// } else {
// return convPE_.dispatch();
// }
ConvParam& convParam_ = convPE_.param();
if (param_.input->shape().channel() == 321 &&
param_.output->shape().channel() == 384) {
// conv_filter_.saveToFile("conv_filter", true);
// cpu_compute1();
// return 1;
}
int channel = param_.input->shape().channel();
param_.input->invalidate();
int remainder =
aligned_input_.shape().channel() - param_.input->shape().channel();
float max = 0;
for (int n = 0; n < param_.input->shape().num(); n++) {
memset(aligned_input_.data<float16>(),
0,
aligned_input_.shape().channel() * sizeof(float16));
memcpy(aligned_input_.data<float16>(),
param_.input->data<float16>() + n * channel,
channel * sizeof(float16));
aligned_input_.copyScaleFrom(param_.input);
aligned_input_.flush();
if (param_.input->shape().channel() == 321 &&
param_.output->shape().channel() == 384) {
// aligned_input_.saveToFile("aligned_input_", true);
// convParam_.filter->saveToFile("conv_filter", true);
}
convPE_.dispatch();
aligned_output_.invalidate();
if (param_.input->shape().num() == 230) {
// aligned_output_.saveToFile("ao", true);
}
//
float16* src = aligned_output_.data<float16>();
float16* dst =
param_.output->data<float16>() + n * param_.output->shape().channel();
memcpy(dst, src, param_.output->shape().channel() * sizeof(float16));
if (aligned_output_.scale()[0] > max) {
max = aligned_output_.scale()[0];
}
}
param_.output->flush();
param_.output->scale()[0] = max / 127.0f;
param_.output->scale()[1] = 127.0f / max;
// param_.output->saveToFile("out", true);
// exit(-1);
// cpu_compute();
// ConvParam& convParam_ = convPE_.param();
// convParam_.scale()->saveToFile("scale", true);
return true;
}
FullyConnectedParam& param() { return param_; }
private:
FullyConnectedParam param_;
Tensor aligned_input_;
Tensor aligned_output_;
ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
Tensor conv_filter_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -29,19 +29,28 @@ class InputPE : public PE {
}
bool dispatch() {
std::cout << "input_dispatch()\n";
Tensor* input = param_.input;
Tensor* output = param_.output;
Tensor* src = input;
// std::cout << "input:" << input << std::endl;
input->flush();
// std::cout << "input_flush()\n";
Tensor half_tensor;
if (input->dataType() == DataType::FP32) {
// std::cout << "2()\n";
half_tensor.mutableData<void*>(DataType::FP16, input->shape());
// std::cout << "3()\n";
half_tensor.copyFrom(input);
// std::cout << "4()\n";
src = &half_tensor;
}
// std::cout << "5()\n";
output->mutableData<void>();
// std::cout << "6()\n";
src->alignImage(output, true);
// std::cout << "7()\n";
return true;
}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
......@@ -52,6 +53,10 @@ class OutputPE : public PE {
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
// auto max = fpga_get_memory_size_max();
// std::cout << "===== Max: ===== :: " << max << std::endl;
return true;
}
......
......@@ -103,12 +103,18 @@ class Tensor {
return reinterpret_cast<Dtype*>(ptr);
}
void releaseData() {
released = true;
placeHolder_.reset();
}
template <typename Dtype>
Dtype* mutableData(DataType dataType, const Shape& shape) {
if (this->shape_ != nullptr) {
delete shape_;
}
this->shape_ = new Shape(shape);
// std::cout << "enter \n";
// std::cout << "before new shape\n";
// this->shape_ = new Shape(shape);
this->shape_.reset(new Shape(shape));
// std::cout << "new shape \n";
this->dataType_ = dataType;
return mutableData<Dtype>();
}
......@@ -117,11 +123,14 @@ class Tensor {
Dtype* mutableData() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
// std::cout << "mem_size:" << memorySize << std::endl;
if (placeHolder_ != nullptr) {
// std::cout << "placeHolder_ not null"<< std::endl;
if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize));
}
} else {
// std::cout << "placeHolder_ null"<< std::endl;
placeHolder_.reset(new PlaceHolder(memorySize));
}
return data<Dtype>();
......@@ -138,7 +147,7 @@ class Tensor {
DataType dataType() { return this->dataType_; }
Shape& shape() { return *shape_; }
Shape& shape() { return *(shape_.get()); }
bool aligned() { return this->aligned_; }
......@@ -247,15 +256,17 @@ class Tensor {
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
if (shape_ != nullptr) {
delete shape_;
}
// if (shape_ != nullptr) {
// delete shape_;
// }
this->placeHolder_ = src->placeHolder_;
this->dataType_ = src->dataType_;
this->aligned_ = src->aligned_;
this->dateLocation_ = src->dateLocation_;
this->offset = offset;
shape_ = new Shape(const_cast<Shape&>(shape));
// shape_ = new Shape(const_cast<Shape&>(shape));
shape_.reset(new Shape(shape));
}
void copyFrom(Tensor* src) {
......@@ -300,6 +311,14 @@ class Tensor {
}
void flush() {
// std::cout << "released:" << released << std::endl;
// std::cout << "placeHolder_" << placeHolder_.get() << std::endl;
if (released) {
// std::cout << "flush::" << this << std::endl;
return;
}
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
......@@ -463,18 +482,13 @@ class Tensor {
return os;
}
~Tensor() {
if (shape_ != nullptr) {
delete shape_;
shape_ = nullptr;
}
}
private:
bool released = false;
int offset = 0;
float mem_scale_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr;
std::shared_ptr<Shape> shape_;
// Shape* shape_ = nullptr;
DataType dataType_ = FP32;
bool aligned_ = false;
DataSyncStatus synchedStatus_ = Synched;
......
......@@ -69,7 +69,7 @@ std::string DDimLite::repr() const {
}
void TensorLite::ShareDataWith(const TensorLite &other) {
buffer_ = other.buffer_;
buffer_ = other.buffer_; // TODO(chonwhite) delete buffer;
dims_ = other.dims_;
zynq_tensor_ = other.zynq_tensor_;
target_ = other.target_;
......@@ -79,10 +79,10 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
}
void *TensorLite::mutable_data(size_t memory_size) {
memory_size_ = memory_size;
memory_size_ = memory_size; // TODO(chonwhite) delete buffer;
buffer_->ResetLazy(target_, memory_size_);
// throw -1;
std::cout << memory_size << std::endl;
// std::cout << memory_size << std::endl;
return buffer_->data();
}
......@@ -92,16 +92,34 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
}
void TensorLite::CopyDataFrom(const TensorLite &other) {
// std::cout << "other11:: "<< &other << std::endl;
dims_ = other.dims_;
target_ = other.target_;
lod_ = other.lod_;
auto dt = zynq_tensor_->dataType();
// std::cout << "before dataType\n";
auto shape = other.zynq_tensor_->shape();
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
auto dt = zynq_tensor_->dataType();
// std::cout << "after dataType\n";
// std::cout << "before resize\n";
Resize(other.dims());
auto shape = other.zynq_tensor_->shape();
// std::cout << "after resize\n";
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
this->ZynqTensor()->copyFrom(other.ZynqTensor());
// std::cout << "after mutableData\n";
// std::cout << "ZynqTensor():" << this->ZynqTensor() << std::endl;
// std::cout << "other Tensor():" << other.ZynqTensor() << std::endl;
// this->ZynqTensor()->copyFrom(other.ZynqTensor());
memcpy(this->ZynqTensor()->data<void>(),
other.ZynqTensor()->data<void>(),
other.ZynqTensor()->shape().numel() * sizeof(float));
// memcpy()
// std::cout << "after copyFrom\n";
}
} // namespace lite
......
......@@ -81,6 +81,10 @@ class DDimLite {
return !(a == b);
}
~DDimLite() {
// std::cout << "free DDimLite\n";
}
private:
std::vector<value_type> data_;
};
......@@ -109,7 +113,12 @@ class TensorLite {
return zynq_tensor_->data<R>() + offset_;
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
void Resize(const DDimLite &ddim) {
// std::cout << "Resize \n";
// std::cout << "ddim:" << & ddim << std::endl;
dims_ = ddim;
// std::cout << "after Reize \n";
}
void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
const DDimLite &dims() const { return dims_; }
......@@ -142,7 +151,9 @@ class TensorLite {
void *mutable_data(size_t memory_size);
void *mutable_data(TargetType target, size_t memory_size);
const void *raw_data() const { return buffer_->data(); }
const void *raw_data() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
size_t data_size() const { return this->dims().production(); }
......@@ -150,7 +161,9 @@ class TensorLite {
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
bool IsInitialized() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
// Other share data to this.
void ShareDataWith(const TensorLite &other);
......@@ -168,7 +181,7 @@ class TensorLite {
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
os << "Tensor:" << '\n';
......@@ -197,7 +210,8 @@ class TensorLite {
size_t memory_size_{};
size_t offset_{0};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
// zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
template <typename T>
void mutable_data_internal();
......@@ -206,6 +220,7 @@ class TensorLite {
template <typename T, typename R>
R *TensorLite::mutable_data() {
std::vector<int> v;
// std::cout << "mutable_data \n";
for (int i = 0; i < dims_.size(); i++) {
v.push_back(dims_[i]);
}
......@@ -228,7 +243,7 @@ R *TensorLite::mutable_data() {
break;
}
zynqmp::Shape input_shape(layout_type, v);
// std::cout << "input_shape \n";
zynqmp::DataType data_type = zynqmp::FP32;
if (typeid(T) == typeid(float)) {
data_type = zynqmp::FP32;
......@@ -236,6 +251,13 @@ R *TensorLite::mutable_data() {
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = zynqmp::FP16;
}
// std::cout << "mutableData \n";
// std::cout << "zynq_tensor_:" << zynq_tensor_.get() << std::endl;
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
return zynq_tensor_->mutableData<R>(data_type, input_shape);
}
......@@ -276,6 +298,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
template <typename T>
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
// TODO(chonwhite) delete this function;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
......
......@@ -40,8 +40,8 @@ void FeedCompute::PrepareForRun() {
void FeedCompute::Run() {
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
pe_.param().input = x.ZynqTensor();
pe_.dispatch();
auto out_lod = param.out->mutable_lod();
*out_lod = x.lod();
......
......@@ -80,7 +80,8 @@ void mul(MulCompute* k) {
}
void MulCompute::Run() {
pe_.dispatch();
// pe_.dispatch();
mul(this);
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger::get_instance().registerOutput("mul", fc_param.output);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册