提交 dd2b700d 编写于 作者: H hanbuhe

merge memory leak into eb1.4.0

......@@ -22,6 +22,9 @@ if (WITH_PADDLE_MOBILE)
return()
endif(WITH_PADDLE_MOBILE)
# set(CMAKE_BUILD_TYPE DEBUG)
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 11)
......
......@@ -14,6 +14,8 @@
#pragma once
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
......@@ -37,8 +39,18 @@ class Debugger {
}
}
void tick(std::string key) {
float value = 0;
if (tick_tock_map.count(key) > 0) {
value += tick_tock_map[key] = value;
}
}
void tock(std::string key) {}
private:
std::unordered_map<std::string, bool> op_config;
std::unordered_map<std::string, float> tick_tock_map;
Debugger() {
op_config["concat"] = true;
op_config["pooling"] = true;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io.hpp"
namespace paddle {
namespace zynqmp {
// FpgaIO::FpgaIO() {}
// void FpgaIO::setMutex(std::mutex* mtx) { mtx_ = mtx; }
// void FpgaIO::setConditionVariable(std::condition_variable* condition) {
// condition_ = condition;
// }
// void FpgaIO::lock() {
// if (mtx_ != nullptr && !locked_) {
// mtx_->lock();
// locked_ = true;
// }
// }
// void FpgaIO::unlock() {
// if (mtx_ != nullptr) {
// mtx_->unlock();
// condition_->notify_one();
// }
// locked_ = false;
// }
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
// #include <condition_variable>
// #include <mutex>
namespace paddle {
namespace zynqmp {
class FpgaIO {
public:
static FpgaIO& get_instance() {
static FpgaIO s_instance;
return s_instance;
}
void allocData(size_t s) { data_ = new float[s]; }
float* getData() { return data_; }
// void setMutex(std::mutex* mtx);
// void setConditionVariable(std::condition_variable* condition);
// void lock();
// void unlock();
private:
// std::mutex* mtx_ = nullptr;
// std::condition_variable* condition_ = nullptr;
// bool locked_ = false;
float* data_ = nullptr;
FpgaIO();
};
} // namespace zynqmp
} // namespace paddle
......@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
int8_t* quantized_start = quantized_data + n * chw;
quantize(filter_start, quantized_start, chw, max);
filter_max.push_back(1);
quantize(filter_start, quantized_start, chw, f_max);
filter_max.push_back(f_max);
}
int8_t* hwc_data =
......
......@@ -205,7 +205,7 @@ int get_device_info(const struct DeviceInfo &args) {
int perform_bypass(const struct BypassArgs &args) {
int ret = -1;
int size = args.image.channels * args.image.width * args.image.height;
int max_size = 1 << 21;
int max_size = 1 << 22;
float times = 1.0 * size / max_size;
int count = static_cast<int>(times);
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <cmath>
#include <vector>
#include "lite/backends/fpga/KD/pe.hpp"
......@@ -38,7 +40,6 @@ class FullyConnectedPE : public PE {
Tensor* input = param_.input;
convParam_.input = param_.input;
convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1;
convParam_.strides = {1, 1};
......@@ -54,32 +55,42 @@ class FullyConnectedPE : public PE {
int height = param_.input->shape().height();
int width = param_.input->shape().width();
int filter_channel = chw / height / width;
// int filter_channel = chw / height / width;
int channel = param_.output->shape().channel();
Shape shape(NCHW, {num, filter_channel, height, width});
Tensor* conv_filter = new Tensor();
float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
Shape shape(NCHW, {num, chw_aligned, 1, 1});
float* new_filter_data = conv_filter_.mutableData<float>(FP32, shape);
float* filter_data = param_.filter->data<float>();
memset(new_filter_data, 0, num * chw_aligned * sizeof(float));
for (int i = 0; i < num; i++) {
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
new_filter_data[i * chw + j] = scale;
new_filter_data[i * chw_aligned + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
Shape sb_shape(N, {channel});
conv_filter_.flush();
convParam_.filter = &conv_filter_;
// param_.filter->saveToFile("param_filter", true);
// conv_filter->saveToFile("conv_filter", true);
// exit(-1);
Shape sb_shape(N, {num});
float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
for (int i = 0; i < channel; i++) {
for (int i = 0; i < num; i++) {
scale_data[i] = 1.0f;
bias_data[i] = param_.bias->data<float>()[i];
}
// for (int i = 0; i < num; i++) {
// scale_data[i] = 1.0f;
// bias_data[i] = param_.bias->data<float>()[i];
// }
convParam_.scale()->flush();
convParam_.bias()->flush();
......@@ -115,14 +126,41 @@ class FullyConnectedPE : public PE {
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
output->saveToFile("cpu_compute", true);
// exit(-1);
}
void batch_to_w() {
ConvParam& convParam_ = convPE_.param();
int channel = param_.input->shape().channel();
param_.input->invalidate();
int remainder =
aligned_input_.shape().channel() - param_.input->shape().channel();
float max = 0;
for (int n = 0; n < param_.input->shape().num(); n++) {
memset(aligned_input_.data<float16>(),
0,
aligned_input_.shape().channel() * sizeof(float16));
memcpy(
aligned_input_.data<float16>() + n * aligned_input_.shape().channel(),
param_.input->data<float16>() + n * channel,
channel * sizeof(float16));
aligned_input_.copyScaleFrom(param_.input);
aligned_input_.flush();
}
convPE_.dispatch();
}
bool dispatch() {
// int num = param_.filter->shape().channel();
// if (num == 2) {
// cpu_compute();
// batch_to_w();
// return 1;
// cpu_compute1();
// return 1;
// } else {
return convPE_.dispatch();
// }
}
......@@ -131,7 +169,10 @@ class FullyConnectedPE : public PE {
private:
FullyConnectedParam param_;
Tensor aligned_input_;
Tensor aligned_output_;
ConvPE convPE_;
Tensor conv_filter_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -29,6 +29,7 @@ class InputPE : public PE {
}
bool dispatch() {
// std::cout << "input_dispatch()\n";
Tensor* input = param_.input;
Tensor* output = param_.output;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
......@@ -52,6 +53,12 @@ class OutputPE : public PE {
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
fpga_reset();
auto max = fpga_get_memory_size_max();
std::cout << "PL ===== Max: ===== :: " << max << std::endl;
return true;
}
......
......@@ -103,12 +103,18 @@ class Tensor {
return reinterpret_cast<Dtype*>(ptr);
}
void releaseData() {
released = true;
placeHolder_.reset();
}
template <typename Dtype>
Dtype* mutableData(DataType dataType, const Shape& shape) {
if (this->shape_ != nullptr) {
delete shape_;
}
this->shape_ = new Shape(shape);
// std::cout << "enter \n";
// std::cout << "before new shape\n";
// this->shape_ = new Shape(shape);
this->shape_.reset(new Shape(shape));
// std::cout << "new shape \n";
this->dataType_ = dataType;
return mutableData<Dtype>();
}
......@@ -117,11 +123,14 @@ class Tensor {
Dtype* mutableData() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
// std::cout << "mem_size:" << memorySize << std::endl;
if (placeHolder_ != nullptr) {
// std::cout << "placeHolder_ not null"<< std::endl;
if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize));
}
} else {
// std::cout << "placeHolder_ null"<< std::endl;
placeHolder_.reset(new PlaceHolder(memorySize));
}
return data<Dtype>();
......@@ -138,7 +147,7 @@ class Tensor {
DataType dataType() { return this->dataType_; }
Shape& shape() { return *shape_; }
Shape& shape() { return *(shape_.get()); }
bool aligned() { return this->aligned_; }
......@@ -247,15 +256,17 @@ class Tensor {
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
if (shape_ != nullptr) {
delete shape_;
}
// if (shape_ != nullptr) {
// delete shape_;
// }
this->placeHolder_ = src->placeHolder_;
this->dataType_ = src->dataType_;
this->aligned_ = src->aligned_;
this->dateLocation_ = src->dateLocation_;
this->offset = offset;
shape_ = new Shape(const_cast<Shape&>(shape));
// shape_ = new Shape(const_cast<Shape&>(shape));
shape_.reset(new Shape(shape));
}
void copyFrom(Tensor* src) {
......@@ -300,7 +311,13 @@ class Tensor {
}
void flush() {
size_t memorySize = placeHolder_->memorySize();
if (released) {
// std::cout << "flush::" << this << std::endl;
return;
}
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
}
......@@ -451,18 +468,13 @@ class Tensor {
return os;
}
~Tensor() {
if (shape_ != nullptr) {
delete shape_;
shape_ = nullptr;
}
}
private:
bool released = false;
int offset = 0;
float mem_scale_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr;
std::shared_ptr<Shape> shape_;
// Shape* shape_ = nullptr;
DataType dataType_ = FP32;
bool aligned_ = false;
DataSyncStatus synchedStatus_ = Synched;
......
......@@ -69,7 +69,7 @@ std::string DDimLite::repr() const {
}
void TensorLite::ShareDataWith(const TensorLite &other) {
buffer_ = other.buffer_;
buffer_ = other.buffer_; // TODO(chonwhite) delete buffer;
dims_ = other.dims_;
zynq_tensor_ = other.zynq_tensor_;
target_ = other.target_;
......@@ -79,10 +79,10 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
}
void *TensorLite::mutable_data(size_t memory_size) {
memory_size_ = memory_size;
memory_size_ = memory_size; // TODO(chonwhite) delete buffer;
buffer_->ResetLazy(target_, memory_size_);
// throw -1;
std::cout << memory_size << std::endl;
// std::cout << memory_size << std::endl;
return buffer_->data();
}
......@@ -92,16 +92,34 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
}
void TensorLite::CopyDataFrom(const TensorLite &other) {
// std::cout << "other11:: "<< &other << std::endl;
dims_ = other.dims_;
target_ = other.target_;
lod_ = other.lod_;
auto dt = zynq_tensor_->dataType();
// std::cout << "before dataType\n";
auto shape = other.zynq_tensor_->shape();
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
auto dt = zynq_tensor_->dataType();
// std::cout << "after dataType\n";
// std::cout << "before resize\n";
Resize(other.dims());
auto shape = other.zynq_tensor_->shape();
// std::cout << "after resize\n";
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
this->ZynqTensor()->copyFrom(other.ZynqTensor());
// std::cout << "after mutableData\n";
// std::cout << "ZynqTensor():" << this->ZynqTensor() << std::endl;
// std::cout << "other Tensor():" << other.ZynqTensor() << std::endl;
// this->ZynqTensor()->copyFrom(other.ZynqTensor());
memcpy(this->ZynqTensor()->data<void>(),
other.ZynqTensor()->data<void>(),
other.ZynqTensor()->shape().numel() * sizeof(float));
// memcpy()
// std::cout << "after copyFrom\n";
}
} // namespace lite
......
......@@ -81,6 +81,10 @@ class DDimLite {
return !(a == b);
}
~DDimLite() {
// std::cout << "free DDimLite\n";
}
private:
std::vector<value_type> data_;
};
......@@ -109,7 +113,12 @@ class TensorLite {
return zynq_tensor_->data<R>() + offset_;
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
void Resize(const DDimLite &ddim) {
// std::cout << "Resize \n";
// std::cout << "ddim:" << & ddim << std::endl;
dims_ = ddim;
// std::cout << "after Reize \n";
}
void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
const DDimLite &dims() const { return dims_; }
......@@ -142,7 +151,9 @@ class TensorLite {
void *mutable_data(size_t memory_size);
void *mutable_data(TargetType target, size_t memory_size);
const void *raw_data() const { return buffer_->data(); }
const void *raw_data() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
size_t data_size() const { return this->dims().production(); }
......@@ -150,7 +161,9 @@ class TensorLite {
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
bool IsInitialized() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
// Other share data to this.
void ShareDataWith(const TensorLite &other);
......@@ -165,7 +178,10 @@ class TensorLite {
TargetType target() const { return target_; }
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
os << "Tensor:" << '\n';
......@@ -194,7 +210,8 @@ class TensorLite {
size_t memory_size_{};
size_t offset_{0};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
// zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
template <typename T>
void mutable_data_internal();
......@@ -203,6 +220,7 @@ class TensorLite {
template <typename T, typename R>
R *TensorLite::mutable_data() {
std::vector<int> v;
// std::cout << "mutable_data \n";
for (int i = 0; i < dims_.size(); i++) {
v.push_back(dims_[i]);
}
......@@ -225,7 +243,7 @@ R *TensorLite::mutable_data() {
break;
}
zynqmp::Shape input_shape(layout_type, v);
// std::cout << "input_shape \n";
zynqmp::DataType data_type = zynqmp::FP32;
if (typeid(T) == typeid(float)) {
data_type = zynqmp::FP32;
......@@ -233,6 +251,13 @@ R *TensorLite::mutable_data() {
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = zynqmp::FP16;
}
// std::cout << "mutableData \n";
// std::cout << "zynq_tensor_:" << zynq_tensor_.get() << std::endl;
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
return zynq_tensor_->mutableData<R>(data_type, input_shape);
}
......@@ -272,6 +297,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
template <typename T>
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
// TODO(chonwhite) delete this function;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
......
......@@ -59,6 +59,7 @@ void SequencePoolCompute::Run() {
for (int i = 0; i <= batch_size; i++) {
offset_new[i] = i;
}
(output->mutable_lod())->clear();
(output->mutable_lod())->push_back(offset_new);
}
......
......@@ -14,7 +14,6 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
......
......@@ -40,8 +40,8 @@ void FeedCompute::PrepareForRun() {
void FeedCompute::Run() {
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
pe_.param().input = x.ZynqTensor();
pe_.dispatch();
auto out_lod = param.out->mutable_lod();
*out_lod = x.lod();
......
......@@ -82,6 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
kNHWC,
paddle::lite::kernels::fpga::FetchCompute,
host_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -80,7 +80,8 @@ void mul(MulCompute* k) {
}
void MulCompute::Run() {
pe_.dispatch();
// pe_.dispatch();
mul(this);
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger::get_instance().registerOutput("mul", fc_param.output);
......
......@@ -16,7 +16,7 @@
#include <utility>
#include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
// #include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/host/one_hot_compute.h"
#include "lite/utils/paddle_enforce.h"
......
......@@ -15,7 +15,7 @@
#include "lite/operators/one_hot_op.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/debugger.hpp"
// #include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册