提交 35cfe592 编写于 作者: Z zhangyang

Avoid memory leakage for FPGA track

上级 e96e3dd5
...@@ -411,6 +411,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -411,6 +411,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>(); auto out_ptr = out->data<float>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
// Either group_num or split_num = 1; // Either group_num or split_num = 1;
...@@ -421,6 +422,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -421,6 +422,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg = arg->conv_arg =
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs)); memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_num = arg->split_num;
...@@ -431,11 +434,17 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -431,11 +434,17 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int n = arg->split_num; int n = arg->split_num;
arg->concat_arg.images_in = arg->concat_arg.images_in =
(half **)fpga_malloc(n * sizeof(int *)); // NOLINT static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
arg->concat_arg.scales_in = arg->concat_arg.scales_in =
(float **)fpga_malloc(n * sizeof(float *)); // NOLINT static_cast<float **>(fpga_malloc(n * sizeof(float *)));
arg->concat_arg.channel_num = arg->concat_arg.channel_num =
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
arg->vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
arg->vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
arg->vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
auto channel = (int)out->dims()[1]; // NOLINT auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
...@@ -469,6 +478,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -469,6 +478,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto filter_head = &( auto filter_head = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_arg[i].filter_address = fpga_malloc(filter_size); arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
arg->vector_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size);
...@@ -477,18 +488,25 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -477,18 +488,25 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
sizeof(float); sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size); arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
arg->vector_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size); fpga_flush(arg->conv_arg[i].sb_address, bs_size);
if (n > 1) { if (n > 1) {
arg->conv_arg[i].output.scale_address = arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT static_cast<float *>(fpga_malloc(2 * sizeof(float)));
arg->conv_arg[i].output.address = arg->conv_arg[i].output.address =
fpga_malloc(out->dims()[2] * fpga_malloc(out->dims()[2] *
align_to_x((int)(out->dims()[3] * // NOLINT align_to_x((int)(out->dims()[3] * // NOLINT
arg->conv_arg[i].filter_num), arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
arg->vector_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
deleter));
arg->vector_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
} else { } else {
arg->conv_arg[i].output.scale_address = out->scale; arg->conv_arg[i].output.scale_address = out->scale;
arg->conv_arg[i].output.address = out_ptr; arg->conv_arg[i].output.address = out_ptr;
...@@ -512,6 +530,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -512,6 +530,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
float *bs_ptr) { float *bs_ptr) {
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
arg->sub_conv_num = (uint32_t)stride_h; arg->sub_conv_num = (uint32_t)stride_h;
...@@ -554,25 +573,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -554,25 +573,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
uint32_t split_num = uint32_t split_num =
group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
arg->split_conv_args = (SplitConvArgs *)fpga_malloc( // NOLINT
sub_conv_num * sizeof(SplitConvArgs)); // NOLINT
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
arg->split_conv_args[i].filter_num = arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
arg->split_conv_args[i]->filter_num =
(arg->sub_conv_num) * (arg->filter_num); (arg->sub_conv_num) * (arg->filter_num);
arg->split_conv_args[i].group_num = (uint32_t)group_num; arg->split_conv_args[i]->group_num = (uint32_t)group_num;
arg->split_conv_args[i].split_num = split_num; arg->split_conv_args[i]->split_num = split_num;
arg->split_conv_args[i].conv_arg = arg->split_conv_args[i]->concat_arg.height = sub_output_height;
(ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs)); // NOLINT arg->split_conv_args[i]->concat_arg.width = sub_output_width;
arg->split_conv_args[i]->concat_arg.image_num = split_num;
arg->split_conv_args[i].concat_arg.height = sub_output_height;
arg->split_conv_args[i].concat_arg.width = sub_output_width; arg->split_conv_args[i]->conv_arg =
arg->split_conv_args[i].concat_arg.image_num = split_num; static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
arg->split_conv_args[i].concat_arg.images_in = arg->split_conv_args[i]->concat_arg.images_in =
(half **)fpga_malloc(split_num * sizeof(half *)); // NOLINT static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
arg->split_conv_args[i].concat_arg.scales_in = arg->split_conv_args[i]->concat_arg.scales_in =
(float **)fpga_malloc(split_num * sizeof(float *)); // NOLINT static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
arg->split_conv_args[i].concat_arg.channel_num = arg->split_conv_args[i]->concat_arg.channel_num =
(uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); // NOLINT static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
arg->split_conv_args[i]->shared_conv_arg =
std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
arg->split_conv_args[i]->vector_concat_space.push_back(
std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->concat_arg.images_in),
deleter));
arg->split_conv_args[i]->vector_concat_space.push_back(
std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->concat_arg.scales_in),
deleter));
arg->split_conv_args[i]->vector_concat_space.push_back(
std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->concat_arg.channel_num),
deleter));
} }
auto filter_num_per_div = auto filter_num_per_div =
...@@ -597,111 +632,132 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -597,111 +632,132 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
uint32_t out_addr_offset = 0; uint32_t out_addr_offset = 0;
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
if (sub_conv_num == 1) { if (sub_conv_num == 1) {
arg->split_conv_args[i].output.address = arg->output.address; arg->split_conv_args[i]->output.address = arg->output.address;
arg->split_conv_args[i].output.scale_address = arg->output.scale_address; arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
out_addr_offset = 0; out_addr_offset = 0;
} else { } else {
auto ptr_output = (half *)out_ptr; // NOLINT
out_addr_offset = out_addr_offset =
sizeof(half) * (sub_conv_num - 1 - i) * sizeof(int16_t) * (sub_conv_num - 1 - i) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->split_conv_args[i].output.address = (void *)(ptr_output); // NOLINT arg->split_conv_args[i]->output.address = out_ptr;
arg->split_conv_args[i]->output.scale_address =
auto ptr_output_scale = static_cast<float *>(fpga_malloc(2 * sizeof(float)));
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->split_conv_args[i]->vector_conv_space.push_back(
arg->split_conv_args[i].output.scale_address = ptr_output_scale; std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->output.scale_address),
deleter));
} }
for (int j = 0; j < split_num; ++j) { for (int j = 0; j < split_num; ++j) {
arg->split_conv_args[i].conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i].conv_arg[j].group_num = (uint32_t)group_num; arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
arg->split_conv_args[i].conv_arg[j].kernel.width = arg->split_conv_args[i]->conv_arg[j].kernel.width =
(uint32_t)sub_filter_width; (uint32_t)sub_filter_width;
arg->split_conv_args[i].conv_arg[j].kernel.height = arg->split_conv_args[i]->conv_arg[j].kernel.height =
(uint32_t)sub_filter_width; (uint32_t)sub_filter_width;
arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1; arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1; arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
arg->split_conv_args[i].conv_arg[j].deconv_tx_param.deconv_en = 1; arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
arg->split_conv_args[i].conv_arg[j].deconv_tx_param.sub_conv_num = arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
sub_conv_num; sub_conv_num;
arg->split_conv_args[i].conv_arg[j].deconv_tx_param.omit_size = omit_size; arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
arg->split_conv_args[i].conv_arg[j].deconv_tx_param.out_addr_offset = omit_size;
arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
out_addr_offset; out_addr_offset;
arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale; arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
arg->split_conv_args[i].conv_arg[j].image.channels = arg->split_conv_args[i]->conv_arg[j].image.channels =
(uint32_t)sub_channels; (uint32_t)sub_channels;
arg->split_conv_args[i].conv_arg[j].image.width = arg->split_conv_args[i]->conv_arg[j].image.width =
(uint32_t)input->dims()[3]; (uint32_t)input->dims()[3];
arg->split_conv_args[i].conv_arg[j].image.height = arg->split_conv_args[i]->conv_arg[j].image.height =
(uint32_t)input->dims()[2]; (uint32_t)input->dims()[2];
arg->split_conv_args[i].conv_arg[j].image.pad_width = (uint32_t)sub_pad; arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
arg->split_conv_args[i].conv_arg[j].image.pad_height = (uint32_t)sub_pad; arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
arg->split_conv_args[i].conv_arg[j].image.address = input_ptr; arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale; arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
arg->split_conv_args[i].conv_arg[j].filter_num = arg->split_conv_args[i]->conv_arg[j].filter_num =
(uint32_t)(j == split_num - 1 (uint32_t)(j == split_num - 1
? sub_filter_num - (split_num - 1) * filter_num_per_div ? sub_filter_num - (split_num - 1) * filter_num_per_div
: filter_num_per_div); : filter_num_per_div);
size_t filter_size = size_t filter_size =
element_num * element_num *
align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) * FILTER_NUM_ALIGNMENT) *
sizeof(int8_t); sizeof(int8_t);
auto filter_head = &(( auto filter_head = &((
int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset]; i * filter_sub_conv_offset];
arg->split_conv_args[i].conv_arg[j].filter_address = arg->split_conv_args[i]->conv_arg[j].filter_address =
fpga_malloc(filter_size); fpga_malloc(filter_size);
memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head, arg->split_conv_args[i]->vector_conv_space.push_back(
std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->conv_arg[j].filter_address),
deleter));
memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
filter_size); filter_size);
fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address, fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
filter_size); filter_size);
size_t bs_align_num = align_to_x( size_t bs_align_num = align_to_x(
arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT); arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
size_t bs_size = 2 * bs_align_num * sizeof(float); size_t bs_size = 2 * bs_align_num * sizeof(float);
auto bs_head = &bs_ptr[j * filter_num_per_div * 2]; auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
arg->split_conv_args[i].conv_arg[j].sb_address = fpga_malloc(bs_size); arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
memcpy(arg->split_conv_args[i].conv_arg[j].sb_address, bs_head, bs_size); arg->split_conv_args[i]->vector_conv_space.push_back(
fpga_flush(arg->split_conv_args[i].conv_arg[j].sb_address, bs_size); std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->conv_arg[j].sb_address),
deleter));
memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
if (split_num == 1) { if (split_num == 1) {
arg->split_conv_args[i].conv_arg[j].output.address = arg->split_conv_args[i]->conv_arg[j].output.address =
arg->split_conv_args[i].output.address; arg->split_conv_args[i]->output.address;
arg->split_conv_args[i].conv_arg[j].output.scale_address = arg->split_conv_args[i]->conv_arg[j].output.scale_address =
arg->split_conv_args[i].output.scale_address; arg->split_conv_args[i]->output.scale_address;
} else { } else {
auto ptr_output = arg->split_conv_args[i]->conv_arg[j].output.address =
(half *)fpga_malloc(conv_output_size * sizeof(half)); // NOLINT fpga_malloc(conv_output_size * sizeof(int16_t));
arg->split_conv_args[i].conv_arg[j].output.address = arg->split_conv_args[i]->conv_arg[j].output.scale_address =
(void *)((half *)ptr_output); // NOLINT static_cast<float *>(fpga_malloc(2 * sizeof(float)));
auto ptr_output_scale = arg->split_conv_args[i]->vector_conv_space.push_back(
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT std::shared_ptr<char>(
arg->split_conv_args[i].conv_arg[j].output.scale_address = reinterpret_cast<char *>(
ptr_output_scale; arg->split_conv_args[i]->conv_arg[j].output.address),
deleter));
arg->split_conv_args[i]->vector_conv_space.push_back(
std::shared_ptr<char>(
reinterpret_cast<char *>(
arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter));
} }
arg->split_conv_args[i].concat_arg.images_in[j] = arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>(
(half *)arg->split_conv_args[i].conv_arg[j].output.address; // NOLINT arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i].concat_arg.scales_in[j] = arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i].conv_arg[j].output.scale_address; arg->split_conv_args[i]->conv_arg[j].output.scale_address;
arg->split_conv_args[i].concat_arg.channel_num[j] = arg->split_conv_args[i]->concat_arg.channel_num[j] =
arg->split_conv_args[i].conv_arg[j].filter_num; arg->split_conv_args[i]->conv_arg[j].filter_num;
expand_conv_arg(&(arg->split_conv_args[i].conv_arg[j])); expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
} }
arg->split_conv_args[i].concat_arg.image_out = arg->split_conv_args[i]->concat_arg.image_out =
arg->split_conv_args[i].output.address; arg->split_conv_args[i]->output.address;
arg->split_conv_args[i].concat_arg.scale_out = arg->split_conv_args[i]->concat_arg.scale_out =
arg->split_conv_args[i].output.scale_address; arg->split_conv_args[i]->output.scale_address;
} }
filter->reset_data_ptr(nullptr); filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr); fpga_free(bs_ptr);
...@@ -717,16 +773,16 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, ...@@ -717,16 +773,16 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
arg->relu_enabled = relu_enabled; arg->relu_enabled = relu_enabled;
arg->bias_address = bias_ptr; arg->bias_address = bias_ptr;
arg->filter_address = filter_ptr; arg->filter_address = filter_ptr;
arg->kernel.height = filter->dims()[2]; arg->kernel.height = (uint32_t)filter->dims()[2];
arg->kernel.width = filter->dims()[3]; arg->kernel.width = (uint32_t)filter->dims()[3];
arg->kernel.stride_h = stride_h; arg->kernel.stride_h = (uint32_t)stride_h;
arg->kernel.stride_w = stride_w; arg->kernel.stride_w = (uint32_t)stride_w;
arg->image.address = input_ptr; arg->image.address = input_ptr;
arg->image.channels = (uint32_t)input->dims()[1]; arg->image.channels = (uint32_t)input->dims()[1];
arg->image.height = (uint32_t)input->dims()[2]; arg->image.height = (uint32_t)input->dims()[2];
arg->image.width = (uint32_t)input->dims()[3]; arg->image.width = (uint32_t)input->dims()[3];
arg->image.pad_height = padding_h; arg->image.pad_height = (uint32_t)padding_h;
arg->image.pad_width = padding_w; arg->image.pad_width = (uint32_t)padding_w;
arg->image.scale_address = input->scale; arg->image.scale_address = input->scale;
arg->output.address = output_ptr; arg->output.address = output_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
......
...@@ -632,7 +632,7 @@ void deconv_post_process(const struct DeconvArgs &args) { ...@@ -632,7 +632,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
for (int idx = 0; idx < sub_conv_n; ++idx) { for (int idx = 0; idx < sub_conv_n; ++idx) {
paddle_mobile::fpga::fpga_invalidate( paddle_mobile::fpga::fpga_invalidate(
args.split_conv_args[idx].output.address, args.split_conv_args[idx]->output.address,
align_origin_w * origin_h * sizeof(int16_t)); align_origin_w * origin_h * sizeof(int16_t));
} }
...@@ -642,7 +642,7 @@ void deconv_post_process(const struct DeconvArgs &args) { ...@@ -642,7 +642,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
int hx = (hh % sub_conv_n); int hx = (hh % sub_conv_n);
auto sub_t = auto sub_t =
(int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT
.output.address); ->output.address);
int hi = (hh / sub_conv_n); int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
...@@ -681,7 +681,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { ...@@ -681,7 +681,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
#endif #endif
ComputeFpgaConv(args.split_conv_args[i]); ComputeFpgaConv(*args.split_conv_args[i]);
#ifdef COST_TIME_PRINT #ifdef COST_TIME_PRINT
gettimeofday(&end, NULL); gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec; dif_sec = end.tv_sec - start.tv_sec;
...@@ -699,12 +699,12 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { ...@@ -699,12 +699,12 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif #endif
for (int i = 0; i < sub_conv_num; i++) { for (int i = 0; i < sub_conv_num; i++) {
paddle_mobile::fpga::fpga_invalidate( paddle_mobile::fpga::fpga_invalidate(
args.split_conv_args[i].output.scale_address, 2 * sizeof(float)); args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
float ptr_scale = (args.split_conv_args[i].output.scale_address)[0]; float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
if (ptr_scale > max_scale) { if (ptr_scale > max_scale) {
args.output.scale_address[0] = ptr_scale; args.output.scale_address[0] = ptr_scale;
args.output.scale_address[1] = args.output.scale_address[1] =
(args.split_conv_args[i].output.scale_address)[1]; (args.split_conv_args[i]->output.scale_address)[1];
} }
} }
......
...@@ -75,6 +75,9 @@ void *fpga_malloc(size_t size) { ...@@ -75,6 +75,9 @@ void *fpga_malloc(size_t size) {
} }
void fpga_free(void *ptr) { void fpga_free(void *ptr) {
if (ptr == nullptr) {
return;
}
static uint64_t counter = 0; static uint64_t counter = 0;
size_t size = 0; size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
...@@ -123,5 +126,6 @@ uint64_t vaddr_to_paddr(void *address) { ...@@ -123,5 +126,6 @@ uint64_t vaddr_to_paddr(void *address) {
return 0; return 0;
#endif #endif
} }
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <memory>
#include <vector>
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -158,7 +160,7 @@ struct ConcatArgs { ...@@ -158,7 +160,7 @@ struct ConcatArgs {
void* image_out; void* image_out;
float* scale_out; float* scale_out;
uint32_t* channel_num; uint32_t* channel_num;
uint32_t* aligned_channel_num; uint32_t* aligned_channel_num; // Not used so far. Reserved for V2.
uint32_t out_channel; uint32_t out_channel;
uint32_t height; uint32_t height;
uint32_t width; uint32_t width;
...@@ -171,6 +173,9 @@ struct SplitConvArgs { ...@@ -171,6 +173,9 @@ struct SplitConvArgs {
struct ImageOutputArgs output; struct ImageOutputArgs output;
struct ConvArgs* conv_arg; struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg; struct ConcatArgs concat_arg;
std::shared_ptr<ConvArgs> shared_conv_arg;
std::vector<std::shared_ptr<char>> vector_concat_space;
std::vector<std::shared_ptr<char>> vector_conv_space;
}; };
struct SplitArgs { struct SplitArgs {
...@@ -221,7 +226,7 @@ struct DeconvArgs { ...@@ -221,7 +226,7 @@ struct DeconvArgs {
uint32_t sub_output_width; uint32_t sub_output_width;
uint32_t sub_output_height; uint32_t sub_output_height;
struct ImageOutputArgs output; struct ImageOutputArgs output;
struct SplitConvArgs* split_conv_args; std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
}; };
struct DWconvArgs { struct DWconvArgs {
bool relu_enabled; bool relu_enabled;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册