提交 8bae119c 编写于 作者: H hjchen2

Merge branch 'ocr_ctc' of https://github.com/hjchen2/paddle-mobile into ocr_ctc

...@@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) { ...@@ -81,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) {
int div_capacity = filter::calc_division_capacity(chw); int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity); return filter::calc_split_num(num, div_capacity);
} }
int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
auto dims = filter_tensor->dims();
auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
auto num = dims[0] * stride;
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
}
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
...@@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { ...@@ -90,6 +97,15 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
return filter::calc_num_per_div(num, group_num, div_capacity); return filter::calc_num_per_div(num, group_num, div_capacity);
} }
int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
int group_num, int stride) {
auto dims = filter_tensor->dims();
auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
auto num = dims[0] * stride;
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_num_per_div(num, group_num, div_capacity);
}
int get_aligned_filter_element_num(int chw) { int get_aligned_filter_element_num(int chw) {
return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
} }
...@@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -448,14 +464,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->sub_output_height = (uint32_t)sub_output_height; arg->sub_output_height = (uint32_t)sub_output_height;
arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
stride_w, (int)filter->dims()[3], padding_w); stride_w, (int)filter->dims()[3], padding_w);
arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
auto sub_channels = (int)input->dims()[1]; arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
int sub_channels = (int)input->dims()[1];
int omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
int sub_filter_num = sub_conv_num * (arg->filter_num); int sub_filter_num = sub_conv_num * (arg->filter_num);
int conv_output_size = int conv_output_size =
(align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
sub_output_height; sub_output_height;
int ouput_size = conv_output_size * sub_conv_num;
int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
int align_sub_filter_count = int align_sub_filter_count =
...@@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -464,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int align_conv_sub_filter_count = int align_conv_sub_filter_count =
align_sub_filter_count * align_sub_filter_num; align_sub_filter_count * align_sub_filter_num;
int split_num =
group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
arg->split_conv_args =
(SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs));
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
arg->conv_args[i].filter_num = arg->sub_conv_num * arg->filter_num; arg->split_conv_args[i].filter_num =
arg->conv_args[i].group_num = (uint32_t)group_num; (arg->sub_conv_num) * (arg->filter_num);
arg->split_conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].filter_scale_address = filter->scale; arg->split_conv_args[i].split_num = split_num;
arg->conv_args[i].relu_enabled = relu_enabled; arg->split_conv_args[i].conv_arg =
(ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));
arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width;
arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width; arg->split_conv_args[i].concat_arg.height = sub_output_height;
arg->conv_args[i].kernel.stride_w = 1; arg->split_conv_args[i].concat_arg.width = sub_output_width;
arg->conv_args[i].kernel.stride_h = 1; arg->split_conv_args[i].concat_arg.image_num = split_num;
arg->split_conv_args[i].concat_arg.images_in =
arg->conv_args[i].image.scale_address = input->scale; (half **)fpga_malloc(split_num * sizeof(half *));
arg->conv_args[i].image.channels = (uint32_t)sub_channels; arg->split_conv_args[i].concat_arg.scales_in =
arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; (float **)fpga_malloc(split_num * sizeof(float *));
arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; arg->split_conv_args[i].concat_arg.channel_num =
arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));
arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; // arg->split_conv_args[i].concat_arg.image_out =
arg->conv_args[i].image.address = input_ptr; // fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].sb_address = bs_ptr; // arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 *
// sizeof(float));
auto filter_sub_space = }
(char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
fpga_copy(filter_sub_space,
(char *)filter_ptr + i * align_conv_sub_filter_count,
(size_t)align_conv_sub_filter_count);
arg->conv_args[i].filter_address = filter_sub_space;
fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
int filter_num_per_div =
get_deconv_filter_num_per_div(filter, group_num, stride_w);
int element_num = get_aligned_filter_element_num(
(int)(sub_channels * sub_filter_width * sub_filter_width));
int chw = sub_channels * sub_filter_width * sub_filter_width;
int division_capacity = filter::calc_division_capacity(chw);
int num_per_div_before_alignment =
filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
num_per_div_before_alignment;
int residual = sub_filter_num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
int filter_sub_conv_offset = element_num * num_after_alignment;
for (int i = 0; i < sub_conv_num; ++i) {
if (sub_conv_num == 1) { if (sub_conv_num == 1) {
arg->conv_args[i].output.address = out_ptr; arg->split_conv_args[i].output.address = arg->output.address;
arg->conv_args[i].output.scale_address = out->scale; arg->split_conv_args[i].output.scale_address = arg->output.scale_address;
} else { } else {
auto ptr_output = fpga_malloc(conv_output_size * sizeof(half)); auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].output.address = ptr_output; arg->split_conv_args[i].output.address = (void *)((half *)ptr_output);
auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.scale_address = ptr_output_scale; arg->split_conv_args[i].output.scale_address = ptr_output_scale;
} }
expand_conv_arg(&arg->conv_args[i]);
}
arg->output.address = out_ptr; for (int j = 0; j < split_num; ++j) {
arg->output.scale_address = out->scale; arg->split_conv_args[i].conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i].conv_arg[j].group_num = (uint32_t)group_num;
arg->split_conv_args[i].conv_arg[j].kernel.width =
(uint32_t)sub_filter_width;
arg->split_conv_args[i].conv_arg[j].kernel.height =
(uint32_t)sub_filter_width;
arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1;
arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1;
arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale;
arg->split_conv_args[i].conv_arg[j].image.channels =
(uint32_t)sub_channels;
arg->split_conv_args[i].conv_arg[j].image.width =
(uint32_t)input->dims()[3];
arg->split_conv_args[i].conv_arg[j].image.height =
(uint32_t)input->dims()[2];
arg->split_conv_args[i].conv_arg[j].image.pad_width = (uint32_t)sub_pad;
arg->split_conv_args[i].conv_arg[j].image.pad_height = (uint32_t)sub_pad;
arg->split_conv_args[i].conv_arg[j].image.address = input_ptr;
arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale;
arg->split_conv_args[i].conv_arg[j].filter_num = (uint32_t)(
j == split_num - 1
? sub_filter_num - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div);
size_t filter_size =
element_num *
align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[j * element_num * filter_num_per_div +
i * filter_sub_conv_offset];
arg->split_conv_args[i].conv_arg[j].filter_address =
fpga_malloc(filter_size);
memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head,
filter_size);
fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address,
filter_size);
{
static int test_cnt = 0;
signed char result = 0;
if (test_cnt <= 1) {
std::string filename = "deconv_split_flt" + std::to_string(test_cnt);
fpga::savefile<signed char>(
filename, arg->split_conv_args[i].conv_arg[j].filter_address,
filter_size, result);
test_cnt++;
}
}
size_t bs_align_num = align_to_x(
arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
size_t bs_size = 2 * bs_align_num * sizeof(float);
auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
arg->split_conv_args[i].conv_arg[j].sb_address = fpga_malloc(bs_size);
memcpy(arg->split_conv_args[i].conv_arg[j].sb_address, bs_head, bs_size);
fpga_flush(arg->split_conv_args[i].conv_arg[j].sb_address, bs_size);
if (split_num == 1) {
arg->split_conv_args[i].conv_arg[j].output.address =
arg->split_conv_args[i].output.address;
arg->split_conv_args[i].conv_arg[j].output.scale_address =
arg->split_conv_args[i].output.scale_address;
} else {
auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
arg->split_conv_args[i].conv_arg[j].output.address =
(void *)((half *)ptr_output);
auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->split_conv_args[i].conv_arg[j].output.scale_address =
ptr_output_scale;
}
arg->split_conv_args[i].concat_arg.images_in[j] =
(half *)arg->split_conv_args[i].conv_arg[j].output.address; // NOLINT
arg->split_conv_args[i].concat_arg.scales_in[j] =
arg->split_conv_args[i].conv_arg[j].output.scale_address;
arg->split_conv_args[i].concat_arg.channel_num[j] =
arg->split_conv_args[i].conv_arg[j].filter_num;
expand_conv_arg(&(arg->split_conv_args[i].conv_arg[j]));
}
arg->split_conv_args[i].concat_arg.image_out =
arg->split_conv_args[i].output.address;
arg->split_conv_args[i].concat_arg.scale_out =
arg->split_conv_args[i].output.scale_address;
}
filter->reset_data_ptr(nullptr); filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr);
} // fill_deconv_arg } // fill_deconv_arg
} // namespace fpga } // namespace fpga
......
...@@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor); ...@@ -27,7 +27,12 @@ void format_fp32_ofm(framework::Tensor* ofm_tensor);
float filter_find_max(framework::Tensor* filter_tensor); float filter_find_max(framework::Tensor* filter_tensor);
int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
int group_num, int stride);
int get_plit_num(framework::Tensor* filter_tensor); int get_plit_num(framework::Tensor* filter_tensor);
int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
int get_aligned_filter_element_num(int chw); int get_aligned_filter_element_num(int chw);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
......
...@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and ...@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fpga/common/pe.h" #include "fpga/common/pe.h"
#include "common/types.h"
#include "fpga/V1/filter.h" #include "fpga/V1/filter.h"
#include "fpga/V1/image.h" #include "fpga/V1/image.h"
#include "fpga/common/config.h" #include "fpga/common/config.h"
#include "fpga/common/driver.h" #include "fpga/common/driver.h"
#ifdef COST_TIME_PRINT
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <iostream>
//#include <iostream>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
using namespace driver; // NOLINT using namespace driver; // NOLINT
using namespace std;
#define USE_RELU 1 #define USE_RELU 1
#define USE_BIAS 2 #define USE_BIAS 2
...@@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) { ...@@ -162,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
<< " group_num:" << args.group_num << " group_num:" << args.group_num
<< " split_num:" << args.split_num; << " split_num:" << args.split_num;
#endif #endif
int ret = 0;
int split_num = args.split_num; int split_num = args.split_num;
for (int i = 0; i < split_num; i++) { for (int i = 0; i < split_num; i++) {
ComputeBasicConv(args.conv_arg[i]); ret |= ComputeBasicConv(args.conv_arg[i]);
} }
if (split_num > 1) { if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg); ComputeFPGAConcat(args.concat_arg);
} }
return ret;
} }
int ComputeBasicConv(const struct ConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) {
...@@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -250,12 +262,13 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq(args.driver.post_prog_full_cnt, 0xd10); reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD); reg_writeq(args.driver.cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "Conv Wait Irq Timeout!"; DLOG << "Conv Wait Irq Timeout!";
} }
DLOG << "after reg poll";
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
...@@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -289,6 +302,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Polling";
// return 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t timer_cnt = 0; uint64_t timer_cnt = 0;
int ret = 0; int ret = 0;
...@@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -561,11 +576,13 @@ int PerformBypass(const struct BypassArgs &args) {
reg_writeq(datalen, REG_CONVERT_LENGTH); reg_writeq(datalen, REG_CONVERT_LENGTH);
reg_writeq(cmd, REG_CONVERT_CMD); reg_writeq(cmd, REG_CONVERT_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "BYPASS Wait Irq Timeout!"; DLOG << "BYPASS Wait Irq Timeout!";
} }
DLOG << "after reg poll";
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
...@@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) { ...@@ -619,37 +636,29 @@ void deconv_post_process(const struct DeconvArgs &args) {
int align_deconv_row_len = align_to_x(deconv_row_len, 16); int align_deconv_row_len = align_to_x(deconv_row_len, 16);
for (int idx = 0; idx < sub_conv_n; ++idx) { for (int idx = 0; idx < sub_conv_n; ++idx) {
fpga_invalidate(args.conv_args[idx].output.address, paddle_mobile::fpga::fpga_invalidate(
align_origin_w * origin_h * sizeof(int16_t)); args.split_conv_args[idx].output.address,
align_origin_w * origin_h * sizeof(int16_t));
} }
auto ptr_deconv = (int16_t *)fpga_malloc(num * align_deconv_row_len *
deconv_h * sizeof(int16_t));
memset(ptr_deconv, 0,
num * align_deconv_row_len * deconv_h * sizeof(int16_t));
int deconv_idx = 0; int deconv_idx = 0;
for (int nn = 0; nn < num; ++nn) { for (int nn = 0; nn < num; ++nn) {
for (int hh = 0; hh < origin_h; ++hh) { for (int hh = 0; hh < origin_h; ++hh) {
int hx = (hh % sub_conv_n); int hx = (hh % sub_conv_n);
auto sub_t = auto sub_t =
(int16_t *)(args.conv_args[sub_conv_n - hx - 1].output.address); (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
int hi = (hh / sub_conv_n); int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
omit_size * channel); omit_size * channel);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx,
sizeof(int16_t) * deconv_row_len); sizeof(int16_t) * deconv_row_len);
deconv_idx += align_deconv_row_len; deconv_idx += align_deconv_row_len;
} }
} }
fpga_copy(args.output.address, ptr_deconv,
num * align_deconv_row_len * deconv_h * sizeof(int16_t));
fpga_flush(args.output.address, fpga_flush(args.output.address,
num * align_deconv_row_len * deconv_h * sizeof(int16_t)); num * align_deconv_row_len * deconv_h * sizeof(int16_t));
fpga_free(ptr_deconv); }
} // deconv_post_process
int ComputeFpgaDeconv(const struct DeconvArgs &args) { int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef FPGA_PRINT_MODE #ifdef FPGA_PRINT_MODE
...@@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { ...@@ -661,32 +670,70 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
<< " sub_conv_num:" << args.sub_conv_num; << " sub_conv_num:" << args.sub_conv_num;
DLOG << "args.output.address: " << args.output.address DLOG << "args.output.address: " << args.output.address
<< "args.output.scale_address: " << args.output.scale_address; << "args.output.scale_address: " << args.output.scale_address;
DLOG << "args.conv_args.sb_address: " << (args.conv_args)->sb_address
<< "args.conv_args.filter_address: " << (args.conv_args)->filter_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif #endif
int sub_conv_num = args.sub_conv_num; int sub_conv_num = args.sub_conv_num;
#ifdef COST_TIME_PRINT
timeval start, end;
long dif_sec, dif_usec;
#endif
for (int i = 0; i < sub_conv_num; i++) { for (int i = 0; i < sub_conv_num; i++) {
ComputeBasicConv(args.conv_args[i]); #ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
ComputeFpgaConv(args.split_conv_args[i]);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv basic_conv: " << i << " times: "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
} }
if (sub_conv_num > 1) { if (sub_conv_num > 1) {
float max_scale = -1.0f; float max_scale = -1.0f;
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
for (int i = 0; i < sub_conv_num; i++) { for (int i = 0; i < sub_conv_num; i++) {
paddle_mobile::fpga::fpga_invalidate( paddle_mobile::fpga::fpga_invalidate(
args.conv_args[i].output.scale_address, 2 * sizeof(float)); args.split_conv_args[i].output.scale_address, 2 * sizeof(float));
float ptr_scale = (args.conv_args[i].output.scale_address)[0]; float ptr_scale = (args.split_conv_args[i].output.scale_address)[0];
if (ptr_scale > max_scale) { if (ptr_scale > max_scale) {
args.output.scale_address[0] = ptr_scale; args.output.scale_address[0] = ptr_scale;
args.output.scale_address[1] = args.output.scale_address[1] =
(args.conv_args[i].output.scale_address)[1]; (args.split_conv_args[i].output.scale_address)[1];
} }
} }
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv scale "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
deconv_post_process(args); deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
} }
return 0; return 0;
......
...@@ -59,6 +59,9 @@ int close_device() { ...@@ -59,6 +59,9 @@ int close_device() {
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
static uint64_t counter = 0; static uint64_t counter = 0;
if (size <= 0) {
size = 1;
}
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
auto ptr = driver::fpga_malloc_driver(size); auto ptr = driver::fpga_malloc_driver(size);
#else #else
......
...@@ -210,7 +210,7 @@ struct DeconvArgs { ...@@ -210,7 +210,7 @@ struct DeconvArgs {
uint32_t sub_output_width; uint32_t sub_output_width;
uint32_t sub_output_height; uint32_t sub_output_height;
struct ImageOutputArgs output; struct ImageOutputArgs output;
struct ConvArgs* conv_args; struct SplitConvArgs* split_conv_args;
}; };
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
......
...@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
fpga::format_deconv_filter(filter, max_value, param->Groups(), fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]); param->Strides()[0]);
// int element_num_per_div = int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups()); fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
// deconv only support group=1 && no spilt //
fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n, fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
channel * sub_conv_n); channel * sub_conv_n);
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
......
...@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
fpga::format_deconv_filter(filter, max_value, param->Groups(), fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]); param->Strides()[0]);
// int element_num_per_div = int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups()); fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
// deconv only support group=1 && no spilt fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
channel * sub_conv_n); channel * sub_conv_n);
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册