提交 a1cc931d 编写于 作者: J jameswu2014 提交者: qnqinan

V2-conv-hellocase pass & V1 verify-pass (#1608)

上级 64aa8f05
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2 #define USE_BIAS 2
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
...@@ -301,7 +302,9 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -301,7 +302,9 @@ void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg; ConvArgs args = *arg;
auto fpga_bias_scale_len = auto fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num; align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) *
args.group_num;
fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM;
auto output_height = auto output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
...@@ -325,7 +328,7 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -325,7 +328,7 @@ void expand_conv_arg(ConvArgs *arg) {
auto output_amount_per_row = align_to_x( auto output_amount_per_row = align_to_x(
(output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num, (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
IMAGE_ALIGNMENT); RESULT_ALIGNMENT);
// find the opt partition strategy // find the opt partition strategy
uint64_t res_win; uint64_t res_win;
...@@ -335,10 +338,10 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -335,10 +338,10 @@ void expand_conv_arg(ConvArgs *arg) {
(args.image.channels * (args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)), (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) / IMAGE_ALIGNMENT) /
16 + IMAGE_ALIGNMENT +
1) * 1) *
args.kernel.height > args.kernel.height >
2048) { 256) {
break; break;
} }
} }
...@@ -350,6 +353,7 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -350,6 +353,7 @@ void expand_conv_arg(ConvArgs *arg) {
if (((res_win % 2) != 0) && (res_win != 1)) { if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1; res_win = res_win - 1;
} }
PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
res_fit = res_win; res_fit = res_win;
auto block_num = (output_width + res_fit - 1) / res_fit; auto block_num = (output_width + res_fit - 1) / res_fit;
...@@ -375,14 +379,14 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -375,14 +379,14 @@ void expand_conv_arg(ConvArgs *arg) {
align_to_x((args.image.channels * align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)), (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) / IMAGE_ALIGNMENT) /
16 + IMAGE_ALIGNMENT +
1; 1;
auto image_block_len_last = auto image_block_len_last =
align_to_x( align_to_x(
(args.image.channels * (args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)), (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) / IMAGE_ALIGNMENT) /
16 + IMAGE_ALIGNMENT +
1; 1;
auto image_win_cnt = block_len; auto image_win_cnt = block_len;
auto image_win_cnt_last = block_last; auto image_win_cnt_last = block_last;
...@@ -395,46 +399,85 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -395,46 +399,85 @@ void expand_conv_arg(ConvArgs *arg) {
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0; : 0;
// auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
auto cmd = 0UL | USE_BIAS; // auto cmd = 0UL | USE_BIAS;
auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
((args.deconv_tx_param.sub_conv_num) << 8) | ((args.deconv_tx_param.sub_conv_num) << 8) |
((args.deconv_tx_param.omit_size) << 0); ((args.deconv_tx_param.omit_size) << 0);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
args.deconv_tx_param.out_addr_offset;
(*arg).driver.output_height = output_height;
(*arg).driver.output_width = output_width;
(*arg).driver.filter_per_group = filter_per_group; (*arg).driver.filter_per_group = filter_per_group;
(*arg).driver.channel_per_group = channel_per_group; (*arg).driver.channel_per_group = channel_per_group;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_one_pad_per_row = image_one_pad_per_row; (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
(*arg).driver.filter_amount_all = filter_amount_all; (*arg).driver.deconv_param = deconv_param;
(*arg).driver.output_amount_per_row = output_amount_per_row; // new
(*arg).driver.col_padding_up = args.image.pad_width * args.image.channels;
(*arg).driver.col_padding_down = image_one_pad_per_row;
(*arg).driver.row_padding_up = args.image.pad_height;
(*arg).driver.row_padding_down = args.image.pad_height + args.image.height;
(*arg).driver.image_block_amount_per_row = image_block_amount_per_row; (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
(*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.filter_row = args.kernel.width * args.image.channels;
(*arg).driver.filter_width = args.kernel.width;
(*arg).driver.filter_height = args.kernel.height;
(*arg).driver.skip_window = args.image.channels * args.kernel.stride_w;
(*arg).driver.stride_h = args.kernel.stride_h;
(*arg).driver.filter_amount_all = filter_amount_all;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) +
(((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0);
(*arg).driver.filter_num = args.filter_num;
(*arg).driver.output_width = output_width;
(*arg).driver.output_amount_per_row = output_amount_per_row;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM +
((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1;
(*arg).driver.last_cal_res_row_num =
(output_height % (ROW_PARALLEL_NUM))
? (output_height % (ROW_PARALLEL_NUM))
: (ROW_PARALLEL_NUM);
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.deconv_skip_row =
ROW_PARALLEL_NUM *
args.deconv_tx_param.sub_conv_num; // paralvl*deconv_group
(*arg).driver.deconv_res_skip_row =
args.deconv_tx_param.sub_conv_num *
output_amount_per_row; // deconv_group * result_amount_per_row
(*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en;
(*arg).driver.deconv_dump = args.deconv_tx_param.omit_size;
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
args.deconv_tx_param.out_addr_offset;
(*arg).driver.output_height = output_height;
(*arg).driver.result_amount_per_row_multi_para =
output_amount_per_row / RESULT_ALIGNMENT *
(args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row
: ROW_PARALLEL_NUM);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.filter_amount_whole = filter_amount_all;
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.filters_amount_whole =
filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.image_hight = args.image.height;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_amount_per_row_multi_win_first = (*arg).driver.image_amount_per_row_multi_win_first =
image_amount_per_row_multi_win_first; image_amount_per_row_multi_win_first;
(*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
(*arg).driver.filter_pad_hight = args.image.pad_height;
(*arg).driver.image_block_num = image_block_num; (*arg).driver.image_block_num = image_block_num;
(*arg).driver.image_block_len = image_block_len; (*arg).driver.image_block_len = image_block_len;
(*arg).driver.image_block_len_last = image_block_len_last; (*arg).driver.image_block_len_last = image_block_len_last;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.cmd = cmd; (*arg).driver.cmd = cmd;
(*arg).driver.deconv_param = deconv_param;
} // expand_conv_arg() } // expand_conv_arg()
void expand_EW_arg(EWAddArgs *arg) { void expand_EW_arg(EWAddArgs *arg) {
EWAddArgs args = *arg; EWAddArgs args = *arg;
uint64_t cmd = 0; uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t datalen = (uint64_t)args.image0.width * uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height * (uint64_t)args.image0.height *
(uint64_t)args.image0.channels; (uint64_t)args.image0.channels;
...@@ -462,10 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) { ...@@ -462,10 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) {
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable, bool relu_enabled, int group_num, int stride_h,
int16_t leaky_relu_negative_slope, int group_num, int stride_w, int padding_h, int padding_w, float *bs_ptr) {
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<int8_t>(); auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>(); auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<int8_t>(); auto out_ptr = out->data<int8_t>();
...@@ -473,6 +514,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -473,6 +514,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
// Either group_num or split_num = 1; // Either group_num or split_num = 1;
PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1");
arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1; arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr; arg->output.address = out_ptr;
...@@ -511,9 +553,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -511,9 +553,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter->dims()[3])); filter->dims()[3]));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
arg->conv_arg[i].output.activation.activation_type = activation_enable; arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
arg->conv_arg[i].group_num = (uint32_t)group_num; arg->conv_arg[i].group_num = (uint32_t)group_num;
arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
...@@ -585,9 +625,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -585,9 +625,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable, bool relu_enabled, int group_num, int stride_h,
int16_t leaky_relu_negative_slope, int group_num, int stride_w, int padding_h, int padding_w,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) { float *bs_ptr) {
auto input_ptr = input->data<int8_t>(); auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>(); auto filter_ptr = filter->data<int8_t>();
...@@ -713,12 +752,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -713,12 +752,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
} }
for (int j = 0; j < split_num; ++j) { for (int j = 0; j < split_num; ++j) {
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = // arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type
activation_enable; // =
arg->split_conv_args[i] // activation_enable;
->conv_arg[j] // arg->split_conv_args[i]
.output.activation.leaky_relu_negative_slope = // ->conv_arg[j]
leaky_relu_negative_slope; // .output.activation.leaky_relu_negative_slope =
// leaky_relu_negative_slope;
arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num; arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
arg->split_conv_args[i]->conv_arg[j].kernel.width = arg->split_conv_args[i]->conv_arg[j].kernel.width =
...@@ -831,16 +872,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -831,16 +872,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable, bool relu_enabled, int stride_h, int stride_w,
int16_t leaky_relu_negative_slope, int stride_h, int padding_h, int padding_w, float *bias_ptr) {
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<int16_t>(); auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<int8_t>(); auto input_ptr = input->data<int8_t>();
auto output_ptr = out->mutable_data<int8_t>(); auto output_ptr = out->mutable_data<int8_t>();
arg->sub_conv_num = 1; arg->sub_conv_num = 1;
arg->output.activation.activation_type = activation_enable; arg->relu_enabled = relu_enabled;
arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; // arg->output.activation.activation_type = activation_enable;
arg->bias_address = bias_ptr; arg->bias_address = bias_ptr;
arg->filter_address = filter_ptr; arg->filter_address = filter_ptr;
arg->kernel.height = (uint32_t)filter->dims()[2]; arg->kernel.height = (uint32_t)filter->dims()[2];
...@@ -860,10 +899,8 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, ...@@ -860,10 +899,8 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable, bool relu_enabled, int stride_h, int stride_w,
int16_t leaky_relu_negative_slope, int stride_h, int padding_h, int padding_w, float *bias_ptr) {
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<int8_t>(); auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<int8_t>(); auto input_ptr = input->data<int8_t>();
...@@ -913,10 +950,11 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, ...@@ -913,10 +950,11 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>()); arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
arg->dw_conv_args[i]->sub_conv_num = sub_conv_num; arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
// arg->dw_conv_args[i]->relu_enabled = relu_enabled; arg->dw_conv_args[i]->relu_enabled = relu_enabled;
arg->dw_conv_args[i]->output.activation.activation_type = activation_enable; // arg->dw_conv_args[i]->output.activation.activation_type =
arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope = // activation_enable;
leaky_relu_negative_slope; // arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
// leaky_relu_negative_slope;
arg->dw_conv_args[i]->bias_address = bias_ptr; arg->dw_conv_args[i]->bias_address = bias_ptr;
arg->dw_conv_args[i]->filter_address = arg->dw_conv_args[i]->filter_address =
......
...@@ -48,28 +48,20 @@ void format_concat_output(framework::Tensor* out, int height, int width, ...@@ -48,28 +48,20 @@ void format_concat_output(framework::Tensor* out, int height, int width,
void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable, bool relu_enabled, int group_num, int stride_h,
int16_t leaky_relu_negative_slope, int group_num, int stride_w, int padding_h, int padding_w, float* bs_ptr);
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable, bool relu_enabled, int group_num, int stride_h,
int16_t leaky_relu_negative_slope, int group_num, int stride_w, int padding_h, int padding_w, float* bs_ptr);
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable, bool relu_enabled, int stride_h, int stride_w,
int16_t leaky_relu_negative_slope, int stride_h, int padding_h, int padding_w, float* bias_ptr);
int stride_w, int padding_h, int padding_w,
float* bias_ptr);
void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input, void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable, bool relu_enabled, int stride_h, int stride_w,
int16_t leaky_relu_negative_slope, int stride_h, int padding_h, int padding_w, float* bs_ptr);
int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride); int group_num, int stride);
......
...@@ -115,6 +115,19 @@ using namespace std; // NOLINT ...@@ -115,6 +115,19 @@ using namespace std; // NOLINT
/*conv*/ /*conv*/
#define REG_CONV_CMD 0xC00 #define REG_CONV_CMD 0xC00
#define REG_CONV_REG0 0xC08
#define REG_CONV_REG1 0xC10
#define REG_CONV_REG2 0xC18
#define REG_CONV_REG3 0xC20
#define REG_CONV_REG4 0xC28
#define REG_CONV_REG5 0xC30
#define REG_CONV_REG6 0xC38
#define REG_CONV_REG7 0xC40
#define REG_CONV_REG8 0xC48
#define REG_CONV_REG9 0xC50
#define REG_CONV_REG10 0xC58
#define REG_CONV_REG11 0xC60
#define REG_CONV_IMAGE_BASE_ADDR 0xC08 #define REG_CONV_IMAGE_BASE_ADDR 0xC08
#define REG_CONV_FILTER_BASE_ADDR 0xC10 #define REG_CONV_FILTER_BASE_ADDR 0xC10
#define REG_CONV_SB_BASE_ADDR 0xC18 #define REG_CONV_SB_BASE_ADDR 0xC18
...@@ -194,7 +207,7 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) { ...@@ -194,7 +207,7 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
int ComputeBasicConv(const struct ConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_PRINT_MODE #ifdef FPGA_PRINT_MODE
DLOG << "======Compute Basic Conv======"; DLOG << "======Compute Basic Conv======";
// DLOG << " relu_enabled:" << args.relu_enabled DLOG << " relu_enabled:" << args.relu_enabled;
DLOG << " sb_address:" << args.sb_address DLOG << " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address << " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num << " filter_num:" << args.filter_num
...@@ -218,23 +231,23 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -218,23 +231,23 @@ int ComputeBasicConv(const struct ConvArgs &args) {
int ret = 0; int ret = 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t reg_ActivationArgs = 0; // uint64_t reg_ActivationArgs = 0;
// active function:{none,leakeyrelu,sigmoid,tanh} // active function:{none,leakeyrelu,sigmoid,tanh}
ActivationArgs active_args; // ActivationArgs active_args;
// active_args.activation_type = LEAKYRELU; // active_args.activation_type = LEAKYRELU;
active_args.activation_type = args.output.activation.activation_type; // active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope = // active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope; // args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope; // active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type // DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:" // << " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope; // << active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
...@@ -243,63 +256,71 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -243,63 +256,71 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
// new
reg_writeq((args.driver.row_padding_down << 45) |
(args.driver.row_padding_up << 34) |
(args.driver.col_padding_down << 17) |
args.driver.col_padding_up,
REG_CONV_REG0);
reg_writeq((args.driver.image_win_cnt_last << 50) |
(args.driver.image_win_cnt << 39) |
(args.driver.image_block_amount_per_row << 20) |
args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) |
(args.driver.filter_row << 8) |
(args.driver.filter_height << 4) | args.driver.filter_width,
REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) |
args.driver.filter_amount_all,
REG_CONV_REG3);
reg_writeq((args.driver.post_prog_full_cnt << 54) |
(args.driver.last_cal_res_row_num << 50) |
(args.driver.cal_res_num << 39) |
(args.driver.res_row_data_align4_pad << 35) |
(args.driver.output_amount_per_row << 16) |
args.driver.output_width,
REG_CONV_REG4);
reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) |
(args.driver.deconv_res_skip_row << 7) |
args.driver.deconv_skip_row,
REG_CONV_REG5);
reg_writeq((args.driver.result_amount_per_row_multi_para << 43) |
(args.driver.output_height << 32) |
args.driver.output_address_phy,
REG_CONV_REG6);
reg_writeq((args.driver.filter_amount_whole << 48) |
(args.driver.fpga_bias_scale_len << 32) |
args.driver.sb_address_phy,
REG_CONV_REG7);
reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_CONV_IMAGE_PIXEL);
reg_writeq( reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), (args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy,
REG_CONV_FILTER_PIXEL); REG_CONV_REG8);
uint64_t output_height_fraction = reg_writeq((args.driver.image_amount_per_row << 43) |
args.driver.output_height / ROW_PARALLEL_NUM; (args.driver.image_hight << 32) |
uint64_t output_height_remainder = args.driver.image_address_phy,
args.driver.output_height % ROW_PARALLEL_NUM; REG_CONV_REG9);
reg_writeq(args.driver.output_height | (output_height_fraction << 16) |
(output_height_remainder << 26) | reg_writeq((args.driver.filter_pad_hight << 46) |
(args.driver.output_width << 32), (args.driver.image_amount_per_row_multi_win << 23) |
REG_CONV_RESULT_PIXEL); args.driver.image_amount_per_row_multi_win_first,
reg_writeq(((uint64_t)args.image.pad_height) | REG_CONV_REG10);
(((uint64_t)args.image.pad_width) << 32),
REG_CONV_PAD_PIXEL); reg_writeq((args.driver.image_block_num << 48) |
reg_writeq(((uint64_t)args.kernel.stride_h) | (args.driver.image_block_len << 24) |
(((uint64_t)args.kernel.stride_w) << 32), args.driver.image_block_len_last,
REG_CONV_STEP_PIXEL); REG_CONV_REG11);
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT
REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT
REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
reg_writeq(args.driver.image_block_num, 0xcc8);
reg_writeq(args.driver.image_block_len, 0xcd0);
reg_writeq(args.driver.image_block_len_last, 0xcd8);
reg_writeq(args.driver.image_win_cnt, 0xce0);
reg_writeq(args.driver.image_win_cnt_last, 0xce8);
reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
reg_writeq(args.driver.prog_full_cnt, 0xd08);
reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD); reg_writeq(args.driver.cmd, REG_CONV_CMD);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
...@@ -307,12 +328,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -307,12 +328,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << "Conv Wait Irq Timeout!"; DLOG << "Conv Wait Irq Timeout!";
PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout"); PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
} }
output_scale = reg_readq(REG_SCALE_PARAMETER); DLOG << "after reg poll";
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
...@@ -350,22 +366,22 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -350,22 +366,22 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
uint64_t reg_ActivationArgs = 0; // uint64_t reg_ActivationArgs = 0;
// active function:{none,leakeyrelu,sigmoid,tanh} // active function:{none,leakeyrelu,sigmoid,tanh}
ActivationArgs active_args; // ActivationArgs active_args;
// active_args.activation_type = LEAKYRELU; // active_args.activation_type = LEAKYRELU;
active_args.activation_type = args.output.activation.activation_type; // active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope = // active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope; // args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope; // active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type // DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:" // << " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope; // << active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
image_physical_address = vaddr_to_paddr_driver(args.image.address); image_physical_address = vaddr_to_paddr_driver(args.image.address);
output_physical_address = vaddr_to_paddr_driver(args.output.address); output_physical_address = vaddr_to_paddr_driver(args.output.address);
...@@ -417,10 +433,10 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -417,10 +433,10 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
return ret; return ret;
} }
reg_writeq(reg_ActivationArgs, // reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER); // reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
reg_writeq( reg_writeq(
...@@ -462,12 +478,12 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -462,12 +478,12 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG << "after reg poll"; DLOG << "after reg poll";
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER); // output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); // output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE; // active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
...@@ -479,7 +495,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -479,7 +495,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
int ComputeFpgaEWAdd(const struct EWAddArgs &args) { int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_PRINT_MODE #ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaEWAdd==========="; DLOG << "=============ComputeFpgaEWAdd===========";
// DLOG << " relu_enabled:" << args.relu_enabled DLOG << " relu_enabled:" << args.relu_enabled;
DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0)) DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1)); << " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address DLOG << " image0_address:" << args.image0.address
...@@ -503,17 +519,17 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -503,17 +519,17 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
int ret = 0; int ret = 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t reg_ActivationArgs = 0; // uint64_t reg_ActivationArgs = 0;
ActivationArgs active_args; // ActivationArgs active_args;
active_args.activation_type = args.output.activation.activation_type; // active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope = // active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope; // args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope; // active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type // DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:" // << " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope; // << active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
...@@ -523,8 +539,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -523,8 +539,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
return ret; return ret;
} }
reg_writeq(reg_ActivationArgs, // reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
...@@ -543,11 +559,11 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -543,11 +559,11 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
} }
output_scale = reg_readq(REG_SCALE_PARAMETER); // output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); // output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE; // active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
......
...@@ -200,10 +200,10 @@ uint64_t vaddr_to_paddr(void *address) { ...@@ -200,10 +200,10 @@ uint64_t vaddr_to_paddr(void *address) {
} }
uint32_t paddle_mobile_version() { uint32_t paddle_mobile_version() {
uint32_t v_master = 35; uint32_t v_master = 52;
uint32_t v_slave = 35; uint32_t v_slave = 52;
uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 2; uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1;
uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master; uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master;
uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave; uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave;
......
...@@ -32,8 +32,12 @@ limitations under the License. */ ...@@ -32,8 +32,12 @@ limitations under the License. */
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 #define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8) #define BS_NUM_ALIGNMENT (8)
#define BIAS_SCALE_DMA_NUM (4)
#define RESULT_ALIGNMENT (32)
#define PE_COLUMN (8)
#define ROW_PARALLEL_NUM (2)
#define BIAS_NUM_ALIGNMENT (16) #define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif #endif
namespace paddle_mobile { namespace paddle_mobile {
...@@ -89,37 +93,59 @@ struct ImageOutputArgs { ...@@ -89,37 +93,59 @@ struct ImageOutputArgs {
}; };
struct ConvDriverParam { struct ConvDriverParam {
uint64_t image_address_phy;
uint64_t filter_address_phy;
uint64_t sb_address_phy;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t output_width;
uint64_t filter_per_group; uint64_t filter_per_group;
uint64_t channel_per_group; uint64_t channel_per_group;
uint64_t image_amount_per_row;
uint64_t image_one_pad_per_row; uint64_t image_one_pad_per_row;
uint64_t filter_amount_all; uint64_t deconv_param;
uint64_t output_amount_per_row;
uint64_t col_padding_up;
uint64_t col_padding_down;
uint64_t row_padding_up;
uint64_t row_padding_down;
uint64_t image_block_amount_per_row; uint64_t image_block_amount_per_row;
uint64_t filter_pad_width_mul_channel; uint64_t filter_pad_width_mul_channel;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t image_win_cnt; uint64_t image_win_cnt;
uint64_t image_win_cnt_last; uint64_t image_win_cnt_last;
uint64_t res_row_data_align4_pad; uint64_t filter_row;
uint64_t filter_width;
uint64_t filter_height;
uint64_t skip_window;
uint64_t stride_h;
uint64_t filter_amount_all;
uint64_t prog_full_cnt; uint64_t prog_full_cnt;
uint64_t filter_align;
uint64_t filter_num;
uint64_t output_width;
uint64_t output_amount_per_row;
uint64_t res_row_data_align4_pad;
uint64_t cal_res_num;
uint64_t last_cal_res_row_num;
uint64_t post_prog_full_cnt; uint64_t post_prog_full_cnt;
uint64_t deconv_skip_row; // paralvl*deconv_group
uint64_t deconv_res_skip_row; // deconv_group * result_amount_per_row
uint64_t deconv_ena;
uint64_t deconv_dump;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t result_amount_per_row_multi_para;
uint64_t sb_address_phy;
uint64_t fpga_bias_scale_len; uint64_t fpga_bias_scale_len;
uint64_t cmd; uint64_t filter_amount_whole;
uint64_t filter_address_phy;
uint64_t filters_amount_whole;
uint64_t image_address_phy;
uint64_t image_hight;
uint64_t image_amount_per_row;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t filter_pad_hight;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t deconv_param; uint64_t cmd;
}; };
struct EWAddDriverParam { struct EWAddDriverParam {
...@@ -141,6 +167,7 @@ struct DeconvTxParm { ...@@ -141,6 +167,7 @@ struct DeconvTxParm {
}; };
struct ConvArgs { struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias void* sb_address; // scale and bias
void* filter_address; void* filter_address;
float* filter_scale_address; float* filter_scale_address;
...@@ -209,6 +236,7 @@ struct PoolingArgs { ...@@ -209,6 +236,7 @@ struct PoolingArgs {
}; };
struct EWAddArgs { struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1; uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1; uint32_t const1;
struct ImageInputArgs image0; struct ImageInputArgs image0;
...@@ -238,6 +266,7 @@ struct DeconvArgs { ...@@ -238,6 +266,7 @@ struct DeconvArgs {
}; };
struct DWconvArgs { struct DWconvArgs {
uint32_t sub_conv_num; uint32_t sub_conv_num;
bool relu_enabled;
void* bias_address; void* bias_address;
void* filter_address; void* filter_address;
struct KernelArgs kernel; struct KernelArgs kernel;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "framework/executor.h" #include "framework/executor.h"
#include <algorithm> #include <algorithm>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
...@@ -638,7 +639,8 @@ std::map<std::string, float> LoadQuantValFromFile(std::string filename) { ...@@ -638,7 +639,8 @@ std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
std::ifstream in; std::ifstream in;
in.open(filename, std::ios::in); in.open(filename, std::ios::in);
if (!in.is_open()) { if (!in.is_open()) {
std::cout << "open File Failed." << std::endl; // std::cout << "open File Failed." << std::endl;
DLOG << "open File Failed.";
exit(-1); exit(-1);
} }
......
...@@ -22,6 +22,7 @@ namespace operators { ...@@ -22,6 +22,7 @@ namespace operators {
template <> template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
...@@ -34,7 +35,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -34,7 +35,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
...@@ -64,10 +65,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -64,10 +65,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale; delete new_scale;
......
...@@ -23,9 +23,9 @@ namespace operators { ...@@ -23,9 +23,9 @@ namespace operators {
template <> template <>
bool ConvAddBNReluKernel<FPGA, float>::Init( bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) { FusionConvAddBNReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = bool relu_enabled = true;
paddle_mobile::fpga::LEAKYRELU; // paddle_mobile::fpga::ActivationType activation_enable =
int16_t leaky_relu_negative_slope = 0; // paddle_mobile::fpga::LEAKYRELU;
auto input = const_cast<LoDTensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
...@@ -34,7 +34,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -34,7 +34,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
const int groups = param->Groups(); const int groups = param->Groups();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
...@@ -70,17 +70,17 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -70,17 +70,17 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
if (groups == channel) { if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0}; fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
leaky_relu_negative_slope, strides[0], strides[1], strides[0], strides[1], paddings[0], paddings[1],
paddings[0], paddings[1], new_bias_ptr); new_bias_ptr);
param->SetFpgaArgs(dwconv_arg); param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(bs_ptr); fpga::fpga_free(bs_ptr);
} else { } else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
leaky_relu_negative_slope, param->Groups(), strides[0], param->Groups(), strides[0], strides[1], paddings[0],
strides[1], paddings[0], paddings[1], bs_ptr); paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
} }
delete new_scale; delete new_scale;
......
...@@ -31,7 +31,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) { ...@@ -31,7 +31,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
...@@ -45,8 +45,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) { ...@@ -45,8 +45,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
......
...@@ -31,7 +31,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -31,7 +31,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
...@@ -45,8 +45,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -45,8 +45,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
......
...@@ -30,7 +30,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -30,7 +30,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -56,8 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -56,8 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
......
...@@ -29,7 +29,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -29,7 +29,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -58,17 +58,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -58,17 +58,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
if (groups == channel) { if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0}; fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true,
leaky_relu_negative_slope, param->Strides()[0], param->Strides()[0], param->Strides()[1],
param->Strides()[1], param->Paddings()[0], param->Paddings()[0], param->Paddings()[1],
param->Paddings()[1], new_bias_ptr); new_bias_ptr);
param->SetFpgaArgs(dwconv_arg); param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(bs_ptr); fpga::fpga_free(bs_ptr);
} else { } else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
......
...@@ -29,7 +29,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) { ...@@ -29,7 +29,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
int channel = out->dims()[1]; int channel = out->dims()[1];
auto bs_ptr = auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
...@@ -40,8 +40,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) { ...@@ -40,8 +40,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
......
...@@ -31,7 +31,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) { ...@@ -31,7 +31,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
int channel = out->dims()[1]; int channel = out->dims()[1];
...@@ -58,8 +58,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) { ...@@ -58,8 +58,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
...@@ -70,10 +69,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) { ...@@ -70,10 +69,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
} }
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
return true; return true;
......
...@@ -33,7 +33,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) { ...@@ -33,7 +33,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
...@@ -61,8 +61,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) { ...@@ -61,8 +61,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
...@@ -73,10 +72,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) { ...@@ -73,10 +72,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
} }
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
return true; return true;
......
...@@ -34,7 +34,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init( ...@@ -34,7 +34,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
...@@ -62,8 +62,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init( ...@@ -62,8 +62,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
...@@ -74,10 +73,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init( ...@@ -74,10 +73,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
} }
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
return true; return true;
......
...@@ -33,7 +33,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -33,7 +33,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
...@@ -61,8 +61,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -61,8 +61,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
...@@ -73,10 +72,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -73,10 +72,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
} }
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
......
...@@ -34,7 +34,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -34,7 +34,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
...@@ -57,8 +57,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -57,8 +57,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
...@@ -69,10 +68,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -69,10 +68,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
} }
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
return true; return true;
......
...@@ -35,7 +35,7 @@ bool DeconvBNReluKernel<FPGA, float>::Init( ...@@ -35,7 +35,7 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0]; float Si = input->scale[0];
float So = out->scale[0]; float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127; float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -80,18 +80,17 @@ bool DeconvBNReluKernel<FPGA, float>::Init( ...@@ -80,18 +80,17 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n); sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0}; fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg); param->SetFpgaArgs(DWDeconv_arg);
} else { } else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0}; fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
leaky_relu_negative_slope, param->Groups(), param->Groups(), param->Strides()[0],
param->Strides()[0], param->Strides()[1], param->Strides()[1], param->Paddings()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg); param->SetFpgaArgs(deconv_arg);
} }
delete new_scale; delete new_scale;
......
...@@ -44,7 +44,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -44,7 +44,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
} }
fpga::format_image(input); fpga::format_image(input);
output->ShareDataWith(*input); output->ShareDataWith(*input);
input->external_data = nullptr;
} }
template class FeedKernel<FPGA, float>; template class FeedKernel<FPGA, float>;
......
...@@ -20,6 +20,7 @@ namespace operators { ...@@ -20,6 +20,7 @@ namespace operators {
template <> template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
...@@ -58,8 +59,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -58,8 +59,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -20,6 +20,7 @@ namespace operators { ...@@ -20,6 +20,7 @@ namespace operators {
template <> template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
...@@ -58,8 +59,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -58,8 +59,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册