提交 a1cc931d 编写于 作者: J jameswu2014 提交者: qnqinan

V2-conv-hellocase pass & V1 verify-pass (#1608)

上级 64aa8f05
......@@ -22,6 +22,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2
void format_image(framework::Tensor *image_tensor) {
......@@ -301,7 +302,9 @@ void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
auto fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) *
args.group_num;
fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM;
auto output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
......@@ -325,7 +328,7 @@ void expand_conv_arg(ConvArgs *arg) {
auto output_amount_per_row = align_to_x(
(output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
IMAGE_ALIGNMENT);
RESULT_ALIGNMENT);
// find the opt partition strategy
uint64_t res_win;
......@@ -335,10 +338,10 @@ void expand_conv_arg(ConvArgs *arg) {
(args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
IMAGE_ALIGNMENT +
1) *
args.kernel.height >
2048) {
256) {
break;
}
}
......@@ -350,6 +353,7 @@ void expand_conv_arg(ConvArgs *arg) {
if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1;
}
PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
res_fit = res_win;
auto block_num = (output_width + res_fit - 1) / res_fit;
......@@ -375,14 +379,14 @@ void expand_conv_arg(ConvArgs *arg) {
align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
IMAGE_ALIGNMENT +
1;
auto image_block_len_last =
align_to_x(
(args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
IMAGE_ALIGNMENT +
1;
auto image_win_cnt = block_len;
auto image_win_cnt_last = block_last;
......@@ -395,46 +399,85 @@ void expand_conv_arg(ConvArgs *arg) {
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0;
// auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
auto cmd = 0UL | USE_BIAS;
auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
// auto cmd = 0UL | USE_BIAS;
auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
((args.deconv_tx_param.sub_conv_num) << 8) |
((args.deconv_tx_param.omit_size) << 0);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
args.deconv_tx_param.out_addr_offset;
(*arg).driver.output_height = output_height;
(*arg).driver.output_width = output_width;
(*arg).driver.filter_per_group = filter_per_group;
(*arg).driver.channel_per_group = channel_per_group;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
(*arg).driver.filter_amount_all = filter_amount_all;
(*arg).driver.output_amount_per_row = output_amount_per_row;
(*arg).driver.deconv_param = deconv_param;
// new
(*arg).driver.col_padding_up = args.image.pad_width * args.image.channels;
(*arg).driver.col_padding_down = image_one_pad_per_row;
(*arg).driver.row_padding_up = args.image.pad_height;
(*arg).driver.row_padding_down = args.image.pad_height + args.image.height;
(*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
(*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.filter_row = args.kernel.width * args.image.channels;
(*arg).driver.filter_width = args.kernel.width;
(*arg).driver.filter_height = args.kernel.height;
(*arg).driver.skip_window = args.image.channels * args.kernel.stride_w;
(*arg).driver.stride_h = args.kernel.stride_h;
(*arg).driver.filter_amount_all = filter_amount_all;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) +
(((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0);
(*arg).driver.filter_num = args.filter_num;
(*arg).driver.output_width = output_width;
(*arg).driver.output_amount_per_row = output_amount_per_row;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM +
((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1;
(*arg).driver.last_cal_res_row_num =
(output_height % (ROW_PARALLEL_NUM))
? (output_height % (ROW_PARALLEL_NUM))
: (ROW_PARALLEL_NUM);
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.deconv_skip_row =
ROW_PARALLEL_NUM *
args.deconv_tx_param.sub_conv_num; // paralvl*deconv_group
(*arg).driver.deconv_res_skip_row =
args.deconv_tx_param.sub_conv_num *
output_amount_per_row; // deconv_group * result_amount_per_row
(*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en;
(*arg).driver.deconv_dump = args.deconv_tx_param.omit_size;
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
args.deconv_tx_param.out_addr_offset;
(*arg).driver.output_height = output_height;
(*arg).driver.result_amount_per_row_multi_para =
output_amount_per_row / RESULT_ALIGNMENT *
(args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row
: ROW_PARALLEL_NUM);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.filter_amount_whole = filter_amount_all;
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.filters_amount_whole =
filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.image_hight = args.image.height;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_amount_per_row_multi_win_first =
image_amount_per_row_multi_win_first;
(*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
(*arg).driver.filter_pad_hight = args.image.pad_height;
(*arg).driver.image_block_num = image_block_num;
(*arg).driver.image_block_len = image_block_len;
(*arg).driver.image_block_len_last = image_block_len_last;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.cmd = cmd;
(*arg).driver.deconv_param = deconv_param;
} // expand_conv_arg()
void expand_EW_arg(EWAddArgs *arg) {
EWAddArgs args = *arg;
uint64_t cmd = 0;
uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
......@@ -462,10 +505,8 @@ void expand_EW_arg(EWAddArgs *arg) {
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float *bs_ptr) {
auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<int8_t>();
......@@ -473,6 +514,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->group_num = (uint32_t)group_num;
// Either group_num or split_num = 1;
PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1");
arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr;
......@@ -511,9 +553,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter->dims()[3]));
for (int i = 0; i < n; i++) {
arg->conv_arg[i].output.activation.activation_type = activation_enable;
arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].group_num = (uint32_t)group_num;
arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
......@@ -585,9 +625,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>();
......@@ -713,12 +752,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
}
for (int j = 0; j < split_num; ++j) {
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
activation_enable;
arg->split_conv_args[i]
->conv_arg[j]
.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
// arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type
// =
// activation_enable;
// arg->split_conv_args[i]
// ->conv_arg[j]
// .output.activation.leaky_relu_negative_slope =
// leaky_relu_negative_slope;
arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
arg->split_conv_args[i]->conv_arg[j].kernel.width =
......@@ -831,16 +872,14 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float *bias_ptr) {
auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = out->mutable_data<int8_t>();
arg->sub_conv_num = 1;
arg->output.activation.activation_type = activation_enable;
arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
arg->relu_enabled = relu_enabled;
// arg->output.activation.activation_type = activation_enable;
arg->bias_address = bias_ptr;
arg->filter_address = filter_ptr;
arg->kernel.height = (uint32_t)filter->dims()[2];
......@@ -860,10 +899,8 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float *bias_ptr) {
auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<int8_t>();
......@@ -913,10 +950,11 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
// arg->dw_conv_args[i]->relu_enabled = relu_enabled;
arg->dw_conv_args[i]->output.activation.activation_type = activation_enable;
arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
arg->dw_conv_args[i]->relu_enabled = relu_enabled;
// arg->dw_conv_args[i]->output.activation.activation_type =
// activation_enable;
// arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
// leaky_relu_negative_slope;
arg->dw_conv_args[i]->bias_address = bias_ptr;
arg->dw_conv_args[i]->filter_address =
......
......@@ -48,28 +48,20 @@ void format_concat_output(framework::Tensor* out, int height, int width,
void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float* bias_ptr);
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bias_ptr);
void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float* bs_ptr);
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bs_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride);
......
......@@ -115,6 +115,19 @@ using namespace std; // NOLINT
/*conv*/
#define REG_CONV_CMD 0xC00
#define REG_CONV_REG0 0xC08
#define REG_CONV_REG1 0xC10
#define REG_CONV_REG2 0xC18
#define REG_CONV_REG3 0xC20
#define REG_CONV_REG4 0xC28
#define REG_CONV_REG5 0xC30
#define REG_CONV_REG6 0xC38
#define REG_CONV_REG7 0xC40
#define REG_CONV_REG8 0xC48
#define REG_CONV_REG9 0xC50
#define REG_CONV_REG10 0xC58
#define REG_CONV_REG11 0xC60
#define REG_CONV_IMAGE_BASE_ADDR 0xC08
#define REG_CONV_FILTER_BASE_ADDR 0xC10
#define REG_CONV_SB_BASE_ADDR 0xC18
......@@ -194,7 +207,7 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "======Compute Basic Conv======";
// DLOG << " relu_enabled:" << args.relu_enabled
DLOG << " relu_enabled:" << args.relu_enabled;
DLOG << " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
......@@ -218,23 +231,23 @@ int ComputeBasicConv(const struct ConvArgs &args) {
int ret = 0;
uint64_t output_scale = 0;
uint64_t reg_ActivationArgs = 0;
// uint64_t reg_ActivationArgs = 0;
// active function:{none,leakeyrelu,sigmoid,tanh}
ActivationArgs active_args;
// ActivationArgs active_args;
// active_args.activation_type = LEAKYRELU;
active_args.activation_type = args.output.activation.activation_type;
// active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope;
// active_args.leaky_relu_negative_slope =
// args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope;
// reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
// active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
// DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
......@@ -243,63 +256,71 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
// new
reg_writeq((args.driver.row_padding_down << 45) |
(args.driver.row_padding_up << 34) |
(args.driver.col_padding_down << 17) |
args.driver.col_padding_up,
REG_CONV_REG0);
reg_writeq((args.driver.image_win_cnt_last << 50) |
(args.driver.image_win_cnt << 39) |
(args.driver.image_block_amount_per_row << 20) |
args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) |
(args.driver.filter_row << 8) |
(args.driver.filter_height << 4) | args.driver.filter_width,
REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) |
args.driver.filter_amount_all,
REG_CONV_REG3);
reg_writeq((args.driver.post_prog_full_cnt << 54) |
(args.driver.last_cal_res_row_num << 50) |
(args.driver.cal_res_num << 39) |
(args.driver.res_row_data_align4_pad << 35) |
(args.driver.output_amount_per_row << 16) |
args.driver.output_width,
REG_CONV_REG4);
reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) |
(args.driver.deconv_res_skip_row << 7) |
args.driver.deconv_skip_row,
REG_CONV_REG5);
reg_writeq((args.driver.result_amount_per_row_multi_para << 43) |
(args.driver.output_height << 32) |
args.driver.output_address_phy,
REG_CONV_REG6);
reg_writeq((args.driver.filter_amount_whole << 48) |
(args.driver.fpga_bias_scale_len << 32) |
args.driver.sb_address_phy,
REG_CONV_REG7);
reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_CONV_IMAGE_PIXEL);
reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_CONV_FILTER_PIXEL);
uint64_t output_height_fraction =
args.driver.output_height / ROW_PARALLEL_NUM;
uint64_t output_height_remainder =
args.driver.output_height % ROW_PARALLEL_NUM;
reg_writeq(args.driver.output_height | (output_height_fraction << 16) |
(output_height_remainder << 26) |
(args.driver.output_width << 32),
REG_CONV_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_CONV_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_CONV_STEP_PIXEL);
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT
REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT
REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
reg_writeq(args.driver.image_block_num, 0xcc8);
reg_writeq(args.driver.image_block_len, 0xcd0);
reg_writeq(args.driver.image_block_len_last, 0xcd8);
reg_writeq(args.driver.image_win_cnt, 0xce0);
reg_writeq(args.driver.image_win_cnt_last, 0xce8);
reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
reg_writeq(args.driver.prog_full_cnt, 0xd08);
reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
(args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy,
REG_CONV_REG8);
reg_writeq((args.driver.image_amount_per_row << 43) |
(args.driver.image_hight << 32) |
args.driver.image_address_phy,
REG_CONV_REG9);
reg_writeq((args.driver.filter_pad_hight << 46) |
(args.driver.image_amount_per_row_multi_win << 23) |
args.driver.image_amount_per_row_multi_win_first,
REG_CONV_REG10);
reg_writeq((args.driver.image_block_num << 48) |
(args.driver.image_block_len << 24) |
args.driver.image_block_len_last,
REG_CONV_REG11);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
......@@ -307,12 +328,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << "Conv Wait Irq Timeout!";
PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
}
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
DLOG << "after reg poll";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
......@@ -350,22 +366,22 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0;
uint64_t reg_ActivationArgs = 0;
// uint64_t reg_ActivationArgs = 0;
// active function:{none,leakeyrelu,sigmoid,tanh}
ActivationArgs active_args;
// ActivationArgs active_args;
// active_args.activation_type = LEAKYRELU;
active_args.activation_type = args.output.activation.activation_type;
// active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope;
// active_args.leaky_relu_negative_slope =
// args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope;
// reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
// active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
// DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
image_physical_address = vaddr_to_paddr_driver(args.image.address);
output_physical_address = vaddr_to_paddr_driver(args.output.address);
......@@ -417,10 +433,10 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
return ret;
}
reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
// reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
reg_writeq(
......@@ -462,12 +478,12 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG << "after reg poll";
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
// output_scale = reg_readq(REG_SCALE_PARAMETER);
// output_scale = (output_scale << 32) | (output_scale >> 32);
// fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
// active_args.activation_type = NONE;
// reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
......@@ -479,7 +495,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
// DLOG << " relu_enabled:" << args.relu_enabled
DLOG << " relu_enabled:" << args.relu_enabled;
DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
......@@ -503,17 +519,17 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
int ret = 0;
uint64_t output_scale = 0;
uint64_t reg_ActivationArgs = 0;
ActivationArgs active_args;
active_args.activation_type = args.output.activation.activation_type;
active_args.leaky_relu_negative_slope =
args.output.activation.leaky_relu_negative_slope;
reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
active_args.leaky_relu_negative_slope;
DLOG << " activation_type:" << active_args.activation_type
<< " leaky_relu_negative_slope:"
<< active_args.leaky_relu_negative_slope;
DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
// uint64_t reg_ActivationArgs = 0;
// ActivationArgs active_args;
// active_args.activation_type = args.output.activation.activation_type;
// active_args.leaky_relu_negative_slope =
// args.output.activation.leaky_relu_negative_slope;
// reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
// active_args.leaky_relu_negative_slope;
// DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
......@@ -523,8 +539,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
return ret;
}
reg_writeq(reg_ActivationArgs,
REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
......@@ -543,11 +559,11 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
}
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
active_args.activation_type = NONE;
reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
// output_scale = reg_readq(REG_SCALE_PARAMETER);
// output_scale = (output_scale << 32) | (output_scale >> 32);
// fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
// active_args.activation_type = NONE;
// reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
......
......@@ -200,10 +200,10 @@ uint64_t vaddr_to_paddr(void *address) {
}
uint32_t paddle_mobile_version() {
uint32_t v_master = 35;
uint32_t v_slave = 35;
uint32_t v_master = 52;
uint32_t v_slave = 52;
uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 2;
uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1;
uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master;
uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave;
......
......@@ -32,8 +32,12 @@ limitations under the License. */
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_SCALE_DMA_NUM (4)
#define RESULT_ALIGNMENT (32)
#define PE_COLUMN (8)
#define ROW_PARALLEL_NUM (2)
#define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif
namespace paddle_mobile {
......@@ -89,37 +93,59 @@ struct ImageOutputArgs {
};
struct ConvDriverParam {
uint64_t image_address_phy;
uint64_t filter_address_phy;
uint64_t sb_address_phy;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t output_width;
uint64_t filter_per_group;
uint64_t channel_per_group;
uint64_t image_amount_per_row;
uint64_t image_one_pad_per_row;
uint64_t filter_amount_all;
uint64_t output_amount_per_row;
uint64_t deconv_param;
uint64_t col_padding_up;
uint64_t col_padding_down;
uint64_t row_padding_up;
uint64_t row_padding_down;
uint64_t image_block_amount_per_row;
uint64_t filter_pad_width_mul_channel;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t image_win_cnt;
uint64_t image_win_cnt_last;
uint64_t res_row_data_align4_pad;
uint64_t filter_row;
uint64_t filter_width;
uint64_t filter_height;
uint64_t skip_window;
uint64_t stride_h;
uint64_t filter_amount_all;
uint64_t prog_full_cnt;
uint64_t filter_align;
uint64_t filter_num;
uint64_t output_width;
uint64_t output_amount_per_row;
uint64_t res_row_data_align4_pad;
uint64_t cal_res_num;
uint64_t last_cal_res_row_num;
uint64_t post_prog_full_cnt;
uint64_t deconv_skip_row; // paralvl*deconv_group
uint64_t deconv_res_skip_row; // deconv_group * result_amount_per_row
uint64_t deconv_ena;
uint64_t deconv_dump;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t result_amount_per_row_multi_para;
uint64_t sb_address_phy;
uint64_t fpga_bias_scale_len;
uint64_t cmd;
uint64_t filter_amount_whole;
uint64_t filter_address_phy;
uint64_t filters_amount_whole;
uint64_t image_address_phy;
uint64_t image_hight;
uint64_t image_amount_per_row;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t filter_pad_hight;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t deconv_param;
uint64_t cmd;
};
struct EWAddDriverParam {
......@@ -141,6 +167,7 @@ struct DeconvTxParm {
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias
void* filter_address;
float* filter_scale_address;
......@@ -209,6 +236,7 @@ struct PoolingArgs {
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
......@@ -238,6 +266,7 @@ struct DeconvArgs {
};
struct DWconvArgs {
uint32_t sub_conv_num;
bool relu_enabled;
void* bias_address;
void* filter_address;
struct KernelArgs kernel;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "framework/executor.h"
#include <algorithm>
#include <unordered_map>
#include <utility>
#include <vector>
#include "common/enforce.h"
......@@ -638,7 +639,8 @@ std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
// std::cout << "open File Failed." << std::endl;
DLOG << "open File Failed.";
exit(-1);
}
......
......@@ -22,6 +22,7 @@ namespace operators {
template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -34,7 +35,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -64,10 +65,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
......
......@@ -23,9 +23,9 @@ namespace operators {
template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
bool relu_enabled = true;
// paddle_mobile::fpga::ActivationType activation_enable =
// paddle_mobile::fpga::LEAKYRELU;
auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
......@@ -34,7 +34,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
const int groups = param->Groups();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>();
......@@ -70,17 +70,17 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, strides[0], strides[1],
paddings[0], paddings[1], new_bias_ptr);
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
strides[0], strides[1], paddings[0], paddings[1],
new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(), strides[0],
strides[1], paddings[0], paddings[1], bs_ptr);
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), strides[0], strides[1], paddings[0],
paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
}
delete new_scale;
......
......@@ -31,7 +31,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
......@@ -45,8 +45,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
......
......@@ -31,7 +31,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
......@@ -45,8 +45,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
......
......@@ -30,7 +30,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -56,8 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
......
......@@ -29,7 +29,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -58,17 +58,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], new_bias_ptr);
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1],
new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
......
......@@ -29,7 +29,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
int channel = out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
......@@ -40,8 +40,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
......
......@@ -31,7 +31,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
int channel = out->dims()[1];
......@@ -58,8 +58,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
......@@ -70,10 +69,10 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
......
......@@ -33,7 +33,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -61,8 +61,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
......@@ -73,10 +72,10 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
......
......@@ -34,7 +34,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -62,8 +62,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
......@@ -74,10 +73,10 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
......
......@@ -33,7 +33,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -61,8 +61,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
......@@ -73,10 +72,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
......
......@@ -34,7 +34,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -57,8 +57,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
......@@ -69,10 +68,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
......
......@@ -35,7 +35,7 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -80,18 +80,17 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
delete new_scale;
......
......@@ -44,7 +44,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
}
fpga::format_image(input);
output->ShareDataWith(*input);
input->external_data = nullptr;
}
template class FeedKernel<FPGA, float>;
......
......@@ -20,6 +20,7 @@ namespace operators {
template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -58,8 +59,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
......
......@@ -20,6 +20,7 @@ namespace operators {
template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -58,8 +59,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册