未验证 提交 9437e287 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

Merge pull request #1342 from zhangyang0701/develop

init parameters for conv&EW drivers for FPGA track close #1341
...@@ -21,6 +21,9 @@ limitations under the License. */ ...@@ -21,6 +21,9 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
...@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width, ...@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out->reset_data_ptr(data_ptr); out->reset_data_ptr(data_ptr);
} }
void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height *
(uint64_t)args.image.channels;
filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
uint64_t fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
uint64_t output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1;
uint64_t output_width =
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1;
uint64_t output_size =
output_height * output_width * (uint64_t)args.filter_num;
auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
auto channel_per_group = (uint64_t)(args.image.channels / args.group_num);
uint64_t image_row_count = ((uint64_t)args.image.width) *
((uint64_t)args.image.channels); // without align
uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row =
align_to_x(image_row_count, IMAGE_ALIGNMENT) +
((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
uint64_t filter_amount_all =
align_to_x(((uint64_t)args.kernel.height) *
((uint64_t)args.kernel.width) * channel_per_group,
FILTER_ELEMENT_ALIGNMENT);
uint64_t output_amount_per_row =
align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT);
// find the opt partition strategy
uint64_t res_win;
uint64_t res_fit = 0;
for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
if ((align_to_x(
(args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1) *
args.kernel.height >
2048) {
break;
}
}
if (res_win != output_width) {
res_win -= 1;
}
if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1;
}
res_fit = res_win;
uint64_t block_num = (output_width + res_fit - 1) / res_fit;
uint64_t block_len = res_fit;
uint64_t block_last = output_width - res_fit * (block_num - 1);
uint64_t res_amount_per_row = output_width * args.filter_num;
uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
uint64_t image_block_amount_per_row =
args.kernel.stride_w * (res_fit)*args.image.channels;
uint64_t filter_pad_width_mul_channel =
args.image.pad_width * args.image.channels;
uint64_t image_amount_per_row_multi_win_first =
image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
uint64_t image_amount_per_row_multi_win =
image_amount_per_row * (4 * args.kernel.stride_h);
uint64_t image_block_num = block_num;
uint64_t image_block_len =
align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1;
uint64_t image_block_len_last =
align_to_x(
(args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1;
uint64_t image_win_cnt = block_len;
uint64_t image_win_cnt_last = block_last;
uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
if (prog_full_cnt == 1023) {
prog_full_cnt--;
}
uint64_t post_prog_full_cnt =
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0;
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address);
(*arg).driver.output_height = output_height;
(*arg).driver.output_width = output_width;
(*arg).driver.filter_per_group = filter_per_group;
(*arg).driver.channel_per_group = channel_per_group;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
(*arg).driver.filter_amount_all = filter_amount_all;
(*arg).driver.output_amount_per_row = output_amount_per_row;
(*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
(*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
(*arg).driver.image_amount_per_row_multi_win_first =
image_amount_per_row_multi_win_first;
(*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
(*arg).driver.image_block_num = image_block_num;
(*arg).driver.image_block_len = image_block_len;
(*arg).driver.image_block_len_last = image_block_len_last;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.cmd = cmd;
} // expand_conv_arg()
void expand_EW_arg(EWAddArgs *arg) {
EWAddArgs args = *arg;
uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGNMENT);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height;
(*arg).driver.image0_address_phy = image0_address_phy;
(*arg).driver.image1_address_phy = image1_address_phy;
(*arg).driver.datalen = datalen;
(*arg).driver.image_image_pixel = image_image_pixel;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.output_address_phy = output_address_phy;
(*arg).driver.coefficient = coefficient;
(*arg).driver.cmd = cmd;
} // expand_EW_arg
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h, bool relu_enabled, int group_num, int stride_h,
...@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto channel = (int)out->dims()[1]; // NOLINT auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num( int element_num = get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
arg->conv_arg[i].relu_enabled = relu_enabled; arg->conv_arg[i].relu_enabled = relu_enabled;
...@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale; arg->conv_arg[i].filter_scale_address = filter->scale;
// arg->conv_arg[i].filter_address = &(
// (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
// NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)( arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div); : filter_num_per_div);
size_t filter_size = size_t filter_size =
element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); element_num *
align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head = auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_arg[i].filter_address = fpga_malloc(filter_size); arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); size_t bs_size = 2 *
align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size); arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
...@@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
if (n > 1) { if (n > 1) {
arg->conv_arg[i].output.scale_address = arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = arg->conv_arg[i].output.address = fpga_malloc(
fpga_malloc(out->dims()[2] * out->dims()[2] *
align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
} else { } else {
arg->conv_arg[i].output.scale_address = out->scale; arg->conv_arg[i].output.scale_address = out->scale;
arg->conv_arg[i].output.address = out_ptr; arg->conv_arg[i].output.address = out_ptr;
...@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(half *)arg->conv_arg[i].output.address; // NOLINT (half *)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
expand_conv_arg(&arg->conv_arg[i]);
} }
filter->reset_data_ptr(nullptr); filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr); fpga_free(bs_ptr);
} } // fill_split_arg
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h, bool relu_enabled, int group_num, int stride_h,
...@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
auto out_ptr = out->data<float>(); auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
arg->sub_conv_num = stride_h; arg->sub_conv_num = (uint32_t)stride_h;
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
int sub_conv_num = arg->sub_conv_num; int sub_conv_num = arg->sub_conv_num;
int sub_stride = 1; int sub_stride = 1;
int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w, int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
stride_w); padding_w, stride_w);
int sub_filter_width = int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w); (int)filter->dims()[3], stride_w);
int sub_output_width = deconv_filter::deconv_get_sub_out_axis( int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
input->dims()[3], sub_pad, sub_filter_width); (int)input->dims()[3], sub_pad, sub_filter_width);
int sub_output_height = deconv_filter::deconv_get_sub_out_axis( int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
input->dims()[2], sub_pad, sub_filter_width); (int)input->dims()[2], sub_pad, sub_filter_width);
arg->sub_output_width = sub_output_width; arg->sub_output_width = (uint32_t)sub_output_width;
arg->sub_output_height = sub_output_height; arg->sub_output_height = (uint32_t)sub_output_height;
arg->omit_size = arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w); stride_w, (int)filter->dims()[3], padding_w);
arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
int sub_channels = (int32_t)input->dims()[1]; int sub_channels = (int)input->dims()[1];
int omit_size = arg->omit_size; int omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
...@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
arg->conv_args[i].group_num = group_num; arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].relu_enabled = relu_enabled; arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].kernel.width = sub_filter_width; arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width;
arg->conv_args[i].kernel.height = sub_filter_width; arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width;
arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_w = 1;
arg->conv_args[i].kernel.stride_h = 1; arg->conv_args[i].kernel.stride_h = 1;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image; // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = sub_channels; arg->conv_args[i].image.channels = (uint32_t)sub_channels;
arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.pad_width = sub_pad; arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
arg->conv_args[i].image.pad_height = sub_pad; arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
arg->conv_args[i].image.address = input_ptr; arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].sb_address = (void *)bs_ptr; arg->conv_args[i].sb_address = (void *)bs_ptr;
char *filter_sub_space = auto filter_sub_space =
(char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
fpga_copy(filter_sub_space, fpga_copy(filter_sub_space,
(char *)filter_ptr + i * align_conv_sub_filter_count, (char *)filter_ptr + i * align_conv_sub_filter_count,
align_conv_sub_filter_count); (size_t)align_conv_sub_filter_count);
arg->conv_args[i].filter_address = (void *)(filter_sub_space); arg->conv_args[i].filter_address = (void *)(filter_sub_space);
fpga_flush(filter_sub_space, align_conv_sub_filter_count); fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
if (sub_conv_num == 1) { if (sub_conv_num == 1) {
arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale; arg->conv_args[i].output.scale_address = out->scale;
} else { } else {
half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].output.address = (void *)((half *)ptr_output); arg->conv_args[i].output.address = (void *)((half *)ptr_output);
float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.scale_address = ptr_output_scale; arg->conv_args[i].output.scale_address = ptr_output_scale;
} }
} }
...@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
// fpga_free(filter_ptr); // fpga_free(filter_ptr);
} } // fill_deconv_arg
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define BS_NUM_ALIGNMENT 8
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace bias_scale { namespace bias_scale {
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define BS_NUM_ALIGNMENT 8
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace deconv_bias_scale { namespace deconv_bias_scale {
......
...@@ -14,9 +14,6 @@ limitations under the License. */ ...@@ -14,9 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
......
...@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
} }
void split_image(int16_t *image_in, float *scale_in, void **images_out, void split_image(int16_t *image_in, const float *scale_in, void **images_out,
float **scales_out, int image_num, uint32_t *channel_nums, float **scales_out, int image_num,
int height, int width) { const uint32_t *channel_nums, int height, int width) {
int total_channel = 0; int total_channel = 0;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
scales_out[i][0] = scale_in[0]; scales_out[i][0] = scale_in[0];
scales_out[i][1] = scale_in[1]; scales_out[i][1] = scale_in[1];
total_channel += channel_nums[i]; total_channel += channel_nums[i];
} }
int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
fpga_invalidate(image_in, element_num * sizeof(int16_t));
int src_offset = 0, des_offset = 0;
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT); for (int w = 0; w < width; w++) {
for (int i = 0; i < image_num; i++) { src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT); w * total_channel;
memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, for (int i = 0; i < image_num; i++) {
channel_nums[i] * sizeof(int16_t)); des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
src_offset += channel_nums[i]; w * channel_nums[i];
memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i];
}
} }
} }
for (int i = 0; i < image_num; i++) {
element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
fpga_flush(images_out[i], element_num * sizeof(int16_t));
}
} }
} // namespace image } // namespace image
......
...@@ -14,9 +14,8 @@ limitations under the License. */ ...@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <cstdint>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
...@@ -24,13 +23,16 @@ namespace image { ...@@ -24,13 +23,16 @@ namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width); void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw); void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width); void format_image(float** data_in, int channel, int height, int width);
// Concat featuremaps along channel direction
void concat_images(int16_t** images_in, float** scales_in, void* image_out, void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num, float* scale_out, int image_num, uint32_t* channel_num,
int height, int height, int width);
int width); // Concat featuremaps along channel direction
void split_image(int16_t* image_in, float* scale_in, void** images_out, // Split featuremap along channel direction
float** scales_out, int image_num, uint32_t* channel_nums, void split_image(int16_t* image_in, const float* scale_in, void** images_out,
int height, int width); float** scales_out, int image_num,
const uint32_t* channel_nums, int height, int width);
} // namespace image } // namespace image
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << " out_address:" << args.output.address DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
cout << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv"; int ret = 0;
// return 0; uint64_t output_scale = 0;
uint64_t timer_cnt; /*
uint64_t output_scale; uint64_t output_scale;
uint64_t image_scale; uint64_t image_scale;
uint64_t filter_scale; uint64_t filter_scale;
...@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
uint64_t sb_address_phy = 0; uint64_t sb_address_phy = 0;
uint64_t filter_address_phy = 0; uint64_t filter_address_phy = 0;
uint64_t output_address_phy = 0; uint64_t output_address_phy = 0;
int ret = 0;
fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
cout << "image_scale :" << hex << (image_scale) << endl;
cout << "filter_scale :" << hex << (filter_scale) << endl;
uint64_t filterlen = (uint64_t)args.kernel.width * uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height * (uint64_t)args.kernel.height *
(uint64_t)args.image.channels; (uint64_t)args.image.channels;
...@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
filter_address_phy = vaddr_to_paddr(args.filter_address); filter_address_phy = vaddr_to_paddr(args.filter_address);
output_address_phy = vaddr_to_paddr(args.output.address); output_address_phy = vaddr_to_paddr(args.output.address);
/*SDK刷Cache保证数据一致性*/
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
*/
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
...@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq( reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_CONV_IMAGE_PIXEL); REG_CONV_IMAGE_PIXEL);
reg_writeq( reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_CONV_FILTER_PIXEL); REG_CONV_FILTER_PIXEL);
reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); reg_writeq(args.driver.output_height | (args.driver.output_width << 32),
REG_CONV_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) | reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32), (((uint64_t)args.image.pad_width) << 32),
REG_CONV_PAD_PIXEL); REG_CONV_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) | reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32), (((uint64_t)args.kernel.stride_w) << 32),
REG_CONV_STEP_PIXEL); REG_CONV_STEP_PIXEL);
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
reg_writeq(args.driver.image_block_num, 0xcc8);
reg_writeq(args.driver.image_block_len, 0xcd0);
reg_writeq(args.driver.image_block_len_last, 0xcd8);
reg_writeq(args.driver.image_win_cnt, 0xce0);
reg_writeq(args.driver.image_win_cnt_last, 0xce8);
reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
reg_writeq(args.driver.prog_full_cnt, 0xd08);
reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(image_block_amount_per_row, 0xca8);
reg_writeq(filter_pad_width_mul_channel, 0xcb0);
reg_writeq(image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(image_amount_per_row_multi_win, 0xcc0);
reg_writeq(image_block_num, 0xcc8);
reg_writeq(image_block_len, 0xcd0);
reg_writeq(image_block_len_last, 0xcd8);
reg_writeq(image_win_cnt, 0xce0);
reg_writeq(image_win_cnt_last, 0xce8);
reg_writeq(res_row_data_align4_pad, 0xcf8);
reg_writeq(prog_full_cnt, 0xd08);
reg_writeq(post_prog_full_cnt, 0xd10);
reg_writeq(fpga_bias_scale_len / 4, 0xd20);
/*write scale*/
reg_writeq(image_scale, REG_CONV_IMAGE_SCALE);
reg_writeq(filter_scale, REG_CONV_FILTER_SCALE);
reg_writeq(cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "Conv Wait Irq Timeout!"; DLOG << "Conv Wait Irq Timeout!";
} }
DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
cout << "output_scale :" << hex << (output_scale) << endl;
//*(args.output.scale_address) = output_scale;
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
...@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG << "Pooling Wait Irq Timeout!"; DLOG << "Pooling Wait Irq Timeout!";
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
...@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv";
// return 0;
int ret = 0; int ret = 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t timer_cnt = 0; /*uint64_t timer_cnt = 0;
uint64_t image0_address_phy = 0; uint64_t image0_address_phy = 0;
uint64_t image1_address_phy = 0; uint64_t image1_address_phy = 0;
uint64_t output_address_phy = 0; uint64_t output_address_phy = 0;
...@@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
(uint64_t)args.image0.height * (uint64_t)args.image0.height *
(uint64_t)args.image0.channels; (uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
image0_address_phy = vaddr_to_paddr(args.image0.address); image0_address_phy = vaddr_to_paddr(args.image0.address);
image1_address_phy = vaddr_to_paddr(args.image1.address); image1_address_phy = vaddr_to_paddr(args.image1.address);
output_address_phy = vaddr_to_paddr(args.output.address); output_address_phy = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGN); IMAGE_ALIGN);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) | ((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height; (uint64_t)args.image0.height;*/
/*SDK刷Cache保证数据一致性*/ pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
ret = -EIO;
DLOG << "EW Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
reg_writeq(datalen, REG_EW_DATA_LEN); reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); reg_writeq(args.driver.cmd, REG_EW_CMD);
reg_writeq(coefficient, REG_EW_COEFFICIENT);
reg_writeq(cmd, REG_EW_CMD);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "EW Wait Irq Timeout!"; DLOG << "EW Wait Irq Timeout!";
} }
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
...@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) {
DLOG << "BYPASS Wait Irq Timeout!"; DLOG << "BYPASS Wait Irq Timeout!";
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
...@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, ...@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
*data_in = ptr_deconv; *data_in = ptr_deconv;
fpga_free(ptr_tmp); fpga_free(ptr_tmp);
} }
int ComputeFpgaDeconv(const struct DeconvArgs &args) { int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFPGADeConv==========="; DLOG << "=============ComputeFPGADeConv===========";
DLOG << " filter_num:" << args.filter_num DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num << " group_num:" << args.group_num
......
...@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { ...@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
<< std::endl;
break; break;
} }
} }
...@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { ...@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
DLOG << "dest:" << dest << " src:" << src << " size:" << num; DLOG << "dest:" << dest << " src:" << src << " size:" << num;
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT
} }
......
...@@ -103,22 +103,15 @@ struct FPGA_INFO { ...@@ -103,22 +103,15 @@ struct FPGA_INFO {
extern struct FPGA_INFO g_fpgainfo; extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) { inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value = uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT offset); // NOLINT
// DLOG << "read end";
usleep(10);
return value; return value;
} }
inline void reg_writeq(uint64_t value, uint32_t offset) { inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value; offset) = value;
// DLOG << "write end";
usleep(10);
} }
int open_device_driver(); int open_device_driver();
......
...@@ -20,6 +20,13 @@ limitations under the License. */ ...@@ -20,6 +20,13 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#endif
enum DataType { enum DataType {
DATA_TYPE_FP32 = 1, DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0, DATA_TYPE_FP16 = 0,
...@@ -52,19 +59,70 @@ struct ImageOutputArgs { ...@@ -52,19 +59,70 @@ struct ImageOutputArgs {
float* scale_address; // output scale address; float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation uint64_t timer_cnt; // time counter for FPGA computation
}; };
#ifdef PADDLE_MOBILE_FPGA_V1
struct ConvDriverParam {
uint64_t image_address_phy;
uint64_t filter_address_phy;
uint64_t sb_address_phy;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t output_width;
uint64_t filter_per_group;
uint64_t channel_per_group;
uint64_t image_amount_per_row;
uint64_t image_one_pad_per_row;
uint64_t filter_amount_all;
uint64_t output_amount_per_row;
uint64_t image_block_amount_per_row;
uint64_t filter_pad_width_mul_channel;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t image_win_cnt;
uint64_t image_win_cnt_last;
uint64_t res_row_data_align4_pad;
uint64_t prog_full_cnt;
uint64_t post_prog_full_cnt;
uint64_t fpga_bias_scale_len;
uint64_t cmd;
};
struct EWAddDriverParam {
uint64_t image0_address_phy;
uint64_t image1_address_phy;
uint64_t datalen;
uint64_t image_image_pixel;
uint64_t image_amount_per_row;
uint64_t output_address_phy;
uint64_t coefficient;
uint64_t cmd;
};
#endif
struct ConvArgs { struct ConvArgs {
bool relu_enabled; bool relu_enabled;
void* sb_address; // scale and bias void* sb_address; // scale and bias
void* filter_address; void* filter_address;
float* filter_scale_address; float* filter_scale_address;
void* free_space; // used by FPGA logic
uint32_t filter_num; uint32_t filter_num;
uint32_t group_num; uint32_t group_num;
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V2
void* free_space; // used by FPGA logic
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct ConvDriverParam driver;
#endif
}; };
struct ConcatArgs { struct ConcatArgs {
...@@ -115,6 +173,9 @@ struct EWAddArgs { ...@@ -115,6 +173,9 @@ struct EWAddArgs {
struct ImageInputArgs image0; struct ImageInputArgs image0;
struct ImageInputArgs image1; struct ImageInputArgs image1;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V1
struct EWAddDriverParam driver;
#endif
}; };
struct BypassArgs { struct BypassArgs {
...@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num); ...@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size); int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size); int fpga_invalidate(void* address, size_t size);
uint64_t vaddr_to_paddr(void* address);
void expand_conv_arg(ConvArgs* arg);
void expand_EW_arg(EWAddArgs* arg);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
...@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
...@@ -24,8 +24,12 @@ template <> ...@@ -24,8 +24,12 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor; auto float_input = new Tensor;
float_input->mutable_data<float>({1, input->dims()[1]}); float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
fpga::format_fp32_ofm(float_input); fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
...@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.input_data_type = fpga::DATA_TYPE_FP16; args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32; args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr; args.image.address = input_ptr;
args.image.height = 1; args.image.height = (uint32_t)input->dims()[2];
args.image.width = 1; args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>(); args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale; args.output.scale_address = float_input->scale;
...@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { ...@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *out = param.Out(); Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate( fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
(void *)in_x->data<float>(), // NOLINT in_x->numel() * sizeof(float));
fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); // TODO: In general case, 0 should be squeezed before softmax input
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_flush(out->data<float>(), out->memory_size());
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册