diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index efcc9f742e88e9a96394a2f6a15118c918624515..15532aa9b35a8d0a05b8ca9c4623f007be93a02f 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -21,6 +21,9 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +#define USE_RELU 1 +#define USE_BIAS 2 + int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } void format_image(framework::Tensor *image_tensor) { @@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width, out->reset_data_ptr(data_ptr); } +void expand_conv_arg(ConvArgs *arg) { + ConvArgs args = *arg; + uint64_t filterlen = (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height * + (uint64_t)args.image.channels; + filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT); + filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT); + uint64_t fpga_bias_scale_len = + align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + + uint64_t output_height = + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1; + uint64_t output_width = + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1; + uint64_t output_size = + output_height * output_width * (uint64_t)args.filter_num; + + auto filter_per_group = (uint64_t)(args.filter_num / args.group_num); + auto channel_per_group = (uint64_t)(args.image.channels / args.group_num); + + uint64_t image_row_count = ((uint64_t)args.image.width) * + ((uint64_t)args.image.channels); // without align + uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); + uint64_t image_one_pad_per_row = + align_to_x(image_row_count, IMAGE_ALIGNMENT) + + ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); + uint64_t filter_amount_all = + align_to_x(((uint64_t)args.kernel.height) * + ((uint64_t)args.kernel.width) * channel_per_group, + FILTER_ELEMENT_ALIGNMENT); + + uint64_t output_amount_per_row = + align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT); + + // find the opt partition strategy + uint64_t res_win; + uint64_t res_fit = 0; + for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + if ((align_to_x( + (args.image.channels * + (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1) * + args.kernel.height > + 2048) { + break; + } + } + + if (res_win != output_width) { + res_win -= 1; + } + + if (((res_win % 2) != 0) && (res_win != 1)) { + res_win = res_win - 1; + } + res_fit = res_win; + + uint64_t block_num = (output_width + res_fit - 1) / res_fit; + uint64_t block_len = res_fit; + uint64_t block_last = output_width - res_fit * (block_num - 1); + + uint64_t res_amount_per_row = output_width * args.filter_num; + uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + + uint64_t image_block_amount_per_row = + args.kernel.stride_w * (res_fit)*args.image.channels; + uint64_t filter_pad_width_mul_channel = + args.image.pad_width * args.image.channels; + uint64_t image_amount_per_row_multi_win_first = + image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); + uint64_t image_amount_per_row_multi_win = + image_amount_per_row * (4 * args.kernel.stride_h); + + uint64_t image_block_num = block_num; + uint64_t image_block_len = + align_to_x((args.image.channels * + (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1; + uint64_t image_block_len_last = + align_to_x( + (args.image.channels * + (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1; + uint64_t image_win_cnt = block_len; + uint64_t image_win_cnt_last = block_last; + uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; + uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + if (prog_full_cnt == 1023) { + prog_full_cnt--; + } + uint64_t post_prog_full_cnt = + (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) + ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) + : 0; + uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + + (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); + (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); + (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); + (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address); + (*arg).driver.output_height = output_height; + (*arg).driver.output_width = output_width; + (*arg).driver.filter_per_group = filter_per_group; + (*arg).driver.channel_per_group = channel_per_group; + (*arg).driver.image_amount_per_row = image_amount_per_row; + (*arg).driver.image_one_pad_per_row = image_one_pad_per_row; + (*arg).driver.filter_amount_all = filter_amount_all; + (*arg).driver.output_amount_per_row = output_amount_per_row; + (*arg).driver.image_block_amount_per_row = image_block_amount_per_row; + (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; + (*arg).driver.image_amount_per_row_multi_win_first = + image_amount_per_row_multi_win_first; + (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; + (*arg).driver.image_block_num = image_block_num; + (*arg).driver.image_block_len = image_block_len; + (*arg).driver.image_block_len_last = image_block_len_last; + (*arg).driver.image_win_cnt = image_win_cnt; + (*arg).driver.image_win_cnt_last = image_win_cnt_last; + (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; + (*arg).driver.prog_full_cnt = prog_full_cnt; + (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; + (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; + (*arg).driver.cmd = cmd; +} // expand_conv_arg() + +void expand_EW_arg(EWAddArgs *arg) { + EWAddArgs args = *arg; + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; + uint64_t datalen = (uint64_t)args.image0.width * + (uint64_t)args.image0.height * + (uint64_t)args.image0.channels; + uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; + uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address); + uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address); + uint64_t output_address_phy = vaddr_to_paddr(args.output.address); + + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGNMENT); + uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | + ((uint64_t)args.image0.width << 16) | + (uint64_t)args.image0.height; + + (*arg).driver.image0_address_phy = image0_address_phy; + (*arg).driver.image1_address_phy = image1_address_phy; + (*arg).driver.datalen = datalen; + (*arg).driver.image_image_pixel = image_image_pixel; + (*arg).driver.image_amount_per_row = image_amount_per_row; + (*arg).driver.output_address_phy = output_address_phy; + (*arg).driver.coefficient = coefficient; + (*arg).driver.cmd = cmd; + +} // expand_EW_arg + void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, bool relu_enabled, int group_num, int stride_h, @@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, auto channel = (int)out->dims()[1]; // NOLINT int filter_num_per_div = get_filter_num_per_div(filter, group_num); int element_num = get_aligned_filter_element_num( - filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3])); for (int i = 0; i < n; i++) { arg->conv_arg[i].relu_enabled = relu_enabled; @@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].filter_scale_address = filter->scale; - // arg->conv_arg[i].filter_address = &( - // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // - // NOLINT - // arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); size_t filter_size = - element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); + element_num * + align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * + sizeof(int8_t); auto filter_head = &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; arg->conv_arg[i].filter_address = fpga_malloc(filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size); - size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); + size_t bs_size = 2 * + align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * + sizeof(float); auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_arg[i].sb_address = fpga_malloc(bs_size); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); @@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_arg[i].output.address = - fpga_malloc(out->dims()[2] * - align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, - IMAGE_ALIGNMENT) * - sizeof(half)); + arg->conv_arg[i].output.address = fpga_malloc( + out->dims()[2] * + align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num), + IMAGE_ALIGNMENT) * + sizeof(half)); } else { arg->conv_arg[i].output.scale_address = out->scale; arg->conv_arg[i].output.address = out_ptr; @@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, (half *)arg->conv_arg[i].output.address; // NOLINT arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; + + expand_conv_arg(&arg->conv_arg[i]); } filter->reset_data_ptr(nullptr); fpga_free(bs_ptr); -} +} // fill_split_arg + void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, bool relu_enabled, int group_num, int stride_h, @@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, auto out_ptr = out->data(); arg->group_num = (uint32_t)group_num; - arg->sub_conv_num = stride_h; + arg->sub_conv_num = (uint32_t)stride_h; arg->filter_num = (uint32_t)filter->dims()[0]; - int sub_conv_num = arg->sub_conv_num; int sub_stride = 1; - int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w, - stride_w); - int sub_filter_width = - deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w); + int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], + padding_w, stride_w); + int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis( + (int)filter->dims()[3], stride_w); int sub_output_width = deconv_filter::deconv_get_sub_out_axis( - input->dims()[3], sub_pad, sub_filter_width); + (int)input->dims()[3], sub_pad, sub_filter_width); int sub_output_height = deconv_filter::deconv_get_sub_out_axis( - input->dims()[2], sub_pad, sub_filter_width); + (int)input->dims()[2], sub_pad, sub_filter_width); - arg->sub_output_width = sub_output_width; - arg->sub_output_height = sub_output_height; - arg->omit_size = - deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w); + arg->sub_output_width = (uint32_t)sub_output_width; + arg->sub_output_height = (uint32_t)sub_output_height; + arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( + stride_w, (int)filter->dims()[3], padding_w); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); - int sub_channels = (int32_t)input->dims()[1]; + int sub_channels = (int)input->dims()[1]; int omit_size = arg->omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; @@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, for (int i = 0; i < sub_conv_num; ++i) { arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); - arg->conv_args[i].group_num = group_num; + arg->conv_args[i].group_num = (uint32_t)group_num; arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].relu_enabled = relu_enabled; - arg->conv_args[i].kernel.width = sub_filter_width; - arg->conv_args[i].kernel.height = sub_filter_width; + arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width; + arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width; arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_h = 1; // DeconvParam.conv_args[i].image.address = (void*)ptr_image; arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.channels = sub_channels; + arg->conv_args[i].image.channels = (uint32_t)sub_channels; arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.pad_width = sub_pad; - arg->conv_args[i].image.pad_height = sub_pad; + arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; + arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].sb_address = (void *)bs_ptr; - char *filter_sub_space = + auto filter_sub_space = (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); fpga_copy(filter_sub_space, (char *)filter_ptr + i * align_conv_sub_filter_count, - align_conv_sub_filter_count); + (size_t)align_conv_sub_filter_count); arg->conv_args[i].filter_address = (void *)(filter_sub_space); - fpga_flush(filter_sub_space, align_conv_sub_filter_count); + fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count); if (sub_conv_num == 1) { arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.scale_address = out->scale; } else { - half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); arg->conv_args[i].output.address = (void *)((half *)ptr_output); - float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); arg->conv_args[i].output.scale_address = ptr_output_scale; } } @@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->output.address = out_ptr; arg->output.scale_address = out->scale; // fpga_free(filter_ptr); -} +} // fill_deconv_arg + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/bias_scale.h b/src/fpga/V1/bias_scale.h index 2d1e44c5470dae02fde6956a3744edc2e371a87b..9ae572a556a01719d918809ab9becea2d9fa5a20 100644 --- a/src/fpga/V1/bias_scale.h +++ b/src/fpga/V1/bias_scale.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#define BS_NUM_ALIGNMENT 8 - namespace paddle_mobile { namespace fpga { namespace bias_scale { diff --git a/src/fpga/V1/deconv_bias_scale.h b/src/fpga/V1/deconv_bias_scale.h index 7b9aaff756809b43884883b3333fcbc1dd2b6adf..820c6984d439f945ea4fc5f560fb346869026003 100644 --- a/src/fpga/V1/deconv_bias_scale.h +++ b/src/fpga/V1/deconv_bias_scale.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#define BS_NUM_ALIGNMENT 8 - namespace paddle_mobile { namespace fpga { namespace deconv_bias_scale { diff --git a/src/fpga/V1/filter.h b/src/fpga/V1/filter.h index 7bcc334738be98a49fc7eb25cf7090c0f8b6b3b3..6cb35d380733b0bf64a3a782d44fd321e3f00cfa 100644 --- a/src/fpga/V1/filter.h +++ b/src/fpga/V1/filter.h @@ -14,9 +14,6 @@ limitations under the License. */ #pragma once -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 - namespace paddle_mobile { namespace fpga { namespace filter { diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index f4142ad58a273691c84db9dd585518e7edcff8a6..c79a5c3a8e7c4f47cd11c2c4af14feb69efed48d 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); } -void split_image(int16_t *image_in, float *scale_in, void **images_out, - float **scales_out, int image_num, uint32_t *channel_nums, - int height, int width) { +void split_image(int16_t *image_in, const float *scale_in, void **images_out, + float **scales_out, int image_num, + const uint32_t *channel_nums, int height, int width) { int total_channel = 0; for (int i = 0; i < image_num; i++) { scales_out[i][0] = scale_in[0]; scales_out[i][1] = scale_in[1]; total_channel += channel_nums[i]; } + int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); + fpga_invalidate(image_in, element_num * sizeof(int16_t)); + int src_offset = 0, des_offset = 0; for (int h = 0; h < height; h++) { - int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT); - for (int i = 0; i < image_num; i++) { - int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT); - memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, - channel_nums[i] * sizeof(int16_t)); - src_offset += channel_nums[i]; + for (int w = 0; w < width; w++) { + src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) + + w * total_channel; + for (int i = 0; i < image_num; i++) { + des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + + w * channel_nums[i]; + memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, + channel_nums[i] * sizeof(int16_t)); + src_offset += channel_nums[i]; + } } } + + for (int i = 0; i < image_num; i++) { + element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); + fpga_flush(images_out[i], element_num * sizeof(int16_t)); + } } } // namespace image diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index 321967bbe233c5bec889aeb63f98dc23779b4918..f3c7b2731cb555c0c8871f6cd1d9f9df3e6429f2 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -14,9 +14,8 @@ limitations under the License. */ #pragma once -#include +#include -#define IMAGE_ALIGNMENT 16 // Aligned to 16 namespace paddle_mobile { namespace fpga { namespace image { @@ -24,13 +23,16 @@ namespace image { void convert_to_hwc(float** data_in, int channel, int height, int width); void align_element_conv(float** data_in, int height, int cw); void format_image(float** data_in, int channel, int height, int width); + +// Concat featuremaps along channel direction void concat_images(int16_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, - int height, - int width); // Concat featuremaps along channel direction -void split_image(int16_t* image_in, float* scale_in, void** images_out, - float** scales_out, int image_num, uint32_t* channel_nums, - int height, int width); + int height, int width); + +// Split featuremap along channel direction +void split_image(int16_t* image_in, const float* scale_in, void** images_out, + float** scales_out, int image_num, + const uint32_t* channel_nums, int height, int width); } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index d62f015e66c9bfb7c1ee07c349a307563a4581f2..ee18435323f92a9132cf0014ddba9bb79eb4b265 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif - cout << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - cout << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Conv"; - // return 0; - uint64_t timer_cnt; + int ret = 0; + uint64_t output_scale = 0; + /* uint64_t output_scale; uint64_t image_scale; uint64_t filter_scale; @@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { uint64_t sb_address_phy = 0; uint64_t filter_address_phy = 0; uint64_t output_address_phy = 0; - int ret = 0; + fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); - - cout << "image_scale :" << hex << (image_scale) << endl; - cout << "filter_scale :" << hex << (filter_scale) << endl; - uint64_t filterlen = (uint64_t)args.kernel.width * (uint64_t)args.kernel.height * (uint64_t)args.image.channels; @@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { filter_address_phy = vaddr_to_paddr(args.filter_address); output_address_phy = vaddr_to_paddr(args.output.address); - /*SDK刷Cache保证数据一致性*/ uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; +*/ pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { @@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR); - reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR); - reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR); - reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR); - reg_writeq( ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), REG_CONV_IMAGE_PIXEL); reg_writeq( ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), REG_CONV_FILTER_PIXEL); - reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); + reg_writeq(args.driver.output_height | (args.driver.output_width << 32), + REG_CONV_RESULT_PIXEL); reg_writeq(((uint64_t)args.image.pad_height) | (((uint64_t)args.image.pad_width) << 32), REG_CONV_PAD_PIXEL); reg_writeq(((uint64_t)args.kernel.stride_h) | (((uint64_t)args.kernel.stride_w) << 32), REG_CONV_STEP_PIXEL); - reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); + reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE); + reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE); + + reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); + reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); + reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); + reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR); + reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP); + reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP); + reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); + reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); + reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); + reg_writeq(args.driver.image_block_amount_per_row, 0xca8); + reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0); + reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8); + reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0); + reg_writeq(args.driver.image_block_num, 0xcc8); + reg_writeq(args.driver.image_block_len, 0xcd0); + reg_writeq(args.driver.image_block_len_last, 0xcd8); + reg_writeq(args.driver.image_win_cnt, 0xce0); + reg_writeq(args.driver.image_win_cnt_last, 0xce8); + reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8); + reg_writeq(args.driver.prog_full_cnt, 0xd08); + reg_writeq(args.driver.post_prog_full_cnt, 0xd10); + reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); + + reg_writeq(args.driver.cmd, REG_CONV_CMD); - reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP); - reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP); - - reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); - reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); - - reg_writeq(image_block_amount_per_row, 0xca8); - reg_writeq(filter_pad_width_mul_channel, 0xcb0); - reg_writeq(image_amount_per_row_multi_win_first, 0xcb8); - reg_writeq(image_amount_per_row_multi_win, 0xcc0); - reg_writeq(image_block_num, 0xcc8); - reg_writeq(image_block_len, 0xcd0); - reg_writeq(image_block_len_last, 0xcd8); - reg_writeq(image_win_cnt, 0xce0); - reg_writeq(image_win_cnt_last, 0xce8); - reg_writeq(res_row_data_align4_pad, 0xcf8); - reg_writeq(prog_full_cnt, 0xd08); - reg_writeq(post_prog_full_cnt, 0xd10); - reg_writeq(fpga_bias_scale_len / 4, 0xd20); - - /*write scale*/ - reg_writeq(image_scale, REG_CONV_IMAGE_SCALE); - reg_writeq(filter_scale, REG_CONV_FILTER_SCALE); - - reg_writeq(cmd, REG_CONV_CMD); - - DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; ret = -EIO; DLOG << "Conv Wait Irq Timeout!"; } - DLOG << "after reg poll"; - usleep(40); - - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - cout << "output_scale :" << hex << (output_scale) << endl; - //*(args.output.scale_address) = output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; @@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << "Pooling Wait Irq Timeout!"; } DLOG << "after reg poll"; - usleep(40); - - /*SDK 无效 Cache保证数据一致性*/ // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER); @@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Conv"; - // return 0; int ret = 0; uint64_t output_scale = 0; - uint64_t timer_cnt = 0; + /*uint64_t timer_cnt = 0; uint64_t image0_address_phy = 0; uint64_t image1_address_phy = 0; uint64_t output_address_phy = 0; @@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { (uint64_t)args.image0.height * (uint64_t)args.image0.channels; uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - image0_address_phy = vaddr_to_paddr(args.image0.address); image1_address_phy = vaddr_to_paddr(args.image1.address); output_address_phy = vaddr_to_paddr(args.output.address); uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGN); + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGN); uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | ((uint64_t)args.image0.width << 16) | - (uint64_t)args.image0.height; + (uint64_t)args.image0.height;*/ - /*SDK刷Cache保证数据一致性*/ + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { + ret = -EIO; + DLOG << "EW Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(datalen, REG_EW_DATA_LEN); - reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - - reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(coefficient, REG_EW_COEFFICIENT); - - reg_writeq(cmd, REG_EW_CMD); + reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); + reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); + reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); + reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); + reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); + reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); + reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); + reg_writeq(args.driver.cmd, REG_EW_CMD); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; ret = -EIO; DLOG << "EW Wait Irq Timeout!"; } - usleep(40); - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); @@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) { DLOG << "BYPASS Wait Irq Timeout!"; } DLOG << "after reg poll"; - usleep(40); - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); @@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, *data_in = ptr_deconv; fpga_free(ptr_tmp); } + int ComputeFpgaDeconv(const struct DeconvArgs &args) { -#ifdef FPGA_TEST_MODE +#ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFPGADeConv==========="; DLOG << " filter_num:" << args.filter_num << " group_num:" << args.group_num diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 2f592fe45d951230427595f2f8ff5b4a148c0276..acab650cb1452c2d39cf965b56016cc3c6f394c3 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { - std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg - << std::endl; break; } } @@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { DLOG << "dest:" << dest << " src:" << src << " size:" << num; for (i = 0; i < num; i++) { - // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); - // usleep(1); *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index c204370be7ecd3aca229b2c130ec7861116a3ef7..8034bd2bf6b9c98fe14f4aa38ff1bbb41cf64b70 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -103,22 +103,15 @@ struct FPGA_INFO { extern struct FPGA_INFO g_fpgainfo; inline uint64_t reg_readq(uint32_t offset) { - // DLOG << "offset : " << offset; uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset); // NOLINT - // DLOG << "read end"; - usleep(10); - return value; } inline void reg_writeq(uint64_t value, uint32_t offset) { - // DLOG << "offset : " << offset << ", value : " << value; *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset) = value; - // DLOG << "write end"; - usleep(10); } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index fdda65afda595d281a6f4db6f8132213f8f8d9e5..6d0c29bfbf8d07bad4398ba29baf7f3794e42ba1 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -20,6 +20,13 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +#ifdef PADDLE_MOBILE_FPGA_V1 +#define IMAGE_ALIGNMENT 16 // Aligned to 16 +#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 +#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 +#define BS_NUM_ALIGNMENT 8 +#endif + enum DataType { DATA_TYPE_FP32 = 1, DATA_TYPE_FP16 = 0, @@ -52,19 +59,70 @@ struct ImageOutputArgs { float* scale_address; // output scale address; uint64_t timer_cnt; // time counter for FPGA computation }; +#ifdef PADDLE_MOBILE_FPGA_V1 +struct ConvDriverParam { + uint64_t image_address_phy; + uint64_t filter_address_phy; + uint64_t sb_address_phy; + uint64_t output_address_phy; + + uint64_t output_height; + uint64_t output_width; + uint64_t filter_per_group; + uint64_t channel_per_group; + + uint64_t image_amount_per_row; + uint64_t image_one_pad_per_row; + uint64_t filter_amount_all; + uint64_t output_amount_per_row; + + uint64_t image_block_amount_per_row; + uint64_t filter_pad_width_mul_channel; + uint64_t image_amount_per_row_multi_win_first; + uint64_t image_amount_per_row_multi_win; + uint64_t image_block_num; + uint64_t image_block_len; + uint64_t image_block_len_last; + uint64_t image_win_cnt; + uint64_t image_win_cnt_last; + uint64_t res_row_data_align4_pad; + uint64_t prog_full_cnt; + uint64_t post_prog_full_cnt; + uint64_t fpga_bias_scale_len; + uint64_t cmd; +}; + +struct EWAddDriverParam { + uint64_t image0_address_phy; + uint64_t image1_address_phy; + uint64_t datalen; + uint64_t image_image_pixel; + uint64_t image_amount_per_row; + uint64_t output_address_phy; + uint64_t coefficient; + uint64_t cmd; +}; +#endif struct ConvArgs { bool relu_enabled; void* sb_address; // scale and bias void* filter_address; float* filter_scale_address; - void* free_space; // used by FPGA logic uint32_t filter_num; uint32_t group_num; struct KernelArgs kernel; struct ImageInputArgs image; // input image; struct ImageOutputArgs output; + +#ifdef PADDLE_MOBILE_FPGA_V2 + void* free_space; // used by FPGA logic +#endif + +#ifdef PADDLE_MOBILE_FPGA_V1 + struct ConvDriverParam driver; +#endif }; struct ConcatArgs { @@ -115,6 +173,9 @@ struct EWAddArgs { struct ImageInputArgs image0; struct ImageInputArgs image1; struct ImageOutputArgs output; +#ifdef PADDLE_MOBILE_FPGA_V1 + struct EWAddDriverParam driver; +#endif }; struct BypassArgs { @@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num); int fpga_flush(void* address, size_t size); int fpga_invalidate(void* address, size_t size); +uint64_t vaddr_to_paddr(void* address); +void expand_conv_arg(ConvArgs* arg); +void expand_EW_arg(EWAddArgs* arg); + } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index f8eeb53159411276fbab957c676a01cb31b597c8..be773412f099410b02f24b1d38d2a44d6ca77689 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -49,6 +49,7 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { ewaddArgs.image1.pad_width = 0; ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); param->SetFpgaArgs(ewaddArgs); return true; } diff --git a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp index 5253d4d0d3e00190b4ed594279d9190659ec6026..541bb6126509dc7da59fa6bed5c46aff3442928b 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp @@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel::Init( ewaddArgs.image1.pad_width = 0; ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); param->SetFpgaArgs(ewaddArgs); return true; } diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 37c03e2404f761f3089adb852b94bef27bec1ce9..918760bcfab0ea6c940fa35b3aebd0351f4d88cf 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -24,8 +24,12 @@ template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto input_ptr = input->data(); + auto out = param->Out(); + fpga::format_fp32_ofm(out); + auto float_input = new Tensor; - float_input->mutable_data({1, input->dims()[1]}); + float_input->mutable_data( + {1, input->dims()[2], input->dims()[3], input->dims()[1]}); fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -34,8 +38,8 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = input_ptr; - args.image.height = 1; - args.image.width = 1; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; args.image.channels = (uint32_t)input->dims()[1]; args.output.address = float_input->data(); args.output.scale_address = float_input->scale; @@ -50,9 +54,9 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *out = param.Out(); fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate( - (void *)in_x->data(), // NOLINT - fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(float)); + // TODO: In general case, 0 should be squeezed before softmax input math::SoftmaxFuntor()(in_x, out); fpga::fpga_flush(out->data(), out->memory_size()); }