diff --git a/CMakeLists.txt b/CMakeLists.txt index f5d68712a64b5a47657a7af9c0e6b47604893e23..4e33b715093d5ede1cb92641533fe42c2f25f831 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF) option(CPU "armv7 with neon" ON) option(GPU_MALI "mali gpu" OFF) option(GPU_CL "opencl gpu" OFF) + option(FPGA "fpga" OFF) if(FPGA) option(FPGAV1 "fpga v1" ON) @@ -144,7 +145,7 @@ if(FPGA) endforeach() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) endforeach() endif() if(FPGAV2) @@ -156,7 +157,7 @@ if(FPGA) endforeach() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) endforeach() endif() diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 15532aa9b35a8d0a05b8ca9c4623f007be93a02f..f2a16b2784f1d07d1322023470539bcd26315751 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -24,8 +24,6 @@ namespace fpga { #define USE_RELU 1 #define USE_BIAS 2 -int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } - void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; @@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) { return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); } -int get_aligned_filter_num(int num) { - return align_to_x(num, FILTER_NUM_ALIGNMENT); -} - void format_filter(framework::Tensor *filter_tensor, float max_value, int group_num) { filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT @@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width, void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; - uint64_t filterlen = (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height * - (uint64_t)args.image.channels; - filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT); - filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT); - uint64_t fpga_bias_scale_len = + + auto fpga_bias_scale_len = align_to_x(args.filter_num / args.group_num, 8) * args.group_num; - uint64_t output_height = + auto output_height = (args.image.height + args.image.pad_height * 2 - args.kernel.height) / args.kernel.stride_h + 1; - uint64_t output_width = + auto output_width = (args.image.width + args.image.pad_width * 2 - args.kernel.width) / args.kernel.stride_w + 1; - uint64_t output_size = - output_height * output_width * (uint64_t)args.filter_num; - - auto filter_per_group = (uint64_t)(args.filter_num / args.group_num); - auto channel_per_group = (uint64_t)(args.image.channels / args.group_num); - - uint64_t image_row_count = ((uint64_t)args.image.width) * - ((uint64_t)args.image.channels); // without align - uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x(image_row_count, IMAGE_ALIGNMENT) + - ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); - uint64_t filter_amount_all = - align_to_x(((uint64_t)args.kernel.height) * - ((uint64_t)args.kernel.width) * channel_per_group, + + auto filter_per_group = args.filter_num / args.group_num; + auto channel_per_group = args.image.channels / args.group_num; + + auto image_row_count = args.image.width * args.image.channels; + auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); + auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) + + args.image.pad_width * args.image.channels; + auto filter_amount_all = + align_to_x(args.kernel.height * args.kernel.width * channel_per_group, FILTER_ELEMENT_ALIGNMENT); - uint64_t output_amount_per_row = - align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT); + auto output_amount_per_row = + align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT); // find the opt partition strategy uint64_t res_win; uint64_t res_fit = 0; - for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + for (res_win = 1; res_win <= output_width; res_win++) { if ((align_to_x( (args.image.channels * (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), @@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) { } res_fit = res_win; - uint64_t block_num = (output_width + res_fit - 1) / res_fit; - uint64_t block_len = res_fit; - uint64_t block_last = output_width - res_fit * (block_num - 1); + auto block_num = (output_width + res_fit - 1) / res_fit; + auto block_len = res_fit; + auto block_last = output_width - res_fit * (block_num - 1); - uint64_t res_amount_per_row = output_width * args.filter_num; - uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + auto res_amount_per_row = output_width * args.filter_num; + auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; - uint64_t image_block_amount_per_row = - args.kernel.stride_w * (res_fit)*args.image.channels; - uint64_t filter_pad_width_mul_channel = + auto image_block_amount_per_row = + args.kernel.stride_w * res_fit * args.image.channels; + auto filter_pad_width_mul_channel = args.image.pad_width * args.image.channels; - uint64_t image_amount_per_row_multi_win_first = + auto image_amount_per_row_multi_win_first = image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); - uint64_t image_amount_per_row_multi_win = + auto image_amount_per_row_multi_win = image_amount_per_row * (4 * args.kernel.stride_h); - uint64_t image_block_num = block_num; - uint64_t image_block_len = + auto image_block_num = block_num; + auto image_block_len = align_to_x((args.image.channels * (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), IMAGE_ALIGNMENT) / 16 + 1; - uint64_t image_block_len_last = + auto image_block_len_last = align_to_x( (args.image.channels * (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), IMAGE_ALIGNMENT) / 16 + 1; - uint64_t image_win_cnt = block_len; - uint64_t image_win_cnt_last = block_last; - uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; - uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + auto image_win_cnt = block_len; + auto image_win_cnt_last = block_last; + auto res_row_data_align4_pad = res_amount_per_row_pad / 8; + auto prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; if (prog_full_cnt == 1023) { prog_full_cnt--; } - uint64_t post_prog_full_cnt = + auto post_prog_full_cnt = (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) : 0; - uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); @@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->sub_conv_num = (uint32_t)stride_h; arg->filter_num = (uint32_t)filter->dims()[0]; int sub_conv_num = arg->sub_conv_num; - int sub_stride = 1; int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], padding_w, stride_w); int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis( @@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, stride_w, (int)filter->dims()[3], padding_w); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); - int sub_channels = (int)input->dims()[1]; - int omit_size = arg->omit_size; - int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; + auto sub_channels = (int)input->dims()[1]; int sub_filter_num = sub_conv_num * (arg->filter_num); int conv_output_size = (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * sub_output_height; - int ouput_size = conv_output_size * sub_conv_num; int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); int align_sub_filter_count = @@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_sub_filter_count * align_sub_filter_num; for (int i = 0; i < sub_conv_num; ++i) { - arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); + arg->conv_args[i].filter_num = arg->sub_conv_num * arg->filter_num; arg->conv_args[i].group_num = (uint32_t)group_num; arg->conv_args[i].filter_scale_address = filter->scale; @@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_h = 1; - // DeconvParam.conv_args[i].image.address = (void*)ptr_image; arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.channels = (uint32_t)sub_channels; arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; @@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].sb_address = (void *)bs_ptr; + arg->conv_args[i].sb_address = bs_ptr; auto filter_sub_space = (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); fpga_copy(filter_sub_space, (char *)filter_ptr + i * align_conv_sub_filter_count, (size_t)align_conv_sub_filter_count); - arg->conv_args[i].filter_address = (void *)(filter_sub_space); + arg->conv_args[i].filter_address = filter_sub_space; fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count); if (sub_conv_num == 1) { arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.scale_address = out->scale; } else { - auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); - arg->conv_args[i].output.address = (void *)((half *)ptr_output); + auto ptr_output = fpga_malloc(conv_output_size * sizeof(half)); + arg->conv_args[i].output.address = ptr_output; auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); arg->conv_args[i].output.scale_address = ptr_output_scale; } + expand_conv_arg(&arg->conv_args[i]); } arg->output.address = out_ptr; arg->output.scale_address = out->scale; - // fpga_free(filter_ptr); + filter->reset_data_ptr(nullptr); } // fill_deconv_arg } // namespace fpga diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index dbc3051b5784d3ecdf3cd08afb15242b7208a543..712973b16dd32a389c06526802a06d78bad4fab1 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -21,7 +21,6 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { -int get_align_image_cw(int cw); void format_image(framework::Tensor* image_tensor); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_fp32_ofm(framework::Tensor* ofm_tensor); @@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_plit_num(framework::Tensor* filter_tensor); int get_aligned_filter_element_num(int chw); -int get_aligned_filter_num(int num); void format_filter(framework::Tensor* filter_tensor, float max_value, int group_num); void format_fc_filter(framework::Tensor* filter_tensor, float max_value); diff --git a/src/fpga/V1/deconv_filter.cpp b/src/fpga/V1/deconv_filter.cpp index 96634e520fd8638dd0ded1da7ca5b5ad33bd7aec..8fb3cd69fdfb10effb5769b656e19858e481f5f4 100644 --- a/src/fpga/V1/deconv_filter.cpp +++ b/src/fpga/V1/deconv_filter.cpp @@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter void deconv_inverse_filter(float** data_in, int num, int channel, int width, int height) { float* tmp = *data_in; - // float fix_range = 127;// float scale = fix_range / max; int data_size = num * channel * width * height; int hw_len = height * width; - float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); + auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); for (int i = 0; i < num; ++i) { for (int j = 0; j < channel; ++j) { for (int k = 0; k < hw_len; ++k) { @@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, } } } - *data_in = (float*)tmp_data; // + *data_in = tmp_data; fpga_free(tmp); } @@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, */ int deconv_calc_sub_pad(int filter_axis, int pad, int stride) { if (stride == 0 || ((filter_axis - pad - 1) < 0)) { - // error - return 0; + PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters"); } return (filter_axis - pad - 1) / stride; } @@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) { position. so the omit rows or columns is (stride - ) */ int deconv_get_omit(int stride, int filter_width, int pad) { - if (((filter_width - pad) <= 0)) { // ((filter_width-pad) > stride) || - // error - return 0; - } - int idx = 1; + PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters"); + int idx; bool flag = false; for (idx = 1; idx <= stride; ++idx) { int j = idx; @@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) { return (stride - idx); } -int deconv_get_sub_filter_num(int filter_num, int stride) { - return filter_num * stride; -} - void deconv_get_sub_filter(char** data_in, int height, int width, int sub_conv_n, int kernel_num, int channel) { char* ptr_tmp = *data_in; @@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset * sizeof(char)); // continuous space for (int i = 0; i < sub_conv_n; ++i) { - int offset = i * origin_offset; char* ptr_tmp = (ptr_ptr_data)[i]; filter::align_element(&ptr_tmp, sub_num, sub_chw); diff --git a/src/fpga/V1/deconv_filter.h b/src/fpga/V1/deconv_filter.h index e89ebe5087a3ab8ee7ef007696e5b1fd50d4173e..5fa9781933712a8506c052258dbf2f7f7e05fe37 100644 --- a/src/fpga/V1/deconv_filter.h +++ b/src/fpga/V1/deconv_filter.h @@ -21,7 +21,6 @@ namespace deconv_filter { void deconv_inverse_filter(float** data_in, int num, int channel, int width, int height); int deconv_calc_sub_pad(int filter_axis, int pad, int stride); -int deconv_get_sub_filter_num(int filter_num, int stride); int deconv_get_sub_filter_axis(int filter_axis, int stride); int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); int deconv_get_omit(int stride, int filter_width, int pad); diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index ee18435323f92a9132cf0014ddba9bb79eb4b265..4121b4f0a458728bece76bbef44201991c46a0a7 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -13,23 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" -#include -#include -#include -#include "common/types.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" -using namespace std; -using namespace paddle_mobile::fpga::driver; // NOLINT namespace paddle_mobile { namespace fpga { -#define IMAGE_ALIGN 16 -#define FILTER_ALIGN 16 -#define FILTER_NUM_ALIGN 32 +using namespace driver; // NOLINT #define USE_RELU 1 #define USE_BIAS 2 @@ -51,7 +43,7 @@ namespace fpga { #define INTERRUPT_CONV 0x0004 #define INTERRUPT_POOLING 0x0008 #define INTERRUPT_EW 0x0010 -//#define INTERRUPT_RESIZE 0x0020 +//#define INTERRUPT_RESIZE 0x0020 /* Register offset */ #define REG_INTERRUPT 0x000 @@ -207,129 +199,6 @@ int ComputeBasicConv(const struct ConvArgs &args) { #ifdef PADDLE_MOBILE_ZU5 int ret = 0; uint64_t output_scale = 0; - /* - uint64_t output_scale; - uint64_t image_scale; - uint64_t filter_scale; - uint64_t image_address_phy = 0; - uint64_t sb_address_phy = 0; - uint64_t filter_address_phy = 0; - uint64_t output_address_phy = 0; - - - fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); - fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); - uint64_t filterlen = (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height * - (uint64_t)args.image.channels; - filterlen = align_to_x(filterlen, FILTER_ALIGN); - filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN); - uint64_t fpga_bias_scale_len = - align_to_x(args.filter_num / args.group_num, 8) * args.group_num; - - uint64_t output_height = - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1; - uint64_t output_width = - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1; - uint64_t output_size = - output_height * output_width * (uint64_t)args.filter_num; - - uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num); - uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num); - - uint64_t image_row_count = ((uint64_t)args.image.width) * - ((uint64_t)args.image.channels); // without align - uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN); - uint64_t image_one_pad_per_row = - align_to_x(image_row_count, IMAGE_ALIGN) + - ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); - uint64_t filter_amount_all = - align_to_x(((uint64_t)args.kernel.height) * - ((uint64_t)args.kernel.width) * channel_per_group, - FILTER_ALIGN); - - uint64_t output_amount_per_row = - align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN); - - // find the opt partition strategy - uint64_t res_win; - uint64_t res_fit = 0; - for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { - if ((align_to_x( - (args.image.channels * - (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), - IMAGE_ALIGN) / - 16 + - 1) * - args.kernel.height > - 2048) { - break; - } - } - - if (res_win != output_width) { - res_win -= 1; - } - - if (((res_win % 2) != 0) && (res_win != 1)) { - res_win = res_win - 1; - } - res_fit = res_win; - - uint64_t block_num = (output_width + res_fit - 1) / res_fit; - uint64_t block_len = res_fit; - uint64_t block_last = output_width - res_fit * (block_num - 1); - - uint64_t res_amount_per_row = output_width * args.filter_num; - uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; - - uint64_t image_block_amount_per_row = - args.kernel.stride_w * (res_fit)*args.image.channels; - uint64_t filter_pad_width_mul_channel = - args.image.pad_width * args.image.channels; - uint64_t image_amount_per_row_multi_win_first = - image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); - uint64_t image_amount_per_row_multi_win = - image_amount_per_row * (4 * args.kernel.stride_h); - - uint64_t image_block_num = block_num; - uint64_t image_block_len = - align_to_x((args.image.channels * - (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), - IMAGE_ALIGN) / - 16 + - 1; - uint64_t image_block_len_last = - align_to_x( - (args.image.channels * - (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), - IMAGE_ALIGN) / - 16 + - 1; - uint64_t image_win_cnt = block_len; - uint64_t image_win_cnt_last = block_last; - uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; - uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; - if (prog_full_cnt == 1023) { - prog_full_cnt--; - } - uint64_t post_prog_full_cnt = - (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) - ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) - : 0; - - image_address_phy = vaddr_to_paddr(args.image.address); - sb_address_phy = vaddr_to_paddr(args.sb_address); - filter_address_phy = vaddr_to_paddr(args.filter_address); - output_address_phy = vaddr_to_paddr(args.output.address); - - uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; -*/ - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { ret = -EIO; @@ -357,7 +226,6 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE); reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE); - reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); @@ -381,7 +249,6 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq(args.driver.prog_full_cnt, 0xd08); reg_writeq(args.driver.post_prog_full_cnt, 0xd10); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); - reg_writeq(args.driver.cmd, REG_CONV_CMD); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { @@ -398,9 +265,9 @@ int ComputeBasicConv(const struct ConvArgs &args) { return ret; #endif - return 0; -} + +} // ComputeBasicConv int ComputeFpgaPool(const struct PoolingArgs &args) { #ifdef FPGA_PRINT_MODE @@ -422,19 +289,15 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Polling"; - // return 0; uint64_t output_scale = 0; uint64_t timer_cnt = 0; int ret = 0; uint64_t cmd = 0; - uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; - image_physical_address = vaddr_to_paddr(args.image.address); - output_physical_address = vaddr_to_paddr(args.output.address); - + image_physical_address = vaddr_to_paddr_driver(args.image.address); + output_physical_address = vaddr_to_paddr_driver(args.output.address); uint32_t output_height = (uint32_t)( (args.image.height + args.image.pad_height * 2 - args.kernel.height) / args.kernel.stride_h + @@ -443,37 +306,35 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { (args.image.width + args.image.pad_width * 2 - args.kernel.width) / args.kernel.stride_w + 1); - - uint64_t image_amount_per_row = align_to_x( - (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + IMAGE_ALIGNMENT); uint64_t image_one_pad_per_row = align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ALIGN) + + FILTER_ELEMENT_ALIGNMENT) + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; uint64_t image_two_pad_per_row = align_to_x( ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * (uint64_t)args.image.channels, - IMAGE_ALIGN); + IMAGE_ALIGNMENT); uint64_t image_row_mul_pooling_hight = image_amount_per_row * (uint64_t)args.kernel.height; uint64_t image_row_mul_pad_hight = image_amount_per_row * (uint64_t)args.image.pad_height; uint64_t image_row_mul_step_hight = image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN); + uint64_t result_amount_align_32 = + align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, + FILTER_ELEMENT_ALIGNMENT); uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN); + (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t image_calcu_height = (uint64_t)args.kernel.height + ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - uint64_t image_padleft_skipwindow = (image_skip_window << 32) | image_pad_left; - uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | (((uint64_t)args.kernel_reciprocal)); @@ -485,50 +346,36 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { return ret; } - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq( ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), REG_POOLING_IMAGE_PIXEL); reg_writeq( ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), REG_POOLING_WINDOW_SIZE); - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), REG_POOLING_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | (((uint64_t)args.image.pad_width) << 32), REG_POOLING_PAD_PIXEL); reg_writeq(((uint64_t)args.kernel.stride_h) | (((uint64_t)args.kernel.stride_w) << 32), REG_POOLING_STEP_PIXEL); - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - reg_writeq(image_row_mul_pooling_hight, REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); - - /*SDK刷Cache保证数据一致性*/ - reg_writeq(cmd, REG_POOLING_CMD); DLOG << "before reg poll"; @@ -549,7 +396,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { return ret; #endif return 0; -} + +} // ComputeFpgaPool int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #ifdef FPGA_PRINT_MODE @@ -577,27 +425,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #ifdef PADDLE_MOBILE_ZU5 int ret = 0; uint64_t output_scale = 0; - /*uint64_t timer_cnt = 0; - uint64_t image0_address_phy = 0; - uint64_t image1_address_phy = 0; - uint64_t output_address_phy = 0; - - uint64_t cmd = args.relu_enabled ? USE_RELU : 0; - uint64_t datalen = (uint64_t)args.image0.width * - (uint64_t)args.image0.height * - (uint64_t)args.image0.channels; - uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; - image0_address_phy = vaddr_to_paddr(args.image0.address); - image1_address_phy = vaddr_to_paddr(args.image1.address); - output_address_phy = vaddr_to_paddr(args.output.address); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGN); - uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | - ((uint64_t)args.image0.width << 16) | - (uint64_t)args.image0.height;*/ - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { ret = -EIO; @@ -631,7 +458,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { return ret; #endif return 0; -} + +} // ComputeFpgaEWAdd int PerformBypass(const struct BypassArgs &args) { #ifdef FPGA_PRINT_MODE @@ -651,9 +479,6 @@ int PerformBypass(const struct BypassArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Bypass"; - // return 0; - struct fpga_pe *pe; uint64_t output_scale = 0; uint64_t timer_cnt = 0; uint64_t cmd = 0; @@ -662,15 +487,12 @@ int PerformBypass(const struct BypassArgs &args) { uint64_t output_address_phy = 0; uint8_t data_cell_in = 0; uint8_t data_cell_out = 0; - int ret = 0; - datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * (uint64_t)args.image.channels; datalen = align_to_x(datalen, 16); - - input_address_phy = vaddr_to_paddr(args.image.address); - output_address_phy = vaddr_to_paddr(args.output.address); + input_address_phy = vaddr_to_paddr_driver(args.image.address); + output_address_phy = vaddr_to_paddr_driver(args.output.address); DLOG << "input_phy:" << input_address_phy; DLOG << "output_phy:" << output_address_phy; @@ -733,36 +555,29 @@ int PerformBypass(const struct BypassArgs &args) { return ret; } - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); reg_writeq(datalen, REG_CONVERT_LENGTH); - - /*SDK刷Cache保证数据一致性*/ reg_writeq(cmd, REG_CONVERT_CMD); - DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; ret = -EIO; DLOG << "BYPASS Wait Irq Timeout!"; } - DLOG << "after reg poll"; output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif - return 0; -} + +} // PerformBypass int ComputeFPGAConcat(const struct ConcatArgs &args) { #ifdef FPGA_PRINT_MODE @@ -776,7 +591,7 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { DLOG << " " << i << "th: "; DLOG << " channel_num:" << args.channel_num[i] - // << " aligned_channel_num:" << args.aligned_channel_num[i] + //<< " aligned_channel_num:" << args.aligned_channel_num[i] << " image_address:" << args.images_in[i] << " image_scale_address:" << args.scales_in[i]; } @@ -786,10 +601,15 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { args.scale_out, args.image_num, args.channel_num, args.height, args.width); return 0; -} - -void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, - int sub_height, int sub_width, int omit_size) { +} // ComputeFPGAConcat + +void deconv_post_process(const struct DeconvArgs &args) { + int sub_conv_n = args.sub_conv_num; + int sub_height = args.sub_output_height; + int sub_width = args.sub_output_width; + int omit_size = args.omit_size; + int channel = args.filter_num; + int num = 1; int origin_h = sub_height * sub_conv_n; int origin_w = sub_width * sub_conv_n; int align_origin_w = align_to_x(origin_w * channel, 16); @@ -797,60 +617,68 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, int deconv_w = origin_w - 2 * omit_size; int deconv_row_len = deconv_w * channel; int align_deconv_row_len = align_to_x(deconv_row_len, 16); - half *ptr_tmp = *data_in; - half *ptr_deconv = - (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half)); - memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half)); + + for (int idx = 0; idx < sub_conv_n; ++idx) { + fpga_invalidate(args.conv_args[idx].output.address, + align_origin_w * origin_h * sizeof(int16_t)); + } + + auto ptr_deconv = (int16_t *)fpga_malloc(num * align_deconv_row_len * + deconv_h * sizeof(int16_t)); + memset(ptr_deconv, 0, + num * align_deconv_row_len * deconv_h * sizeof(int16_t)); int deconv_idx = 0; for (int nn = 0; nn < num; ++nn) { for (int hh = 0; hh < origin_h; ++hh) { int hx = (hh % sub_conv_n); - half *sub_t = ptr_tmp + hx * sub_height * align_origin_w; // sub(hx,:); - + auto sub_t = + (int16_t *)(args.conv_args[sub_conv_n - hx - 1].output.address); int hi = (hh / sub_conv_n); - if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; - - // for (int ww = 0; ww < origin_w; ++ww){ - - // if((ww < omit_size) )// || (ww >= (origin_w-omit_size)) - // continue; - int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + omit_size * channel); fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx, - sizeof(half) * deconv_row_len); + sizeof(int16_t) * deconv_row_len); deconv_idx += align_deconv_row_len; - //} } } + fpga_copy(args.output.address, ptr_deconv, + num * align_deconv_row_len * deconv_h * sizeof(int16_t)); + fpga_flush(args.output.address, + num * align_deconv_row_len * deconv_h * sizeof(int16_t)); + fpga_free(ptr_deconv); - *data_in = ptr_deconv; - fpga_free(ptr_tmp); -} +} // deconv_post_process int ComputeFpgaDeconv(const struct DeconvArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFPGADeConv==========="; DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num + << " group_num:" << args.group_num << "omit_size:" << args.omit_size + << "sub_output_width: " << args.sub_output_width + << "sub_output_height: " << args.sub_output_height << " sub_conv_num:" << args.sub_conv_num; + DLOG << "args.output.address: " << args.output.address + << "args.output.scale_address: " << args.output.scale_address; + DLOG << "args.conv_args.sb_address: " << (args.conv_args)->sb_address + + << "args.conv_args.filter_address: " << (args.conv_args)->filter_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; #endif int sub_conv_num = args.sub_conv_num; - for (int i = 0; i < sub_conv_num; i++) { - //#if CPU_SIMULATE - - //#else ComputeBasicConv(args.conv_args[i]); - //#endif } if (sub_conv_num > 1) { - float max_scale = -1.0; + float max_scale = -1.0f; for (int i = 0; i < sub_conv_num; i++) { + paddle_mobile::fpga::fpga_invalidate( + args.conv_args[i].output.scale_address, 2 * sizeof(float)); float ptr_scale = (args.conv_args[i].output.scale_address)[0]; if (ptr_scale > max_scale) { args.output.scale_address[0] = ptr_scale; @@ -858,12 +686,11 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { (args.conv_args[i].output.scale_address)[1]; } } - deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1, - args.filter_num, (args.sub_output_height), - (args.sub_output_width), args.omit_size); + deconv_post_process(args); } + return 0; -} +} // ComputeFpgaDeconv int ComputeFPGASplit(const struct SplitArgs &args) { #ifdef FPGA_PRINT_MODE @@ -883,6 +710,7 @@ int ComputeFPGASplit(const struct SplitArgs &args) { args.scales_out, args.image_num, args.out_channel_nums, args.height, args.width); return 0; -} +} // ComputeFPGASplit + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index acab650cb1452c2d39cf965b56016cc3c6f394c3..18a310b09cad4a741eb83453a09f3c94d4f0db05 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); unsigned int nr = (unsigned int)_nr; int ret = 0; - DLOG << size; - DLOG << _nr; - DLOG << nr; - uint64_t a_size = FPGA_PAGE_SIZE * nr; DLOG << a_size; @@ -283,7 +279,7 @@ int fpga_memory_add() { return 0; } -uint64_t vaddr_to_paddr(void *address) { +uint64_t vaddr_to_paddr_driver(void *address) { uint64_t paddr = 0; auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { @@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) { g_fpgainfo.fpga_addr2size_map.erase(iter); munmap(ptr, size); } else { - DLOG << "Invalid pointer"; + DLOG << "Invalid pointer" << ptr; } } @@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) { g_fpgainfo.fpga_addr2size_map.erase(iter); munmap(ptr, size); - p_addr = vaddr_to_paddr(ptr); + p_addr = vaddr_to_paddr_driver(ptr); pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; /*clear bitmap*/ @@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) { g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); } } else { - DLOG << "Invalid pointer"; + DLOG << "Invalid pointer" << ptr; } } @@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) { struct MemoryCacheArgs args; uint64_t p_addr; - p_addr = vaddr_to_paddr(address); + p_addr = vaddr_to_paddr_driver(address); args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; @@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) { struct MemoryCacheArgs args; uint64_t p_addr; - p_addr = vaddr_to_paddr(address); + p_addr = vaddr_to_paddr_driver(address); args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index 8034bd2bf6b9c98fe14f4aa38ff1bbb41cf64b70..4fa83b776e7b3df5df5e536de91093fd18ca67a1 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -31,8 +31,8 @@ namespace driver { #define FPGA_REG_PHY_ADDR 0xa0000000 #define FPGA_REG_SIZE 0x1000 -#define FPGA_MEM_PHY_ADDR 0x20000000 -#define FPGA_MEM_SIZE 0x20000000 +#define FPGA_MEM_PHY_ADDR 0x40000000 +#define FPGA_MEM_SIZE 0x80000000 #define FPGA_PAGE_SIZE (16UL * 1024UL) @@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size); void fpga_free_driver(void *ptr); -void fpga_copy_driver(void *dest, const void *src, size_t num); - int fpga_flush_driver(void *address, size_t size); int fpga_invalidate_driver(void *address, size_t size); -/*pe*/ - -uint64_t vaddr_to_paddr(void *address); +uint64_t vaddr_to_paddr_driver(void *address); int fpga_regpoll(uint64_t reg, uint64_t val, int time); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 6d0c29bfbf8d07bad4398ba29baf7f3794e42ba1..af6f18eb57b0e710359235790529c15ce4e82917 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -37,6 +37,18 @@ enum LayoutType { LAYOUT_HWC = 0, }; +enum ActivationType { + NONE = 0, + LEAKYRELU = 1, + SIGMOID = 2, + TANH = 3, +}; + +struct ActivationArgs { + enum ActivationType activation_type; + int16_t leaky_relu_negative_slope; +}; + struct KernelArgs { uint32_t width; uint32_t height; @@ -58,7 +70,10 @@ struct ImageOutputArgs { void* address; // output result address; float* scale_address; // output scale address; uint64_t timer_cnt; // time counter for FPGA computation + struct ActivationArgs + activation; // To select activation and specify (Leaky)Relu parameter. }; + #ifdef PADDLE_MOBILE_FPGA_V1 struct ConvDriverParam { uint64_t image_address_phy; @@ -198,7 +213,11 @@ struct DeconvArgs { struct ConvArgs* conv_args; }; -static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } +// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; +// } +static inline uint32_t align_to_x(int64_t num, int64_t x) { + return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x; +} int16_t fp32_2_fp16(float fp32_num); float fp16_2_fp32(int16_t fp16_num); diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index 4505401f434c320003e8122a3a0e197441ae8921..48e84707fabb4ccd0618da672b82c5380d9533ba 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -14,7 +14,6 @@ limitations under the License. */ #ifdef TRANSPOSE2_OP #include "operators/kernel/transpose2_kernel.h" -#include "operators/kernel/central-arm-func/transpose2_arm_func.h" namespace paddle_mobile { namespace operators {