diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index efcc9f742e88e9a96394a2f6a15118c918624515..15532aa9b35a8d0a05b8ca9c4623f007be93a02f 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -21,6 +21,9 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +#define USE_RELU 1 +#define USE_BIAS 2 + int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } void format_image(framework::Tensor *image_tensor) { @@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width, out->reset_data_ptr(data_ptr); } +void expand_conv_arg(ConvArgs *arg) { + ConvArgs args = *arg; + uint64_t filterlen = (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height * + (uint64_t)args.image.channels; + filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT); + filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT); + uint64_t fpga_bias_scale_len = + align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + + uint64_t output_height = + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1; + uint64_t output_width = + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1; + uint64_t output_size = + output_height * output_width * (uint64_t)args.filter_num; + + auto filter_per_group = (uint64_t)(args.filter_num / args.group_num); + auto channel_per_group = (uint64_t)(args.image.channels / args.group_num); + + uint64_t image_row_count = ((uint64_t)args.image.width) * + ((uint64_t)args.image.channels); // without align + uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); + uint64_t image_one_pad_per_row = + align_to_x(image_row_count, IMAGE_ALIGNMENT) + + ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); + uint64_t filter_amount_all = + align_to_x(((uint64_t)args.kernel.height) * + ((uint64_t)args.kernel.width) * channel_per_group, + FILTER_ELEMENT_ALIGNMENT); + + uint64_t output_amount_per_row = + align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT); + + // find the opt partition strategy + uint64_t res_win; + uint64_t res_fit = 0; + for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + if ((align_to_x( + (args.image.channels * + (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1) * + args.kernel.height > + 2048) { + break; + } + } + + if (res_win != output_width) { + res_win -= 1; + } + + if (((res_win % 2) != 0) && (res_win != 1)) { + res_win = res_win - 1; + } + res_fit = res_win; + + uint64_t block_num = (output_width + res_fit - 1) / res_fit; + uint64_t block_len = res_fit; + uint64_t block_last = output_width - res_fit * (block_num - 1); + + uint64_t res_amount_per_row = output_width * args.filter_num; + uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + + uint64_t image_block_amount_per_row = + args.kernel.stride_w * (res_fit)*args.image.channels; + uint64_t filter_pad_width_mul_channel = + args.image.pad_width * args.image.channels; + uint64_t image_amount_per_row_multi_win_first = + image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); + uint64_t image_amount_per_row_multi_win = + image_amount_per_row * (4 * args.kernel.stride_h); + + uint64_t image_block_num = block_num; + uint64_t image_block_len = + align_to_x((args.image.channels * + (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1; + uint64_t image_block_len_last = + align_to_x( + (args.image.channels * + (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), + IMAGE_ALIGNMENT) / + 16 + + 1; + uint64_t image_win_cnt = block_len; + uint64_t image_win_cnt_last = block_last; + uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; + uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + if (prog_full_cnt == 1023) { + prog_full_cnt--; + } + uint64_t post_prog_full_cnt = + (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) + ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) + : 0; + uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + + (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); + (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); + (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); + (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address); + (*arg).driver.output_height = output_height; + (*arg).driver.output_width = output_width; + (*arg).driver.filter_per_group = filter_per_group; + (*arg).driver.channel_per_group = channel_per_group; + (*arg).driver.image_amount_per_row = image_amount_per_row; + (*arg).driver.image_one_pad_per_row = image_one_pad_per_row; + (*arg).driver.filter_amount_all = filter_amount_all; + (*arg).driver.output_amount_per_row = output_amount_per_row; + (*arg).driver.image_block_amount_per_row = image_block_amount_per_row; + (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; + (*arg).driver.image_amount_per_row_multi_win_first = + image_amount_per_row_multi_win_first; + (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; + (*arg).driver.image_block_num = image_block_num; + (*arg).driver.image_block_len = image_block_len; + (*arg).driver.image_block_len_last = image_block_len_last; + (*arg).driver.image_win_cnt = image_win_cnt; + (*arg).driver.image_win_cnt_last = image_win_cnt_last; + (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; + (*arg).driver.prog_full_cnt = prog_full_cnt; + (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; + (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; + (*arg).driver.cmd = cmd; +} // expand_conv_arg() + +void expand_EW_arg(EWAddArgs *arg) { + EWAddArgs args = *arg; + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; + uint64_t datalen = (uint64_t)args.image0.width * + (uint64_t)args.image0.height * + (uint64_t)args.image0.channels; + uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; + uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address); + uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address); + uint64_t output_address_phy = vaddr_to_paddr(args.output.address); + + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGNMENT); + uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | + ((uint64_t)args.image0.width << 16) | + (uint64_t)args.image0.height; + + (*arg).driver.image0_address_phy = image0_address_phy; + (*arg).driver.image1_address_phy = image1_address_phy; + (*arg).driver.datalen = datalen; + (*arg).driver.image_image_pixel = image_image_pixel; + (*arg).driver.image_amount_per_row = image_amount_per_row; + (*arg).driver.output_address_phy = output_address_phy; + (*arg).driver.coefficient = coefficient; + (*arg).driver.cmd = cmd; + +} // expand_EW_arg + void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, bool relu_enabled, int group_num, int stride_h, @@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, auto channel = (int)out->dims()[1]; // NOLINT int filter_num_per_div = get_filter_num_per_div(filter, group_num); int element_num = get_aligned_filter_element_num( - filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3])); for (int i = 0; i < n; i++) { arg->conv_arg[i].relu_enabled = relu_enabled; @@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].filter_scale_address = filter->scale; - // arg->conv_arg[i].filter_address = &( - // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // - // NOLINT - // arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); size_t filter_size = - element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); + element_num * + align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * + sizeof(int8_t); auto filter_head = &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; arg->conv_arg[i].filter_address = fpga_malloc(filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size); - size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); + size_t bs_size = 2 * + align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * + sizeof(float); auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_arg[i].sb_address = fpga_malloc(bs_size); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); @@ -249,11 +415,11 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_arg[i].output.address = - fpga_malloc(out->dims()[2] * - align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, - IMAGE_ALIGNMENT) * - sizeof(half)); + arg->conv_arg[i].output.address = fpga_malloc( + out->dims()[2] * + align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num), + IMAGE_ALIGNMENT) * + sizeof(half)); } else { arg->conv_arg[i].output.scale_address = out->scale; arg->conv_arg[i].output.address = out_ptr; @@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, (half *)arg->conv_arg[i].output.address; // NOLINT arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; + + expand_conv_arg(&arg->conv_arg[i]); } filter->reset_data_ptr(nullptr); fpga_free(bs_ptr); -} +} // fill_split_arg + void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::Tensor *out, framework::Tensor *filter, bool relu_enabled, int group_num, int stride_h, @@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, auto out_ptr = out->data(); arg->group_num = (uint32_t)group_num; - arg->sub_conv_num = stride_h; + arg->sub_conv_num = (uint32_t)stride_h; arg->filter_num = (uint32_t)filter->dims()[0]; - int sub_conv_num = arg->sub_conv_num; int sub_stride = 1; - int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w, - stride_w); - int sub_filter_width = - deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w); + int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], + padding_w, stride_w); + int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis( + (int)filter->dims()[3], stride_w); int sub_output_width = deconv_filter::deconv_get_sub_out_axis( - input->dims()[3], sub_pad, sub_filter_width); + (int)input->dims()[3], sub_pad, sub_filter_width); int sub_output_height = deconv_filter::deconv_get_sub_out_axis( - input->dims()[2], sub_pad, sub_filter_width); + (int)input->dims()[2], sub_pad, sub_filter_width); - arg->sub_output_width = sub_output_width; - arg->sub_output_height = sub_output_height; - arg->omit_size = - deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w); + arg->sub_output_width = (uint32_t)sub_output_width; + arg->sub_output_height = (uint32_t)sub_output_height; + arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( + stride_w, (int)filter->dims()[3], padding_w); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); - int sub_channels = (int32_t)input->dims()[1]; + int sub_channels = (int)input->dims()[1]; int omit_size = arg->omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; @@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, for (int i = 0; i < sub_conv_num; ++i) { arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); - arg->conv_args[i].group_num = group_num; + arg->conv_args[i].group_num = (uint32_t)group_num; arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].relu_enabled = relu_enabled; - arg->conv_args[i].kernel.width = sub_filter_width; - arg->conv_args[i].kernel.height = sub_filter_width; + arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width; + arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width; arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_h = 1; // DeconvParam.conv_args[i].image.address = (void*)ptr_image; arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.channels = sub_channels; + arg->conv_args[i].image.channels = (uint32_t)sub_channels; arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.pad_width = sub_pad; - arg->conv_args[i].image.pad_height = sub_pad; + arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; + arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].sb_address = (void *)bs_ptr; - char *filter_sub_space = + auto filter_sub_space = (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); fpga_copy(filter_sub_space, (char *)filter_ptr + i * align_conv_sub_filter_count, - align_conv_sub_filter_count); + (size_t)align_conv_sub_filter_count); arg->conv_args[i].filter_address = (void *)(filter_sub_space); - fpga_flush(filter_sub_space, align_conv_sub_filter_count); + fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count); if (sub_conv_num == 1) { arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.scale_address = out->scale; } else { - half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); arg->conv_args[i].output.address = (void *)((half *)ptr_output); - float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); arg->conv_args[i].output.scale_address = ptr_output_scale; } } @@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->output.address = out_ptr; arg->output.scale_address = out->scale; // fpga_free(filter_ptr); -} +} // fill_deconv_arg + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/bias_scale.h b/src/fpga/V1/bias_scale.h index 2d1e44c5470dae02fde6956a3744edc2e371a87b..9ae572a556a01719d918809ab9becea2d9fa5a20 100644 --- a/src/fpga/V1/bias_scale.h +++ b/src/fpga/V1/bias_scale.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#define BS_NUM_ALIGNMENT 8 - namespace paddle_mobile { namespace fpga { namespace bias_scale { diff --git a/src/fpga/V1/deconv_bias_scale.h b/src/fpga/V1/deconv_bias_scale.h index 7b9aaff756809b43884883b3333fcbc1dd2b6adf..820c6984d439f945ea4fc5f560fb346869026003 100644 --- a/src/fpga/V1/deconv_bias_scale.h +++ b/src/fpga/V1/deconv_bias_scale.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#define BS_NUM_ALIGNMENT 8 - namespace paddle_mobile { namespace fpga { namespace deconv_bias_scale { diff --git a/src/fpga/V1/filter.h b/src/fpga/V1/filter.h index 7bcc334738be98a49fc7eb25cf7090c0f8b6b3b3..6cb35d380733b0bf64a3a782d44fd321e3f00cfa 100644 --- a/src/fpga/V1/filter.h +++ b/src/fpga/V1/filter.h @@ -14,9 +14,6 @@ limitations under the License. */ #pragma once -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 - namespace paddle_mobile { namespace fpga { namespace filter { diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index f4142ad58a273691c84db9dd585518e7edcff8a6..c79a5c3a8e7c4f47cd11c2c4af14feb69efed48d 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); } -void split_image(int16_t *image_in, float *scale_in, void **images_out, - float **scales_out, int image_num, uint32_t *channel_nums, - int height, int width) { +void split_image(int16_t *image_in, const float *scale_in, void **images_out, + float **scales_out, int image_num, + const uint32_t *channel_nums, int height, int width) { int total_channel = 0; for (int i = 0; i < image_num; i++) { scales_out[i][0] = scale_in[0]; scales_out[i][1] = scale_in[1]; total_channel += channel_nums[i]; } + int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); + fpga_invalidate(image_in, element_num * sizeof(int16_t)); + int src_offset = 0, des_offset = 0; for (int h = 0; h < height; h++) { - int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT); - for (int i = 0; i < image_num; i++) { - int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT); - memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, - channel_nums[i] * sizeof(int16_t)); - src_offset += channel_nums[i]; + for (int w = 0; w < width; w++) { + src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) + + w * total_channel; + for (int i = 0; i < image_num; i++) { + des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + + w * channel_nums[i]; + memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, + channel_nums[i] * sizeof(int16_t)); + src_offset += channel_nums[i]; + } } } + + for (int i = 0; i < image_num; i++) { + element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); + fpga_flush(images_out[i], element_num * sizeof(int16_t)); + } } } // namespace image diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index 321967bbe233c5bec889aeb63f98dc23779b4918..f3c7b2731cb555c0c8871f6cd1d9f9df3e6429f2 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -14,9 +14,8 @@ limitations under the License. */ #pragma once -#include +#include -#define IMAGE_ALIGNMENT 16 // Aligned to 16 namespace paddle_mobile { namespace fpga { namespace image { @@ -24,13 +23,16 @@ namespace image { void convert_to_hwc(float** data_in, int channel, int height, int width); void align_element_conv(float** data_in, int height, int cw); void format_image(float** data_in, int channel, int height, int width); + +// Concat featuremaps along channel direction void concat_images(int16_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, - int height, - int width); // Concat featuremaps along channel direction -void split_image(int16_t* image_in, float* scale_in, void** images_out, - float** scales_out, int image_num, uint32_t* channel_nums, - int height, int width); + int height, int width); + +// Split featuremap along channel direction +void split_image(int16_t* image_in, const float* scale_in, void** images_out, + float** scales_out, int image_num, + const uint32_t* channel_nums, int height, int width); } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index d62f015e66c9bfb7c1ee07c349a307563a4581f2..ee18435323f92a9132cf0014ddba9bb79eb4b265 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif - cout << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - cout << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Conv"; - // return 0; - uint64_t timer_cnt; + int ret = 0; + uint64_t output_scale = 0; + /* uint64_t output_scale; uint64_t image_scale; uint64_t filter_scale; @@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { uint64_t sb_address_phy = 0; uint64_t filter_address_phy = 0; uint64_t output_address_phy = 0; - int ret = 0; + fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); - - cout << "image_scale :" << hex << (image_scale) << endl; - cout << "filter_scale :" << hex << (filter_scale) << endl; - uint64_t filterlen = (uint64_t)args.kernel.width * (uint64_t)args.kernel.height * (uint64_t)args.image.channels; @@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { filter_address_phy = vaddr_to_paddr(args.filter_address); output_address_phy = vaddr_to_paddr(args.output.address); - /*SDK刷Cache保证数据一致性*/ uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; +*/ pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { @@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR); - reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR); - reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR); - reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR); - reg_writeq( ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), REG_CONV_IMAGE_PIXEL); reg_writeq( ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), REG_CONV_FILTER_PIXEL); - reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); + reg_writeq(args.driver.output_height | (args.driver.output_width << 32), + REG_CONV_RESULT_PIXEL); reg_writeq(((uint64_t)args.image.pad_height) | (((uint64_t)args.image.pad_width) << 32), REG_CONV_PAD_PIXEL); reg_writeq(((uint64_t)args.kernel.stride_h) | (((uint64_t)args.kernel.stride_w) << 32), REG_CONV_STEP_PIXEL); - reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); + reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE); + reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE); + + reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); + reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); + reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); + reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR); + reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP); + reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP); + reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); + reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); + reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); + reg_writeq(args.driver.image_block_amount_per_row, 0xca8); + reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0); + reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8); + reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0); + reg_writeq(args.driver.image_block_num, 0xcc8); + reg_writeq(args.driver.image_block_len, 0xcd0); + reg_writeq(args.driver.image_block_len_last, 0xcd8); + reg_writeq(args.driver.image_win_cnt, 0xce0); + reg_writeq(args.driver.image_win_cnt_last, 0xce8); + reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8); + reg_writeq(args.driver.prog_full_cnt, 0xd08); + reg_writeq(args.driver.post_prog_full_cnt, 0xd10); + reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); + + reg_writeq(args.driver.cmd, REG_CONV_CMD); - reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP); - reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP); - - reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); - reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); - - reg_writeq(image_block_amount_per_row, 0xca8); - reg_writeq(filter_pad_width_mul_channel, 0xcb0); - reg_writeq(image_amount_per_row_multi_win_first, 0xcb8); - reg_writeq(image_amount_per_row_multi_win, 0xcc0); - reg_writeq(image_block_num, 0xcc8); - reg_writeq(image_block_len, 0xcd0); - reg_writeq(image_block_len_last, 0xcd8); - reg_writeq(image_win_cnt, 0xce0); - reg_writeq(image_win_cnt_last, 0xce8); - reg_writeq(res_row_data_align4_pad, 0xcf8); - reg_writeq(prog_full_cnt, 0xd08); - reg_writeq(post_prog_full_cnt, 0xd10); - reg_writeq(fpga_bias_scale_len / 4, 0xd20); - - /*write scale*/ - reg_writeq(image_scale, REG_CONV_IMAGE_SCALE); - reg_writeq(filter_scale, REG_CONV_FILTER_SCALE); - - reg_writeq(cmd, REG_CONV_CMD); - - DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; ret = -EIO; DLOG << "Conv Wait Irq Timeout!"; } - DLOG << "after reg poll"; - usleep(40); - - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - cout << "output_scale :" << hex << (output_scale) << endl; - //*(args.output.scale_address) = output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; @@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << "Pooling Wait Irq Timeout!"; } DLOG << "after reg poll"; - usleep(40); - - /*SDK 无效 Cache保证数据一致性*/ // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER); @@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Conv"; - // return 0; int ret = 0; uint64_t output_scale = 0; - uint64_t timer_cnt = 0; + /*uint64_t timer_cnt = 0; uint64_t image0_address_phy = 0; uint64_t image1_address_phy = 0; uint64_t output_address_phy = 0; @@ -629,54 +587,44 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { (uint64_t)args.image0.height * (uint64_t)args.image0.channels; uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - image0_address_phy = vaddr_to_paddr(args.image0.address); image1_address_phy = vaddr_to_paddr(args.image1.address); output_address_phy = vaddr_to_paddr(args.output.address); uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGN); + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGN); uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | ((uint64_t)args.image0.width << 16) | - (uint64_t)args.image0.height; + (uint64_t)args.image0.height;*/ - /*SDK刷Cache保证数据一致性*/ + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { + ret = -EIO; + DLOG << "EW Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } - /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(datalen, REG_EW_DATA_LEN); - reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - - reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(coefficient, REG_EW_COEFFICIENT); - - reg_writeq(cmd, REG_EW_CMD); + reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); + reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); + reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); + reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); + reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); + reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); + reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); + reg_writeq(args.driver.cmd, REG_EW_CMD); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; ret = -EIO; DLOG << "EW Wait Irq Timeout!"; } - usleep(40); - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); @@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) { DLOG << "BYPASS Wait Irq Timeout!"; } DLOG << "after reg poll"; - usleep(40); - /*SDK 无效 Cache保证数据一致性*/ output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); @@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, *data_in = ptr_deconv; fpga_free(ptr_tmp); } + int ComputeFpgaDeconv(const struct DeconvArgs &args) { -#ifdef FPGA_TEST_MODE +#ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFPGADeConv==========="; DLOG << " filter_num:" << args.filter_num << " group_num:" << args.group_num diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 2f592fe45d951230427595f2f8ff5b4a148c0276..acab650cb1452c2d39cf965b56016cc3c6f394c3 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { - std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg - << std::endl; break; } } @@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { DLOG << "dest:" << dest << " src:" << src << " size:" << num; for (i = 0; i < num; i++) { - // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); - // usleep(1); *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index c204370be7ecd3aca229b2c130ec7861116a3ef7..8034bd2bf6b9c98fe14f4aa38ff1bbb41cf64b70 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -103,22 +103,15 @@ struct FPGA_INFO { extern struct FPGA_INFO g_fpgainfo; inline uint64_t reg_readq(uint32_t offset) { - // DLOG << "offset : " << offset; uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset); // NOLINT - // DLOG << "read end"; - usleep(10); - return value; } inline void reg_writeq(uint64_t value, uint32_t offset) { - // DLOG << "offset : " << offset << ", value : " << value; *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset) = value; - // DLOG << "write end"; - usleep(10); } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index fdda65afda595d281a6f4db6f8132213f8f8d9e5..6d0c29bfbf8d07bad4398ba29baf7f3794e42ba1 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -20,6 +20,13 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +#ifdef PADDLE_MOBILE_FPGA_V1 +#define IMAGE_ALIGNMENT 16 // Aligned to 16 +#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 +#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 +#define BS_NUM_ALIGNMENT 8 +#endif + enum DataType { DATA_TYPE_FP32 = 1, DATA_TYPE_FP16 = 0, @@ -52,19 +59,70 @@ struct ImageOutputArgs { float* scale_address; // output scale address; uint64_t timer_cnt; // time counter for FPGA computation }; +#ifdef PADDLE_MOBILE_FPGA_V1 +struct ConvDriverParam { + uint64_t image_address_phy; + uint64_t filter_address_phy; + uint64_t sb_address_phy; + uint64_t output_address_phy; + + uint64_t output_height; + uint64_t output_width; + uint64_t filter_per_group; + uint64_t channel_per_group; + + uint64_t image_amount_per_row; + uint64_t image_one_pad_per_row; + uint64_t filter_amount_all; + uint64_t output_amount_per_row; + + uint64_t image_block_amount_per_row; + uint64_t filter_pad_width_mul_channel; + uint64_t image_amount_per_row_multi_win_first; + uint64_t image_amount_per_row_multi_win; + uint64_t image_block_num; + uint64_t image_block_len; + uint64_t image_block_len_last; + uint64_t image_win_cnt; + uint64_t image_win_cnt_last; + uint64_t res_row_data_align4_pad; + uint64_t prog_full_cnt; + uint64_t post_prog_full_cnt; + uint64_t fpga_bias_scale_len; + uint64_t cmd; +}; + +struct EWAddDriverParam { + uint64_t image0_address_phy; + uint64_t image1_address_phy; + uint64_t datalen; + uint64_t image_image_pixel; + uint64_t image_amount_per_row; + uint64_t output_address_phy; + uint64_t coefficient; + uint64_t cmd; +}; +#endif struct ConvArgs { bool relu_enabled; void* sb_address; // scale and bias void* filter_address; float* filter_scale_address; - void* free_space; // used by FPGA logic uint32_t filter_num; uint32_t group_num; struct KernelArgs kernel; struct ImageInputArgs image; // input image; struct ImageOutputArgs output; + +#ifdef PADDLE_MOBILE_FPGA_V2 + void* free_space; // used by FPGA logic +#endif + +#ifdef PADDLE_MOBILE_FPGA_V1 + struct ConvDriverParam driver; +#endif }; struct ConcatArgs { @@ -115,6 +173,9 @@ struct EWAddArgs { struct ImageInputArgs image0; struct ImageInputArgs image1; struct ImageOutputArgs output; +#ifdef PADDLE_MOBILE_FPGA_V1 + struct EWAddDriverParam driver; +#endif }; struct BypassArgs { @@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num); int fpga_flush(void* address, size_t size); int fpga_invalidate(void* address, size_t size); +uint64_t vaddr_to_paddr(void* address); +void expand_conv_arg(ConvArgs* arg); +void expand_EW_arg(EWAddArgs* arg); + } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp index 8cdf6cb01afa85239bfd0d48bbce02790ba5250d..3c6a6f151f9b05ad0b69b40298ee5a47797d70af 100644 --- a/src/operators/kernel/arm/concat_kernel.cpp +++ b/src/operators/kernel/arm/concat_kernel.cpp @@ -27,7 +27,11 @@ bool ConcatKernel::Init(ConcatParam *param) { template <> void ConcatKernel::Compute(const ConcatParam ¶m) { - ConcatCompute(param); + if (param.Inputs()[0]->type() == typeid(int8_t)) { + ConcatCompute(param); + } else { + ConcatCompute(param); + } param.Out()->set_lod(param.Inputs()[0]->lod()); } diff --git a/src/operators/kernel/central-arm-func/concat_arm_func.h b/src/operators/kernel/central-arm-func/concat_arm_func.h index 57a22aafa5e0bc75c1041c379c2229deaa310ffe..4b22857302d11e4a7861282b3088ebe23bea0537 100644 --- a/src/operators/kernel/central-arm-func/concat_arm_func.h +++ b/src/operators/kernel/central-arm-func/concat_arm_func.h @@ -57,8 +57,8 @@ template void ConcatCompute(const ConcatParam ¶m) { auto inputs = param.Inputs(); auto *out = param.Out(); - int64_t axis = param.Axis(); - out->mutable_data(); + int axis = param.Axis(); + out->mutable_data

(); /// Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && inputs.size() < 10) { @@ -66,12 +66,12 @@ void ConcatCompute(const ConcatParam ¶m) { for (auto *in : inputs) { auto in_stride = framework::stride_numel(in->dims()); auto out_stride = framework::stride_numel(out->dims()); - auto dst = out->data() + output_offset; - auto src = in->data(); + auto dst = out->data

() + output_offset; + auto src = in->data

(); PADDLE_MOBILE_ENFORCE( in_stride.size() == out_stride.size(), "src and dst tensor should have the same dims size."); - memory::Copy(dst, src, sizeof(float) * in_stride[0]); + memory::Copy(dst, src, sizeof(P) * in_stride[0]); output_offset += in_stride[0]; } } else { @@ -79,8 +79,8 @@ void ConcatCompute(const ConcatParam ¶m) { for (int j = 0; j < inputs.size(); ++j) { inputs_concat[j] = *inputs[j]; } - ConcatFunctor concat_functor; - concat_functor(inputs_concat, static_cast(axis), out); + ConcatFunctor

concat_functor; + concat_functor(inputs_concat, axis, out); } } diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index f8eeb53159411276fbab957c676a01cb31b597c8..be773412f099410b02f24b1d38d2a44d6ca77689 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -49,6 +49,7 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { ewaddArgs.image1.pad_width = 0; ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); param->SetFpgaArgs(ewaddArgs); return true; } diff --git a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp index 5253d4d0d3e00190b4ed594279d9190659ec6026..541bb6126509dc7da59fa6bed5c46aff3442928b 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp @@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel::Init( ewaddArgs.image1.pad_width = 0; ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); param->SetFpgaArgs(ewaddArgs); return true; } diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 37c03e2404f761f3089adb852b94bef27bec1ce9..918760bcfab0ea6c940fa35b3aebd0351f4d88cf 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -24,8 +24,12 @@ template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto input_ptr = input->data(); + auto out = param->Out(); + fpga::format_fp32_ofm(out); + auto float_input = new Tensor; - float_input->mutable_data({1, input->dims()[1]}); + float_input->mutable_data( + {1, input->dims()[2], input->dims()[3], input->dims()[1]}); fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -34,8 +38,8 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = input_ptr; - args.image.height = 1; - args.image.width = 1; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; args.image.channels = (uint32_t)input->dims()[1]; args.output.address = float_input->data(); args.output.scale_address = float_input->scale; @@ -50,9 +54,9 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *out = param.Out(); fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate( - (void *)in_x->data(), // NOLINT - fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(float)); + // TODO: In general case, 0 should be squeezed before softmax input math::SoftmaxFuntor()(in_x, out); fpga::fpga_flush(out->data(), out->memory_size()); } diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp index 1a347a9c37a96f3c31506d0b45f95e05b64292ff..88ec06be6f1b5197669f7c580d935bb9d2475c5a 100644 --- a/test/operators/test_concat_op.cpp +++ b/test/operators/test_concat_op.cpp @@ -12,76 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include +#include +#include "../test_helper.h" #include "../test_include.h" #include "operators/concat_op.h" +namespace paddle_mobile { +using framework::AttributeMap; +using framework::DDim; +using framework::LoDTensor; +using framework::Scope; +using framework::make_ddim; + +template +void concat(const std::vector &input, LoDTensor &output, int axis) { + int num = input.size(); + + int rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int out_rows = rows, out_cols = 0; + + std::vector input_cols(input.size()); + for (int i = 0; i < num; ++i) { + int t_cols = input[i].numel() / rows; + out_cols += t_cols; + input_cols[i] = t_cols; + } + + // computation + auto output_data = output.data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = input[j].data(); + for (int k = 0; k < out_rows; ++k) { + memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len, + sizeof(T) * col_len); + } + col_idx += col_len; + } +} + +template +int TestConcatOP() { + DDim inputA_shape = make_ddim({10, 4, 2, 2}); + DDim inputB_shape = make_ddim({20, 4, 2, 2}); + DDim inputC_shape = make_ddim({30, 4, 2, 2}); + DDim inputD_shape = make_ddim({40, 4, 2, 2}); + DDim output_shape = make_ddim({100, 4, 2, 2}); + int axis_v = 0; + VariableNameMap inputs; + VariableNameMap outputs; + std::vector input_tensors; + auto scope = std::make_shared(); + inputs["X"] = + std::vector({"inputA", "inputB", "inputC", "inputD"}); + outputs["Out"] = std::vector({"output"}); + + auto inputA_var = scope.get()->Var("inputA"); + auto inputA = inputA_var->template GetMutable(); + SetupTensor(inputA, inputA_shape, -127, 127); + input_tensors.push_back(std::move(*inputA)); + + auto inputB_var = scope.get()->Var("inputB"); + auto inputB = inputB_var->template GetMutable(); + SetupTensor(inputB, inputB_shape, -127, 127); + input_tensors.push_back(std::move(*inputB)); + + auto inputC_var = scope.get()->Var("inputC"); + auto inputC = inputC_var->template GetMutable(); + SetupTensor(inputC, inputC_shape, -127, 127); + input_tensors.push_back(std::move(*inputC)); + + auto inputD_var = scope.get()->Var("inputD"); + auto inputD = inputD_var->template GetMutable(); + SetupTensor(inputD, inputD_shape, -127, 127); + input_tensors.push_back(std::move(*inputD)); + + auto output_var = scope.get()->Var("output"); + AttributeMap attrs; + attrs["axis"].Set(axis_v); + + auto *op = new operators::ConcatOp("concat", inputs, outputs, + attrs, scope); + op->InferShape(); + op->Run(); + auto output = output_var->template Get(); + const T *output_data = output->data(); + LoDTensor output_cmp; + output_cmp.mutable_data(output_shape); + concat(input_tensors, output_cmp, axis_v); + const T *output_cmp_data = output_cmp.data(); + // compare + int eq = 0; + int neq = 0; + for (int i = 0; i < output->numel(); ++i) { + PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], + "The execution of test_concat_op is failed!"); + if (output_data[i] == output_cmp_data[i]) { + ++eq; + } else { + ++neq; + } + } + std::cout << "eq = " << eq << ", neq = " << neq << std::endl; + + delete op; + return 0; +} +} // namespace paddle_mobile + int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(g_googlenet); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "concat"); - - // 1. input_tensors; - vector input_tensors; - - Tensor input1; - auto input1_data = CreateInput(&input1, {4, 10, 2, 2}, 0, 1); - input_tensors.push_back(input1); - Tensor input2; - auto input2_data = CreateInput(&input2, {4, 20, 2, 2}, 0, 1); - input_tensors.push_back(input2); - Tensor input3; - auto input3_data = CreateInput(&input3, {4, 30, 2, 2}, 0, 1); - input_tensors.push_back(input3); - Tensor input4; - auto input4_data = CreateInput(&input4, {4, 40, 2, 2}, 0, 1); - input_tensors.push_back(input4); - // 2. input_names - vector input_names({ - "conv2d_3.tmp_1", - "conv2d_5.tmp_1", - "conv2d_7.tmp_1", - "conv2d_8.tmp_1", - }); - - // 3. output_names - vector output_names({"concat_0.tmp_0"}); - - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2}); - out_ddims.push_back(out_ddim); - - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); - - auto output0_data = output[0]->data(); - - // 5. test one example. - int input_n = 1; - int input_c = 2; - int input_h = 0; - int input_w = 1; - int stride0 = input3.numel() / input3.dims()[0]; - int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1]; - int stride2 = input3.dims()[3]; - /// inputx1 (4,10,2,2), - /// inputx2 (4,20,2,2), - /// inputx3 (4,30,2,2), - /// inputx4 (4,40,2,2), - /// axis = 1 - /// output (4,100,2,2) - int input_index = - input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w; - int output_index = input_n * 100 * 2 * 2 + - (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 + - input_h * 2 + input_w; - - DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index]; - DLOG << " output [1,32,0,1] = " << output0_data[output_index]; + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); + paddle_mobile::TestConcatOP(); + paddle_mobile::TestConcatOP(); return 0; } diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp index 97bf233155a7229caee68b67c6ff7b1314ec0c6e..e4778db815e63301cfd7f62af2e2fd07d7400d69 100644 --- a/test/operators/test_fusion_fc_op.cpp +++ b/test/operators/test_fusion_fc_op.cpp @@ -18,6 +18,9 @@ limitations under the License. */ #include "../test_include.h" #include "framework/operator.h" #include "operators/fusion_fc_op.h" +#ifdef FUSION_FC_INT8_OP +#include "operators/fusion_fc_int8_op.h" +#endif #define a(i, j) a[(i)*lda + (j)] #define b(i, j) b[(i)*ldb + (j)]