From 6dc4074221470139ff7fa960536e3454842944c4 Mon Sep 17 00:00:00 2001 From: qinan Date: Mon, 29 Apr 2019 10:20:52 +0800 Subject: [PATCH] update kernel and related files for static quantization in FPGA v2 track fixed#1584 (#1585) * update concat and split kernel and related files in FPGA v2(v3) track * update * update * update kernel and related files in FPGA v2 track * update * update --- src/fpga/V2/api.cpp | 153 ++++++-------- src/fpga/V2/api.h | 2 + src/fpga/V2/image.cpp | 50 +++-- src/fpga/V2/image.h | 5 +- src/fpga/V2/pe.cpp | 5 +- src/fpga/common/fpga_common.h | 23 +- src/framework/executor.cpp | 13 +- .../kernel/fpga/V2/concat_kernel.cpp | 6 +- .../kernel/fpga/V2/conv_add_bn_kernel.cpp | 5 +- .../fpga/V2/conv_add_bn_relu_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_add_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_add_relu_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_bn_kernel.cpp | 5 +- .../kernel/fpga/V2/conv_bn_relu_kernel.cpp | 4 +- src/operators/kernel/fpga/V2/conv_kernel.cpp | 4 +- .../kernel/fpga/V2/conv_transpose_kernel.cpp | 13 +- .../kernel/fpga/V2/deconv_add_bn_kernel.cpp | 3 +- .../fpga/V2/deconv_add_bn_relu_kernel.cpp | 4 +- .../kernel/fpga/V2/deconv_add_kernel.cpp | 3 +- .../kernel/fpga/V2/deconv_add_relu_kernel.cpp | 9 +- .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp | 8 +- .../kernel/fpga/V2/elementwise_add_kernel.cpp | 196 ++++-------------- .../fpga/V2/elementwise_add_relu_kernel.cpp | 14 +- .../kernel/fpga/V2/fusion_fc_kernel.cpp | 9 +- .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp | 9 +- src/operators/kernel/fpga/V2/pool_kernel.cpp | 6 +- .../kernel/fpga/V2/reshape2_kernel.cpp | 8 +- src/operators/kernel/fpga/V2/slice_kernel.cpp | 8 +- src/operators/kernel/fpga/V2/split_kernel.cpp | 4 +- 29 files changed, 194 insertions(+), 390 deletions(-) diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 9d91cf45b2..f2562f9c53 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -22,79 +22,85 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { -#define USE_RELU 1 #define USE_BIAS 2 void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; - kTypeId_t input_type = image_tensor->type(); - if (input_type == type_id()) { - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr && external_ptr == nullptr) { - image_tensor->reset_data_ptr(p_data); - } - } else { - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + auto data_ptr = image_tensor->data(); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr && external_ptr == nullptr) { - image_tensor->reset_data_ptr(p_data); - } + image::format_image(&p_data, channel, height, width); + if (p_data != data_ptr && external_ptr == nullptr) { + image_tensor->reset_data_ptr(p_data); } } void format_ofm(framework::Tensor *ofm_tensor) { if (ofm_tensor->type() == type_id()) { format_fp32_ofm(ofm_tensor); - } else { + } else if (ofm_tensor->type() == type_id()) { format_fp16_ofm(ofm_tensor); + } else { + format_int8_ofm(ofm_tensor); } + format_int8_ofm(ofm_tensor); } -void format_fp16_ofm(framework::Tensor *ofm_tensor) { + +void format_int8_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; if (dims.size() == 4) { auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(half); + sizeof(int8_t); } else if (dims.size() == 2) { - memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t); } else { DLOG << "Wrong ofm dimension"; } auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(half); + ofm_tensor->set_type(type_id().hash_code()); + ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t); fpga::fpga_flush(p, memory_size); } -void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { - // auto dims = ofm_tensor->dims(); +void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { size_t memory_size = 0; if (dims.size() == 4) { auto channel = dims[1], height = dims[2], width = dims[3]; memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(int8_t); + } else if (dims.size() == 2) { + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(type_id().hash_code()); + ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t); + fpga::fpga_flush(p, memory_size); +} + +void format_fp16_ofm(framework::Tensor *ofm_tensor) { + auto dims = ofm_tensor->dims(); + size_t memory_size = 0; + if (dims.size() == 4) { + auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; + memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * + sizeof(half); } else if (dims.size() == 2) { memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); } else { DLOG << "Wrong ofm dimension"; } auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); ofm_tensor->set_type(type_id().hash_code()); ofm_tensor->fpga_data_num = memory_size / sizeof(half); - fpga::fpga_flush(p, memory_size); } void format_fp32_ofm(framework::Tensor *ofm_tensor) { @@ -110,7 +116,6 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { DLOG << "Wrong ofm dimension"; } auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); ofm_tensor->set_type(type_id().hash_code()); ofm_tensor->fpga_data_num = memory_size / sizeof(float); @@ -269,11 +274,11 @@ void format_concat_output(framework::Tensor *out, int height, int width, } sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT); - auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half)); + auto data_ptr = fpga_malloc(height * sum_cw * sizeof(int8_t)); auto ddim = framework::make_ddim({1, sum_channel, height, width}); out->Resize(ddim); out->reset_data_ptr(data_ptr); - out->set_type(type_id().hash_code()); + out->set_type(type_id().hash_code()); } void format_conv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, @@ -283,7 +288,7 @@ void format_conv_data(framework::Tensor *filter_tensor, int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group); fpga::format_bias_scale_array(bs_ptr, element_num_per_div, ofm_tensor->dims()[1]); - fpga::format_fp16_ofm(ofm_tensor); + fpga::format_ofm(ofm_tensor); } void format_deconv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, @@ -294,7 +299,7 @@ void format_deconv_data(framework::Tensor *filter_tensor, int element_num_per_div = get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n); format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n); - format_fp16_ofm(ofm_tensor); + format_ofm(ofm_tensor); } void format_dwconv_data(framework::Tensor *filter_tensor, @@ -303,7 +308,7 @@ void format_dwconv_data(framework::Tensor *filter_tensor, auto channel = ofm_tensor->dims()[1]; format_dwconv_filter(filter_tensor, scale_ptr); format_bias_array(bias_ptr, channel); - format_fp16_ofm(ofm_tensor); + format_ofm(ofm_tensor); } void format_DWDeconv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, @@ -314,7 +319,7 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor, filter_tensor, (reinterpret_cast(*bs_ptr) + sub_conv_n * channel), sub_conv_n); format_bias_array(bs_ptr, channel); - format_fp16_ofm(ofm_tensor); + format_ofm(ofm_tensor); } void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; @@ -486,9 +491,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto filter_ptr = filter->data(); - auto out_ptr = out->data(); + auto out_ptr = out->data(); auto deleter = [](void *p) { fpga_free(p); }; arg->group_num = (uint32_t)group_num; @@ -512,7 +517,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, int n = arg->split_num; arg->concat_arg.images_in = - static_cast(fpga_malloc(n * sizeof(int *))); + static_cast(fpga_malloc(n * sizeof(int *))); arg->concat_arg.scales_in = static_cast(fpga_malloc(n * sizeof(float *))); arg->concat_arg.channel_num = @@ -531,7 +536,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, filter->dims()[3])); for (int i = 0; i < n; i++) { - // arg->conv_arg[i].relu_enabled = relu_enabled; arg->conv_arg[i].output.activation.activation_type = activation_enable; arg->conv_arg[i].output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; @@ -563,18 +567,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, reinterpret_cast(arg->conv_arg[i].filter_address), deleter)); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size); - // for test - // { - // static int cnt = 0; - // if(cnt == 4){ - // int8_t result = 0; - // std::string str = "fc_filter"; - // fpga::savefile(str, arg->conv_arg[i].filter_address, - // filter_size, result); - // - // } - // cnt++; - //} size_t bs_size = 2 * align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * @@ -585,18 +577,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, reinterpret_cast(arg->conv_arg[i].sb_address), deleter)); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); fpga_flush(arg->conv_arg[i].sb_address, bs_size); - // for test - /*{ - static int cnt = 0; - if(cnt == 4){ - float result = 0; - std::string str = "fc_bs"; - fpga::savefile(str, arg->conv_arg[i].sb_address, bs_size/4, -result); - - } - cnt++; -}*/ if (n > 1) { arg->conv_arg[i].output.scale_address = @@ -606,7 +586,7 @@ result); align_to_x((int)(out->dims()[3] * // NOLINT arg->conv_arg[i].filter_num), IMAGE_ALIGNMENT) * - sizeof(half)); + sizeof(int8_t)); arg->vector_conv_space.push_back(std::shared_ptr( reinterpret_cast(arg->conv_arg[i].output.scale_address), deleter)); @@ -618,7 +598,7 @@ result); } arg->concat_arg.images_in[i] = - (half *)arg->conv_arg[i].output.address; // NOLINT + (int8_t *)arg->conv_arg[i].output.address; // NOLINT arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; @@ -634,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto filter_ptr = filter->data(); auto deleter = [](void *p) { fpga_free(p); }; @@ -665,11 +645,11 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); - fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + fpga::format_int8_ofm(out, dims_out_new); + auto out_ptr = out->data(); arg->output.address = - (half *)out_ptr + // NOLINT - omit_size * sizeof(half) * + (int8_t *)out_ptr + // NOLINT + omit_size * sizeof(int8_t) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -692,7 +672,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i]->conv_arg = static_cast(fpga_malloc(split_num * sizeof(ConvArgs))); arg->split_conv_args[i]->concat_arg.images_in = - static_cast(fpga_malloc(split_num * sizeof(int16_t *))); + static_cast(fpga_malloc(split_num * sizeof(int8_t *))); arg->split_conv_args[i]->concat_arg.scales_in = static_cast(fpga_malloc(split_num * sizeof(float *))); arg->split_conv_args[i]->concat_arg.channel_num = @@ -744,7 +724,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } else { out_addr_offset = - sizeof(int16_t) * (sub_conv_num - 1 - i) * + sizeof(int8_t) * (sub_conv_num - 1 - i) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->split_conv_args[i]->output.address = out_ptr; @@ -841,7 +821,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i]->output.scale_address; } else { arg->split_conv_args[i]->conv_arg[j].output.address = - fpga_malloc(conv_output_size * sizeof(int16_t)); + fpga_malloc(conv_output_size * sizeof(int8_t)); arg->split_conv_args[i]->conv_arg[j].output.scale_address = static_cast(fpga_malloc(2 * sizeof(float))); arg->split_conv_args[i]->vector_conv_space.push_back( @@ -855,7 +835,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i]->conv_arg[j].output.scale_address), deleter)); } - arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( + arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( arg->split_conv_args[i]->conv_arg[j].output.address); arg->split_conv_args[i]->concat_arg.scales_in[j] = arg->split_conv_args[i]->conv_arg[j].output.scale_address; @@ -885,10 +865,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto input_ptr = input->data(); + auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; - // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; arg->bias_address = bias_ptr; @@ -915,7 +894,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, int stride_w, int padding_h, int padding_w, float *bias_ptr) { auto filter_ptr = filter->data(); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto deleter = [](void *p) { fpga_free(p); }; @@ -949,15 +928,9 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, real_out_height, real_out_width}); - fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + fpga::format_int8_ofm(out, dims_out_new); + auto out_ptr = out->data(); - /*====For Addition - arg->output.address = - (half *)out_ptr + // NOLINT - omit_size * sizeof(half) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - */ arg->output.address = out_ptr; arg->output.scale_address = out->scale; @@ -1002,7 +975,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, fpga_malloc(sub_output_height * align_to_x(sub_output_width * sub_channels * sub_conv_num, IMAGE_ALIGNMENT) * - sizeof(int16_t)); + sizeof(int8_t)); arg->dw_conv_args[i]->output.scale_address = static_cast(fpga_malloc(2 * sizeof(float))); arg->vector_dw_conv_space.push_back(std::shared_ptr( diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 33a5d3d33f..01d900602f 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -24,6 +24,8 @@ namespace fpga { void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); +void format_int8_ofm(framework::Tensor* ofm_tensor); +void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp32_ofm(framework::Tensor* ofm_tensor); diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 928526e2b9..8cbcc74c83 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -55,7 +55,7 @@ void convert_to_chw(float **data_in, int channel, int height, int width, *data_in = data_tmp; } -void concat_images(int16_t **images_in, float **scales_in, void *image_out, +void concat_images(int8_t **images_in, float **scales_in, void *image_out, float *scale_out, int image_num, uint32_t *channel_num, int height, int width) { int i = 0; @@ -66,17 +66,29 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, int align_each_in_area_cw = 0; int align_each_out_area_cw_differ = 0; int tmp_channel = 0; - scale_out[0] = 0.0; - scale_out[1] = 0.0; + float Ck = 0.0f; + float So = scale_out[0]; + auto images_in_tmp = + (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT + for (i = 0; i < image_num; i++) { + images_in_tmp[i] = reinterpret_cast(fpga::fpga_malloc( + height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * + sizeof(int8_t))); + } for (i = 0; i < image_num; i++) { each_out_line_channel += channel_num[i]; - scale_out[0] = std::max(*scale_out, scales_in[i][0]); + float Si_k = scales_in[i][0]; + Ck = Si_k / So; fpga_invalidate(images_in[i], height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * - sizeof(int16_t)); + sizeof(int8_t)); + for (j = 0; + j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); + j++) { + images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5); + } } - scale_out[1] = 1 / scale_out[0]; align_each_out_area_cw = align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); align_each_out_area_cw_differ = @@ -87,31 +99,27 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, for (i = 0; i < image_num; i++) { align_each_in_area_cw = align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - memcpy((int16_t *)image_out + tmp_channel + // NOLINT - k * align_each_out_area_cw_differ, - images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, - channel_num[i] * sizeof(int16_t)); + memcpy( + (int16_t *)image_out + tmp_channel + // NOLINT + k * align_each_out_area_cw_differ, + images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw, + channel_num[i] * sizeof(int8_t)); tmp_channel += channel_num[i]; } } } - - fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); + fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t)); } -void split_image(int16_t *image_in, const float *scale_in, void **images_out, - float **scales_out, int image_num, +void split_image(int8_t *image_in, void **images_out, int image_num, const uint32_t *channel_nums, int height, int width) { int total_channel = 0; for (int i = 0; i < image_num; i++) { - scales_out[i][0] = scale_in[0]; - scales_out[i][1] = scale_in[1]; total_channel += channel_nums[i]; } int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); - fpga_invalidate(image_in, element_num * sizeof(int16_t)); - + fpga_invalidate(image_in, element_num * sizeof(int8_t)); int src_offset = 0, des_offset = 0; for (int h = 0; h < height; h++) { for (int w = 0; w < width; w++) { @@ -120,8 +128,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out, for (int i = 0; i < image_num; i++) { des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + w * channel_nums[i]; - memcpy(reinterpret_cast(images_out[i]) + des_offset, - image_in + src_offset, channel_nums[i] * sizeof(int16_t)); + memcpy(reinterpret_cast(images_out[i]) + des_offset, + image_in + src_offset, channel_nums[i] * sizeof(int8_t)); src_offset += channel_nums[i]; } } @@ -129,7 +137,7 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out, for (int i = 0; i < image_num; i++) { element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); - fpga_flush(images_out[i], element_num * sizeof(int16_t)); + fpga_flush(images_out[i], element_num * sizeof(int8_t)); } } diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h index f5dc6ffe3e..f5fe649391 100644 --- a/src/fpga/V2/image.h +++ b/src/fpga/V2/image.h @@ -63,13 +63,12 @@ void format_image(T** data_in, int channel, int height, int width) { align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T)); } // Concat featuremaps along channel direction -void concat_images(int16_t** images_in, float** scales_in, void* image_out, +void concat_images(int8_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, int height, int width); // Split featuremap along channel direction -void split_image(int16_t* image_in, const float* scale_in, void** images_out, - float** scales_out, int image_num, +void split_image(int8_t* image_in, void** images_out, int image_num, const uint32_t* channel_nums, int height, int width); } // namespace image } // namespace fpga diff --git a/src/fpga/V2/pe.cpp b/src/fpga/V2/pe.cpp index 0503a51910..5bcaa9321b 100644 --- a/src/fpga/V2/pe.cpp +++ b/src/fpga/V2/pe.cpp @@ -907,9 +907,8 @@ int ComputeFPGASplit(const struct SplitArgs &args) { << " image_scale_address:" << args.scales_out[i]; } #endif - image::split_image(args.image_in, args.scale_in, args.images_out, - args.scales_out, args.image_num, args.out_channel_nums, - args.height, args.width); + image::split_image(args.image_in, args.images_out, args.image_num, + args.out_channel_nums, args.height, args.width); return 0; } // ComputeFPGASplit int ComputeDWConv(const struct DWconvArgs &args) { diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 8711f239f5..a1532c6c87 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -88,8 +88,6 @@ struct ImageOutputArgs { activation; // To select activation and specify (Leaky)Relu parameter. }; -// #ifdef PADDLE_MOBILE_FPGA_V1 -#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2)) struct ConvDriverParam { uint64_t image_address_phy; uint64_t filter_address_phy; @@ -141,10 +139,8 @@ struct DeconvTxParm { uint32_t deconv_en; uint32_t out_addr_offset; }; -#endif struct ConvArgs { - // bool relu_enabled; void* sb_address; // scale and bias void* filter_address; float* filter_scale_address; @@ -155,16 +151,17 @@ struct ConvArgs { struct ImageInputArgs image; // input image; struct ImageOutputArgs output; -// #ifdef PADDLE_MOBILE_FPGA_V1 -#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2)) struct DeconvTxParm deconv_tx_param; struct ConvDriverParam driver; -#endif }; struct ConcatArgs { uint32_t image_num; +#ifdef PADDLE_MOBILE_FPGA_V2 + int8_t** images_in; +#else int16_t** images_in; +#endif float** scales_in; void* image_out; float* scale_out; @@ -189,7 +186,11 @@ struct SplitConvArgs { struct SplitArgs { uint32_t image_num; +#ifdef PADDLE_MOBILE_FPGA_V2 + int8_t* image_in; +#else int16_t* image_in; +#endif float* scale_in; void** images_out; float** scales_out; @@ -214,12 +215,7 @@ struct EWAddArgs { struct ImageInputArgs image0; struct ImageInputArgs image1; struct ImageOutputArgs output; - std::vector image_in_quantVal; - std::vector image_out_quantVal; -// #ifdef PADDLE_MOBILE_FPGA_V1 -#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2)) struct EWAddDriverParam driver; -#endif }; struct BypassArgs { @@ -243,7 +239,6 @@ struct DeconvArgs { }; struct DWconvArgs { uint32_t sub_conv_num; - // bool relu_enabled; void* bias_address; void* filter_address; struct KernelArgs kernel; @@ -264,8 +259,6 @@ struct DWDeconvArgs { std::vector> vector_dw_conv_space; }; -// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; -// } static inline uint32_t align_to_x(int64_t num, int64_t x) { return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x; } diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 1779a88439..5975d80f9b 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -94,7 +94,6 @@ Executor::Executor(const Program &program, } else { InitMemory(); } - int count = 0; for (auto &op_handler : ops_of_block0_) { DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type(); @@ -319,7 +318,11 @@ bool Executor::varInputMemory( const std::shared_ptr &var_desc, Variable *var) const { #ifdef PADDLE_MOBILE_FPGA framework::LoDTensor *tensor = var->template GetMutable(); +#ifdef PADDLE_MOBILE_FPGA_V2 + tensor->init(type_id().hash_code()); +#else tensor->init(type_id().hash_code()); +#endif return true; #endif @@ -677,8 +680,8 @@ void Executor::InitQuantMemory() { for (int i = 0; i < count; i++) { auto tensor = GetTensorByName(inputs_vars[i]); tensor->scale[0] = quantValList[inputs_vars[i]]; - std::cout << "input variance name : " << inputs_vars[i] - << ", scale value : " << tensor->scale[0] << std::endl; + DLOG << "input variance name : " << inputs_vars[i] + << ", scale value : " << tensor->scale[0]; } } auto output_keys = op->GetOutKeys(); @@ -689,8 +692,8 @@ void Executor::InitQuantMemory() { for (int i = 0; i < count; i++) { auto tensor = GetTensorByName(outputs_vars[i]); tensor->scale[0] = quantValList[outputs_vars[i]]; - std::cout << "output variance name : " << outputs_vars[i] - << ", scale value : " << tensor->scale[0] << std::endl; + DLOG << "output variance name : " << outputs_vars[i] + << ", scale value : " << tensor->scale[0]; } } } diff --git a/src/operators/kernel/fpga/V2/concat_kernel.cpp b/src/operators/kernel/fpga/V2/concat_kernel.cpp index 7690f41ad3..716531fcab 100644 --- a/src/operators/kernel/fpga/V2/concat_kernel.cpp +++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp @@ -25,7 +25,7 @@ bool ConcatKernel::Init(ConcatParam *param) { auto out = param->Out(); auto image_num = inputs.size(); auto images_in = - (half **)fpga::fpga_malloc(image_num * sizeof(int *)); // NOLINT + (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT auto channel_num = @@ -38,7 +38,7 @@ bool ConcatKernel::Init(ConcatParam *param) { PADDLE_MOBILE_ENFORCE( input->dims()[2] == height && input->dims()[3] == width, "Image height & width should be unified"); - images_in[i] = input->data(); + images_in[i] = input->data(); channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT scales_in[i] = input->scale; } @@ -48,7 +48,7 @@ bool ConcatKernel::Init(ConcatParam *param) { concatArgs.image_num = image_num; concatArgs.images_in = images_in; concatArgs.scales_in = scales_in; - concatArgs.image_out = out->data(); + concatArgs.image_out = out->data(); concatArgs.scale_out = out->scale; concatArgs.channel_num = channel_num; concatArgs.height = height; diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp index 4c8cfa54dd..9289339123 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp @@ -22,7 +22,6 @@ namespace operators { template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -35,7 +34,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -59,8 +58,6 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - // bs_ptr[i + channel] = new_scale_ptr[i]; - // bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; } diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp index d16ec56d70..b9708edc28 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp @@ -23,7 +23,6 @@ namespace operators { template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -35,7 +34,7 @@ bool ConvAddBNReluKernel::Init( const int groups = param->Groups(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; vector paddings = param->Paddings(); vector strides = param->Strides(); auto bn_mean_ptr = param->InputMean()->data(); @@ -60,8 +59,6 @@ bool ConvAddBNReluKernel::Init( static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - // bs_ptr[i + channel] = new_scale_ptr[i]; - // bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; if (groups == channel) { diff --git a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp index 0651b36d62..2557dc262e 100644 --- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp @@ -21,7 +21,6 @@ namespace operators { template <> bool ConvAddKernel::Init(FusionConvAddParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -32,7 +31,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); @@ -40,8 +39,6 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT for (int i = 0; i < channel; i++) { - // bs_ptr[i + channel] = 1; - // bs_ptr[i] = bias_ptr[i]; bs_ptr[i + channel] = Si / So * Sf / 127.0; bs_ptr[i] = bias_ptr[i] * 127.0 / So; } diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp index 46315aa3df..793a3de414 100644 --- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp @@ -21,7 +21,6 @@ namespace operators { template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -32,7 +31,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); @@ -40,8 +39,6 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT for (int i = 0; i < channel; i++) { - // bs_ptr[i + channel] = 1; - // bs_ptr[i] = bias_ptr[i]; bs_ptr[i + channel] = Si / So * Sf / 127.0; bs_ptr[i] = bias_ptr[i] * 127.0 / So; } diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp index 431a9f9ac5..74615b6bdd 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp @@ -22,7 +22,6 @@ namespace operators { template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -31,7 +30,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -51,8 +50,6 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - // bs_ptr[i + channel] = new_scale_ptr[i]; - // bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; } diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp index 856b23ac38..340b6b2344 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp @@ -29,7 +29,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -48,8 +48,6 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - // bs_ptr[i + channel] = new_scale_ptr[i]; - // bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; if (groups == channel) { diff --git a/src/operators/kernel/fpga/V2/conv_kernel.cpp b/src/operators/kernel/fpga/V2/conv_kernel.cpp index c981c38b23..0bc70a9472 100644 --- a/src/operators/kernel/fpga/V2/conv_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_kernel.cpp @@ -29,13 +29,11 @@ bool ConvKernel::Init(ConvParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; int channel = out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT for (int i = 0; i < channel; i++) { - // bs_ptr[i + channel] = 1; - // bs_ptr[i] = 0; bs_ptr[i + channel] = Si / So * Sf / 127.0; bs_ptr[i] = 0; } diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp index 76889b0dd9..acc19401c8 100644 --- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp @@ -23,21 +23,16 @@ namespace operators { template <> bool ConvTransposeKernel::Init(ConvTransposeParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; auto input = const_cast(param->Input()); - // const Tensor *bias = param->Bias(); - // auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; - // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - // "Output channel should be equal to bias number"); int channel = out->dims()[1]; int sub_conv_n = param->Strides()[0]; @@ -46,7 +41,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = 0; // bias_ptr[i % (channel)]; + bs_ptr[i] = 0; } PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], @@ -58,7 +53,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { if (param->Groups() == channel) { for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = 0; // bias_ptr[i % (channel)]; + bs_ptr[i] = 0; } fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); @@ -71,7 +66,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { } else { for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = 0; // bias_ptr[i % (channel)]; + bs_ptr[i] = 0; } fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp index 5e3417f8c6..614974b2ac 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp @@ -23,7 +23,6 @@ namespace operators { template <> bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -34,7 +33,7 @@ bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; diff --git a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp index 2913a628dd..972dbdef63 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp @@ -24,7 +24,6 @@ namespace operators { template <> bool DeconvAddBNReluKernel::Init( FusionDeconvAddBNReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -35,7 +34,7 @@ bool DeconvAddBNReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -87,7 +86,6 @@ bool DeconvAddBNReluKernel::Init( template <> void DeconvAddBNReluKernel::Compute( const FusionDeconvAddBNReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); if (param.Groups() == param.Output()->dims()[1]) { fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); } else { diff --git a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp index dcafcbea9c..9ce3319762 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp @@ -23,7 +23,6 @@ namespace operators { template <> bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -34,7 +33,7 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; diff --git a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp index 1364b4b5aa..50ae9764ea 100644 --- a/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp @@ -24,7 +24,6 @@ namespace operators { template <> bool DeconvAddReluKernel::Init( FusionDeconvAddReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -35,7 +34,7 @@ bool DeconvAddReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; @@ -44,11 +43,6 @@ bool DeconvAddReluKernel::Init( auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT sizeof(float)); // NOLINT - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], "stride_width should be equal to stride_height "); PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], @@ -87,7 +81,6 @@ bool DeconvAddReluKernel::Init( template <> void DeconvAddReluKernel::Compute( const FusionDeconvAddReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); if (param.Groups() == param.Output()->dims()[1]) { fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); } else { diff --git a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp index 6aae1ea729..a1e69f57b9 100644 --- a/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp @@ -25,7 +25,6 @@ namespace operators { template <> bool DeconvBNReluKernel::Init( FusionDeconvBNReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -36,7 +35,7 @@ bool DeconvBNReluKernel::Init( auto out = param->Output(); float Si = input->scale[0]; float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); + float Sf = fpga::filter_find_max(filter) / 127; auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -59,10 +58,6 @@ bool DeconvBNReluKernel::Init( int sub_conv_n = param->Strides()[0]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT sizeof(float)); // NOLINT - // for (int i = 0; i < channel * sub_conv_n; i++) { - // bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel]; - // bs_ptr[i] = new_bias_ptr[i % (channel)]; - // } if (param->Groups() == channel) { for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So; @@ -107,7 +102,6 @@ bool DeconvBNReluKernel::Init( template <> void DeconvBNReluKernel::Compute( const FusionDeconvBNReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); if (param.Groups() == param.Output()->dims()[1]) { fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); } else { diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp index 145d7851f0..c4441d7cf5 100644 --- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -25,170 +25,50 @@ template <> bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - if (input_y->type() != type_id()) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto *input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); - float Si_1 = input_x->scale[0]; - float Si_2 = input_y->scale[0]; - float So = out->scale[0]; - float C1 = Si_1 / So; - float C2 = Si_2 / So; - fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); - } else { - param->float_input_x.Resize(param->InputX()->dims()); - param->float_input_x.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_input_x)); - - param->float_out.Resize(param->InputX()->dims()); - param->float_out.mutable_data(param->InputX()->dims()); - fpga::format_fp32_ofm(&(param->float_out)); - - fpga::format_fp16_ofm(out); - } + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto *input_x = const_cast(param->InputX()); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + float Si_1 = input_x->scale[0]; + float Si_2 = input_y->scale[0]; + float So = out->scale[0]; + float C1 = Si_1 / So; + float C2 = Si_2 / So; + fpga::EWAddArgs ewaddArgs = {0}; + ewaddArgs.output.activation.activation_type = activation_enable; + ewaddArgs.output.activation.leaky_relu_negative_slope = + leaky_relu_negative_slope; + ewaddArgs.const0 = fpga::fp32_2_fp16(C1); + ewaddArgs.const1 = fpga::fp32_2_fp16(C2); + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); + param->SetFpgaArgs(ewaddArgs); return true; } -inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { - auto input_x = param.float_input_x; - auto input_y = param.InputY(); - auto Out = param.float_out; - int axis = param.Axis(); - - const auto &x_dims = input_x.dims(); - const auto &y_dims = input_y->dims(); - /// axis = -1 represent the last dimensions. - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - size_t batch = 1; - size_t channels = 1; - size_t elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - const float *bias_data = input_y->data(); - const float *input_data = input_x.data(); - float *output_data = Out.mutable_data(); - - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - size_t offset = (i * channels + j) * elementwise_num; - const float *input = input_data + offset; - const float bias = bias_data[j]; - float *output = output_data + offset; - // DLOG << "output address: "<< output; - for (int k = 0; k < elementwise_num; ++k) { - output[k] = input[k] + bias; - // DLOG << "output[" << k << "]= " << output[k] ; - } - } - } -} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - auto input_y = const_cast(param.InputY()); - if (input_y->type() != type_id()) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); - } else { - auto input_x = const_cast(param.InputX()); - auto intput_x_float = const_cast(&(param.float_input_x)); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_x->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = intput_x_float->data(); - args.output.scale_address = intput_x_float->scale; - - // fpga::fpga_flush(input_x->data(),input_x->fpga_data_num * - // sizeof(half)); - fpga::PerformBypass(args); - fpga::fpga_invalidate(args.output.address, - input_x->fpga_data_num * sizeof(float)); - - // just for test - /* { - static int cnt = 0; - if(cnt == 0){ - std::string str= "first_bypass_data"; - float rslt = 0.0f; - fpga::savefile(str, args.output.address, input_x->fpga_data_num, - rslt); cnt++; - } - }*/ - ElementwiseAddCompute(param); - - auto out_float = const_cast(&(param.float_out)); - DLOG << "out float: " << out_float->data(); - fpga::fpga_flush(out_float->data(), - input_x->fpga_data_num * sizeof(float)); - // just for test - /*{ - static int cnt = 0; - if(cnt == 0){ - std::string str= "ew_output_data"; - float rslt = 0.0f; - - fpga::savefile(str, out_float->data(), input_x->fpga_data_num, - rslt); cnt++; - } - }*/ - auto Out = param.Out(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = out_float->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = Out->data(); - args.output.scale_address = Out->scale; - fpga::PerformBypass(args); - } + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp index 44266049a2..4c477b83af 100644 --- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -21,29 +21,27 @@ namespace operators { template <> bool ElementwiseAddReluKernel::Init( ElementwiseAddReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); float Si_1 = input_x->scale[0]; float Si_2 = input_y->scale[0]; float So = out->scale[0]; float C1 = Si_1 / So; float C2 = Si_2 / So; fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; ewaddArgs.output.activation.activation_type = activation_enable; ewaddArgs.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.const0 = fpga::fp32_2_fp16(C1); + ewaddArgs.const1 = fpga::fp32_2_fp16(C2); ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; ewaddArgs.image0.scale_address = input_x->scale; diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp index 1f85beb532..6f4e096112 100644 --- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp @@ -20,7 +20,6 @@ namespace operators { template <> bool FusionFcKernel::Init(FusionFcParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; @@ -30,17 +29,13 @@ bool FusionFcKernel::Init(FusionFcParam *param) { auto input_z_ptr = input_z->data(); auto out = param->Out(); float Si = input_x->scale[0]; - float Sf = filter->scale[0]; + float Sf = fpga::filter_find_max(filter) / 127; float So = out->scale[0]; - // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - // "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT for (int i = 0; i < channel; i++) { - // bs_ptr[i + channel] = 1; - // bs_ptr[i] = input_z_ptr[i]; bs_ptr[i + channel] = Si / So * Sf / 127.0f; bs_ptr[i] = input_z_ptr[i] * 127.0f / So; } @@ -60,7 +55,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); + fpga::format_ofm(out); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, diff --git a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp index 0ccec45195..bc4fc96829 100644 --- a/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp @@ -20,7 +20,6 @@ namespace operators { template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { - // bool relu_enabled = false; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -30,17 +29,13 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { auto input_z_ptr = input_z->data(); auto out = param->Out(); float Si = input_x->scale[0]; - float Sf = filter->scale[0]; + float Sf = fpga::filter_find_max(filter) / 127; float So = out->scale[0]; - // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - // "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT for (int i = 0; i < channel; i++) { - // bs_ptr[i + channel] = 1; - // bs_ptr[i] = input_z_ptr[i]; bs_ptr[i + channel] = Si / So * Sf / 127.0f; bs_ptr[i] = input_z_ptr[i] * 127.0f / So; } @@ -60,7 +55,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); + fpga::format_ofm(out); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, diff --git a/src/operators/kernel/fpga/V2/pool_kernel.cpp b/src/operators/kernel/fpga/V2/pool_kernel.cpp index 60bd3786aa..aafc86d888 100644 --- a/src/operators/kernel/fpga/V2/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp @@ -41,9 +41,9 @@ bool PoolKernel::Init(PoolParam *param) { return true; } - auto input_ptr = input->data(); - fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); + auto input_ptr = input->data(); + fpga::format_ofm(output); + auto output_ptr = output->mutable_data(); float Si = input->scale[0]; float So = output->scale[0]; diff --git a/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp index 647ecb5a65..b1df9372a6 100644 --- a/src/operators/kernel/fpga/V2/reshape2_kernel.cpp +++ b/src/operators/kernel/fpga/V2/reshape2_kernel.cpp @@ -48,8 +48,8 @@ bool Reshape2Kernel::Init(Reshape2Param *param) { void reshape(LoDTensor *input, LoDTensor *output) { // Subscript r means after reshape - auto input_ptr = input->data(); - auto output_ptr = output->data(); + auto input_ptr = input->data(); + auto output_ptr = output->data(); output->scale[0] = input->scale[0]; output->scale[1] = input->scale[1]; @@ -67,7 +67,7 @@ void reshape(LoDTensor *input, LoDTensor *output) { auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); auto HWr = Hr * Wr; - fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half)); + fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(int8_t)); int offset_align = 0; int offset_r = 0, offset_align_r = 0; @@ -89,7 +89,7 @@ void reshape(LoDTensor *input, LoDTensor *output) { } } - fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half)); + fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(int8_t)); } template <> diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp index 2fd6ef542e..a841290400 100644 --- a/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -22,7 +22,7 @@ namespace operators { template <> bool SliceKernel::Init(SliceParam* param) { auto output = param->output_; - fpga::format_fp16_ofm(output); + fpga::format_ofm(output); DLOG << "input: " << param->input_; DLOG << "output: " << param->output_; if (param->input_->type() != type_id()) { @@ -40,8 +40,8 @@ void SliceKernel::Compute(const SliceParam& param) { auto output = param.output_; int HW = input->dims()[2] * input->dims()[3]; int channel = input->dims()[1]; - auto input_ptr = input->data(); - auto output_ptr = output->data(); + auto input_ptr = input->data(); + auto output_ptr = output->data(); output->scale[0] = input->scale[0]; output->scale[1] = input->scale[1]; @@ -52,7 +52,7 @@ void SliceKernel::Compute(const SliceParam& param) { start = start > channel ? channel : start; end = end > channel ? channel : end; int len = end - start; - size_t size = len * sizeof(half); + size_t size = len * sizeof(int8_t); for (int i = 0; i < HW; i++) { memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); diff --git a/src/operators/kernel/fpga/V2/split_kernel.cpp b/src/operators/kernel/fpga/V2/split_kernel.cpp index 584cb41fb3..ccfe918963 100644 --- a/src/operators/kernel/fpga/V2/split_kernel.cpp +++ b/src/operators/kernel/fpga/V2/split_kernel.cpp @@ -38,7 +38,7 @@ bool SplitKernel::Init(SplitParam *param) { for (int i = 0; i < image_num; i++) { fpga::format_fp16_ofm(outs[i]); DLOG << "output: " << outs[i]; - images_out[i] = outs[i]->mutable_data(); + images_out[i] = outs[i]->mutable_data(); scales_out[i] = outs[i]->scale; out_channels[i] = (uint32_t)sections[i]; } @@ -47,7 +47,7 @@ bool SplitKernel::Init(SplitParam *param) { fpga::SplitArgs arg = {0}; arg.image_num = image_num; - arg.image_in = in->data(); + arg.image_in = in->data(); arg.scale_in = in->scale; arg.images_out = images_out; arg.scales_out = scales_out; -- GitLab