diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index d1014ff87a86efeeefec731ebac05a8a30abe3b1..47acd275fa644f7c6d51c34a547c814531fd88c5 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -14,11 +14,9 @@ limitations under the License. */ #include "api.h" #include -#include -#include #include #include -#include +#include #include "bias_scale.h" #include "filter.h" #include "image.h" @@ -48,6 +46,7 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { + DLOG << size << " bytes allocated"; #ifdef PADDLE_MOBILE_OS_LINUX return reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); @@ -68,6 +67,20 @@ void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); } +int fpga_flush(void *address, size_t size) { + struct MemoryCacheArgs args; + args.address = address; + args.size = size; + return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); +} + +int fpga_invalidate(void *address, size_t size) { + struct MemoryCacheArgs args; + args.address = address; + args.size = size; + return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); +} + int ComputeFpgaConv(const struct WrapperConvArgs &args) { #ifdef FPGA_TEST_MODE /*DLOG << " relu_enabled:" << args.relu_enabled @@ -145,8 +158,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { } int PerformBypass(const struct BypassArgs &args) { #ifdef FPGA_TEST_MODE - DLOG << " layout_type:" << args.layout_type - << " convert_type:" << args.convert_type; + DLOG << " input_type:" << args.input_data_type + << " input_layout_type:" << args.input_layout_type; DLOG << " image_address:" << args.image.address << " image_scale_address:" << args.image.scale_address << " image_channels:" << args.image.channels @@ -181,10 +194,19 @@ void format_image(framework::Tensor *image_tensor) { void format_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); - auto channel = dims[1], height = dims[2], width = dims[3]; - size_t memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); - ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); + size_t memory_size = 0; + if (dims.size() == 4) { + auto channel = dims[1], height = dims[2], width = dims[3]; + memory_size = + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + } else if (dims.size() == 2) { + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); } float filter_find_max(framework::Tensor *filter_tensor) { @@ -200,7 +222,7 @@ int get_plit_num(framework::Tensor *filter_tensor) { return filter::calc_split_num(num, div_capacity); } -int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { +int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { auto dims = filter_tensor->dims(); auto chw = dims[1] * dims[2] * dims[3]; auto num = dims[0]; @@ -279,7 +301,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->concat_arg.image_out = out_ptr; const int channel = (int)out->dims()[1]; - int element_num_per_div = fpga::get_element_num_per_div(filter, group_num); + int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num); int element_num = fpga::get_aligned_filter_element_num( filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); @@ -297,12 +319,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.pad_height = (uint32_t)padding_h; arg->conv_args[i].image.pad_width = (uint32_t)padding_w; - arg->conv_args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - arg->conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + arg->conv_args[i].filter_scale_address = filter->scale; + arg->conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( - channel - (n - 1) * element_num_per_div) - : element_num_per_div); + channel - (n - 1) * filter_num_per_div) + : filter_num_per_div); if (n > 1) { arg->conv_args[i].output.scale_address = diff --git a/src/fpga/api.h b/src/fpga/api.h index 096f847170501784f0ee74b5a98ca91349587cfc..9d17e05d6cbfeeb8abac1e06c731510fed2ee65d 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -25,23 +25,14 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { -int open_device(); -int close_device(); - -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dst, const void* src, size_t num); - -enum DataConvertType { - DATA_NO_CONVERT = 0, - DATA_FP32_TO_FP16 = 1, - DATA_FP16_TO_FP32 = 2, +enum DataType { + DATA_TYPE_FP32 = 1, + DATA_TYPE_FP16 = 0, }; -enum LayoutConvertType { - LAYOUT_NO_CONVERT = 0, - LAYOUT_CHW_TO_HWC = 1, - LAYOUT_HWC_TO_CHW = 2, +enum LayoutType { + LAYOUT_CHW = 1, + LAYOUT_HWC = 0, }; struct VersionArgs { @@ -122,16 +113,18 @@ struct PoolingArgs { struct EWAddArgs { bool relu_enabled; - float const0; // output0 = const0 x input0 + const1 x input1; - float const1; + uint32_t const0; // output0 = const0 x input0 + const1 x input1; + uint32_t const1; struct ImageInputArgs image0; struct ImageInputArgs image1; struct ImageOutputArgs output; }; struct BypassArgs { - enum DataConvertType convert_type; - enum LayoutConvertType layout_type; + enum DataType input_data_type; + enum DataType output_data_type; + enum LayoutType input_layout_type; + enum LayoutType output_layout_type; struct ImageInputArgs image; struct ImageOutputArgs output; }; @@ -141,6 +134,16 @@ struct FpgaRegWriteArgs { uint64_t value; }; +struct FpgaRegReadArgs { + uint64_t address; + uint64_t value; +}; + +struct MemoryCacheArgs { + void* address; + size_t size; +}; + #define IOCTL_FPGA_MAGIC 'FPGA' #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) @@ -148,6 +151,8 @@ struct FpgaRegWriteArgs { #define IOCTL_SEPARATOR_0 10 #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) +#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) +#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) #define IOCTL_SEPARATOR_1 20 @@ -184,6 +189,15 @@ enum FPGA_ERR_TYPE { //============================== API ============================= +int open_device(); +int close_device(); + +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +void fpga_copy(void* dst, const void* src, size_t num); +int fpga_flush(void* address, size_t size); +int fpga_invalidate(void* address, size_t size); + int PerformBypass(const struct BypassArgs& args); int ComputeFpgaConv(const struct WrapperConvArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args); @@ -196,7 +210,7 @@ void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory float filter_find_max(framework::Tensor* filter_tensor); -int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); +int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_plit_num(framework::Tensor* filter_tensor); int get_aligned_filter_element_num(int chw); int get_aligned_filter_num(int num); diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp index a1b0c8577b9100f69f823a39e9e136c46b7e09ff..3e5c3419a0c35b5c7c81b0ee1fd89a58838b5a26 100644 --- a/src/fpga/bias_scale.cpp +++ b/src/fpga/bias_scale.cpp @@ -79,6 +79,7 @@ void format_bias_scale_array(float **bias_scale_array, int element_num_after_division = align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); interleave(bias_scale_array, div_num * element_num_after_division); + fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); } } // namespace bias_scale diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index 5f1a16d2339f3859f4cd85408c965d8d2634a55f..3b09ede10d10f605e69d06df2e148dd463e94d5b 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -101,7 +101,6 @@ void align_element(char **data_in, int num, int chw) { int j = 0; int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); if (align_chw != chw) { - printf("align %d \n", align_chw); char *tmp = *data_in; char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char)); @@ -207,6 +206,8 @@ void format_filter(float **data_in, int num, int channel, int height, int width, align_num(quantize_data, num_per_div_before_alignment, num, chw); reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); + fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * + num_after_alignment * sizeof(char)); } } // namespace filter diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp index 872abcd7c2dd6b16ab8ec8077e9afa6ec60c10d4..0603d164dfa88eb5620ebf588c610ea25a78be5f 100644 --- a/src/fpga/image.cpp +++ b/src/fpga/image.cpp @@ -38,7 +38,6 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) { } void align_element_conv(float **data_in, int height, int cw) { - int i = 0; int h = 0; int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); if (align_cw != cw) { @@ -60,6 +59,8 @@ void align_element_conv(float **data_in, int height, int cw) { void format_image(float **data_in, int channel, int height, int width) { convert_to_hwc(data_in, channel, height, width); align_element_conv(data_in, height, channel * width); + fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * + sizeof(float)); } void concat_images(int16_t **images_in, float **scales_in, void *image_out, @@ -77,6 +78,10 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, for (i = 0; i < image_num; i++) { each_out_line_channel += channel_num[i]; *scale_out = std::max(*scale_out, scales_in[i][0]); + fpga_invalidate(images_in[i], + height * + align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * + sizeof(int16_t)); } align_each_out_area_cw = align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); @@ -97,6 +102,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, } } } + + fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); } } // namespace image diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index e1f8fdf63ff508d9afc59e2230406c46f2c9e4d0..7cfdaa56aedcfdafb0e0de5e7fe9d1897a5794d4 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -56,8 +56,11 @@ class FeedOp : public framework::OperatorBase { auto output_ptr = output->mutable_data(); fpga::BypassArgs args; - args.convert_type = fpga::DATA_FP32_TO_FP16; - args.layout_type = fpga::LAYOUT_NO_CONVERT; + + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; args.image.address = (void *)input_ptr; args.image.channels = input->dims()[1]; args.image.height = input->dims()[2]; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 84b9d6b0ddd9a1577ee37d095cabed2a8a2fe5a2..58d1717dce08fc9065449a657d68c4c3756c300f 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -23,7 +23,7 @@ template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bool relu_enabled = false; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); + auto bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -62,7 +62,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { fpga::format_filter(filter, max_value, param->Groups()); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); @@ -80,7 +80,6 @@ void ConvAddBNKernel::Compute( const FusionConvAddBNParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddBNKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index e38ae9240534b17e97d7ee1c68bffb25a8aedf71..00bfa9101b6d5c464fb6603d8fde13ce2885a630 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -24,7 +24,6 @@ bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -58,14 +57,12 @@ bool ConvAddBNReluKernel::Init( float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -80,7 +77,6 @@ void ConvAddBNReluKernel::Compute( const FusionConvAddBNReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddBNReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 31f28df5103942750758040ab983e2c0298a8cfd..71f0420b6ab264fd893c7e818e3cf9ac0f9341e5 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -23,7 +23,6 @@ template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -40,14 +39,12 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -62,7 +59,6 @@ void ConvAddReluKernel::Compute( const FusionConvAddReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 8818e98c376ab4e33d399bdf429e5b01928672e2..007561911231cfe25c199c0a9bd7238c58dc85e8 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -24,7 +24,6 @@ template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { bool relu_enabled = false; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); @@ -55,14 +54,12 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -77,7 +74,6 @@ void ConvBNKernel::Compute( const FusionConvBNParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvBNKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 8fe4425a23de2b4b16b241bf65d893d10132cc2e..4c62888b95c08b0198255124ebeff5a265274871 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -23,7 +23,6 @@ template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); @@ -52,27 +51,12 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::WrapperConvArgs convArgs; - convArgs.group_num = (uint32_t)param->Groups(); - convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); - convArgs.filter_num = (uint32_t)filter->dims()[0]; - convArgs.output.address = out_ptr; - convArgs.output.scale_address = out->scale; - convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( - convArgs.split_num * sizeof(fpga::ConvArgs)); - param->SetFpgaArgs(convArgs); - - int element_num = fpga::get_aligned_filter_element_num( - filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -87,7 +71,6 @@ void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvBNReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp index 3a4dd216d481322a9228cfd247bf6f0d0098177e..b0981c4254060996a16f4ae5beabb7c22edd6d34 100644 --- a/src/operators/kernel/fpga/dropout_kernel.cpp +++ b/src/operators/kernel/fpga/dropout_kernel.cpp @@ -27,13 +27,7 @@ bool DropoutKernel::Init(DropoutParam *param) { template <> void DropoutKernel::Compute( - const DropoutParam ¶m) const { - // auto *input_x = param.InputX(); - // auto *out = param.Out(); - // auto input_x_ptr = input_x->data(); - // auto out_ptr = out->mutable_data(); - // out_ptr = const_cast(input_x_ptr); -} + const DropoutParam ¶m) const {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 48d7425fcb7a3c630165fe4a7d26875a4f4a0a9d..5b4c95af9f8844ac2242887d9d0233ab1b83460d 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -21,7 +21,6 @@ template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; auto input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); auto filter = const_cast(param->InputY()); auto input_z = param->InputZ(); auto input_z_ptr = input_z->data(); @@ -47,12 +46,10 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, 1); - auto filter_ptr = filter->data(); - int element_num_per_div = fpga::get_element_num_per_div(filter, 1); + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - auto out_ptr = out->mutable_data(); + fpga::format_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index ccc6009700c98f1f94835a7e21a83de1faade1f0..5681fcc7a7108bce971d0aa82733dbc7595e29cc 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -22,7 +22,6 @@ template <> bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; auto input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); auto filter = const_cast(param->InputY()); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); @@ -48,12 +47,10 @@ bool FusionFcKernel::Init(FusionFcParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, 1); - auto filter_ptr = filter->data(); - int element_num_per_div = fpga::get_element_num_per_div(filter, 1); + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - auto out_ptr = out->mutable_data(); + fpga::format_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index d3df951dbc340814d766f76e8720c3aaef2f3539..82cb88b1d7c141ab94563e74a693119b328920fc 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -50,9 +50,7 @@ bool PoolKernel::Init(PoolParam *param) { template <> void PoolKernel::Compute(const PoolParam ¶m) const { -#ifdef PADDLE_MOBILE_FPGA fpga::ComputeFpgaPool(param.FpgaArgs()); -#endif } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index 20c86a5c73bc9c35b8f8fd430013bb97d269fb4a..fd84cb8e46c974d23816c0dd4c99a545d996c409 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -25,30 +25,41 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { const Tensor *input = param->InputX(); - auto input_ptr = input->data(); - auto output = param->Out(); - auto output_ptr = output->mutable_data(); + auto output_ptr = param->Out(); + Tensor *floatInput = new Tensor(*input); fpga::BypassArgs args; - args.convert_type = fpga::DATA_FP16_TO_FP32; - args.layout_type = fpga::LAYOUT_NO_CONVERT; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = (void *)(input_ptr); args.image.height = (uint32_t)input->dims()[0]; args.image.width = (uint32_t)input->dims()[1]; args.image.channels = 1; - args.output.address = output_ptr; - param->SetFpgaArgs(args); + args.output.address = (void *)floatInput->mutable_data(); + param->SetFloatInput(floatInput); + param->SetFpgaArgs(args); return true; } template <> void SoftmaxKernel::Compute( const SoftmaxParam ¶m) const { - // SoftmaxCompute(param); + DLOG << "======================================= FPGA SoftMAX " + "==============================================="; + const Tensor *in_x = param.FloatInput(); + Tensor *out = param.Out(); + fpga::fpga_flush((void *)in_x->data(), in_x->memory_size()); + fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate(out->data(), out->memory_size()); + + auto x_dims = in_x->dims(); + out->Resize(x_dims); + math::SoftmaxFuntor()(in_x, out); } -template class SoftmaxKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index 4065f7d9c4934bce8285ea99fe4f14c4e2cc990c..090ccdf24e214fc86b8a4032df228d50caa65ef9 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -74,7 +74,7 @@ class Im2ColFunctor { const int isize = im_height; bool pad1 = padding[0] > 0; bool pad2 = - (pad1 && + (pad1 && padding[1] && (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0)); int fill = isize % 2; if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 && diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index 6ef9fb2a8252e82014ebebc22f82066eeb324c0d..14269817ededd097c4c9ade20be5ee773c02d692 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -36,13 +36,35 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int N = dim_out[1]; int K = (!trans_a) ? dim_a[1] : dim_a[0]; + if (trans_a) { + int numel = matrix_a.numel(); + int m = matrix_a.dims()[0]; + int n = matrix_a.dims()[1]; + float *tmp = (float *)(matrix_a.data()); + float *a = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + int index = 0; + for (int j = 0; j < n; j++) { + for (int i = 0; i < m; i++) { + a[index++] = tmp[i * n + j]; + } + } +#ifdef _OPENMP + Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else + Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#endif + } else { #ifdef _OPENMP - Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), - N, beta, matrix_out->data(), N, relu, bias); + Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), + N, beta, matrix_out->data(), N, relu, bias); #else - Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, - beta, matrix_out->data(), N, relu, bias); + Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, + beta, matrix_out->data(), N, relu, bias); #endif + } } template <> diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp index 05d3017f635a040a52d2cc377c8f384dbbd8086c..f8b52c59f5689461ef9b4171b9e33c0d49529eed 100644 --- a/src/operators/math/pool_3x3.cpp +++ b/src/operators/math/pool_3x3.cpp @@ -31,186 +31,43 @@ using std::min; using std::vector; void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { #if __ARM_NEON - const int batch_size = input->dims()[0]; + const int batch_size = static_cast(input->dims()[0]); + const int input_channel = static_cast(input->dims()[1]); - const int h_in = input->dims()[2]; + const int input_height = static_cast(input->dims()[2]); + const int input_width = static_cast(input->dims()[3]); + const int output_height = static_cast(output->dims()[2]); + const int output_width = static_cast(output->dims()[3]); - const int w_in = input->dims()[3]; - - const int output_channels = output->dims()[1]; + const int hxw = input_height * input_width; - const int h_out = output->dims()[2]; - const int w_out = output->dims()[3]; - const int outputdata_channel_stride = h_out * w_out; - const int inputdata_channel_stride = h_in * w_in; - const int input_batch_stride = output_channels * inputdata_channel_stride; - const int output_batch_stride = output_channels * outputdata_channel_stride; - float *out_data = output->data(); - const float *input_data = input->data(); + const int l = input_height; const float coef = 1.0 / 9.0; - for (int k = 0; k < batch_size; ++k) { -#pragma omp parallel for - for (int c = 0; c < output_channels; ++c) { - const float *input_seg = input_data + c * inputdata_channel_stride; - float *output_seg = out_data + c * outputdata_channel_stride; - // four corner point - output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] + - input_seg[w_in + 1]) * - coef; - output_seg[w_out - 1] = - (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] + - input_seg[2 * w_in - 1]) * - coef; - output_seg[(h_out - 1) * w_out] = - (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] + - input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) * - coef; - output_seg[h_out * w_out - 1] = - (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] + - input_seg[(h_in - 1) * w_in - 1] + - input_seg[(h_in - 1) * w_in - 2]) * - coef; - // left side & right side - for (int i = 1; i < h_in - 1; ++i) { - output_seg[i * w_out] = - (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] + - input_seg[i * w_in] + input_seg[i * w_in + 1] + - input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) * - coef; - output_seg[i * w_out + w_out - 1] = - (input_seg[i * w_in - w_in + w_in - 2] + - input_seg[i * w_in - w_in + 1 + w_in - 2] + - input_seg[i * w_in + w_in - 2] + - input_seg[i * w_in + 1 + w_in - 2] + - input_seg[i * w_in + w_in + w_in - 2] + - input_seg[i * w_in + w_in + 1 + w_in - 2]) * - coef; - } - // top 1 row & bottom 1 row - const float *input_tmp = input_seg; - - float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, - tmp3, tmp4, tmp5, sum, out0; - float32x4_t v_coef = vdupq_n_f32(coef); - in0 = vld1q_f32(input_tmp); - in2 = vld1q_f32(input_tmp + w_in); - const float *input_tmp_end = input_tmp + (h_in - 2) * w_in; - in4 = vld1q_f32(input_tmp_end); - in6 = vld1q_f32(input_tmp_end + w_in); - int c_mid = w_out - 2; - auto output_ptr = output_seg + 1; - for (; c_mid > 3; c_mid -= 4) { - in1 = vld1q_f32(input_tmp + 4); - in3 = vld1q_f32(input_tmp + w_in + 4); - - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - - sum = vaddq_f32(in0, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in2); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - - vst1q_f32(output_ptr, vmulq_f32(sum, v_coef)); - - in5 = vld1q_f32(input_tmp_end + 4); - in7 = vld1q_f32(input_tmp_end + w_in + 4); - - tmp0 = vextq_f32(in4, in5, 1); - tmp1 = vextq_f32(in4, in5, 2); - tmp2 = vextq_f32(in6, in7, 1); - tmp3 = vextq_f32(in6, in7, 2); - - sum = vaddq_f32(in0, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in2); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - - vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef)); - - // can optimize to each 8 stride. - input_tmp += 4; - input_tmp_end += 4; - output_ptr += 4; - in0 = in1; - in2 = in3; - in4 = in5; - in6 = in7; - } - // top right remain - float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]); - float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]); - - tmp0 = vextq_f32(in0, pad0, 1); - tmp1 = vextq_f32(in0, pad0, 2); - tmp2 = vextq_f32(in2, pad1, 2); - tmp3 = vextq_f32(in2, pad1, 2); - - sum = vaddq_f32(in0, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in2); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - out0 = vmulq_f32(sum, v_coef); - - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + i, out0, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + i, out0, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + i, out0, 2); - } - } - - // bottom_right remain - float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]); - float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]); - - tmp0 = vextq_f32(in4, pad2, 1); - tmp1 = vextq_f32(in4, pad2, 2); - tmp2 = vextq_f32(in6, pad3, 2); - tmp3 = vextq_f32(in6, pad3, 2); - - sum = vaddq_f32(in4, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in6); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - out0 = vmulq_f32(sum, v_coef); + const float coef1 = 1.0 / 6.0; + const float coef2 = 1.0 / 4.0; - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2); - } - } - // mid - for (int j = 0; j < h_out - 2; ++j) { - output_ptr = output_seg + w_out * (j + 1) + 1; - input_tmp = input_seg + j * w_in; + float32x4_t v_coef = vdupq_n_f32(coef); + float32x4_t v_coef1 = vdupq_n_f32(coef1); - in0 = vld1q_f32(input_tmp); - in2 = vld1q_f32(input_tmp + w_in); - in4 = vld1q_f32(input_tmp + 2 * w_in); - c_mid = w_out - 2; - for (; c_mid > 3; c_mid -= 4) { - in1 = vld1q_f32(input_tmp + 4); - in3 = vld1q_f32(input_tmp + w_in + 4); - in5 = vld1q_f32(input_tmp + 2 * w_in + 4); + for (int b = 0; b < batch_size; b++) { +#pragma omp parallel for + for (int c = 0; c < input_channel; c++) { + const float *input_data = input->data() + c * hxw; + float *output_data = output->data() + c * hxw; + + for (int i = 1; i < output_height - 1; i++) { + float *output_ptr; + float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4, + tmp5, out0; + for (int m = 1; m < output_width - 4; m += 4) { + output_ptr = output_data + i * output_width + m; + in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1); + in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3); + in2 = vld1q_f32(input_data + i * input_width + m - 1); + in3 = vld1q_f32(input_data + i * input_width + m + 3); + in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1); + in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3); tmp0 = vextq_f32(in0, in1, 1); tmp1 = vextq_f32(in0, in1, 2); @@ -219,63 +76,383 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { tmp4 = vextq_f32(in4, in5, 1); tmp5 = vextq_f32(in4, in5, 2); - sum = vaddq_f32(in0, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in2); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - sum = vaddq_f32(sum, in4); - sum = vaddq_f32(sum, tmp4); - sum = vaddq_f32(sum, tmp5); - - out0 = vmulq_f32(sum, v_coef); - vst1q_f32(output_ptr, out0); - output_ptr += 4; - input_tmp += 4; - in0 = in1; - in2 = in3; - in4 = in5; + out0 = in0; + out0 = vaddq_f32(out0, tmp0); + out0 = vaddq_f32(out0, tmp1); + out0 = vaddq_f32(out0, in2); + out0 = vaddq_f32(out0, tmp2); + out0 = vaddq_f32(out0, tmp3); + out0 = vaddq_f32(out0, in4); + out0 = vaddq_f32(out0, tmp4); + out0 = vaddq_f32(out0, tmp5); + + vst1q_f32(output_ptr, vmulq_f32(out0, v_coef)); + } + int m; + for (m = 1; (m + 3) < output_width - 1; m = m + 4) { } - // mid remain - float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]); - float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); - float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); - tmp0 = vextq_f32(in0, pad0, 1); - tmp1 = vextq_f32(in0, pad0, 2); - tmp2 = vextq_f32(in2, pad1, 1); - tmp3 = vextq_f32(in2, pad1, 2); - tmp4 = vextq_f32(in4, pad2, 1); - tmp5 = vextq_f32(in4, pad2, 2); + for (int j = m; j < output_width - 1; j++) { + output_data[i * output_width + j] = + input_data[(i - 1) * input_width + j - 1] + + input_data[(i - 1) * input_width + j] + + input_data[(i - 1) * input_width + j + 1] + + input_data[(i)*input_width + j - 1] + + input_data[(i)*input_width + j] + + input_data[(i)*input_width + j + 1] + + input_data[(i + 1) * input_width + j - 1] + + input_data[(i + 1) * input_width + j] + + input_data[(i + 1) * input_width + j + 1]; + output_data[i * output_width + j] = + output_data[i * output_width + j] * coef; + } + } - sum = vaddq_f32(in0, tmp0); - sum = vaddq_f32(sum, tmp1); - sum = vaddq_f32(sum, in2); - sum = vaddq_f32(sum, tmp2); - sum = vaddq_f32(sum, tmp3); - sum = vaddq_f32(sum, in4); - sum = vaddq_f32(sum, tmp4); - sum = vaddq_f32(sum, tmp5); - out0 = vmulq_f32(sum, v_coef); + output_data[0] = + input_data[0] + input_data[1] + input_data[l] + input_data[l + 1]; + output_data[l - 1] = input_data[l - 2] + input_data[l - 1] + + input_data[2 * l - 2] + input_data[2 * l - 1]; + output_data[(l - 1) * l] = + input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] + + input_data[(l - 1) * l] + input_data[(l - 1) * l + 1]; + output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] + + input_data[(l - 2) * (l + 1) + 1] + + input_data[l * l - 2] + input_data[l * l - 1]; + output_data[0] = output_data[0] * coef2; + output_data[l - 1] = output_data[l - 1] * coef2; + output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2; + output_data[l * l - 1] = output_data[l * l - 1] * coef2; + + for (int i = 1; i < l - 1; ++i) { + output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] + + input_data[i * l] + input_data[i * l + 1] + + input_data[i * l + l] + input_data[i * l + l + 1]; + + output_data[i * l + l - 1] = + input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] + + input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] + + input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l]; + output_data[i * l] = output_data[i * l] * coef1; + output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1; + } - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + i, out0, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + i, out0, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + i, out0, 2); - } - } + int m; + for (m = 1; m < output_width - 4; m += 4) { + float *output_ptr = output_data + m; + float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0; + in0 = vld1q_f32(input_data + m - 1); + in1 = vld1q_f32(input_data + m + 3); + in2 = vld1q_f32(input_data + input_width + m - 1); + in3 = vld1q_f32(input_data + input_width + m + 3); + tmp0 = vextq_f32(in0, in1, 1); + tmp1 = vextq_f32(in0, in1, 2); + tmp2 = vextq_f32(in2, in3, 1); + tmp3 = vextq_f32(in2, in3, 2); + out0 = in0; + out0 = vaddq_f32(out0, tmp0); + out0 = vaddq_f32(out0, tmp1); + out0 = vaddq_f32(out0, in2); + out0 = vaddq_f32(out0, tmp2); + out0 = vaddq_f32(out0, tmp3); + + vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1)); + } + + for (m = 1; (m + 3) < output_width - 1; m += 4) { + } + for (int j = m; j < output_width - 1; j++) { + output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] + + input_data[input_width + j - 1] + + input_data[input_width + j] + + input_data[input_width + j + 1]; + output_data[j] = output_data[j] * coef1; + } + + for (m = 1; m < output_width - 4; m += 4) { + float *output_ptr = + output_data + (output_height - 1) * output_width + m; + + float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0; + in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1); + in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3); + in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1); + in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3); + tmp0 = vextq_f32(in0, in1, 1); + tmp1 = vextq_f32(in0, in1, 2); + tmp2 = vextq_f32(in2, in3, 1); + tmp3 = vextq_f32(in2, in3, 2); + out0 = in0; + out0 = vaddq_f32(out0, tmp0); + out0 = vaddq_f32(out0, tmp1); + out0 = vaddq_f32(out0, in2); + out0 = vaddq_f32(out0, tmp2); + out0 = vaddq_f32(out0, tmp3); + + vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1)); + } + for (m = 1; (m + 3) < output_width - 1; m = m + 4) { + } + for (int j = m; j < output_width - 1; j++) { + output_data[(output_height - 1) * input_width + j] = + input_data[(output_height - 2) * input_width + j - 1] + + input_data[(output_height - 2) * input_width + j] + + input_data[(output_height - 2) * input_width + j + 1] + + input_data[(output_height - 1) * input_width + j - 1] + + input_data[(output_height - 1) * input_width + j] + + input_data[(output_height - 1) * input_width + j + 1]; + output_data[(output_height - 1) * output_width + j] = + output_data[(output_height - 1) * output_width + j] * coef1; } - // input_data += inputdata_channel_stride; - // out_data += outputdata_channel_stride; } - input_data += input_batch_stride; - out_data += output_batch_stride; } + +// const int batch_size = input->dims()[0]; +// +// const int h_in = input->dims()[2]; +// +// const int w_in = input->dims()[3]; +// +// const int output_channels = output->dims()[1]; +// +// const int h_out = output->dims()[2]; +// const int w_out = output->dims()[3]; +// const int outputdata_channel_stride = h_out * w_out; +// const int inputdata_channel_stride = h_in * w_in; +// const int input_batch_stride = output_channels * inputdata_channel_stride; +// const int output_batch_stride = output_channels * +// outputdata_channel_stride; float *out_data = output->data(); const +// float *input_data = input->data(); +// +// const float coef = 1.0 / 9.0; +// for (int k = 0; k < batch_size; ++k) { +//#pragma omp parallel for +// for (int c = 0; c < output_channels; ++c) { +// const float *input_seg = input_data + c * inputdata_channel_stride; +// float *output_seg = out_data + c * outputdata_channel_stride; +// // four corner point +// output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] + +// input_seg[w_in + 1]) * +// coef; +// output_seg[w_out - 1] = +// (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - +// 2] + +// input_seg[2 * w_in - 1]) * +// coef; +// output_seg[(h_out - 1) * w_out] = +// (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] + +// input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) +// * +// coef; +// output_seg[h_out * w_out - 1] = +// (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] + +// input_seg[(h_in - 1) * w_in - 1] + +// input_seg[(h_in - 1) * w_in - 2]) * +// coef; +// // left side & right side +// for (int i = 1; i < h_in - 1; ++i) { +// output_seg[i * w_out] = +// (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] + +// input_seg[i * w_in] + input_seg[i * w_in + 1] + +// input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) * +// coef; +// output_seg[i * w_out + w_out - 1] = +// (input_seg[i * w_in - w_in + w_in - 2] + +// input_seg[i * w_in - w_in + 1 + w_in - 2] + +// input_seg[i * w_in + w_in - 2] + +// input_seg[i * w_in + 1 + w_in - 2] + +// input_seg[i * w_in + w_in + w_in - 2] + +// input_seg[i * w_in + w_in + 1 + w_in - 2]) * +// coef; +// } +// // top 1 row & bottom 1 row +// const float *input_tmp = input_seg; +// +// float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, +// tmp3, tmp4, tmp5, sum, out0; +// float32x4_t v_coef = vdupq_n_f32(coef); +// in0 = vld1q_f32(input_tmp); +// in2 = vld1q_f32(input_tmp + w_in); +// const float *input_tmp_end = input_tmp + (h_in - 2) * w_in; +// in4 = vld1q_f32(input_tmp_end); +// in6 = vld1q_f32(input_tmp_end + w_in); +// int c_mid = w_out - 2; +// auto output_ptr = output_seg + 1; +// for (; c_mid > 3; c_mid -= 4) { +// in1 = vld1q_f32(input_tmp + 4); +// in3 = vld1q_f32(input_tmp + w_in + 4); +// +// tmp0 = vextq_f32(in0, in1, 1); +// tmp1 = vextq_f32(in0, in1, 2); +// +// tmp2 = vextq_f32(in2, in3, 1); +// tmp3 = vextq_f32(in2, in3, 2); +// +// sum = vaddq_f32(in0, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in2); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// +// vst1q_f32(output_ptr, vmulq_f32(sum, v_coef)); +// +// in5 = vld1q_f32(input_tmp_end + 4); +// in7 = vld1q_f32(input_tmp_end + w_in + 4); +// +// tmp0 = vextq_f32(in4, in5, 1); +// tmp1 = vextq_f32(in4, in5, 2); +// tmp2 = vextq_f32(in6, in7, 1); +// tmp3 = vextq_f32(in6, in7, 2); +// +// sum = vaddq_f32(in0, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in2); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// +// vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef)); +// +// // can optimize to each 8 stride. +// input_tmp += 4; +// input_tmp_end += 4; +// output_ptr += 4; +// in0 = in1; +// in2 = in3; +// in4 = in5; +// in6 = in7; +// } +// // top right remain +// float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]); +// float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]); +// +// tmp0 = vextq_f32(in0, pad0, 1); +// tmp1 = vextq_f32(in0, pad0, 2); +// tmp2 = vextq_f32(in2, pad1, 2); +// tmp3 = vextq_f32(in2, pad1, 2); +// +// sum = vaddq_f32(in0, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in2); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// out0 = vmulq_f32(sum, v_coef); +// +// for (int i = 0; i < c_mid; ++i) { +// if (i == 0) { +// vst1q_lane_f32(output_ptr + i, out0, 0); +// } +// if (i == 1) { +// vst1q_lane_f32(output_ptr + i, out0, 1); +// } +// if (i == 2) { +// vst1q_lane_f32(output_ptr + i, out0, 2); +// } +// } +// +// // bottom_right remain +// float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]); +// float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]); +// +// tmp0 = vextq_f32(in4, pad2, 1); +// tmp1 = vextq_f32(in4, pad2, 2); +// tmp2 = vextq_f32(in6, pad3, 2); +// tmp3 = vextq_f32(in6, pad3, 2); +// +// sum = vaddq_f32(in4, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in6); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// out0 = vmulq_f32(sum, v_coef); +// +// for (int i = 0; i < c_mid; ++i) { +// if (i == 0) { +// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0); +// } +// if (i == 1) { +// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1); +// } +// if (i == 2) { +// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2); +// } +// } +// // mid +// for (int j = 0; j < h_out - 2; ++j) { +// output_ptr = output_seg + w_out * (j + 1) + 1; +// input_tmp = input_seg + j * w_in; +// +// in0 = vld1q_f32(input_tmp); +// in2 = vld1q_f32(input_tmp + w_in); +// in4 = vld1q_f32(input_tmp + 2 * w_in); +// c_mid = w_out - 2; +// for (; c_mid > 3; c_mid -= 4) { +// in1 = vld1q_f32(input_tmp + 4); +// in3 = vld1q_f32(input_tmp + w_in + 4); +// in5 = vld1q_f32(input_tmp + 2 * w_in + 4); +// +// tmp0 = vextq_f32(in0, in1, 1); +// tmp1 = vextq_f32(in0, in1, 2); +// tmp2 = vextq_f32(in2, in3, 1); +// tmp3 = vextq_f32(in2, in3, 2); +// tmp4 = vextq_f32(in4, in5, 1); +// tmp5 = vextq_f32(in4, in5, 2); +// +// sum = vaddq_f32(in0, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in2); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// sum = vaddq_f32(sum, in4); +// sum = vaddq_f32(sum, tmp4); +// sum = vaddq_f32(sum, tmp5); +// +// out0 = vmulq_f32(sum, v_coef); +// vst1q_f32(output_ptr, out0); +// output_ptr += 4; +// input_tmp += 4; +// in0 = in1; +// in2 = in3; +// in4 = in5; +// } +// // mid remain +// float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]); +// float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); +// float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); +// +// tmp0 = vextq_f32(in0, pad0, 1); +// tmp1 = vextq_f32(in0, pad0, 2); +// tmp2 = vextq_f32(in2, pad1, 1); +// tmp3 = vextq_f32(in2, pad1, 2); +// tmp4 = vextq_f32(in4, pad2, 1); +// tmp5 = vextq_f32(in4, pad2, 2); +// +// sum = vaddq_f32(in0, tmp0); +// sum = vaddq_f32(sum, tmp1); +// sum = vaddq_f32(sum, in2); +// sum = vaddq_f32(sum, tmp2); +// sum = vaddq_f32(sum, tmp3); +// sum = vaddq_f32(sum, in4); +// sum = vaddq_f32(sum, tmp4); +// sum = vaddq_f32(sum, tmp5); +// out0 = vmulq_f32(sum, v_coef); +// +// for (int i = 0; i < c_mid; ++i) { +// if (i == 0) { +// vst1q_lane_f32(output_ptr + i, out0, 0); +// } +// if (i == 1) { +// vst1q_lane_f32(output_ptr + i, out0, 1); +// } +// if (i == 2) { +// vst1q_lane_f32(output_ptr + i, out0, 2); +// } +// } +// } +// // input_data += inputdata_channel_stride; +// // out_data += outputdata_channel_stride; +// } +// input_data += input_batch_stride; +// out_data += output_batch_stride; +// } #endif } @@ -662,6 +839,7 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, wstart = max(wstart, 0); hend = min(hend, input_height); wend = min(wend, input_width); + const float *pos1 = input_seg + hstart * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart; @@ -674,7 +852,8 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, sum += input_seg[h * input_width + w]; } } - output_seg[ph * output_width + pw] = sum / 9.0; + output_seg[ph * output_width + pw] = + sum / ((hend - hstart) * (wend - wstart) * 1.0); } else { #if __aarch64__ #else diff --git a/src/operators/op_param.h b/src/operators/op_param.h index aaf6657a71917f9f09b7b9e4ea45b5ea789122d5..c462d7f3192297c3e37c6e7da262780fe86abf26 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -795,7 +795,7 @@ class SoftmaxParam : public OpParam { fpga::BypassArgs fpga_bypass_args; public: - RType *FloatInput() { + RType *FloatInput() const { return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); } void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } diff --git a/test/fpga/test_format_data.cpp b/test/fpga/test_format_data.cpp index 0fa3c23d2af6220959d434a6805adc9a7ae984a5..a7b2e393ceae3cc55e4c453a5e4738e7ebff6883 100644 --- a/test/fpga/test_format_data.cpp +++ b/test/fpga/test_format_data.cpp @@ -22,7 +22,7 @@ namespace fpga = paddle_mobile::fpga; using std::cout; using std::endl; -int main() { +void test_format_image() { std::vector dims{1, 1, 3, 3}; std::vector elements{1, 2, 3, 4, 5, 6, 7, 8, 9}; frame::DDim ddim = frame::make_ddim(dims); @@ -44,6 +44,50 @@ int main() { cout << endl; auto dd = image.dims(); cout << dims[0] << dims[1] << dims[2] << dims[3] << endl; +} + +void test_fill_conv_arg() { + Tensor input, out, filter; + DLOG << "Setup input"; + SetupTensor(&input, {1, 250, 32, 30}, static_cast(0), + static_cast(1)); + + DLOG << "Setup filter"; + SetupTensor(&filter, {1001, 250, 3, 3}, static_cast(0), + static_cast(1)); + + DLOG << "Setup output"; + SetupTensor(&out, {1, 1001, 32, 30}, static_cast(0), + static_cast(1)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float)); + + DLOG << "find max"; + float max_value = fpga::filter_find_max(&filter); + DLOG << "format filter"; + fpga::format_filter(&filter, max_value, 1); + + DLOG << "format bs_ptr"; + int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001); + DLOG << "format ofm"; + fpga::format_ofm(&out); + DLOG << "Build arg"; + + fpga::WrapperConvArgs arg; + fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr); + DLOG << "splitNum: " << arg.split_num << " group_num:" << arg.group_num + << " filter_num:" << arg.filter_num; + + for (int i = 0; i < arg.split_num; i++) { + DLOG << arg.conv_args[i].filter_num << " " << arg.conv_args[i].sb_address + << " " << arg.conv_args[i].filter_address << " " + << arg.conv_args[i].filter_scale_address; + } +} + +int main() { + test_format_image(); + test_fill_conv_arg(); return 0; }