diff --git a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp index 9c70d1e2c899567f523f98fc87963e73ab3fa6a1..1a256eb733a11892c72ef4a12a84c78b914d87e6 100644 --- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp @@ -78,6 +78,10 @@ void ConvAddBNReluKernel::Compute( case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp index 5a44b083a37b19637c053655e23196385d432971..3ac1315ba9d0df36725ad6937594a3a8ddf82bf4 100644 --- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp @@ -32,10 +32,8 @@ template <> void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); @@ -46,6 +44,10 @@ void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp index a9efae96e94afa24b48ed46214ff1fdd8ec50d83..104bb6d8b227455594ab34a37dabdb978553aac1 100644 --- a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp @@ -45,6 +45,10 @@ void ConvAddReluKernel::Compute( case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp index 0f52df8b18004da8327caa24ffcd0c599c4f0680..ceb1cf5144212f2e0e791b70d8a36ed3b7a62700 100644 --- a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp @@ -76,6 +76,10 @@ void ConvBNAddReluKernel::Compute( case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp index 1be0c943976dc5e77a7aa867095a923d9d1093ab..eafb9f763108b28d627f14f9a9d04e4378de4423 100644 --- a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp @@ -75,6 +75,10 @@ void ConvBNReluKernel::Compute( case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/arm/convolution/conv_common.cpp b/src/operators/kernel/arm/convolution/conv_common.cpp index 2a3a5e17e1d9da8db3ee30924c066bf195ddb97e..361d315a59aa8ace4e964a25514a8ebbf165717d 100644 --- a/src/operators/kernel/arm/convolution/conv_common.cpp +++ b/src/operators/kernel/arm/convolution/conv_common.cpp @@ -57,8 +57,8 @@ void InitBaseConvKernel(ConvParam *param) { param->Dilations()[0] == param->Dilations()[1] && param->Strides()[0] == 1 && param->Dilations()[0] == 1 #if 1 - && (param->Input()->dims()[1] >= 4 || - param->Output()->dims()[1] >= 16) + && (param->Input()->dims()[1] >= 8 && + param->Output()->dims()[1] >= 8) #endif ) { param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; @@ -66,6 +66,26 @@ void InitBaseConvKernel(ConvParam *param) { param->transformed_filter_ = new framework::LoDTensor; operators::math::winograd_transform_weight<8, 3>( *param->Filter(), param->transformed_filter_); + } else if (conv3x3 && !depth3x3 && + param->Strides()[0] == param->Strides()[1] && + param->Dilations()[0] == param->Dilations()[1] && + param->Strides()[0] == 1 && param->Dilations()[0] == 1 +#if 1 + && (param->Input()->dims()[2] >= 48 && + param->Output()->dims()[1] <= 24) +#endif + ) { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; + } else if (conv3x3 && !depth3x3 && + param->Strides()[0] == param->Strides()[1] && + param->Dilations()[0] == param->Dilations()[1] && + param->Strides()[0] == 2 && param->Dilations()[0] == 1 +#if 1 + && (param->Input()->dims()[2] >= 48 && + param->Output()->dims()[1] <= 24) +#endif + ) { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT; } else { param->ExecMode() = ConvParam::EXEC_GEMM_FLOAT; } diff --git a/src/operators/kernel/arm/convolution/conv_kernel.cpp b/src/operators/kernel/arm/convolution/conv_kernel.cpp index a819aa50216f06387e24d864fed07674f621b9eb..18d1e7e640eb30d027df1e9519bd4f4419142aa1 100644 --- a/src/operators/kernel/arm/convolution/conv_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_kernel.cpp @@ -54,6 +54,10 @@ void ConvKernel::Compute(const ConvParam ¶m) { case ConvParam::EXEC_GEMM_FLOAT: GemmConv(param); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: + SlidingwindowConv3x3(param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/src/operators/kernel/central-arm-func/conv_arm_func.cpp index 2c3166720652a77d3b628d2e5fd5d227a1a7fc33..dd41df59f303dfce1a6b9eb598f6dd34d6b014d7 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.cpp +++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp @@ -19,6 +19,7 @@ limitations under the License. */ #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/pad.h" +#include "operators/math/slidingwindow_conv3x3.h" #include "operators/math/vol2col.h" #include "operators/math/winograd/winograd_transform.h" #include "operators/op_param.h" @@ -232,10 +233,29 @@ void DepthwiseConv5x5(const ConvParam ¶m) { } } +template +void SlidingwindowConv3x3(const ConvParam ¶m) { + const Tensor *input = param.Input(); + const Tensor *filter = param.Filter(); + const std::vector &paddings = param.Paddings(); + const std::vector &strides = param.Strides(); + Tensor *output = param.Output(); + output->mutable_data(); + + if (strides[0] == 1) { + math::SlidingwindowConv3x3s1(input, filter, paddings, output); + } else if (strides[0] == 2) { + math::SlidingwindowConv3x3s2(input, filter, paddings, output); + } else { + GemmConv(param); + } +} + template void GemmConv(const ConvParam ¶m); template void WinogradConv3x3<8, 3>(const ConvParam ¶m); template void DepthwiseConv3x3(const ConvParam ¶m); template void DepthwiseConv5x5(const ConvParam ¶m); +template void SlidingwindowConv3x3(const ConvParam ¶m); #ifndef __aarch64__ template void GemmConv(const ConvParam ¶m); diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 52bcbbb7c6f76e7e68da4c8a10271bb1bac35adf..2fa06f7cee1ffdf97448964da04e95ddeb27aedf 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -41,6 +41,9 @@ void DepthwiseConv3x3(const ConvParam ¶m); template void DepthwiseConv5x5(const ConvParam ¶m); +template +void SlidingwindowConv3x3(const ConvParam ¶m); + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index 772c68059ddb85958279639626bfb9e2b36fb91b..09c135ff5cdff0755dc41d96f90d4a3e3b345c27 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -300,7 +300,7 @@ static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { template static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, - float eta) { + float eta, int post_nms_num = 100) { int64_t num_boxes = bbox->dims()[0]; // 4: [xmin ymin xmax ymax] int64_t box_size = bbox->dims()[1]; @@ -314,7 +314,7 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, int selected_num = 0; T adaptive_threshold = nms_threshold; const T *bbox_data = bbox->data(); - while (sorted_indices.size() != 0) { + while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) { int idx = sorted_indices.back().second; bool flag = true; for (int kept_idx : selected_indices) { @@ -397,17 +397,19 @@ std::pair ProposalForOneImage( return std::make_pair(bbox_sel, scores_filter); } - Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); + // Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); + Tensor keep_nms = + NMS(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); } - // proposals.mutable_data({keep_nms.numel(), 4});//original - // scores_sel.mutable_data({keep_nms.numel(), 1});//original + proposals.mutable_data({keep_nms.numel(), 4}); // original + scores_sel.mutable_data({keep_nms.numel(), 1}); // original - proposals.mutable_data({post_nms_top_n, 4}); // wong - scores_sel.mutable_data({post_nms_top_n, 1}); // wong + // proposals.mutable_data({post_nms_top_n, 4}); // wong + // scores_sel.mutable_data({post_nms_top_n, 1}); // wong CPUGather(bbox_sel, keep_nms, &proposals); CPUGather(scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 170d245c0212c06b8a25243a79c4f1bd25d314c4..7e0852ca4b25ff3ffea31136cea0065495d57dc6 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef PSROI_POOL_OP #include -#include #include #include "operators/kernel/detection_kernel.h" @@ -72,16 +71,72 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { return true; } +/* + template + void PSROIPoolingForward( + const Dtype* bottom_data, + const int height, const int width, const int input_channel, + Dtype* top_data, + const int pooled_height, const int pooled_width, const int output_channel, + const Dtype* bottom_rois, + const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h, + const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind) + { + + int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw)* Bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); + + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + float32x4_t sum_pixels_low_c= vdupq_n_f32(0); + float32x4_t sum_pixels_high_c= vdupq_n_f32(0); + + if(!is_empty){ + Dtype bin_area = (hend - hstart) * (wend - wstart); + float rev_bin_area = 1 / bin_area; + float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area); + //static_cast(bin_area) float pixels_c[output_channel]; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int pixel_offset = (h * width + w) * input_channel; + for(int output_c = 0; output_c < output_channel; output_c++){ + int input_channel_offset = output_c * pooled_height * + pooled_width; int input_bias = pixel_offset + input_channel_offset + ph * + pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias]; + } + float32x4_t pixel_low_c = vld1q_f32(pixels_c); + float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4); + sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c); + sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c); + } + } + sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area); + sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area); + } + + int output_index_base = (ph * pooled_width + pw) * output_channel; + top_data += output_index_base; + vst1q_f32(top_data, sum_pixels_low_c); + top_data += 4; + vst1q_f32(top_data, sum_pixels_high_c); + }*/ + template -void PSROIPooling(const Dtype* bottom_data, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const Dtype* bottom_rois, - const int output_dim, const int group_size, Dtype* top_data, - int index, int nid, const Dtype Bin_size_h, - const Dtype Bin_size_w, const Dtype roi_start_h, - const Dtype roi_start_w, const int ctop, const int ph, - const int roi_batch_ind) { - int pw = index; +void PSROIPoolingForward(const Dtype* bottom_data, const int height, + const int width, const int input_channel, + Dtype* top_data, const int pooled_height, + const int pooled_width, const int output_channel, + const Dtype* bottom_rois, const Dtype Bin_size_h, + const Dtype Bin_size_w, const Dtype roi_start_h, + const Dtype roi_start_w, const int pw, const int ph, + const int roi_batch_ind) { int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); int wstart = floor(static_cast(pw) * Bin_size_w + roi_start_w); int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); @@ -94,60 +149,35 @@ void PSROIPooling(const Dtype* bottom_data, const int channels, wend = std::min(std::max(wend, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); - int c = (ctop * group_size + ph) * group_size + pw; - - Dtype bin_area = (hend - hstart) * (wend - wstart); - bottom_data += (roi_batch_ind * channels + c) * height * width; - Dtype out_sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int bottom_index = h * width + w; - out_sum += bottom_data[bottom_index]; - } - } - - top_data[nid + index] = is_empty ? 0. : out_sum / bin_area; -} - -void convert_to_chw(float** data_in, int channel, int height, int width, - int num) { - float* data_in_tmp = *data_in; - float* data_tmp = reinterpret_cast( - fpga::fpga_malloc(channel * height * width * sizeof(float))); // NOLINT - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * height * width * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); + float sum_pixels_c[output_channel] = {0}; + float pixels_c[output_channel] = {0}; + if (!is_empty) { + Dtype bin_area = (hend - hstart) * (wend - wstart); + float rec_bin_area = 1 / bin_area; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int pixel_offset = (h * width + w) * input_channel; + for (int output_c = 0; output_c < output_channel; output_c++) { + int input_channel_offset = output_c * pooled_height * pooled_width; + int input_bias = + pixel_offset + input_channel_offset + ph * pooled_width + pw; + pixels_c[output_c] = bottom_data[input_bias]; } - } - } - } - *data_in = data_tmp; - fpga::fpga_free(data_in_tmp); -} -void convert_to_hwc(float** data_in, int channel, int height, int width, - int num) { - float* data_in_tmp = *data_in; - float* data_tmp = reinterpret_cast( - fpga::fpga_malloc(num * channel * height * width * sizeof(float))); - int64_t amount_per_row = width * channel; - for (int n = 0; n < num; n++) { - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * channel * height * width + offset_height + - w * channel + c) = *((*data_in)++); + for (int output_c = 0; output_c < output_channel; output_c++) { + sum_pixels_c[output_c] += pixels_c[output_c]; } } } + for (int output_c = 0; output_c < output_channel; output_c++) { + sum_pixels_c[output_c] *= rec_bin_area; + } } - *data_in = data_tmp; - fpga::fpga_free(data_in_tmp); + + int output_index_base = (ph * pooled_width + pw) * output_channel; + top_data += output_index_base; + memcpy(top_data, sum_pixels_c, output_channel * 4); } template <> @@ -174,14 +204,15 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { int rois_num = rois->dims()[0]; auto data_nhwc = in->mutable_data(); - fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1); + + // fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); framework::DDim dims_out_new = framework::make_ddim( {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); - float* input_data = data_nhwc; // in->data(); - // shared_ptr input_data(data_nhwc); + const float* input_data = data_nhwc; // in->data(); framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); auto rois_batch_id_data = rois_batch_id_list.mutable_data(); @@ -203,18 +234,19 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { "output_channels x pooled_height x pooled_width"); // calculate batch id index for each roi according to LoD - // for (int n = 0; n < rois_batch_size; ++n) { - // for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - // rois_batch_id_data[i] = n; - // } - //} + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } auto output_data = out->mutable_data(); auto input_rois = rois->data(); - // calculate psroipooling, parallel processing can be implemented per ROI for (int n = 0; n < rois_num; ++n) { - // [start, end) interval for spatial sampling auto offset_input_rois = input_rois + n * 4; + auto offset_output_data = + output_data + pooled_height * pooled_width * output_channels * n; + auto roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; auto roi_start_h = @@ -232,27 +264,18 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto bin_size_h = roi_height / static_cast(pooled_height); auto bin_size_w = roi_width / static_cast(pooled_width); - int roi_batch_ind = 0; // rois_batch_id_data[n]; - // std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl; - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < pooled_height; ph++) { - int index = pooled_width; - int nid = n * output_channels * pooled_height * pooled_width + - c * pooled_width * pooled_height + ph * pooled_width; - for (int idx = 0; idx < index; idx++) { - PSROIPooling(input_data, input_channels, height, width, - pooled_height, pooled_width, input_rois, - output_channels, pooled_height, output_data, idx, - nid, bin_size_h, bin_size_w, roi_start_h, - roi_start_w, c, ph, roi_batch_ind); - } + int roi_batch_ind = rois_batch_id_data[n]; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + PSROIPoolingForward(input_data, height, width, input_channels, + offset_output_data, pooled_height, + pooled_width, output_channels, input_rois, + bin_size_h, bin_size_w, roi_start_h, + roi_start_w, pw, ph, roi_batch_ind); } } } - fpga::fpga_free(input_data); - fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height, - pooled_width, rois_num); - out->reset_data_ptr(output_data); } } // namespace operators diff --git a/src/operators/math/slidingwindow_conv3x3.cpp b/src/operators/math/slidingwindow_conv3x3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76a79c07740d435e545121378a8c5739c76517c6 --- /dev/null +++ b/src/operators/math/slidingwindow_conv3x3.cpp @@ -0,0 +1,3710 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/math/slidingwindow_conv3x3.h" +#include +#if __ARM_NEON +#include +#endif +#ifdef _OPENMP +#include +#endif + +namespace paddle_mobile { +namespace operators { +namespace math { +template <> +void SlidingwindowConv3x3s1(const framework::Tensor *input, + const framework::Tensor *filter, + const std::vector &paddings, + framework::Tensor *output) { + const int batch = input->dims()[0]; + const int input_ch = input->dims()[1]; + const int input_h = input->dims()[2]; + const int input_w = input->dims()[3]; + const int output_ch = output->dims()[1]; + const int output_h = output->dims()[2]; + const int output_w = output->dims()[3]; + const int padding_h = paddings[0]; + const int padding_w = paddings[1]; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + const float *filter_data = filter->data(); + + const int in_ch_size = input_h * input_w; + const int in_batch_size = input_ch * in_ch_size; + const int out_ch_size = output_h * output_w; + const int out_batch_size = output_ch * out_ch_size; + const int out_size = batch * out_batch_size; + const int filter_ch_size = 9; + const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3); + const int pad_filter_start = + 2 * padding_h * (2 * padding_w + 3) + 2 * padding_w; + const int pad_filter_w = 3 + padding_w * 2; + bool if_nopadding = false; + +#if __ARM_NEON + float *out_ptr = output_data; + int remain = out_size & 0x3; + float32x4_t _zero = vdupq_n_f32(0.0); + + for (int i = 0; i < out_size; i += 4) { + vst1q_f32(out_ptr, _zero); + out_ptr += 4; + } + switch (remain) { + case 1: + vst1q_lane_f32(out_ptr, _zero, 0); + break; + case 2: + vst1_f32(out_ptr, vget_low_f32(_zero)); + break; + case 3: + vst1_f32(out_ptr, vget_low_f32(_zero)); + vst1q_lane_f32(out_ptr + 2, _zero, 0); + break; + } +#else +#pragma omp parallel for + for (int i = 0; i < out_size; ++i) { + output_data[i] = 0; + } +#endif + if (padding_h == 0 && padding_w == 0) { + if_nopadding = true; + } + + for (int b = 0; b < batch; ++b) { +#pragma omp parallel for + for (int o_c = 0; o_c < output_ch - 1; o_c += 2) { + bool issamefilter; + const float *f1; + const float *f1_c2; + const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4; + const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3; + const float *pad_filter0_c2, *pad_filter1_c2, *pad_filter2_c2, + *pad_filter3_c2; + float pad_filter_arr[pad_filter_ch_size]; + float pad_filter_arr_c2[pad_filter_ch_size]; + + float *output_data_ch; + float *output_data_ch_2; + const float *input_data_ch; + const float *filter_data_ch; + const float *filter_data_ch_c2; + + filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; + filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch; + + input_data_ch = input_data; + output_data_ch = output_data + o_c * out_ch_size; + output_data_ch_2 = output_data + (o_c + 1) * out_ch_size; + + for (int i_c = 0; i_c < input_ch; ++i_c) { + f1 = filter_data_ch; + f1_c2 = filter_data_ch_c2; + + if (!if_nopadding) { + memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); + memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2)); + for (int i = 0; i < 9; i++) { + int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + + padding_w * (2 * padding_h + 1); + pad_filter_arr[j] = filter_data_ch[i]; + pad_filter_arr_c2[j] = filter_data_ch_c2[i]; + } + pad_filter1 = pad_filter_arr; + pad_filter1 += pad_filter_start; + pad_filter0 = pad_filter1 - pad_filter_w; + pad_filter2 = pad_filter1 + pad_filter_w; + pad_filter3 = pad_filter2 + pad_filter_w; + + pad_filter1_c2 = pad_filter_arr_c2; + pad_filter1_c2 += pad_filter_start; + pad_filter0_c2 = pad_filter1_c2 - pad_filter_w; + pad_filter2_c2 = pad_filter1_c2 + pad_filter_w; + pad_filter3_c2 = pad_filter2_c2 + pad_filter_w; + } else { + pad_filter1 = filter_data_ch; + pad_filter2 = pad_filter1 + 3; + pad_filter3 = pad_filter2 + 3; + + pad_filter1_c2 = filter_data_ch_c2; + pad_filter2_c2 = pad_filter1_c2 + 3; + pad_filter3_c2 = pad_filter2_c2 + 3; + } + float *out_ptr1, *out_ptr2; + float *out_ptr1_c2, *out_ptr2_c2; + + out_ptr1 = output_data_ch; + out_ptr2 = out_ptr1 + output_w; + out_ptr1_c2 = output_data_ch_2; + out_ptr2_c2 = out_ptr1_c2 + output_w; + + in_ptr1 = input_data_ch; + in_ptr2 = in_ptr1 + input_w; + in_ptr3 = in_ptr2 + input_w; + in_ptr4 = in_ptr3 + input_w; + + int o_h = 0; + for (; o_h < output_h - 1; o_h = o_h + 2) { + if (!if_nopadding && + (o_h < padding_h || o_h > output_h - padding_h - 2)) { + issamefilter = false; + } else { + issamefilter = true; + } + int o_w = 0; + // pad left + for (; o_w < padding_w; ++o_w) { + float sum1 = 0; + float sum2 = 0; + float sum1_c2 = 0; + float sum2_c2 = 0; + + if (issamefilter) { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); + float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); + + float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); + _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss2_2 = + vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); + sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + sum2 += in_ptr4[0] * pad_filter3[0]; + sum2 += in_ptr4[1] * pad_filter3[1]; + sum2 += in_ptr4[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; + sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; + sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; + sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; + sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; + sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; + sum2_c2 += in_ptr4[0] * pad_filter3_c2[0]; + sum2_c2 += in_ptr4[1] * pad_filter3_c2[1]; + sum2_c2 += in_ptr4[2] * pad_filter3_c2[2]; +#endif + } else { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); + float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); + float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss2_2 = + vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); + sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr1[0] * pad_filter0[0]; + sum2 += in_ptr1[1] * pad_filter0[1]; + sum2 += in_ptr1[2] * pad_filter0[2]; + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum2_c2 += in_ptr1[0] * pad_filter0_c2[0]; + sum2_c2 += in_ptr1[1] * pad_filter0_c2[1]; + sum2_c2 += in_ptr1[2] * pad_filter0_c2[2]; + sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; + sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; + sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; + sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; + sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; + sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; +#endif + } + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr2 += sum2; + *out_ptr1_c2 += sum1_c2; + *out_ptr2_c2 += sum2_c2; + + out_ptr1++; + out_ptr2++; + out_ptr1_c2++; + out_ptr2_c2++; + } + // valid +#if __ARM_NEON +#if __aarch64__ + if (issamefilter) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[f1_c2], #256] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1_c2]], #32 \n\t" + "ld1 {v4.s}[0], [%[f1]] \n\t" + + "sub %[f1],%[f1], #32 \n\t" + "ld1 {v4.s}[1], [%[f1_c2]] \n\t" + "sub %[f1_c2],%[f1_c2], #32 \n\t" + + "prfm pldl1keep, [%[in_ptr1], #192] \n\t" + "prfm pldl1keep, [%[in_ptr4], #192] \n\t" + + "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" + "add %[in_ptr1],%[in_ptr1], #16 \n\t" + + "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + "ld1 {v7.4s}, [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + + "0: \n\t" + // load out_ptr + "prfm pldl1keep, [%[out_ptr1], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" + "prfm pldl1keep, [%[out_ptr2], #128] \n\t" + "prfm pldl1keep, [%[out_ptr2_c2], #128] \n\t" + + "ld1 {v12.4s}, [%[out_ptr1]] \n\t" + "ld1 {v13.4s}, [%[out_ptr1_c2]] \n\t" + "ld1 {v14.4s}, [%[out_ptr2]] \n\t" + "ld1 {v15.4s}, [%[out_ptr2_c2]] \n\t" + + // in_ptr1 and in_ptr4 multiply + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[0] \n\t" + "fmla v13.4s, v5.4s, v2.s[0] \n\t" + + "ext v9.16b, v6.16b, v7.16b, #8 \n\t" + "fmla v14.4s, v7.4s, v4.s[0] \n\t" + "fmla v15.4s, v7.4s, v4.s[1] \n\t" + + "ext v10.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v0.s[1] \n\t" + "fmla v13.4s, v8.4s, v2.s[1] \n\t" + + "ext v11.16b, v6.16b, v7.16b, #12 \n\t" + "fmla v14.4s, v9.4s, v1.s[2] \n\t" + "fmla v15.4s, v9.4s, v3.s[2] \n\t" + + "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" + "fmla v12.4s, v10.4s, v0.s[2] \n\t" + "fmla v13.4s, v10.4s, v2.s[2] \n\t" + + "add %[in_ptr2],%[in_ptr2], #16 \n\t" + "fmla v14.4s, v11.4s, v1.s[3] \n\t" + "fmla v15.4s, v11.4s, v3.s[3] \n\t" + + // in_ptr2 multiply + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[3] \n\t" + "fmla v13.4s, v5.4s, v2.s[3] \n\t" + + "fmla v14.4s, v5.4s, v0.s[0] \n\t" + "fmla v15.4s, v5.4s, v2.s[0] \n\t" + + "ext v9.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v1.s[0] \n\t" + "fmla v13.4s, v8.4s, v3.s[0] \n\t" + + "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + "fmla v14.4s, v8.4s, v0.s[1] \n\t" + "fmla v15.4s, v8.4s, v2.s[1] \n\t" + + "ld1 {v7.4s}, [%[in_ptr3]] \n\t" + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + + "fmla v12.4s, v9.4s, v1.s[1] \n\t" + "fmla v13.4s, v9.4s, v3.s[1] \n\t" + + "ext v10.16b, v6.16b, v7.16b, #8 \n\t" + "fmla v14.4s, v9.4s, v0.s[2] \n\t" + "fmla v15.4s, v9.4s, v2.s[2] \n\t" + + // in_ptr3 multiply + "fmla v12.4s, v7.4s, v4.s[0] \n\t" + "fmla v13.4s, v7.4s, v4.s[1] \n\t" + + "ext v11.16b, v6.16b, v7.16b, #12 \n\t" + "fmla v14.4s, v7.4s, v1.s[1] \n\t" + "fmla v15.4s, v7.4s, v3.s[1] \n\t" + + "fmla v12.4s, v10.4s, v1.s[2] \n\t" + "fmla v13.4s, v10.4s, v3.s[2] \n\t" + + "fmla v14.4s, v10.4s, v0.s[3] \n\t" + "fmla v15.4s, v10.4s, v2.s[3] \n\t" + + "fmla v12.4s, v11.4s, v1.s[3] \n\t" + "fmla v13.4s, v11.4s, v3.s[3] \n\t" + + "prfm pldl1keep, [%[in_ptr1], #192] \n\t" + "fmla v14.4s, v11.4s, v1.s[0] \n\t" + "fmla v15.4s, v11.4s, v3.s[0] \n\t" + + // store out_ptr + "prfm pldl1keep, [%[in_ptr4], #192] \n\t" + "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" + "add %[in_ptr1],%[in_ptr1], #16 \n\t" + "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" + + "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + "st1 {v13.4s}, [%[out_ptr1_c2]], #16 \n\t" + + "ld1 {v7.4s}, [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + "st1 {v14.4s}, [%[out_ptr2]], #16 \n\t" + + "subs %[loop],%[loop], #1 \n\t" + "st1 {v15.4s}, [%[out_ptr2_c2]], #16 \n\t" + + // cycle + "bne 0b \n\t" + "sub %[in_ptr1],%[in_ptr1], #16 \n\t" + "sub %[in_ptr4],%[in_ptr4], #16 \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1), [f1_c2] "r"(f1_c2) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + } + } + if (!if_nopadding && o_w == output_w - padding_w) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + + in_ptr1--; + in_ptr2--; + in_ptr3--; + in_ptr4--; + } +#else + if (issamefilter) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "pld [%[f1], #256] \n\t" + "pld [%[f1_c2], #256] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]] \n\t" + "add %[f1], #32 \n\t" + "vld1.f32 {d4-d7}, [%[f1_c2]] \n\t" + "add %[f1_c2], #32 \n\t" + + "vld1.f32 {d8[0]}, [%[f1]] \n\t" + "sub %[f1], #32 \n\t" + "vld1.f32 {d8[1]}, [%[f1_c2]] \n\t" + "sub %[f1_c2], #32 \n\t" + + "pld [%[in_ptr1], #192] \n\t" + "pld [%[in_ptr4], #192] \n\t" + + "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], #16 \n\t" + + "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" + "add %[in_ptr4], #16 \n\t" + + "0: \n\t" + // load out_ptr + "pld [%[out_ptr1], #128] \n\t" + "pld [%[out_ptr1_c2], #128] \n\t" + "pld [%[out_ptr2], #128] \n\t" + "pld [%[out_ptr2_c2], #128] \n\t" + + "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" + "vld1.f32 {d26, d27}, [%[out_ptr1_c2]] \n\t" + "vld1.f32 {d28, d29}, [%[out_ptr2]] \n\t" + "vld1.f32 {d30, d31}, [%[out_ptr2_c2]] \n\t" + + // in_ptr1 + in_ptr4 multiply + "vext.32 q8, q5, q6, #1 \n\t" + "vmla.f32 q12, q5, d0[0] \n\t" + "vmla.f32 q13, q5, d4[0] \n\t" + + "vext.32 q9, q6, q7, #2 \n\t" + "vmla.f32 q14, q7, d8[0] \n\t" + "vmla.f32 q15, q7, d8[1] \n\t" + + "vext.32 q10, q5, q6, #2 \n\t" + "vmla.f32 q12, q8, d0[1] \n\t" + "vmla.f32 q13, q8, d4[1] \n\t" + + "vext.32 q11, q6, q7, #3 \n\t" + "vmla.f32 q14, q9, d3[0] \n\t" + "vmla.f32 q15, q9, d7[0] \n\t" + + "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" + "add %[in_ptr2], #16 \n\t" + "vmla.f32 q12, q10, d1[0] \n\t" + "vmla.f32 q13, q10, d5[0] \n\t" + + "vmla.f32 q14, q11, d3[1] \n\t" + "vmla.f32 q15, q11, d7[1] \n\t" + + // in_ptr2 multiply + "vext.32 q8, q5, q6, #1 \n\t" + "vmla.f32 q12, q5, d1[1] \n\t" + "vmla.f32 q13, q5, d5[1] \n\t" + + "vmla.f32 q14, q5, d0[0] \n\t" + "vmla.f32 q15, q5, d4[0] \n\t" + + "vext.32 q9, q5, q6, #2 \n\t" + "vmla.f32 q12, q8, d2[0] \n\t" + "vmla.f32 q13, q8, d6[0] \n\t" + + "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" + "add %[in_ptr3], #16 \n\t" + "vmla.f32 q14, q8, d0[1] \n\t" + "vmla.f32 q15, q8, d4[1] \n\t" + + "vmla.f32 q12, q9, d2[1] \n\t" + "vmla.f32 q13, q9, d6[1] \n\t" + + "vmla.f32 q14, q9, d1[0] \n\t" + "vmla.f32 q15, q9, d5[0] \n\t" + + // in_ptr3 multiply + "vext.32 q10, q6, q7, #2 \n\t" + "vmla.f32 q12, q7, d8[0] \n\t" + "vmla.f32 q13, q7, d8[1] \n\t" + "vmla.f32 q14, q7, d2[1] \n\t" + "vmla.f32 q15, q7, d6[1] \n\t" + + "vext.32 q11, q6, q7, #3 \n\t" + "vmla.f32 q12, q10, d3[0] \n\t" + "vmla.f32 q13, q10, d7[0] \n\t" + "vmla.f32 q14, q10, d1[1] \n\t" + "vmla.f32 q15, q10, d5[1] \n\t" + + "vmla.f32 q12, q11, d3[1] \n\t" + "vmla.f32 q13, q11, d7[1] \n\t" + "vmla.f32 q14, q11, d2[0] \n\t" + "vmla.f32 q15, q11, d6[0] \n\t" + + // store out_ptr + "pld [%[in_ptr1], #192] \n\t" + + "pld [%[in_ptr4], #192] \n\t" + "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], #16 \n\t" + + "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" + + "vst1.f32 {d26, d27}, [%[out_ptr1_c2]]! \n\t" + "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" + + "add %[in_ptr4], #16 \n\t" + "vst1.f32 {d28, d29}, [%[out_ptr2]]! \n\t" + + "subs %[loop], #1 \n\t" + "vst1.f32 {d30, d31}, [%[out_ptr2_c2]]! \n\t" + + // cycle + "bne 0b \n\t" + "sub %[in_ptr1], #16 \n\t" + "sub %[in_ptr4], #16 \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1), [f1_c2] "r"(f1_c2) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } + } + if (!if_nopadding && o_w == output_w - padding_w) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + + in_ptr1--; + in_ptr2--; + in_ptr3--; + in_ptr4--; + } +#endif //__aarch64__ +#endif // __ARM_NEON + + // remain output_width + for (; o_w < output_w; ++o_w) { + float sum1 = 0; + float sum2 = 0; + float sum1_c2 = 0; + float sum2_c2 = 0; + + if (issamefilter) { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); + float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); + + float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); + _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss2_2 = + vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); + sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + sum2 += in_ptr4[0] * pad_filter3[0]; + sum2 += in_ptr4[1] * pad_filter3[1]; + sum2 += in_ptr4[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; + sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; + sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; + sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; + sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; + sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; + sum2_c2 += in_ptr4[0] * pad_filter3_c2[0]; + sum2_c2 += in_ptr4[1] * pad_filter3_c2[1]; + sum2_c2 += in_ptr4[2] * pad_filter3_c2[2]; +#endif + } else { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); + float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); + float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss2_2 = + vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); + sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr1[0] * pad_filter0[0]; + sum2 += in_ptr1[1] * pad_filter0[1]; + sum2 += in_ptr1[2] * pad_filter0[2]; + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum2_c2 += in_ptr1[0] * pad_filter0_c2[0]; + sum2_c2 += in_ptr1[1] * pad_filter0_c2[1]; + sum2_c2 += in_ptr1[2] * pad_filter0_c2[2]; + sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; + sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; + sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; + sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; + sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; + sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; +#endif + } + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr2 += sum2; + *out_ptr1_c2 += sum1_c2; + *out_ptr2_c2 += sum2_c2; + + out_ptr1++; + out_ptr2++; + out_ptr1_c2++; + out_ptr2_c2++; + } + if (if_nopadding) { + in_ptr1 += 2 + input_w; + in_ptr2 += 2 + input_w; + in_ptr3 += 2 + input_w; + in_ptr4 += 2 + input_w; + } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) { + in_ptr1 += 3; + in_ptr2 += 3; + in_ptr3 += 3; + in_ptr4 += 3; + + pad_filter0 -= 2; + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + + pad_filter0_c2 -= 2; + pad_filter1_c2 -= 2; + pad_filter2_c2 -= 2; + pad_filter3_c2 -= 2; + + } else if (issamefilter) { + in_ptr1 += 3 + input_w; + in_ptr2 += 3 + input_w; + in_ptr3 += 3 + input_w; + in_ptr4 += 3 + input_w; + + pad_filter0 += 2 * padding_w + 1; + pad_filter1 += 2 * padding_w + 1; + pad_filter2 += 2 * padding_w + 1; + pad_filter3 += 2 * padding_w + 1; + + pad_filter0_c2 += 2 * padding_w + 1; + pad_filter1_c2 += 2 * padding_w + 1; + pad_filter2_c2 += 2 * padding_w + 1; + pad_filter3_c2 += 2 * padding_w + 1; + + } else { + pad_filter0 -= 3 + 2 * padding_w + 2; + pad_filter1 -= 3 + 2 * padding_w + 2; + pad_filter2 -= 3 + 2 * padding_w + 2; + pad_filter3 -= 3 + 2 * padding_w + 2; + + pad_filter0_c2 -= 3 + 2 * padding_w + 2; + pad_filter1_c2 -= 3 + 2 * padding_w + 2; + pad_filter2_c2 -= 3 + 2 * padding_w + 2; + pad_filter3_c2 -= 3 + 2 * padding_w + 2; + + in_ptr1 -= input_w - 3; + in_ptr2 -= input_w - 3; + in_ptr3 -= input_w - 3; + in_ptr4 -= input_w - 3; + } + out_ptr1 += output_w; + out_ptr2 += output_w; + out_ptr1_c2 += output_w; + out_ptr2_c2 += output_w; + } + // remain output_height + for (; o_h < output_h; ++o_h) { + int o_w = 0; + // pad left + for (; o_w < padding_w; ++o_w) { + float sum1 = 0; + float sum1_c2 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); + + sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); + sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; +#endif + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr1_c2 += sum1_c2; + + out_ptr1++; + out_ptr1_c2++; + } +// valid +#if __ARM_NEON +#if __aarch64__ + if (if_nopadding) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[f1_c2], #256] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" + "add %[f1], %[f1], #32 \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1_c2]] \n\t" + "add %[f1_c2], %[f1_c2], #32 \n\t" + + "ld1 {v4.s}[0], [%[f1]] \n\t" + "sub %[f1],%[f1], #32 \n\t" + "ld1 {v4.s}[1], [%[f1_c2]] \n\t" + "sub %[f1_c2],%[f1_c2], #32 \n\t" + + "0: \n\t" + // load out_ptr + "prfm pldl1keep, [%[out_ptr1], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" + + "ld1 {v12.4s}, [%[out_ptr1]] \n\t" + "ld1 {v13.4s}, [%[out_ptr1_c2]] \n\t" + + // in_ptr1 multiply + "prfm pldl1keep, [%[in_ptr1], #192] \n\t" + "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" + "add %[in_ptr1],%[in_ptr1], #16 \n\t" + + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[0] \n\t" + "fmla v13.4s, v5.4s, v2.s[0] \n\t" + + "ext v10.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v0.s[1] \n\t" + "fmla v13.4s, v8.4s, v2.s[1] \n\t" + + "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" + "add %[in_ptr2],%[in_ptr2], #16 \n\t" + "fmla v12.4s, v10.4s, v0.s[2] \n\t" + "fmla v13.4s, v10.4s, v2.s[2] \n\t" + + // in_ptr2 multiply + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[3] \n\t" + "fmla v13.4s, v5.4s, v2.s[3] \n\t" + + "ext v9.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v1.s[0] \n\t" + "fmla v13.4s, v8.4s, v3.s[0] \n\t" + + "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + "ld1 {v7.4s}, [%[in_ptr3]] \n\t" + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + + "fmla v12.4s, v9.4s, v1.s[1] \n\t" + "fmla v13.4s, v9.4s, v3.s[1] \n\t" + + // in_ptr3 multiply + "ext v10.16b, v6.16b, v7.16b, #8 \n\t" + "fmla v12.4s, v7.4s, v4.s[0] \n\t" + "fmla v13.4s, v7.4s, v4.s[1] \n\t" + + "ext v11.16b, v6.16b, v7.16b, #12 \n\t" + "fmla v12.4s, v10.4s, v1.s[2] \n\t" + "fmla v13.4s, v10.4s, v3.s[2] \n\t" + + "fmla v12.4s, v11.4s, v1.s[3] \n\t" + "fmla v13.4s, v11.4s, v3.s[3] \n\t" + + // store out_ptr + "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" + "st1 {v13.4s}, [%[out_ptr1_c2]], #16 \n\t" + + // cycle + "subs %[loop],%[loop], #1 \n\t" + "bne 0b \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1), [f1_c2] "r"(f1_c2) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13"); + } + } +#else + if (if_nopadding) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "pld [%[f1], #256] \n\t" + "pld [%[f1_c2], #256] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]] \n\t" + "add %[f1], #32 \n\t" + "vld1.f32 {d4-d7}, [%[f1_c2]] \n\t" + "add %[f1_c2], #32 \n\t" + + "vld1.f32 {d8[0]}, [%[f1]] \n\t" + "sub %[f1], #32 \n\t" + "vld1.f32 {d8[1]}, [%[f1_c2]] \n\t" + "sub %[f1_c2], #32 \n\t" + + "0: \n\t" + // load out_ptr + "pld [%[out_ptr1], #128] \n\t" + "pld [%[out_ptr1_c2], #128] \n\t" + + "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" + "vld1.f32 {d26, d27}, [%[out_ptr1_c2]] \n\t" + + // in_ptr1 multiply + "pld [%[in_ptr1], #128] \n\t" + + "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], #16 \n\t" + "vext.32 q8, q5, q6, #1 \n\t" + + "pld [%[in_ptr2], #128] \n\t" + "vmla.f32 q12, q5, d0[0] \n\t" + "vmla.f32 q13, q5, d4[0] \n\t" + + "vext.32 q10, q5, q6, #2 \n\t" + "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" + "add %[in_ptr2], #16 \n\t" + "vmla.f32 q12, q8, d0[1] \n\t" + "vmla.f32 q13, q8, d4[1] \n\t" + + "vmla.f32 q12, q10, d1[0] \n\t" + "vmla.f32 q13, q10, d5[0] \n\t" + + // in_ptr2 multiply + "vext.32 q8, q5, q6, #1 \n\t" + "pld [%[in_ptr3], #128] \n\t" + "vmla.f32 q12, q5, d1[1] \n\t" + "vmla.f32 q13, q5, d5[1] \n\t" + + "vext.32 q9, q5, q6, #2 \n\t" + "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" + "add %[in_ptr3], #16 \n\t" + "vmla.f32 q12, q8, d2[0] \n\t" + "vmla.f32 q13, q8, d6[0] \n\t" + + "vmla.f32 q12, q9, d2[1] \n\t" + "vmla.f32 q13, q9, d6[1] \n\t" + + // in_ptr3 multiply + "vext.32 q10, q6, q7, #2 \n\t" + "vmla.f32 q12, q7, d8[0] \n\t" + "vmla.f32 q13, q7, d8[1] \n\t" + + "vext.32 q11, q6, q7, #3 \n\t" + "vmla.f32 q12, q10, d3[0] \n\t" + "vmla.f32 q13, q10, d7[0] \n\t" + + "vmla.f32 q12, q11, d3[1] \n\t" + "vmla.f32 q13, q11, d7[1] \n\t" + + // store out_ptr + "subs %[loop], #1 \n\t" + "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" + "vst1.f32 {d26, d27}, [%[out_ptr1_c2]]! \n\t" + + // cycle + "bne 0b \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1), [f1_c2] "r"(f1_c2) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "q11", "q12", "q13"); + } + } + +#endif //__aarch64__ +#endif // __ARM_NEON + + // remain output_width + for (; o_w < output_w; ++o_w) { + float sum1 = 0; + float sum1_c2 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + + float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); + sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); + sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; +#endif + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + pad_filter0_c2--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr1_c2 += sum1_c2; + + out_ptr1++; + out_ptr1_c2++; + } + out_ptr1 += output_w; + out_ptr1_c2 += output_w; + } + filter_data_ch += filter_ch_size; + filter_data_ch_c2 += filter_ch_size; + input_data_ch += in_ch_size; + } + } + + int out_ch_remain_start = output_ch - output_ch % 2; + // remain output_channel + for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) { + bool issamefilter; + const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4; + const float *f1; + const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3; + float pad_filter_arr[pad_filter_ch_size]; + float *output_data_ch; + const float *input_data_ch; + const float *filter_data_ch; + + input_data_ch = input_data; + output_data_ch = output_data + o_c * out_ch_size; + filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; + + for (int i_c = 0; i_c < input_ch; ++i_c) { + f1 = filter_data_ch; + if (!if_nopadding) { + memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); + for (int i = 0; i < 9; ++i) { + int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + + padding_w * (2 * padding_h + 1); + pad_filter_arr[j] = filter_data_ch[i]; + } + pad_filter1 = pad_filter_arr; + pad_filter1 += pad_filter_start; + pad_filter0 = pad_filter1 - pad_filter_w; + pad_filter2 = pad_filter1 + pad_filter_w; + pad_filter3 = pad_filter2 + pad_filter_w; + + } else { + pad_filter1 = filter_data_ch; + pad_filter2 = pad_filter1 + 3; + pad_filter3 = pad_filter2 + 3; + } + float *out_ptr1, *out_ptr2; + out_ptr1 = output_data_ch; + out_ptr2 = out_ptr1 + output_w; + + in_ptr1 = input_data_ch; + in_ptr2 = in_ptr1 + input_w; + in_ptr3 = in_ptr2 + input_w; + in_ptr4 = in_ptr3 + input_w; + + int o_h = 0; + for (; o_h < output_h - 1; o_h = o_h + 2) { + if (!if_nopadding && + (o_h < padding_h || o_h > output_h - padding_h - 2)) { + issamefilter = false; + } else { + issamefilter = true; + } + int o_w = 0; + // pad left + for (; o_w < padding_w; ++o_w) { + float sum1 = 0; + float sum2 = 0; + + if (issamefilter) { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + + float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); + _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + sum2 += in_ptr4[0] * pad_filter3[0]; + sum2 += in_ptr4[1] * pad_filter3[1]; + sum2 += in_ptr4[2] * pad_filter3[2]; +#endif + } else { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr1[0] * pad_filter0[0]; + sum2 += in_ptr1[1] * pad_filter0[1]; + sum2 += in_ptr1[2] * pad_filter0[2]; + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; +#endif + } + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr2 += sum2; + + out_ptr1++; + out_ptr2++; + } + // valid +#if __ARM_NEON +#if __aarch64__ + if (issamefilter) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "prfm pldl1keep, [%[f1], #256] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" + "add %[f1], %[f1], #32 \n\t" + + "ld1 {v4.s}[0], [%[f1]] \n\t" + "sub %[f1],%[f1], #32 \n\t" + + "0: \n\t" + // load out_ptr + "prfm pldl1keep, [%[out_ptr1], #128] \n\t" + "prfm pldl1keep, [%[out_ptr2], #128] \n\t" + + "ld1 {v12.4s}, [%[out_ptr1]] \n\t" + "ld1 {v14.4s}, [%[out_ptr2]] \n\t" + + // in_ptr1 + in_ptr4 multiply + "prfm pldl1keep, [%[in_ptr1], #192] \n\t" + "prfm pldl1keep, [%[in_ptr4], #192] \n\t" + + "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" + "add %[in_ptr1],%[in_ptr1], #16 \n\t" + + "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + "ld1 {v7.4s}, [%[in_ptr4]] \n\t" + "add %[in_ptr4],%[in_ptr4], #8 \n\t" + + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[0] \n\t" + + "ext v9.16b, v6.16b, v7.16b, #8 \n\t" + "fmla v14.4s, v7.4s, v4.s[0] \n\t" + + "ext v10.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v0.s[1] \n\t" + + "ext v11.16b, v6.16b, v7.16b, #12 \n\t" + "fmla v14.4s, v9.4s, v1.s[2] \n\t" + + "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" + "add %[in_ptr2],%[in_ptr2], #16 \n\t" + + "fmla v12.4s, v10.4s, v0.s[2] \n\t" + "fmla v14.4s, v11.4s, v1.s[3] \n\t" + + // in_ptr2 multiply + "ext v8.16b, v5.16b, v6.16b, #4 \n\t" + "fmla v12.4s, v5.4s, v0.s[3] \n\t" + "fmla v14.4s, v5.4s, v0.s[0] \n\t" + + "ext v9.16b, v5.16b, v6.16b, #8 \n\t" + "fmla v12.4s, v8.4s, v1.s[0] \n\t" + "fmla v14.4s, v8.4s, v0.s[1] \n\t" + + "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + "ld1 {v7.4s}, [%[in_ptr3]] \n\t" + + "add %[in_ptr3],%[in_ptr3], #8 \n\t" + "fmla v12.4s, v9.4s, v1.s[1] \n\t" + "fmla v14.4s, v9.4s, v0.s[2] \n\t" + + // in_ptr3 multiply + "ext v10.16b, v6.16b, v7.16b, #8 \n\t" + "fmla v12.4s, v7.4s, v4.s[0] \n\t" + "fmla v14.4s, v7.4s, v1.s[1] \n\t" + + "ext v11.16b, v6.16b, v7.16b, #12 \n\t" + "fmla v12.4s, v10.4s, v1.s[2] \n\t" + "fmla v14.4s, v10.4s, v0.s[3] \n\t" + + "fmla v12.4s, v11.4s, v1.s[3] \n\t" + "fmla v14.4s, v11.4s, v1.s[0] \n\t" + + // store out_ptr + "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" + "st1 {v14.4s}, [%[out_ptr2]], #16 \n\t" + + // cycle + "subs %[loop],%[loop], #1 \n\t" + "bne 0b \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1) + : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v14"); + } + } + if (!if_nopadding && o_w == output_w - padding_w) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + in_ptr1--; + in_ptr2--; + in_ptr3--; + in_ptr4--; + } +#else + if (issamefilter) { + int loop = (output_w - 2 * padding_w) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "pld [%[f1], #256] \n\t" + "vld1.f32 {d0-d3}, [%[f1]] \n\t" + "add %[f1], #32 \n\t" + + "vld1.f32 {d8[0]}, [%[f1]] \n\t" + "sub %[f1], #32 \n\t" + + "0: \n\t" + // load out_ptr + "pld [%[out_ptr1], #128] \n\t" + "pld [%[out_ptr2], #128] \n\t" + + "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" + "vld1.f32 {d28, d29}, [%[out_ptr2]] \n\t" + + // in_ptr1 + in_ptr4 multiply + "pld [%[in_ptr1], #192] \n\t" + "pld [%[in_ptr4], #192] \n\t" + + "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], #16 \n\t" + + "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" + "add %[in_ptr4], #16 \n\t" + + "vext.32 q8, q5, q6, #1 \n\t" + "vmla.f32 q12, q5, d0[0] \n\t" + + "vext.32 q9, q6, q7, #2 \n\t" + "vmla.f32 q14, q7, d8[0] \n\t" + + "vext.32 q10, q5, q6, #2 \n\t" + "vmla.f32 q12, q8, d0[1] \n\t" + + "vext.32 q11, q6, q7, #3 \n\t" + "vmla.f32 q14, q9, d3[0] \n\t" + + "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" + "add %[in_ptr2], #16 \n\t" + + "vmla.f32 q12, q10, d1[0] \n\t" + "vmla.f32 q14, q11, d3[1] \n\t" + + // in_ptr2 multiply + "vext.32 q8, q5, q6, #1 \n\t" + "vmla.f32 q12, q5, d1[1] \n\t" + "vmla.f32 q14, q5, d0[0] \n\t" + + "vext.32 q9, q5, q6, #2 \n\t" + "vmla.f32 q12, q8, d2[0] \n\t" + "vmla.f32 q14, q8, d0[1] \n\t" + + "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" + "add %[in_ptr3], #16 \n\t" + + "vmla.f32 q12, q9, d2[1] \n\t" + "vmla.f32 q14, q9, d1[0] \n\t" + + // in_ptr3 multiply + "vext.32 q10, q6, q7, #2 \n\t" + "vmla.f32 q12, q7, d8[0] \n\t" + "vmla.f32 q14, q7, d2[1] \n\t" + + "vext.32 q11, q6, q7, #3 \n\t" + "vmla.f32 q12, q10, d3[0] \n\t" + "vmla.f32 q14, q10, d1[1] \n\t" + + "vmla.f32 q12, q11, d3[1] \n\t" + "vmla.f32 q14, q11, d2[0] \n\t" + + // store out_ptr + "subs %[loop], #1 \n\t" + "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" + "vst1.f32 {d28, d29}, [%[out_ptr2]]! \n\t" + + // cycle + "bne 0b \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + [in_ptr4] "+r"(in_ptr4) + : [f1] "r"(f1) + : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q14"); + } + } + if (!if_nopadding && o_w == output_w - padding_w) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + in_ptr1--; + in_ptr2--; + in_ptr3--; + in_ptr4--; + } +#endif //__aarch64__ +#endif // __ARM_NEON + + // remain output_width + for (; o_w < output_w; ++o_w) { + float sum1 = 0; + float sum2 = 0; + + if (issamefilter) { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + + float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); + _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; + sum2 += in_ptr4[0] * pad_filter3[0]; + sum2 += in_ptr4[1] * pad_filter3[1]; + sum2 += in_ptr4[2] * pad_filter3[2]; +#endif + } else { +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum2 = vsetq_lane_f32(sum2, _sum2, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss2 = + vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); + + sum1 += vget_lane_f32(_ssss1_ssss2, 0); + sum2 += vget_lane_f32(_ssss1_ssss2, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum2 += in_ptr1[0] * pad_filter0[0]; + sum2 += in_ptr1[1] * pad_filter0[1]; + sum2 += in_ptr1[2] * pad_filter0[2]; + sum2 += in_ptr2[0] * pad_filter1[0]; + sum2 += in_ptr2[1] * pad_filter1[1]; + sum2 += in_ptr2[2] * pad_filter1[2]; + sum2 += in_ptr3[0] * pad_filter2[0]; + sum2 += in_ptr3[1] * pad_filter2[1]; + sum2 += in_ptr3[2] * pad_filter2[2]; +#endif + } + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + *out_ptr2 += sum2; + + out_ptr1++; + out_ptr2++; + } + if (if_nopadding) { + in_ptr1 += 2 + input_w; + in_ptr2 += 2 + input_w; + in_ptr3 += 2 + input_w; + in_ptr4 += 2 + input_w; + } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) { + in_ptr1 += 3; + in_ptr2 += 3; + in_ptr3 += 3; + in_ptr4 += 3; + + pad_filter0 -= 2; + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + + } else if (issamefilter) { + in_ptr1 += 3 + input_w; + in_ptr2 += 3 + input_w; + in_ptr3 += 3 + input_w; + in_ptr4 += 3 + input_w; + + pad_filter0 += 2 * padding_w + 1; + pad_filter1 += 2 * padding_w + 1; + pad_filter2 += 2 * padding_w + 1; + pad_filter3 += 2 * padding_w + 1; + + } else { + pad_filter0 -= 3 + 2 * padding_w + 2; + pad_filter1 -= 3 + 2 * padding_w + 2; + pad_filter2 -= 3 + 2 * padding_w + 2; + pad_filter3 -= 3 + 2 * padding_w + 2; + + in_ptr1 -= input_w - 3; + in_ptr2 -= input_w - 3; + in_ptr3 -= input_w - 3; + in_ptr4 -= input_w - 3; + } + out_ptr1 += output_w; + out_ptr2 += output_w; + } + + // remain output_height + for (; o_h < output_h; ++o_h) { + for (int o_w = 0; o_w < output_w; ++o_w) { + float sum1 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); + sum1 += vget_lane_f32(_ssss1_ssss1, 0); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; +#endif + if (!if_nopadding && + (o_w < padding_w || o_w > output_w - padding_w - 2)) { + pad_filter0--; + pad_filter1--; + pad_filter2--; + pad_filter3--; + + } else { + in_ptr1++; + in_ptr2++; + in_ptr3++; + in_ptr4++; + } + *out_ptr1 += sum1; + out_ptr1++; + } + out_ptr1 += output_w; + } + filter_data_ch += filter_ch_size; + input_data_ch += in_ch_size; + } + } + input_data += in_batch_size; + output_data += out_batch_size; + } +} + +template <> +void SlidingwindowConv3x3s2(const framework::Tensor *input, + const framework::Tensor *filter, + const std::vector &paddings, + framework::Tensor *output) { + const int batch = input->dims()[0]; + const int input_ch = input->dims()[1]; + const int input_h = input->dims()[2]; + const int input_w = input->dims()[3]; + const int output_ch = output->dims()[1]; + const int output_h = output->dims()[2]; + const int output_w = output->dims()[3]; + const int padding_h = paddings[0]; + const int padding_w = paddings[1]; + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + const float *filter_data = filter->data(); + + const int in_ch_size = input_h * input_w; + const int in_batch_size = input_ch * in_ch_size; + const int out_ch_size = output_h * output_w; + const int out_batch_size = output_ch * out_ch_size; + const int out_size = batch * out_batch_size; + const int filter_ch_size = 9; + const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3); + const int pad_filter_start = + 2 * padding_h * (2 * padding_w + 3) + 2 * padding_w; + const int pad_filter_w = 3 + padding_w * 2; + + bool if_nopadding = false; + const bool if_exact_in_w = (input_w + 2 * padding_w - 3) % 2 == 0; + const bool if_exact_in_h = (input_h + 2 * padding_h - 3) % 2 == 0; + const bool if_odd_pad_w = padding_w % 2 == 1; + const bool if_odd_pad_h = padding_h % 2 == 1; + + int valid_w_start = padding_w >> 1; + int valid_h_start = padding_h >> 1; + int valid_w_end = output_w - valid_w_start - 2; + int valid_h_end = output_h - valid_h_start - 2; + const int remain_stride_w = input_w + 2 * padding_w - 2 * output_w; +#if __ARM_NEON + float *out_ptr = output_data; + int remain = out_size & 0x3; + float32x4_t _zero = vdupq_n_f32(0.0); + + for (int i = 0; i < out_size; i += 4) { + vst1q_f32(out_ptr, _zero); + out_ptr += 4; + } + switch (remain) { + case 1: + vst1q_lane_f32(out_ptr, _zero, 0); + break; + case 2: + vst1_f32(out_ptr, vget_low_f32(_zero)); + break; + case 3: + vst1_f32(out_ptr, vget_low_f32(_zero)); + vst1q_lane_f32(out_ptr + 2, _zero, 0); + break; + } +#else +#pragma omp parallel for + for (int i = 0; i < out_size; ++i) { + output_data[i] = 0; + } +#endif + + if (padding_h == 0 && padding_w == 0) { + if_nopadding = true; + valid_w_start = -1; + valid_h_start = -1; + valid_w_end = output_w; + valid_h_end = output_h; + } + + for (int b = 0; b < batch; ++b) { +#pragma omp parallel for + for (int o_c = 0; o_c < output_ch - 7; o_c += 8) { + const float *f1; + const float *in_ptr1, *in_ptr2, *in_ptr3; + const float *pad_filter1, *pad_filter2, *pad_filter3; + const float *pad_filter1_c2, *pad_filter2_c2, *pad_filter3_c2; + const float *pad_filter1_c3, *pad_filter2_c3, *pad_filter3_c3; + const float *pad_filter1_c4, *pad_filter2_c4, *pad_filter3_c4; + const float *pad_filter1_c5, *pad_filter2_c5, *pad_filter3_c5; + const float *pad_filter1_c6, *pad_filter2_c6, *pad_filter3_c6; + const float *pad_filter1_c7, *pad_filter2_c7, *pad_filter3_c7; + const float *pad_filter1_c8, *pad_filter2_c8, *pad_filter3_c8; + + float reform_filter_arr[72]; + float pad_filter_arr[pad_filter_ch_size]; + float pad_filter_arr_c2[pad_filter_ch_size]; + float pad_filter_arr_c3[pad_filter_ch_size]; + float pad_filter_arr_c4[pad_filter_ch_size]; + float pad_filter_arr_c5[pad_filter_ch_size]; + float pad_filter_arr_c6[pad_filter_ch_size]; + float pad_filter_arr_c7[pad_filter_ch_size]; + float pad_filter_arr_c8[pad_filter_ch_size]; + + float *output_data_ch; + float *output_data_ch_2; + float *output_data_ch_3; + float *output_data_ch_4; + float *output_data_ch_5; + float *output_data_ch_6; + float *output_data_ch_7; + float *output_data_ch_8; + + const float *input_data_ch; + const float *filter_data_ch; + const float *filter_data_ch_c2; + const float *filter_data_ch_c3; + const float *filter_data_ch_c4; + const float *filter_data_ch_c5; + const float *filter_data_ch_c6; + const float *filter_data_ch_c7; + const float *filter_data_ch_c8; + + filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; + filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch; + filter_data_ch_c3 = filter_data + (o_c + 2) * filter_ch_size * input_ch; + filter_data_ch_c4 = filter_data + (o_c + 3) * filter_ch_size * input_ch; + filter_data_ch_c5 = filter_data + (o_c + 4) * filter_ch_size * input_ch; + filter_data_ch_c6 = filter_data + (o_c + 5) * filter_ch_size * input_ch; + filter_data_ch_c7 = filter_data + (o_c + 6) * filter_ch_size * input_ch; + filter_data_ch_c8 = filter_data + (o_c + 7) * filter_ch_size * input_ch; + + input_data_ch = input_data; + output_data_ch = output_data + o_c * out_ch_size; + output_data_ch_2 = output_data + (o_c + 1) * out_ch_size; + output_data_ch_3 = output_data + (o_c + 2) * out_ch_size; + output_data_ch_4 = output_data + (o_c + 3) * out_ch_size; + output_data_ch_5 = output_data + (o_c + 4) * out_ch_size; + output_data_ch_6 = output_data + (o_c + 5) * out_ch_size; + output_data_ch_7 = output_data + (o_c + 6) * out_ch_size; + output_data_ch_8 = output_data + (o_c + 7) * out_ch_size; + + for (int i_c = 0; i_c < input_ch; ++i_c) { + int k = 0; + for (int i = 0; i < 9; ++i) { + for (int j = 0; j < 8; ++j) { + reform_filter_arr[k++] = filter_data_ch[i + input_ch * 9 * j]; + } + } + + f1 = reform_filter_arr; + + if (!if_nopadding) { + memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); + memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2)); + memset(pad_filter_arr_c3, 0.f, sizeof(pad_filter_arr_c3)); + memset(pad_filter_arr_c4, 0.f, sizeof(pad_filter_arr_c4)); + memset(pad_filter_arr_c5, 0.f, sizeof(pad_filter_arr_c5)); + memset(pad_filter_arr_c6, 0.f, sizeof(pad_filter_arr_c6)); + memset(pad_filter_arr_c7, 0.f, sizeof(pad_filter_arr_c7)); + memset(pad_filter_arr_c8, 0.f, sizeof(pad_filter_arr_c8)); + + for (int i = 0; i < 9; ++i) { + int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + + padding_w * (2 * padding_h + 1); + pad_filter_arr[j] = filter_data_ch[i]; + pad_filter_arr_c2[j] = filter_data_ch_c2[i]; + pad_filter_arr_c3[j] = filter_data_ch_c3[i]; + pad_filter_arr_c4[j] = filter_data_ch_c4[i]; + pad_filter_arr_c5[j] = filter_data_ch_c5[i]; + pad_filter_arr_c6[j] = filter_data_ch_c6[i]; + pad_filter_arr_c7[j] = filter_data_ch_c7[i]; + pad_filter_arr_c8[j] = filter_data_ch_c8[i]; + } + + pad_filter1 = pad_filter_arr; + pad_filter1 += pad_filter_start; + pad_filter2 = pad_filter1 + pad_filter_w; + pad_filter3 = pad_filter2 + pad_filter_w; + + pad_filter1_c2 = pad_filter_arr_c2; + pad_filter1_c2 += pad_filter_start; + pad_filter2_c2 = pad_filter1_c2 + pad_filter_w; + pad_filter3_c2 = pad_filter2_c2 + pad_filter_w; + + pad_filter1_c3 = pad_filter_arr_c3; + pad_filter1_c3 += pad_filter_start; + pad_filter2_c3 = pad_filter1_c3 + pad_filter_w; + pad_filter3_c3 = pad_filter2_c3 + pad_filter_w; + + pad_filter1_c4 = pad_filter_arr_c4; + pad_filter1_c4 += pad_filter_start; + pad_filter2_c4 = pad_filter1_c4 + pad_filter_w; + pad_filter3_c4 = pad_filter2_c4 + pad_filter_w; + + pad_filter1_c5 = pad_filter_arr_c5; + pad_filter1_c5 += pad_filter_start; + pad_filter2_c5 = pad_filter1_c5 + pad_filter_w; + pad_filter3_c5 = pad_filter2_c5 + pad_filter_w; + + pad_filter1_c6 = pad_filter_arr_c6; + pad_filter1_c6 += pad_filter_start; + pad_filter2_c6 = pad_filter1_c6 + pad_filter_w; + pad_filter3_c6 = pad_filter2_c6 + pad_filter_w; + + pad_filter1_c7 = pad_filter_arr_c7; + pad_filter1_c7 += pad_filter_start; + pad_filter2_c7 = pad_filter1_c7 + pad_filter_w; + pad_filter3_c7 = pad_filter2_c7 + pad_filter_w; + + pad_filter1_c8 = pad_filter_arr_c8; + pad_filter1_c8 += pad_filter_start; + pad_filter2_c8 = pad_filter1_c8 + pad_filter_w; + pad_filter3_c8 = pad_filter2_c8 + pad_filter_w; + } else { + pad_filter1 = filter_data_ch; + pad_filter2 = pad_filter1 + 3; + pad_filter3 = pad_filter2 + 3; + + pad_filter1_c2 = filter_data_ch_c2; + pad_filter2_c2 = pad_filter1_c2 + 3; + pad_filter3_c2 = pad_filter2_c2 + 3; + + pad_filter1_c3 = filter_data_ch_c3; + pad_filter2_c3 = pad_filter1_c3 + 3; + pad_filter3_c3 = pad_filter2_c3 + 3; + + pad_filter1_c4 = filter_data_ch_c4; + pad_filter2_c4 = pad_filter1_c4 + 3; + pad_filter3_c4 = pad_filter2_c4 + 3; + + pad_filter1_c5 = filter_data_ch_c5; + pad_filter2_c5 = pad_filter1_c5 + 3; + pad_filter3_c5 = pad_filter2_c5 + 3; + + pad_filter1_c6 = filter_data_ch_c6; + pad_filter2_c6 = pad_filter1_c6 + 3; + pad_filter3_c6 = pad_filter2_c6 + 3; + + pad_filter1_c7 = filter_data_ch_c7; + pad_filter2_c7 = pad_filter1_c7 + 3; + pad_filter3_c7 = pad_filter2_c7 + 3; + + pad_filter1_c8 = filter_data_ch_c8; + pad_filter2_c8 = pad_filter1_c8 + 3; + pad_filter3_c8 = pad_filter2_c8 + 3; + } + float *out_ptr1; + float *out_ptr1_c2; + float *out_ptr1_c3; + float *out_ptr1_c4; + float *out_ptr1_c5; + float *out_ptr1_c6; + float *out_ptr1_c7; + float *out_ptr1_c8; + + out_ptr1 = output_data_ch; + out_ptr1_c2 = output_data_ch_2; + out_ptr1_c3 = output_data_ch_3; + out_ptr1_c4 = output_data_ch_4; + out_ptr1_c5 = output_data_ch_5; + out_ptr1_c6 = output_data_ch_6; + out_ptr1_c7 = output_data_ch_7; + out_ptr1_c8 = output_data_ch_8; + + in_ptr1 = input_data_ch; + in_ptr2 = in_ptr1 + input_w; + in_ptr3 = in_ptr2 + input_w; + + int o_h = 0; + + for (; o_h < output_h; ++o_h) { + int o_w = 0; + + // pad left + for (; o_w <= valid_w_start; ++o_w) { + float sum1 = 0; + float sum1_c2 = 0; + float sum1_c3 = 0; + float sum1_c4 = 0; + float sum1_c5 = 0; + float sum1_c6 = 0; + float sum1_c7 = 0; + float sum1_c8 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3); + float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4); + float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5); + float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6); + float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7); + float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3); + float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4); + float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5); + float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6); + float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7); + float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3); + float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4); + float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5); + float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6); + float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7); + float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3); + _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4); + _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5); + _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6); + _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7); + _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3); + float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4); + float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5); + float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6); + float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7); + float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3); + _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4); + _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5); + _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6); + _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7); + _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3); + _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3); + _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3); + _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3); + _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3); + _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss1_3 = + vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3)); + float32x2_t _ss1_4 = + vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4)); + float32x2_t _ss1_5 = + vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5)); + float32x2_t _ss1_6 = + vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6)); + float32x2_t _ss1_7 = + vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7)); + float32x2_t _ss1_8 = + vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8)); + + float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); + float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4); + float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6); + float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8); + + sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); + sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); + sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0); + sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1); + sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0); + sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1); + sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0); + sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum1_c3 += in_ptr1[0] * pad_filter1_c3[0]; + sum1_c3 += in_ptr1[1] * pad_filter1_c3[1]; + sum1_c3 += in_ptr1[2] * pad_filter1_c3[2]; + sum1_c3 += in_ptr2[0] * pad_filter2_c3[0]; + sum1_c3 += in_ptr2[1] * pad_filter2_c3[1]; + sum1_c3 += in_ptr2[2] * pad_filter2_c3[2]; + sum1_c3 += in_ptr3[0] * pad_filter3_c3[0]; + sum1_c3 += in_ptr3[1] * pad_filter3_c3[1]; + sum1_c3 += in_ptr3[2] * pad_filter3_c3[2]; + + sum1_c4 += in_ptr1[0] * pad_filter1_c4[0]; + sum1_c4 += in_ptr1[1] * pad_filter1_c4[1]; + sum1_c4 += in_ptr1[2] * pad_filter1_c4[2]; + sum1_c4 += in_ptr2[0] * pad_filter2_c4[0]; + sum1_c4 += in_ptr2[1] * pad_filter2_c4[1]; + sum1_c4 += in_ptr2[2] * pad_filter2_c4[2]; + sum1_c4 += in_ptr3[0] * pad_filter3_c4[0]; + sum1_c4 += in_ptr3[1] * pad_filter3_c4[1]; + sum1_c4 += in_ptr3[2] * pad_filter3_c4[2]; + + sum1_c5 += in_ptr1[0] * pad_filter1_c5[0]; + sum1_c5 += in_ptr1[1] * pad_filter1_c5[1]; + sum1_c5 += in_ptr1[2] * pad_filter1_c5[2]; + sum1_c5 += in_ptr2[0] * pad_filter2_c5[0]; + sum1_c5 += in_ptr2[1] * pad_filter2_c5[1]; + sum1_c5 += in_ptr2[2] * pad_filter2_c5[2]; + sum1_c5 += in_ptr3[0] * pad_filter3_c5[0]; + sum1_c5 += in_ptr3[1] * pad_filter3_c5[1]; + sum1_c5 += in_ptr3[2] * pad_filter3_c5[2]; + + sum1_c6 += in_ptr1[0] * pad_filter1_c6[0]; + sum1_c6 += in_ptr1[1] * pad_filter1_c6[1]; + sum1_c6 += in_ptr1[2] * pad_filter1_c6[2]; + sum1_c6 += in_ptr2[0] * pad_filter2_c6[0]; + sum1_c6 += in_ptr2[1] * pad_filter2_c6[1]; + sum1_c6 += in_ptr2[2] * pad_filter2_c6[2]; + sum1_c6 += in_ptr3[0] * pad_filter3_c6[0]; + sum1_c6 += in_ptr3[1] * pad_filter3_c6[1]; + sum1_c6 += in_ptr3[2] * pad_filter3_c6[2]; + + sum1_c7 += in_ptr1[0] * pad_filter1_c7[0]; + sum1_c7 += in_ptr1[1] * pad_filter1_c7[1]; + sum1_c7 += in_ptr1[2] * pad_filter1_c7[2]; + sum1_c7 += in_ptr2[0] * pad_filter2_c7[0]; + sum1_c7 += in_ptr2[1] * pad_filter2_c7[1]; + sum1_c7 += in_ptr2[2] * pad_filter2_c7[2]; + sum1_c7 += in_ptr3[0] * pad_filter3_c7[0]; + sum1_c7 += in_ptr3[1] * pad_filter3_c7[1]; + sum1_c7 += in_ptr3[2] * pad_filter3_c7[2]; + + sum1_c8 += in_ptr1[0] * pad_filter1_c8[0]; + sum1_c8 += in_ptr1[1] * pad_filter1_c8[1]; + sum1_c8 += in_ptr1[2] * pad_filter1_c8[2]; + sum1_c8 += in_ptr2[0] * pad_filter2_c8[0]; + sum1_c8 += in_ptr2[1] * pad_filter2_c8[1]; + sum1_c8 += in_ptr2[2] * pad_filter2_c8[2]; + sum1_c8 += in_ptr3[0] * pad_filter3_c8[0]; + sum1_c8 += in_ptr3[1] * pad_filter3_c8[1]; + sum1_c8 += in_ptr3[2] * pad_filter3_c8[2]; +#endif + if (if_nopadding) { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + + } else if (input_w > 3 && + (if_odd_pad_w && o_w == valid_w_start || + o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || + o_w == valid_w_end + 1 && !if_odd_pad_w && + !if_exact_in_w)) { + pad_filter1--; + pad_filter2--; + pad_filter3--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + + pad_filter1_c3--; + pad_filter2_c3--; + pad_filter3_c3--; + pad_filter1_c4--; + pad_filter2_c4--; + pad_filter3_c4--; + + pad_filter1_c5--; + pad_filter2_c5--; + pad_filter3_c5--; + pad_filter1_c6--; + pad_filter2_c6--; + pad_filter3_c6--; + + pad_filter1_c7--; + pad_filter2_c7--; + pad_filter3_c7--; + pad_filter1_c8--; + pad_filter2_c8--; + pad_filter3_c8--; + + in_ptr1++; + in_ptr2++; + in_ptr3++; + + } else if (input_w <= 3 || o_w < valid_w_start || + o_w > valid_w_end) { + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + pad_filter1_c2 -= 2; + pad_filter2_c2 -= 2; + pad_filter3_c2 -= 2; + + pad_filter1_c3 -= 2; + pad_filter2_c3 -= 2; + pad_filter3_c3 -= 2; + pad_filter1_c4 -= 2; + pad_filter2_c4 -= 2; + pad_filter3_c4 -= 2; + + pad_filter1_c5 -= 2; + pad_filter2_c5 -= 2; + pad_filter3_c5 -= 2; + pad_filter1_c6 -= 2; + pad_filter2_c6 -= 2; + pad_filter3_c6 -= 2; + + pad_filter1_c7 -= 2; + pad_filter2_c7 -= 2; + pad_filter3_c7 -= 2; + pad_filter1_c8 -= 2; + pad_filter2_c8 -= 2; + pad_filter3_c8 -= 2; + } else { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } + *out_ptr1 += sum1; + *out_ptr1_c2 += sum1_c2; + *out_ptr1_c3 += sum1_c3; + *out_ptr1_c4 += sum1_c4; + *out_ptr1_c5 += sum1_c5; + *out_ptr1_c6 += sum1_c6; + *out_ptr1_c7 += sum1_c7; + *out_ptr1_c8 += sum1_c8; + + out_ptr1++; + out_ptr1_c2++; + out_ptr1_c3++; + out_ptr1_c4++; + out_ptr1_c5++; + out_ptr1_c6++; + out_ptr1_c7++; + out_ptr1_c8++; + } + // valid +#if __ARM_NEON +#if __aarch64__ + if (o_h > valid_h_start && o_h <= valid_h_end) { + int loop = (valid_w_end - valid_w_start - 1) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[in_ptr1], #288] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32 \n\t" + "ld2 {v6.4s, v7.4s}, [%[in_ptr1]] \n\t" + "0: \n\t" + // load out_ptr + "prfm pldl1keep, [%[out_ptr1], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c3], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c4], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c5], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c6], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c7], #128] \n\t" + "prfm pldl1keep, [%[out_ptr1_c8], #128] \n\t" + + "ld1 {v8.4s}, [%[out_ptr1]] \n\t" + "ld1 {v9.4s}, [%[out_ptr1_c2]] \n\t" + "ld1 {v10.4s}, [%[out_ptr1_c3]] \n\t" + "ld1 {v11.4s}, [%[out_ptr1_c4]] \n\t" + "ld1 {v12.4s}, [%[out_ptr1_c5]] \n\t" + "ld1 {v13.4s}, [%[out_ptr1_c6]] \n\t" + "ld1 {v14.4s}, [%[out_ptr1_c7]] \n\t" + "ld1 {v15.4s}, [%[out_ptr1_c8]] \n\t" + + // in_ptr1 multiply + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" + "fmla v8.4s, v4.4s, v0.s[0] \n\t" + "fmla v9.4s, v4.4s, v0.s[1] \n\t" + "fmla v10.4s, v4.4s, v0.s[2] \n\t" + "fmla v11.4s, v4.4s, v0.s[3] \n\t" + + "fmla v12.4s, v4.4s, v1.s[0] \n\t" + "fmla v13.4s, v4.4s, v1.s[1] \n\t" + "fmla v14.4s, v4.4s, v1.s[2] \n\t" + "fmla v15.4s, v4.4s, v1.s[3] \n\t" + + "ext v7.16b, v4.16b, v6.16b, #4 \n\t" + "fmla v8.4s, v5.4s, v2.s[0] \n\t" + "fmla v9.4s, v5.4s, v2.s[1] \n\t" + "fmla v10.4s, v5.4s, v2.s[2] \n\t" + "fmla v11.4s, v5.4s, v2.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "fmla v12.4s, v5.4s, v3.s[0] \n\t" + "fmla v13.4s, v5.4s, v3.s[1] \n\t" + "fmla v14.4s, v5.4s, v3.s[2] \n\t" + "fmla v15.4s, v5.4s, v3.s[3] \n\t" + + "prfm pldl1keep, [%[in_ptr2], #288] \n\t" + "ld2 {v4.4s, v5.4s}, [%[in_ptr2]], #32 \n\t" + "fmla v8.4s, v7.4s, v0.s[0] \n\t" + "fmla v9.4s, v7.4s, v0.s[1] \n\t" + "fmla v10.4s, v7.4s, v0.s[2] \n\t" + "fmla v11.4s, v7.4s, v0.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" + + "fmla v12.4s, v7.4s, v1.s[0] \n\t" + "fmla v13.4s, v7.4s, v1.s[1] \n\t" + "fmla v14.4s, v7.4s, v1.s[2] \n\t" + "fmla v15.4s, v7.4s, v1.s[3] \n\t" + + // in_ptr2 multiply + "ld2 {v6.4s, v7.4s}, [%[in_ptr2]] \n\t" + "fmla v8.4s, v4.4s, v2.s[0] \n\t" + "fmla v9.4s, v4.4s, v2.s[1] \n\t" + "fmla v10.4s, v4.4s, v2.s[2] \n\t" + "fmla v11.4s, v4.4s, v2.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "fmla v12.4s, v4.4s, v3.s[0] \n\t" + "fmla v13.4s, v4.4s, v3.s[1] \n\t" + "fmla v14.4s, v4.4s, v3.s[2] \n\t" + "fmla v15.4s, v4.4s, v3.s[3] \n\t" + + "ext v7.16b, v4.16b, v6.16b, #4 \n\t" + "fmla v8.4s, v5.4s, v0.s[0] \n\t" + "fmla v9.4s, v5.4s, v0.s[1] \n\t" + "fmla v10.4s, v5.4s, v0.s[2] \n\t" + "fmla v11.4s, v5.4s, v0.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" + "fmla v12.4s, v5.4s, v1.s[0] \n\t" + "fmla v13.4s, v5.4s, v1.s[1] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[in_ptr3], #288] \n\t" + "fmla v14.4s, v5.4s, v1.s[2] \n\t" + "fmla v15.4s, v5.4s, v1.s[3] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "ld2 {v4.4s, v5.4s}, [%[in_ptr3]], #32 \n\t" + "fmla v8.4s, v7.4s, v2.s[0] \n\t" + "fmla v9.4s, v7.4s, v2.s[1] \n\t" + "fmla v10.4s, v7.4s, v2.s[2] \n\t" + "fmla v11.4s, v7.4s, v2.s[3] \n\t" + + "fmla v12.4s, v7.4s, v3.s[0] \n\t" + "fmla v13.4s, v7.4s, v3.s[1] \n\t" + "fmla v14.4s, v7.4s, v3.s[2] \n\t" + "fmla v15.4s, v7.4s, v3.s[3] \n\t" + + // in_ptr3 multiply + "ld2 {v6.4s, v7.4s}, [%[in_ptr3]] \n\t" + "fmla v8.4s, v4.4s, v0.s[0] \n\t" + "fmla v9.4s, v4.4s, v0.s[1] \n\t" + "fmla v10.4s, v4.4s, v0.s[2] \n\t" + "fmla v11.4s, v4.4s, v0.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" + "fmla v12.4s, v4.4s, v1.s[0] \n\t" + "fmla v13.4s, v4.4s, v1.s[1] \n\t" + "fmla v14.4s, v4.4s, v1.s[2] \n\t" + "fmla v15.4s, v4.4s, v1.s[3] \n\t" + + "ext v7.16b, v4.16b, v6.16b, #4 \n\t" + "fmla v8.4s, v5.4s, v2.s[0] \n\t" + "fmla v9.4s, v5.4s, v2.s[1] \n\t" + "fmla v10.4s, v5.4s, v2.s[2] \n\t" + "fmla v11.4s, v5.4s, v2.s[3] \n\t" + + "prfm pldl1keep, [%[f1], #256] \n\t" + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + "fmla v12.4s, v5.4s, v3.s[0] \n\t" + "fmla v13.4s, v5.4s, v3.s[1] \n\t" + "fmla v14.4s, v5.4s, v3.s[2] \n\t" + "fmla v15.4s, v5.4s, v3.s[3] \n\t" + + "sub %[f1], %[f1], #288 \n\t" + "fmla v8.4s, v7.4s, v0.s[0] \n\t" + "fmla v9.4s, v7.4s, v0.s[1] \n\t" + "fmla v10.4s, v7.4s, v0.s[2] \n\t" + "fmla v11.4s, v7.4s, v0.s[3] \n\t" + + "fmla v12.4s, v7.4s, v1.s[0] \n\t" + "fmla v13.4s, v7.4s, v1.s[1] \n\t" + "fmla v14.4s, v7.4s, v1.s[2] \n\t" + "fmla v15.4s, v7.4s, v1.s[3] \n\t" + + // store out_ptr + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[in_ptr1], #288] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" + + "ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32 \n\t" + "st1 {v8.4s}, [%[out_ptr1]], #16 \n\t" + "st1 {v9.4s}, [%[out_ptr1_c2]], #16 \n\t" + + "st1 {v10.4s}, [%[out_ptr1_c3]], #16 \n\t" + "st1 {v11.4s}, [%[out_ptr1_c4]], #16 \n\t" + + "st1 {v12.4s}, [%[out_ptr1_c5]], #16 \n\t" + "st1 {v13.4s}, [%[out_ptr1_c6]], #16 \n\t" + + "ld2 {v6.4s, v7.4s}, [%[in_ptr1]] \n\t" + "st1 {v14.4s}, [%[out_ptr1_c7]], #16 \n\t" + "subs %[loop], %[loop], #1 \n\t" + "st1 {v15.4s}, [%[out_ptr1_c8]], #16 \n\t" + + // cycle + "bne 0b \n\t" + "sub %[f1], %[in_ptr1], #32 \n\t" + "sub %[in_ptr1], %[in_ptr1], #32 \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr1_c3] "+r"(out_ptr1_c3), + [out_ptr1_c4] "+r"(out_ptr1_c4), + [out_ptr1_c5] "+r"(out_ptr1_c5), + [out_ptr1_c6] "+r"(out_ptr1_c6), + [out_ptr1_c7] "+r"(out_ptr1_c7), + [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3) + : [f1] "r"(f1) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + } + } +#else + if (o_h > valid_h_start && o_h <= valid_h_end) { + int loop = (valid_w_end - valid_w_start - 1) >> 2; + o_w += loop * 4; + int in_stride = (input_w - 8) * 4; + + if (loop > 0) { + asm volatile( + + "pld [%[f1], #256] \n\t" + "pld [%[in_ptr1], #288] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" + "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], %[in_stride] \n\t" + + "0: \n\t" + // load out_ptr + "pld [%[out_ptr1], #128] \n\t" + "pld [%[out_ptr1_c2], #128] \n\t" + "pld [%[out_ptr1_c3], #128] \n\t" + "pld [%[out_ptr1_c4], #128] \n\t" + "pld [%[out_ptr1_c5], #128] \n\t" + "pld [%[out_ptr1_c6], #128] \n\t" + "pld [%[out_ptr1_c7], #128] \n\t" + "pld [%[out_ptr1_c8], #128] \n\t" + + "vld1.f32 {d16, d17}, [%[out_ptr1]] \n\t" + "vld1.f32 {d18, d19}, [%[out_ptr1_c2]] \n\t" + "vld1.f32 {d20, d21}, [%[out_ptr1_c3]] \n\t" + "vld1.f32 {d22, d23}, [%[out_ptr1_c4]] \n\t" + "vld1.f32 {d24, d25}, [%[out_ptr1_c5]] \n\t" + "vld1.f32 {d26, d27}, [%[out_ptr1_c6]] \n\t" + "vld1.f32 {d28, d29}, [%[out_ptr1_c7]] \n\t" + "vld1.f32 {d30, d31}, [%[out_ptr1_c8]] \n\t" + + // in_ptr1 multiply + "pld [%[f1], #256] \n\t" + "vld1.f32 {d4-d7}, [%[f1]]! \n\t" + "vmla.f32 q8, q4, d0[0] \n\t" + "vmla.f32 q9, q4, d0[1] \n\t" + + "vmla.f32 q10, q4, d1[0] \n\t" + "vmla.f32 q11, q4, d1[1] \n\t" + + "vmla.f32 q12, q4, d2[0] \n\t" + "vmla.f32 q13, q4, d2[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vmla.f32 q14, q4, d3[0] \n\t" + "vmla.f32 q15, q4, d3[1] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + "vmla.f32 q8, q5, d4[0] \n\t" + "vmla.f32 q9, q5, d4[1] \n\t" + + "vext.32 q7, q4, q6, #1 \n\t" + "vmla.f32 q10, q5, d5[0] \n\t" + "vmla.f32 q11, q5, d5[1] \n\t" + + "vmla.f32 q12, q5, d6[0] \n\t" + "vmla.f32 q13, q5, d6[1] \n\t" + + "pld [%[in_ptr1], #288] \n\t" + "vmla.f32 q14, q5, d7[0] \n\t" + "vmla.f32 q15, q5, d7[1] \n\t" + + "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" + "vmla.f32 q8, q7, d0[0] \n\t" + "vmla.f32 q9, q7, d0[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vld1.f32 {d4-d7}, [%[f1]]! \n\t" + "vmla.f32 q10, q7, d1[0] \n\t" + "vmla.f32 q11, q7, d1[1] \n\t" + + "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], %[in_stride] \n\t" + "vmla.f32 q12, q7, d2[0] \n\t" + "vmla.f32 q13, q7, d2[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vmla.f32 q14, q7, d3[0] \n\t" + "vmla.f32 q15, q7, d3[1] \n\t" + + // in_ptr2 multiply + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + "vmla.f32 q8, q4, d4[0] \n\t" + "vmla.f32 q9, q4, d4[1] \n\t" + + "vmla.f32 q10, q4, d5[0] \n\t" + "vmla.f32 q11, q4, d5[1] \n\t" + + "vmla.f32 q12, q4, d6[0] \n\t" + "vmla.f32 q13, q4, d6[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vmla.f32 q14, q4, d7[0] \n\t" + "vmla.f32 q15, q4, d7[1] \n\t" + + "vld1.f32 {d4-d7}, [%[f1]]! \n\t" + "vmla.f32 q8, q5, d0[0] \n\t" + "vmla.f32 q9, q5, d0[1] \n\t" + + "vext.32 q7, q4, q6, #1 \n\t" + "vmla.f32 q10, q5, d1[0] \n\t" + "vmla.f32 q11, q5, d1[1] \n\t" + + "vmla.f32 q12, q5, d2[0] \n\t" + "vmla.f32 q13, q5, d2[1] \n\t" + + "pld [%[in_ptr1], #288] \n\t" + "vmla.f32 q14, q5, d3[0] \n\t" + "vmla.f32 q15, q5, d3[1] \n\t" + + "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" + "vmla.f32 q8, q7, d4[0] \n\t" + "vmla.f32 q9, q7, d4[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + "vmla.f32 q10, q7, d5[0] \n\t" + "vmla.f32 q11, q7, d5[1] \n\t" + + "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" + "sub %[in_ptr1], %[in_stride] \n\t" + "sub %[in_ptr1], %[in_stride] \n\t" + "vmla.f32 q12, q7, d6[0] \n\t" + "vmla.f32 q13, q7, d6[1] \n\t" + + "sub %[in_ptr1], #64 \n\t" + "pld [%[f1], #256] \n\t" + "vmla.f32 q14, q7, d7[0] \n\t" + "vmla.f32 q15, q7, d7[1] \n\t" + + // in_ptr3 multiply + "vld1.f32 {d4-d7}, [%[f1]]! \n\t" + "vmla.f32 q8, q4, d0[0] \n\t" + "vmla.f32 q9, q4, d0[1] \n\t" + + "vmla.f32 q10, q4, d1[0] \n\t" + "vmla.f32 q11, q4, d1[1] \n\t" + + "vmla.f32 q12, q4, d2[0] \n\t" + "vmla.f32 q13, q4, d2[1] \n\t" + + "pld [%[f1], #256] \n\t" + "vmla.f32 q14, q4, d3[0] \n\t" + "vmla.f32 q15, q4, d3[1] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + "vmla.f32 q8, q5, d4[0] \n\t" + "vmla.f32 q9, q5, d4[1] \n\t" + + "vext.32 q7, q4, q6, #1 \n\t" + "vmla.f32 q10, q5, d5[0] \n\t" + "vmla.f32 q11, q5, d5[1] \n\t" + + "vmla.f32 q12, q5, d6[0] \n\t" + "vmla.f32 q13, q5, d6[1] \n\t" + + "vmla.f32 q14, q5, d7[0] \n\t" + "vmla.f32 q15, q5, d7[1] \n\t" + + "sub %[f1], %[f1], #288 \n\t" + "vmla.f32 q8, q7, d0[0] \n\t" + "vmla.f32 q9, q7, d0[1] \n\t" + + "vmla.f32 q10, q7, d1[0] \n\t" + "vmla.f32 q11, q7, d1[1] \n\t" + + "vmla.f32 q12, q7, d2[0] \n\t" + "vmla.f32 q13, q7, d2[1] \n\t" + + "vmla.f32 q14, q7, d3[0] \n\t" + "vmla.f32 q15, q7, d3[1] \n\t" + + // store out_ptr + "pld [%[f1], #256] \n\t" + "vld1.f32 {d0-d3}, [%[f1]]! \n\t" + + "pld [%[in_ptr1], #288] \n\t" + "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" + "vst1.f32 {d16, d17}, [%[out_ptr1]]! \n\t" + "vst1.f32 {d18, d19}, [%[out_ptr1_c2]]! \n\t" + + "vst1.f32 {d20, d21}, [%[out_ptr1_c3]]! \n\t" + "vst1.f32 {d22, d23}, [%[out_ptr1_c4]]! \n\t" + + "vst1.f32 {d24, d25}, [%[out_ptr1_c5]]! \n\t" + "vst1.f32 {d26, d27}, [%[out_ptr1_c6]]! \n\t" + + "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" + "add %[in_ptr1], %[in_stride] \n\t" + "vst1.f32 {d28, d29}, [%[out_ptr1_c7]]! \n\t" + + "subs %[loop], #1 \n\t" + "vst1.f32 {d30, d31}, [%[out_ptr1_c8]]! \n\t" + + // cycle + "bne 0b \n\t" + "sub %[f1], %[f1], #32 \n\t" + "sub %[in_ptr1], %[in_ptr1], #32 \n\t" + "sub %[in_ptr1], %[in_stride] \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [out_ptr1_c2] "+r"(out_ptr1_c2), + [out_ptr1_c3] "+r"(out_ptr1_c3), + [out_ptr1_c4] "+r"(out_ptr1_c4), + [out_ptr1_c5] "+r"(out_ptr1_c5), + [out_ptr1_c6] "+r"(out_ptr1_c6), + [out_ptr1_c7] "+r"(out_ptr1_c7), + [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1) + : [f1] "r"(f1), [in_stride] "r"(in_stride) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + + in_ptr2 = in_ptr1 + input_w; + in_ptr3 = in_ptr2 + input_w; + } + } +#endif //__aarch64__ +#endif // __ARM_NEON + + // remain output_width + for (; o_w < output_w; ++o_w) { + float sum1 = 0; + float sum1_c2 = 0; + float sum1_c3 = 0; + float sum1_c4 = 0; + float sum1_c5 = 0; + float sum1_c6 = 0; + float sum1_c7 = 0; + float sum1_c8 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); + float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3); + float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4); + float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5); + float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6); + float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7); + float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8); + + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); + float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3); + float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4); + float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5); + float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6); + float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7); + float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); + float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3); + float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4); + float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5); + float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6); + float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7); + float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8); + + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); + _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3); + _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4); + _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5); + _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6); + _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7); + _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); + float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3); + float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4); + float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5); + float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6); + float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7); + float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8); + + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); + _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3); + _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4); + _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5); + _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6); + _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7); + _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); + _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3); + _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3); + _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3); + _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3); + _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3); + _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3); + + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss1_2 = + vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); + float32x2_t _ss1_3 = + vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3)); + float32x2_t _ss1_4 = + vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4)); + float32x2_t _ss1_5 = + vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5)); + float32x2_t _ss1_6 = + vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6)); + float32x2_t _ss1_7 = + vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7)); + float32x2_t _ss1_8 = + vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8)); + + float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); + float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4); + float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6); + float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8); + + sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); + sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); + sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0); + sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1); + sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0); + sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1); + sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0); + sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; + + sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; + sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; + sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; + sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; + sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; + sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; + sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; + sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; + sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; + + sum1_c3 += in_ptr1[0] * pad_filter1_c3[0]; + sum1_c3 += in_ptr1[1] * pad_filter1_c3[1]; + sum1_c3 += in_ptr1[2] * pad_filter1_c3[2]; + sum1_c3 += in_ptr2[0] * pad_filter2_c3[0]; + sum1_c3 += in_ptr2[1] * pad_filter2_c3[1]; + sum1_c3 += in_ptr2[2] * pad_filter2_c3[2]; + sum1_c3 += in_ptr3[0] * pad_filter3_c3[0]; + sum1_c3 += in_ptr3[1] * pad_filter3_c3[1]; + sum1_c3 += in_ptr3[2] * pad_filter3_c3[2]; + + sum1_c4 += in_ptr1[0] * pad_filter1_c4[0]; + sum1_c4 += in_ptr1[1] * pad_filter1_c4[1]; + sum1_c4 += in_ptr1[2] * pad_filter1_c4[2]; + sum1_c4 += in_ptr2[0] * pad_filter2_c4[0]; + sum1_c4 += in_ptr2[1] * pad_filter2_c4[1]; + sum1_c4 += in_ptr2[2] * pad_filter2_c4[2]; + sum1_c4 += in_ptr3[0] * pad_filter3_c4[0]; + sum1_c4 += in_ptr3[1] * pad_filter3_c4[1]; + sum1_c4 += in_ptr3[2] * pad_filter3_c4[2]; + + sum1_c5 += in_ptr1[0] * pad_filter1_c5[0]; + sum1_c5 += in_ptr1[1] * pad_filter1_c5[1]; + sum1_c5 += in_ptr1[2] * pad_filter1_c5[2]; + sum1_c5 += in_ptr2[0] * pad_filter2_c5[0]; + sum1_c5 += in_ptr2[1] * pad_filter2_c5[1]; + sum1_c5 += in_ptr2[2] * pad_filter2_c5[2]; + sum1_c5 += in_ptr3[0] * pad_filter3_c5[0]; + sum1_c5 += in_ptr3[1] * pad_filter3_c5[1]; + sum1_c5 += in_ptr3[2] * pad_filter3_c5[2]; + + sum1_c6 += in_ptr1[0] * pad_filter1_c6[0]; + sum1_c6 += in_ptr1[1] * pad_filter1_c6[1]; + sum1_c6 += in_ptr1[2] * pad_filter1_c6[2]; + sum1_c6 += in_ptr2[0] * pad_filter2_c6[0]; + sum1_c6 += in_ptr2[1] * pad_filter2_c6[1]; + sum1_c6 += in_ptr2[2] * pad_filter2_c6[2]; + sum1_c6 += in_ptr3[0] * pad_filter3_c6[0]; + sum1_c6 += in_ptr3[1] * pad_filter3_c6[1]; + sum1_c6 += in_ptr3[2] * pad_filter3_c6[2]; + + sum1_c7 += in_ptr1[0] * pad_filter1_c7[0]; + sum1_c7 += in_ptr1[1] * pad_filter1_c7[1]; + sum1_c7 += in_ptr1[2] * pad_filter1_c7[2]; + sum1_c7 += in_ptr2[0] * pad_filter2_c7[0]; + sum1_c7 += in_ptr2[1] * pad_filter2_c7[1]; + sum1_c7 += in_ptr2[2] * pad_filter2_c7[2]; + sum1_c7 += in_ptr3[0] * pad_filter3_c7[0]; + sum1_c7 += in_ptr3[1] * pad_filter3_c7[1]; + sum1_c7 += in_ptr3[2] * pad_filter3_c7[2]; + + sum1_c8 += in_ptr1[0] * pad_filter1_c8[0]; + sum1_c8 += in_ptr1[1] * pad_filter1_c8[1]; + sum1_c8 += in_ptr1[2] * pad_filter1_c8[2]; + sum1_c8 += in_ptr2[0] * pad_filter2_c8[0]; + sum1_c8 += in_ptr2[1] * pad_filter2_c8[1]; + sum1_c8 += in_ptr2[2] * pad_filter2_c8[2]; + sum1_c8 += in_ptr3[0] * pad_filter3_c8[0]; + sum1_c8 += in_ptr3[1] * pad_filter3_c8[1]; + sum1_c8 += in_ptr3[2] * pad_filter3_c8[2]; +#endif + if (if_nopadding) { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } else if (input_w > 3 && + (if_odd_pad_w && o_w == valid_w_start || + o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || + o_w == valid_w_end + 1 && !if_odd_pad_w && + !if_exact_in_w)) { + pad_filter1--; + pad_filter2--; + pad_filter3--; + pad_filter1_c2--; + pad_filter2_c2--; + pad_filter3_c2--; + + pad_filter1_c3--; + pad_filter2_c3--; + pad_filter3_c3--; + pad_filter1_c4--; + pad_filter2_c4--; + pad_filter3_c4--; + + pad_filter1_c5--; + pad_filter2_c5--; + pad_filter3_c5--; + pad_filter1_c6--; + pad_filter2_c6--; + pad_filter3_c6--; + + pad_filter1_c7--; + pad_filter2_c7--; + pad_filter3_c7--; + pad_filter1_c8--; + pad_filter2_c8--; + pad_filter3_c8--; + + in_ptr1++; + in_ptr2++; + in_ptr3++; + } else if (input_w <= 3 || o_w < valid_w_start || + o_w > valid_w_end) { + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + pad_filter1_c2 -= 2; + pad_filter2_c2 -= 2; + pad_filter3_c2 -= 2; + + pad_filter1_c3 -= 2; + pad_filter2_c3 -= 2; + pad_filter3_c3 -= 2; + pad_filter1_c4 -= 2; + pad_filter2_c4 -= 2; + pad_filter3_c4 -= 2; + + pad_filter1_c5 -= 2; + pad_filter2_c5 -= 2; + pad_filter3_c5 -= 2; + pad_filter1_c6 -= 2; + pad_filter2_c6 -= 2; + pad_filter3_c6 -= 2; + + pad_filter1_c7 -= 2; + pad_filter2_c7 -= 2; + pad_filter3_c7 -= 2; + pad_filter1_c8 -= 2; + pad_filter2_c8 -= 2; + pad_filter3_c8 -= 2; + } else { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } + *out_ptr1 += sum1; + *out_ptr1_c2 += sum1_c2; + *out_ptr1_c3 += sum1_c3; + *out_ptr1_c4 += sum1_c4; + *out_ptr1_c5 += sum1_c5; + *out_ptr1_c6 += sum1_c6; + *out_ptr1_c7 += sum1_c7; + *out_ptr1_c8 += sum1_c8; + + out_ptr1++; + out_ptr1_c2++; + out_ptr1_c3++; + out_ptr1_c4++; + out_ptr1_c5++; + out_ptr1_c6++; + out_ptr1_c7++; + out_ptr1_c8++; + } + if (if_nopadding) { + in_ptr1 += remain_stride_w + input_w; + in_ptr2 += remain_stride_w + input_w; + in_ptr3 += remain_stride_w + input_w; + + } else if (input_h > 3 && + (if_odd_pad_h && o_h == valid_h_start || + o_h == valid_h_end && if_odd_pad_h && if_exact_in_h || + o_h == valid_h_end + 1 && !if_odd_pad_h && + !if_exact_in_h)) { + in_ptr1 += 3; + in_ptr2 += 3; + in_ptr3 += 3; + + pad_filter1 -= remain_stride_w; + pad_filter2 -= remain_stride_w; + pad_filter3 -= remain_stride_w; + pad_filter1_c2 -= remain_stride_w; + pad_filter2_c2 -= remain_stride_w; + pad_filter3_c2 -= remain_stride_w; + + pad_filter1_c3 -= remain_stride_w; + pad_filter2_c3 -= remain_stride_w; + pad_filter3_c3 -= remain_stride_w; + pad_filter1_c4 -= remain_stride_w; + pad_filter2_c4 -= remain_stride_w; + pad_filter3_c4 -= remain_stride_w; + + pad_filter1_c5 -= remain_stride_w; + pad_filter2_c5 -= remain_stride_w; + pad_filter3_c5 -= remain_stride_w; + pad_filter1_c6 -= remain_stride_w; + pad_filter2_c6 -= remain_stride_w; + pad_filter3_c6 -= remain_stride_w; + + pad_filter1_c7 -= remain_stride_w; + pad_filter2_c7 -= remain_stride_w; + pad_filter3_c7 -= remain_stride_w; + pad_filter1_c8 -= remain_stride_w; + pad_filter2_c8 -= remain_stride_w; + pad_filter3_c8 -= remain_stride_w; + } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) { + in_ptr1 -= input_w - 3; + in_ptr2 -= input_w - 3; + in_ptr3 -= input_w - 3; + + pad_filter1 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter1_c2 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c2 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c2 -= 3 + 2 * padding_w + remain_stride_w; + + pad_filter1_c3 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c3 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c3 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter1_c4 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c4 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c4 -= 3 + 2 * padding_w + remain_stride_w; + + pad_filter1_c5 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c5 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c5 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter1_c6 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c6 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c6 -= 3 + 2 * padding_w + remain_stride_w; + + pad_filter1_c7 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c7 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c7 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter1_c8 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2_c8 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3_c8 -= 3 + 2 * padding_w + remain_stride_w; + } else { + pad_filter1 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3 += 3 + 2 * padding_w - remain_stride_w; + pad_filter1_c2 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c2 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c2 += 3 + 2 * padding_w - remain_stride_w; + + pad_filter1_c3 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c3 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c3 += 3 + 2 * padding_w - remain_stride_w; + pad_filter1_c4 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c4 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c4 += 3 + 2 * padding_w - remain_stride_w; + + pad_filter1_c5 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c5 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c5 += 3 + 2 * padding_w - remain_stride_w; + pad_filter1_c6 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c6 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c6 += 3 + 2 * padding_w - remain_stride_w; + + pad_filter1_c7 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c7 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c7 += 3 + 2 * padding_w - remain_stride_w; + pad_filter1_c8 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2_c8 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3_c8 += 3 + 2 * padding_w - remain_stride_w; + + in_ptr1 += input_w + 3; + in_ptr2 += input_w + 3; + in_ptr3 += input_w + 3; + } + } + + filter_data_ch += filter_ch_size; + filter_data_ch_c2 += filter_ch_size; + filter_data_ch_c3 += filter_ch_size; + filter_data_ch_c4 += filter_ch_size; + filter_data_ch_c5 += filter_ch_size; + filter_data_ch_c6 += filter_ch_size; + filter_data_ch_c7 += filter_ch_size; + filter_data_ch_c8 += filter_ch_size; + input_data_ch += in_ch_size; + } + } + + int out_ch_remain_start = output_ch - output_ch % 8; + + // remain output_channel +#pragma omp parallel for + for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) { + const float *f1, *f9; + const float *in_ptr1, *in_ptr2, *in_ptr3; + const float *pad_filter1, *pad_filter2, *pad_filter3; + float pad_filter_arr[pad_filter_ch_size]; + float *output_data_ch; + const float *input_data_ch; + const float *filter_data_ch; + + filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; + input_data_ch = input_data; + output_data_ch = output_data + o_c * out_ch_size; + + for (int i_c = 0; i_c < input_ch; ++i_c) { + f1 = filter_data_ch; + f9 = f1 + 8; + + if (!if_nopadding) { + memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); + for (int i = 0; i < 9; ++i) { + int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + + padding_w * (2 * padding_h + 1); + pad_filter_arr[j] = filter_data_ch[i]; + } + pad_filter1 = pad_filter_arr; + pad_filter1 += pad_filter_start; + pad_filter2 = pad_filter1 + pad_filter_w; + pad_filter3 = pad_filter2 + pad_filter_w; + } else { + pad_filter1 = filter_data_ch; + pad_filter2 = pad_filter1 + 3; + pad_filter3 = pad_filter2 + 3; + } + + float *out_ptr1; + out_ptr1 = output_data_ch; + in_ptr1 = input_data_ch; + in_ptr2 = in_ptr1 + input_w; + in_ptr3 = in_ptr2 + input_w; + + int o_h = 0; + for (; o_h < output_h; ++o_h) { + int o_w = 0; + + // pad left + for (; o_w <= valid_w_start; ++o_w) { + float sum1 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); + sum1 += vget_lane_f32(_ssss1_ssss1, 0); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; +#endif + if (if_nopadding) { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } else if (input_w > 3 && + (if_odd_pad_w && o_w == valid_w_start || + o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || + o_w == valid_w_end + 1 && !if_odd_pad_w && + !if_exact_in_w)) { + pad_filter1--; + pad_filter2--; + pad_filter3--; + in_ptr1++; + in_ptr2++; + in_ptr3++; + + } else if (input_w <= 3 || o_w < valid_w_start || + o_w > valid_w_end) { + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + } else { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } + *out_ptr1 += sum1; + out_ptr1++; + } + // valid +#if __ARM_NEON +#if __aarch64__ + if (o_h > valid_h_start && o_h < valid_h_end) { + int loop = (valid_w_end - valid_w_start - 1) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "prfm pldl1keep, [%[f1], #256] \n\t" + "prfm pldl1keep, [%[f9], #256] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" + "ld1 {v4.s}[0], [%[f9]] \n\t" + + "0: \n\t" + // load out_ptr + "prfm pldl1keep, [%[out_ptr1], #128] \n\t" + "ld1 {v12.4s}, [%[out_ptr1]] \n\t" + + // in_ptr1 multiply + "prfm pldl1keep, [%[in_ptr1], #256] \n\t" + "ld2 {v5.4s, v6.4s}, [%[in_ptr1]], #32 \n\t" + "ld2 {v7.4s, v8.4s}, [%[in_ptr1]] \n\t" + + "fmla v12.4s, v5.4s, v0.s[0] \n\t" + "fmla v14.4s, v5.4s, v2.s[0] \n\t" + + "ext v8.16b, v5.16b, v7.16b, #4 \n\t" + "fmul v13.4s, v6.4s, v0.s[1] \n\t" + "fmla v12.4s, v8.4s, v0.s[2] \n\t" + + "ld2 {v5.4s, v6.4s}, [%[in_ptr2]], #32 \n\t" + "ld2 {v7.4s, v8.4s}, [%[in_ptr2]] \n\t" + + // in_ptr2 multiply + "fmla v13.4s, v5.4s, v0.s[3] \n\t" + "ext v8.16b, v5.16b, v7.16b, #4 \n\t" + "fmla v12.4s, v6.4s, v1.s[0] \n\t" + + "fmla v13.4s, v8.4s, v1.s[1] \n\t" + "ld2 {v5.4s, v6.4s}, [%[in_ptr3]], #32 \n\t" + "ld2 {v7.4s, v8.4s}, [%[in_ptr3]] \n\t" + + // in_ptr3 multiply + "fmla v12.4s, v5.4s, v1.s[2] \n\t" + "ext v8.16b, v5.16b, v7.16b, #4 \n\t" + + "fmla v13.4s, v6.4s, v1.s[3] \n\t" + "fmla v12.4s, v8.4s, v4.s[0] \n\t" + + // store out_ptr + "fadd v12.4s, v12.4s, v13.4s \n\t" + "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" + + // cycle + "subs %[loop], %[loop], #1 \n\t" + "bne 0b \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2), + [in_ptr3] "+r"(in_ptr3) + : [f1] "r"(f1), [f9] "r"(f9) + : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8", + "v12", "v13"); + } + } +#else + if (o_h > valid_h_start && o_h < valid_h_end) { + int loop = (valid_w_end - valid_w_start - 1) >> 2; + o_w += loop * 4; + + if (loop > 0) { + asm volatile( + "pld [%[f1], #256] \n\t" + "pld [%[f9], #256] \n\t" + + "vld1.f32 {d0-d3}, [%[f1]] \n\t" + "vld1.f32 {d8[0]}, [%[f9]] \n\t" + + "pld [%[in_ptr1], #256] \n\t" + "vld2.f32 {d10-d13}, [%[in_ptr1]]! \n\t" + "vld2.f32 {d14, d15}, [%[in_ptr1]] \n\t" + + "0: \n\t" + // load out_ptr + "pld [%[out_ptr1], #128] \n\t" + "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" + + // in_ptr1 multiply + "pld [%[in_ptr2], #256] \n\t" + "vld2.f32 {d4-d7}, [%[in_ptr2]]! \n\t" + + "vmla.f32 q12, q5, d0[0] \n\t" + "vld2.f32 {d20, d21}, [%[in_ptr2]] \n\t" + "vext.32 q8, q5, q7, #1 \n\t" + + "pld [%[in_ptr3], #256] \n\t" + "vmul.f32 q13, q6, d0[1] \n\t" + + "vld2.f32 {d10-d13}, [%[in_ptr3]]! \n\t" + "vmul.f32 q14, q8, d1[0] \n\t" + "vld2.f32 {d14, d15}, [%[in_ptr3]] \n\t" + + // in_ptr2 multiply + "vmul.f32 q15, q2, d1[1] \n\t" + "vext.32 q8, q2, q10, #1 \n\t" + + "vmla.f32 q12, q3, d2[0] \n\t" + "vmla.f32 q13, q8, d2[1] \n\t" + + // in_ptr3 multiply + "vmla.f32 q14, q5, d3[0] \n\t" + "vext.32 q8, q5, q7, #1 \n\t" + + "pld [%[in_ptr1], #256] \n\t" + "vmla.f32 q15, q6, d3[1] \n\t" + + "vld2.f32 {d10-d13}, [%[in_ptr1]]! \n\t" + "vmla.f32 q13, q8, d8[0] \n\t" + + // store out_ptr + "vld2.f32 {d14, d15}, [%[in_ptr1]] \n\t" + "vadd.f32 q12, q12, q13 \n\t" + "subs %[loop], #1 \n\t" + + "vadd.f32 q14, q14, q15 \n\t" + "vadd.f32 q12, q12, q14 \n\t" + "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" + + // cycle + "bne 0b \n\t" + "subs %[in_ptr1], %[in_ptr1], #32 \n\t" + + : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), + [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2), + [in_ptr3] "+r"(in_ptr3) + : [f1] "r"(f1), [f9] "r"(f9) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q10", "q12", "q13", "q14", "q15"); + } + } +#endif //__aarch64__ +#endif // __ARM_NEON + out_ptr1 -= 4; + out_ptr1 += 4; + + // remain output_width + for (; o_w < output_w; ++o_w) { + float sum1 = 0; +#if __ARM_NEON + float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); + float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); + float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); + + float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); + float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); + _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); + + float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); + float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); + _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); + + _sum1 = vsetq_lane_f32(sum1, _sum1, 3); + float32x2_t _ss1 = + vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); + sum1 += vget_lane_f32(_ssss1_ssss1, 0); +#else + sum1 += in_ptr1[0] * pad_filter1[0]; + sum1 += in_ptr1[1] * pad_filter1[1]; + sum1 += in_ptr1[2] * pad_filter1[2]; + sum1 += in_ptr2[0] * pad_filter2[0]; + sum1 += in_ptr2[1] * pad_filter2[1]; + sum1 += in_ptr2[2] * pad_filter2[2]; + sum1 += in_ptr3[0] * pad_filter3[0]; + sum1 += in_ptr3[1] * pad_filter3[1]; + sum1 += in_ptr3[2] * pad_filter3[2]; +#endif + if (if_nopadding) { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } else if (input_w > 3 && + (if_odd_pad_w && o_w == valid_w_start || + o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || + o_w == valid_w_end + 1 && !if_odd_pad_w && + !if_exact_in_w)) { + pad_filter1--; + pad_filter2--; + pad_filter3--; + + in_ptr1++; + in_ptr2++; + in_ptr3++; + + } else if (input_w <= 3 || o_w < valid_w_start || + o_w > valid_w_end) { + pad_filter1 -= 2; + pad_filter2 -= 2; + pad_filter3 -= 2; + } else { + in_ptr1 += 2; + in_ptr2 += 2; + in_ptr3 += 2; + } + *out_ptr1 += sum1; + out_ptr1++; + } + if (if_nopadding) { + in_ptr1 += remain_stride_w + input_w; + in_ptr2 += remain_stride_w + input_w; + in_ptr3 += remain_stride_w + input_w; + } else if (input_h > 3 && + (if_odd_pad_h && o_h == valid_h_start || + o_h == valid_h_end && if_odd_pad_h && if_exact_in_h || + o_h == valid_h_end + 1 && !if_odd_pad_h && + !if_exact_in_h)) { + in_ptr1 += 3; + in_ptr2 += 3; + in_ptr3 += 3; + + pad_filter1 -= remain_stride_w; + pad_filter2 -= remain_stride_w; + pad_filter3 -= remain_stride_w; + + } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) { + in_ptr1 -= input_w - 3; + in_ptr2 -= input_w - 3; + in_ptr3 -= input_w - 3; + + pad_filter1 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter2 -= 3 + 2 * padding_w + remain_stride_w; + pad_filter3 -= 3 + 2 * padding_w + remain_stride_w; + } else { + pad_filter1 += 3 + 2 * padding_w - remain_stride_w; + pad_filter2 += 3 + 2 * padding_w - remain_stride_w; + pad_filter3 += 3 + 2 * padding_w - remain_stride_w; + + in_ptr1 += input_w + 3; + in_ptr2 += input_w + 3; + in_ptr3 += input_w + 3; + } + } + filter_data_ch += filter_ch_size; + input_data_ch += in_ch_size; + } + } + input_data += in_batch_size; + output_data += out_batch_size; + } +} + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/slidingwindow_conv3x3.h b/src/operators/math/slidingwindow_conv3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..cbcdbc170f1c01866fe402447e07b6ab189a535b --- /dev/null +++ b/src/operators/math/slidingwindow_conv3x3.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "framework/tensor.h" + +namespace paddle_mobile { +namespace operators { +namespace math { +template +void SlidingwindowConv3x3s1(const framework::Tensor *input, + const framework::Tensor *filter, + const std::vector &paddings, + framework::Tensor *output); + +template +void SlidingwindowConv3x3s2(const framework::Tensor *input, + const framework::Tensor *filter, + const std::vector &paddings, + framework::Tensor *output); + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index ead4de0514ad4a552d469d3ce34ec298f63343e7..645c288a35408d99537f68e7da7f7b3e9b546409 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -476,6 +476,8 @@ class ConvParam : public OpParam { EXEC_GEMM_INT8, EXEC_DEPTHWISE3x3_INT8, EXEC_DEPTHWISE5x5_INT8, + EXEC_SLIDINGWINDOW3x3S1_FLOAT, + EXEC_SLIDINGWINDOW3x3S2_FLOAT, }; ExecMode &ExecMode() const { return exec_mode_; } diff --git a/test/fpga/test_marker.cpp b/test/fpga/test_marker.cpp index 6a266773e4e9924ba52d3ced522d8e2821e003f5..e0977b57f07980aaf573abf2a0a8834b36740f56 100644 --- a/test/fpga/test_marker.cpp +++ b/test/fpga/test_marker.cpp @@ -12,17 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#ifndef PADDLE_MOBILE_FPGA +#define PADDLE_MOBILE_FPGA +#endif + #include "../test_helper.h" #include "../test_include.h" - #ifdef PADDLE_MOBILE_FPGA_V1 #include "fpga/V1/api.h" #endif #ifdef PADDLE_MOBILE_FPGA_V2 #include "fpga/V2/api.h" #endif -#include + +#include +#include +#include "../../src/io/paddle_inference_api.h" + +using namespace paddle_mobile; // NOLINT +using namespace paddle_mobile::fpga; // NOLINT + +static const char *g_image = "../models/marker/marker1/image.bin"; +static const char *g_model = "../models/marker/marker1/model"; +static const char *g_param = "../models/marker/marker1/params"; void readStream(std::string filename, char *buf) { std::ifstream in; @@ -36,132 +48,78 @@ void readStream(std::string filename, char *buf) { auto length = in.tellg(); // report location (this is the length) in.seekg(0, std::ios::beg); // go back to the beginning in.read(buf, length); - DLOG << length; in.close(); } -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int num, int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * amount_per_side * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); - } - } - } - } +PaddleMobileConfig GetConfig() { + PaddleMobileConfig config; + config.precision = PaddleMobileConfig::FP32; + config.device = PaddleMobileConfig::kFPGA; + config.prog_file = g_model; + config.param_file = g_param; + config.thread_num = 1; + config.batch_size = 1; + config.optimize = true; + config.lod_mode = true; + config.quantification = false; + return config; } -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum, bool use_chw) { - // bool use_chw = true; - if (input_tensor.dims().size() != 4) return; - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - int n = (input_tensor.dims())[0]; - auto data_ptr = input_tensor.get_data(); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - auto data_tmp = data_ptr_16; - if (use_chw) { - data_tmp = - reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); - convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); - } - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - if (data_tmp != data_ptr_16) { - free(data_tmp); +int main() { + open_device(); + + PaddleMobileConfig config = GetConfig(); + auto predictor = + CreatePaddlePredictor(config); + + std::cout << "Finishing loading model" << std::endl; + + float img_info[3] = {432, 1280, 1.0f}; + int img_length = 432 * 1280 * 3; + auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); + readStream(g_image, reinterpret_cast(img)); + + std::cout << "Finishing initializing data" << std::endl; + struct PaddleTensor t_img_info, t_img; + t_img.dtypeid = typeid(float); + t_img_info.layout = LAYOUT_HWC; + t_img_info.shape = std::vector({1, 3}); + t_img_info.name = "Image information"; + t_img_info.data.Reset(img_info, 3 * sizeof(float)); + + t_img.dtypeid = typeid(float); + t_img.layout = LAYOUT_HWC; + t_img.shape = std::vector({1, 432, 1280, 3}); + t_img.name = "Image information"; + t_img.data.Reset(img, img_length * sizeof(float)); + predictor->FeedPaddleTensors({t_img_info, t_img}); + + std::cout << "Finishing feeding data " << std::endl; + + predictor->Predict_From_To(0, -1); + std::cout << "Finishing predicting " << std::endl; + + std::vector v; // No need to initialize v + predictor->FetchPaddleTensors(&v); // Old data in v will be cleared + for (int i = 0; i < v.size(); ++i) { + auto p = reinterpret_cast(v[i].data.data()); + int len = v[i].data.length(); + float result = 0.0f; + std::string str = "fetch" + std::to_string(i); + fpga::savefile(str, p, len, result); } -} -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} + std::cout << "Finish getting vector values" << std::endl; -void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, - bool use_chw) { - static int i = 0; - if (input_tensor.numel() == 0) { - return; - } - if (input_tensor.type() == typeid(float)) { - DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); - dump_stride_float(filename, input_tensor, dumpnum); - } else { - DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); - dump_stride_half(filename, input_tensor, dumpnum, use_chw); - } - DLOG << "dump input address: " << input_tensor.get_data(); -} + //////////////////////////////////////////////////// -static const char *g_marker_combine = "../models/marker/model"; -static const char *g_image_src_float = "../models/marker/model/input_0.bin"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - - // if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", - // std::string(g_rfcn_combine) + "/params", true, false, - // 1, true)) { - if (paddle_mobile.Load(std::string(g_marker_combine), true)) { - float img_info[3] = {720, 1280, 800.0f / 960.0f}; - auto img = reinterpret_cast( - fpga::fpga_malloc(720 * 1280 * 3 * sizeof(float))); - readStream(g_image_src_float, reinterpret_cast(img)); - - std::vector v(3, nullptr); - paddle_mobile.FeedData({img}); - paddle_mobile.Predict_To(-1); - - for (int i = 47; i < 52; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "marker_" + std::to_string(i); - // if(i != 58) - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(float)); - // tensor_ptr->numel() * sizeof(float)); - - dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), - true); // 20);//tensor_ptr->numel()); - - /* float result = 0; - std::string str = "softmax_input_data"; - float* data = - static_cast(fpga::fpga_malloc(tensor_ptr->numel() * - sizeof(float))); str = "softmax_output_data"; auto output_ptr = - static_cast((*tensor_ptr).get_data()); for (int idx = 0; idx < - tensor_ptr->numel(); ++idx) - { - data[idx] = fpga::fp16_2_fp32(output_ptr[idx]); - } - fpga::savefile(str,data, tensor_ptr->numel(), result ); */ - } - - // paddle_mobile.GetResults(&v); - DLOG << "Computation done"; - fpga::fpga_free(img); - } + // PaddleTensor tensor; + // predictor->GetPaddleTensor("fetch2", &tensor); + // for (int i = 0; i < post_nms; i++) { + // auto p = reinterpret_cast(tensor.data.data()); + // std::cout << p[+i] << std::endl; + // } return 0; } diff --git a/test/fpga/test_marker_api.cpp b/test/fpga/test_marker_api.cpp index e5b2995676b7b2dad5f32ae51b4b6220fda4506d..2b25f8f6d1d9e247d10a9338e60b9c19a5a9c68d 100644 --- a/test/fpga/test_marker_api.cpp +++ b/test/fpga/test_marker_api.cpp @@ -15,12 +15,15 @@ limitations under the License. */ #ifndef PADDLE_MOBILE_FPGA #define PADDLE_MOBILE_FPGA #endif +#include +#include #include +#include #include #include "../../src/io/paddle_inference_api.h" -using namespace paddle_mobile; -using namespace paddle_mobile::fpga; +using namespace paddle_mobile; // NOLINT +using namespace paddle_mobile::fpga; // NOLINT static const char *g_image = "../models/marker/model/image.bin"; static const char *g_model = "../models/marker/model/model"; @@ -136,44 +139,6 @@ PaddleMobileConfig GetConfig1() { int main() { open_device(); - - PaddleMobileConfig config1 = GetConfig1(); - auto predictor1 = - CreatePaddlePredictor(config1); - - std::cout << "Finishing loading model" << std::endl; - for (int i = 0; i < 1; ++i) { - int img_length1 = 144 * 14 * 14; - auto img1 = - reinterpret_cast(fpga_malloc(img_length1 * sizeof(float))); - readStream(g_image1, reinterpret_cast(img1)); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img1; - - t_img1.dtypeid = typeid(float); - t_img1.layout = LAYOUT_HWC; - t_img1.shape = std::vector({1, 14, 14, 144}); - t_img1.name = "Image information"; - t_img1.data.Reset(img1, img_length1 * sizeof(float)); - predictor1->FeedPaddleTensors({t_img1}); - - std::cout << "Finishing feeding data " << std::endl; - - predictor1->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v1; // No need to initialize v - predictor1->FetchPaddleTensors(&v1); // Old data in v will be cleared - std::cout << "Output number is " << v1.size() << std::endl; - for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) { - std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum); - dump_stride(dumpName, v1[fetchNum]); - } - } - ///////////////////////////////////// - PaddleMobileConfig config = GetConfig(); auto predictor = CreatePaddlePredictorPredict_From_To(0, -1); + gettimeofday(&end11, NULL); + dif_sec = end11.tv_sec - start11.tv_sec; + dif_usec = end11.tv_usec - start11.tv_usec; + std::cout << "marker1 total" + << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" + << std::endl; std::cout << "Finishing predicting " << std::endl; std::vector v; // No need to initialize v @@ -217,5 +191,48 @@ int main() { std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum); dump_stride(dumpName, v[fetchNum]); } + + PaddleMobileConfig config1 = GetConfig1(); + auto predictor1 = + CreatePaddlePredictor(config1); + + std::cout << "Finishing loading model" << std::endl; + for (int i = 0; i < 1; ++i) { + int img_length1 = 144 * 14 * 14; + auto img1 = + reinterpret_cast(fpga_malloc(img_length1 * sizeof(float))); + readStream(g_image1, reinterpret_cast(img1)); + + std::cout << "Finishing initializing data" << std::endl; + struct PaddleTensor t_img1; + + t_img1.dtypeid = typeid(float); + t_img1.layout = LAYOUT_HWC; + t_img1.shape = std::vector({1, 14, 14, 144}); + t_img1.name = "Image information"; + t_img1.data.Reset(img1, img_length1 * sizeof(float)); + predictor1->FeedPaddleTensors({t_img1}); + + std::cout << "Finishing feeding data " << std::endl; + + gettimeofday(&start11, NULL); + predictor1->Predict_From_To(0, -1); + gettimeofday(&end11, NULL); + dif_sec = end11.tv_sec - start11.tv_sec; + dif_usec = end11.tv_usec - start11.tv_usec; + std::cout << "marker2 total" + << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" + << std::endl; + std::cout << "Finishing predicting " << std::endl; + + std::vector v1; // No need to initialize v + predictor1->FetchPaddleTensors(&v1); // Old data in v will be cleared + std::cout << "Output number is " << v1.size() << std::endl; + for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) { + std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum); + dump_stride(dumpName, v1[fetchNum]); + } + } return 0; }