diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 0fafe210a573bc23bbcba907b4059465e83791da..95a46816b9f35046388e9a14e659e02822871ecd 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -57,6 +57,8 @@ bool ConvKernel::Init(ConvParam *param) { param->Strides()[0] == 2 && param->Paddings()[0] == 1 && param->Paddings()[0] == param->Paddings()[1]) { param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT; + } else if (depth3x3) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; #ifndef __aarch64__ } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] && param->Strides()[0] == 1) { @@ -106,6 +108,10 @@ void ConvKernel::Compute(const ConvParam ¶m) { math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), nullptr, false, false); break; + case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), + param.Filter(), nullptr, param.Output(), false); + break; #ifndef __aarch64__ case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 86a3c7a9694e8d686f41911ea4af784a33c2cd0a..b1556a3a771231fd62e8cadda2d9d7d40721856a 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -35,6 +35,7 @@ inline void GemmConv(const ConvParam ¶m) { Tensor filter = *param.Filter(); Tensor *output = param.Output(); output->mutable_data(); + int groups = param.Groups(); const std::vector strides = param.Strides(); const std::vector paddings = param.Paddings(); @@ -90,8 +91,8 @@ inline void GemmConv(const ConvParam ¶m) { Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); + // col_matrix.ShareDataWith(in_slice); + col_matrix = in_slice; col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { // im2col @@ -107,6 +108,7 @@ inline void GemmConv(const ConvParam ¶m) { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::MatMul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0), false, diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 869a61089621e8ed436944c26bc3cffc78159f46..e41829761bcd6ac87a73c6378ec36a17458aff56 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -2971,48 +2971,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, // C = A * B void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vst1.32 {q0, q1}, [%[C]]! \n\t" - - "vld1.32 {q2, q3}, [%[c]]! \n\t" - "vst1.32 {q2, q3}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q4}, [%[c]]! \n\t" - "vst1.32 {q4}, [%[C]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - "vld1.32 {q5}, [%[c]]! \n\t" - "vst1.32 {q5}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5"); + memcpy(C, c, n * sizeof(float)); } // C = alpha * A * B + beta * C diff --git a/test/operators/test_conv_op.cpp b/test/operators/test_conv_op.cpp index d612d3a3aa8bf3cf743c40a9cf8d5bf36e3cdf8b..b183b0918cf89ab292262e7aec3920a1684b4202 100644 --- a/test/operators/test_conv_op.cpp +++ b/test/operators/test_conv_op.cpp @@ -228,39 +228,43 @@ int TestAll(const int in_channels, const int in_height, const int in_width, std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height << ", in_width=" << in_width << ", out_channels=" << out_channels << ", groups=" << groups << std::endl; - // // kernel = 3, pad = 0, stride = 1 - // std::cerr << "float, kernel=3, pad=0, stride=1" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 1, stride = 1 - // std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 2, stride = 1 - // std::cerr << "float, kernel=3, pad=2, stride=1" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 5, stride = 1 - // std::cerr << "float, kernel=3, pad=5, stride=1" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // - // // kernel = 3, pad = 0, stride = 2 - // std::cerr << "float, kernel=3, pad=0, stride=2" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 1, stride = 2 - // std::cerr << "float, kernel=3, pad=1, stride=2" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 2, stride = 2 - // std::cerr << "float, kernel=3, pad=2, stride=2" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); - // // kernel = 3, pad = 5, stride = 2 - // std::cerr << "float, kernel=3, pad=5, stride=2" << std::endl; - // paddle_mobile::TestConvOp( - // in_channels, in_height, in_width, out_channels, groups); + std::cerr << "float, kernel=1, pad=0, stride=1" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + + // kernel = 3, pad = 0, stride = 1 + std::cerr << "float, kernel=3, pad=0, stride=1" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 1, stride = 1 + std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 2, stride = 1 + std::cerr << "float, kernel=3, pad=2, stride=1" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 5, stride = 1 + std::cerr << "float, kernel=3, pad=5, stride=1" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + + // kernel = 3, pad = 0, stride = 2 + std::cerr << "float, kernel=3, pad=0, stride=2" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 1, stride = 2 + std::cerr << "float, kernel=3, pad=1, stride=2" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 2, stride = 2 + std::cerr << "float, kernel=3, pad=2, stride=2" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); + // kernel = 3, pad = 5, stride = 2 + std::cerr << "float, kernel=3, pad=5, stride=2" << std::endl; + paddle_mobile::TestConvOp( + in_channels, in_height, in_width, out_channels, groups); #ifndef __aarch64__ // kernel = 3, pad = 0, stride = 1 @@ -338,6 +342,7 @@ int TestAll(const int in_channels, const int in_height, const int in_width, } int main() { + TestAll(16, 10, 10, 16, 16); TestAll(1, 5, 5, 1, 1); TestAll(1, 5, 5, 10, 1); TestAll(10, 5, 5, 10, 10);