diff --git a/CMakeLists.txt b/CMakeLists.txt index e87a256e51f429729c8a5d53baa2ec838ebb1594..f72778ea50c96f2c2fca13909fb7768d3c5b6fb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) include_directories(src/) if(IS_IOS) - set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "-mfpu=neon -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") else() set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}") endif() diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 5db676564e190bf40e8af437ba68aee80b5a5af3..7e353c29b80279f895ad6d0150b31eb1703d97d4 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "operators/math/depthwise_conv_3x3.h" -#ifdef __ARM_NEON +#if __ARM_NEON #include #endif #include @@ -23,7 +23,6 @@ namespace math { void DepthwiseConv3x3(const Tensor *input, vector strides, vector paddings, const Tensor *filter, Tensor *bias, Tensor *output, bool if_bias) { -#ifdef __ARM_NEON const int batch_size = input->dims()[0]; const int input_height = input->dims()[2]; @@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, } } else { -#if defined(ARMV17) +#if __ARM_NEON +#if __aarch64__ + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos2); + const float32x4_t data3 = vld1q_f32(pos3); + + const float32x4_t v_filter1 = vld1q_f32(filter1); + const float32x4_t v_filter2 = vld1q_f32(filter2); + const float32x4_t v_filter3 = vld1q_f32(filter3); + float32x4_t mula = vmulq_f32(data1, v_filter1); + mula = vmlaq_f32(mula, data2, v_filter2); + mula = vmlaq_f32(mula, data3, v_filter3); + float32x2_t res = vpadd_f32( + vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula)); + res = vpadd_f32(res, res); + if (if_bias) { + output_data[ph * output_width + pw] += vget_lane_f32(res, 0); + } else { + output_data[ph * output_width + pw] = vget_lane_f32(res, 0); + } +#else asm volatile( "vld1.32 {q1}, [%[pos1]] \n\t" @@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, [filter2] "r"(filter2), [filter3] "r"(filter3), [output_ptr] "r"(output_ptr), [zero] "r"(zero) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); +#endif // __aarch64__ #else - const float32x4_t data1 = vld1q_f32(pos1); - const float32x4_t data2 = vld1q_f32(pos2); - const float32x4_t data3 = vld1q_f32(pos3); - const float32x4_t v_filter1 = vld1q_f32(filter1); - const float32x4_t v_filter2 = vld1q_f32(filter2); - const float32x4_t v_filter3 = vld1q_f32(filter3); - float32x4_t mula = vmulq_f32(data1, v_filter1); - mula = vmlaq_f32(mula, data2, v_filter2); - mula = vmlaq_f32(mula, data3, v_filter3); - float32x2_t res = vpadd_f32( - vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula)); - res = vpadd_f32(res, res); - if (if_bias) { - output_data[ph * output_width + pw] += vget_lane_f32(res, 0); - } else { - output_data[ph * output_width + pw] = vget_lane_f32(res, 0); - } -#endif +#endif // __ARM_NEON } } } @@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, input_data += input_batch_stride; output_data += output_batch_stride; } -#endif } void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, Tensor *output, Tensor *bias, bool if_bias) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const int batch_size = input->dims()[0]; @@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, Tensor bias, bool if_bias) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 22a351866dd62d8bd93b3e97970af54180b878b7..733511c37652f315b64683fad6fa5f4ed5d06c91 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "operators/math/gemm.h" #include "common/log.h" #include "memory/t_malloc.h" -#ifndef X86 +#if __ARM_NEON #include #endif #ifdef _OPENMP @@ -136,6 +136,10 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, for (int j = 0; j < n - n_tail; j += NR) { for (int i = 0; i < k; ++i) { b0 = &B(i, j); +#if __ARM_NEON +#if __aarch64__ + +#else asm volatile( "pld [%[b0]] \n\t" "vld1.32 {q0, q1}, [%[b0]] \n\t" @@ -143,6 +147,10 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, : [buffer] "+r"(buffer) : [b0] "r"(b0) : "memory", "q0", "q0"); +#endif // __aarch64__ +#else + +#endif // __ARM_NEON } } if (n_tail != 0) { @@ -206,7 +214,9 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, } } -#if defined(IOS) +#if __ARM_NEON +#if __aarch64__ + void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { // init C float32x4_t cv0 = vdupq_n_f32(0.0); @@ -255,9 +265,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { } } } -} // namespace math -#elif defined(ARMV7) +#else + void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { const float *a_ptr, *b_ptr; a_ptr = a; @@ -328,151 +338,6 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { "q10", "q11", "q12", "q13"); } -#else -void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { - float *c0, *c1, *c2, *c3; - c0 = c; - c1 = c + ldc; - c2 = c + 2 * ldc; - c3 = c + 3 * ldc; - for (int p = 0; p < k; p += 1) { - // first row - c0[0] += a[0] * b[0]; - c0[1] += a[0] * b[1]; - c0[2] += a[0] * b[2]; - c0[3] += a[0] * b[3]; - - // second row - c1[0] += a[1] * b[0]; - c1[1] += a[1] * b[1]; - c1[2] += a[1] * b[2]; - c1[3] += a[1] * b[3]; - - // third row - c2[0] += a[2] * b[0]; - c2[1] += a[2] * b[1]; - c2[2] += a[2] * b[2]; - c2[3] += a[2] * b[3]; - - // fourth row - c3[0] += a[3] * b[0]; - c3[1] += a[3] * b[1]; - c3[2] += a[3] * b[2]; - c3[3] += a[3] * b[3]; - - a += 4; - b += 4; - } -} - -#endif - -// 32位 float 矩阵乘法 -void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, bool relu) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; - - KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; - - // make sure MC is multiple of 4, and NC is multiple of 8 - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); - InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, - relu); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - -void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; - - KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; - - // make sure MC is multiple of 4, and NC is multiple of 8 - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); - InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { @@ -1699,6 +1564,153 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, "q12", "q13", "q14"); } +#endif // __aarch64__ +#else + +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { + float *c0, *c1, *c2, *c3; + c0 = c; + c1 = c + ldc; + c2 = c + 2 * ldc; + c3 = c + 3 * ldc; + for (int p = 0; p < k; p += 1) { + // first row + c0[0] += a[0] * b[0]; + c0[1] += a[0] * b[1]; + c0[2] += a[0] * b[2]; + c0[3] += a[0] * b[3]; + + // second row + c1[0] += a[1] * b[0]; + c1[1] += a[1] * b[1]; + c1[2] += a[1] * b[2]; + c1[3] += a[1] * b[3]; + + // third row + c2[0] += a[2] * b[0]; + c2[1] += a[2] * b[1]; + c2[2] += a[2] * b[2]; + c2[3] += a[2] * b[3]; + + // fourth row + c3[0] += a[3] * b[0]; + c3[1] += a[3] * b[1]; + c3[2] += a[3] * b[2]; + c3[3] += a[3] * b[3]; + + a += 4; + b += 4; + } +} + +#endif // __ARM_NEON + +// 32位 float 矩阵乘法 +void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, bool relu) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; + + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; + + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } + + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, + relu); + } + } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); +} + +void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu, float *new_scale, float *new_bias) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; + + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; + + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } + + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, + &C(i, j), ldc, relu, new_scale + i, new_bias + i); + } + } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); +} + +} // namespace math } // namespace operators } // namespace paddle_mobile -} // namespace paddle_mobile diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake index a8735adc8d853a5825a23f1ddf129d0a95199275..a4f58c37de510f700645ba7184b280f6eb1e11c2 100644 --- a/tools/ios-cmake/ios.toolchain.cmake +++ b/tools/ios-cmake/ios.toolchain.cmake @@ -159,7 +159,7 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su # set the architecture for iOS if (${IOS_PLATFORM} STREQUAL "OS") - set (IOS_ARCH armv7 armv7s arm64) + set (IOS_ARCH armv7 armv7s) elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR") set (IOS_ARCH i386) elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")