diff --git a/CMakeLists.txt b/CMakeLists.txt
index e87a256e51f429729c8a5d53baa2ec838ebb1594..f72778ea50c96f2c2fca13909fb7768d3c5b6fb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
 
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
 else()
     set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index 5db676564e190bf40e8af437ba68aee80b5a5af3..7e353c29b80279f895ad6d0150b31eb1703d97d4 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include <vector>
@@ -23,7 +23,6 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                       vector<int> paddings, const Tensor *filter, Tensor *bias,
                       Tensor *output, bool if_bias) {
-#ifdef __ARM_NEON
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
             }
 
           } else {
-#if defined(ARMV17)
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
             asm volatile(
 
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                   [filter2] "r"(filter2), [filter3] "r"(filter3),
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero)
                 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
 #else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
 
-            const float32x4_t v_filter1 = vld1q_f32(filter1);
-            const float32x4_t v_filter2 = vld1q_f32(filter2);
-            const float32x4_t v_filter3 = vld1q_f32(filter3);
-            float32x4_t mula = vmulq_f32(data1, v_filter1);
-            mula = vmlaq_f32(mula, data2, v_filter2);
-            mula = vmlaq_f32(mula, data3, v_filter3);
-            float32x2_t res = vpadd_f32(
-                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
-            res = vpadd_f32(res, res);
-            if (if_bias) {
-              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
-            } else {
-              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
-            }
-#endif
+#endif  // __ARM_NEON
           }
         }
       }
@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
-#endif
 }
 
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                           Tensor *output, Tensor *bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, const Tensor *new_scale,
                                    const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, const Tensor *new_scale,
                                    const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
 
   const int batch_size = input->dims()[0];
 
@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
 
 void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                             Tensor *output, Tensor bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                      Tensor *output, const Tensor *new_scale,
                                      const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 22a351866dd62d8bd93b3e97970af54180b878b7..733511c37652f315b64683fad6fa5f4ed5d06c91 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 #include "common/log.h"
 #include "memory/t_malloc.h"
-#ifndef X86
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #ifdef _OPENMP
@@ -136,6 +136,10 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
   for (int j = 0; j < n - n_tail; j += NR) {
     for (int i = 0; i < k; ++i) {
       b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+
+#else
       asm volatile(
           "pld        [%[b0]]               \n\t"
           "vld1.32    {q0, q1}, [%[b0]]         \n\t"
@@ -143,6 +147,10 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
           : [buffer] "+r"(buffer)
           : [b0] "r"(b0)
           : "memory", "q0", "q0");
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
     }
   }
   if (n_tail != 0) {
@@ -206,7 +214,9 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
   }
 }
 
-#if defined(IOS)
+#if __ARM_NEON
+#if __aarch64__
+
 void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
@@ -255,9 +265,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
     }
   }
 }
-}  // namespace math
 
-#elif defined(ARMV7)
+#else
+
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
@@ -328,151 +338,6 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
         "q10", "q11", "q12", "q13");
 }
 
-#else
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-  float *c0, *c1, *c2, *c3;
-  c0 = c;
-  c1 = c + ldc;
-  c2 = c + 2 * ldc;
-  c3 = c + 3 * ldc;
-  for (int p = 0; p < k; p += 1) {
-    // first row
-    c0[0] += a[0] * b[0];
-    c0[1] += a[0] * b[1];
-    c0[2] += a[0] * b[2];
-    c0[3] += a[0] * b[3];
-
-    // second row
-    c1[0] += a[1] * b[0];
-    c1[1] += a[1] * b[1];
-    c1[2] += a[1] * b[2];
-    c1[3] += a[1] * b[3];
-
-    // third row
-    c2[0] += a[2] * b[0];
-    c2[1] += a[2] * b[1];
-    c2[2] += a[2] * b[2];
-    c2[3] += a[2] * b[3];
-
-    // fourth row
-    c3[0] += a[3] * b[0];
-    c3[1] += a[3] * b[1];
-    c3[2] += a[3] * b[2];
-    c3[3] += a[3] * b[3];
-
-    a += 4;
-    b += 4;
-  }
-}
-
-#endif
-
-// 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
-
-  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
-
-  // make sure MC is multiple of 4, and NC is multiple of 8
-  int mblock_num = (m + MC - 1) / MC;
-  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-
-  int nblock_num = (n + NC - 1) / NC;
-  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
-                  relu);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-  paddle_mobile::memory::Free(zero);
-}
-
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *new_scale, float *new_bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
-
-  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
-
-  // make sure MC is multiple of 4, and NC is multiple of 8
-  int mblock_num = (m + MC - 1) / MC;
-  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-
-  int nblock_num = (n + NC - 1) / NC;
-  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
-                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-  paddle_mobile::memory::Free(zero);
-}
-
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
                   bool relu) {
@@ -1699,6 +1564,153 @@ void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
         "q12", "q13", "q14");
 }
 
+#endif  // __aarch64__
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
+  for (int p = 0; p < k; p += 1) {
+    // first row
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
+
+    // second row
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
+
+    // third row
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
+
+    // fourth row
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
+
+    a += 4;
+    b += 4;
+  }
+}
+
+#endif  // __ARM_NEON
+
+// 32位 float 矩阵乘法
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
+                  relu);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
-}  // namespace paddle_mobile
diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake
index a8735adc8d853a5825a23f1ddf129d0a95199275..a4f58c37de510f700645ba7184b280f6eb1e11c2 100644
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -159,7 +159,7 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
+  set (IOS_ARCH armv7 armv7s)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
   set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")