cherry-pick: fix bias quantize error && fix clang build error, test=develop (#2065)

08f6217b · Xiaoyang LI · GitHub · 66d7398e · 08f6217b · 08f6217b
73 changed file
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -165,6 +165,11 @@ function(lite_cc_binary TARGET)
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    # strip binary target to reduce size
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+            COMMAND "${CMAKE_STRIP}" -s
+            "${TARGET}"
+            COMMENT "Strip debug symbols done on final executable file.")
    # collect targets need to compile for lite
    if (NOT args_EXCLUDE_COMPILE_DEPS)
        add_dependencies(lite_compile_deps ${TARGET})
@@ -207,6 +212,11 @@ function(lite_cc_test TARGET)
              HVY_DEPS ${args_HVY_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
+    # strip binary target to reduce size
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+            COMMAND "${CMAKE_STRIP}" -s
+            "${TARGET}"
+            COMMENT "Strip debug symbols done on final executable file.")
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
    file(APPEND ${offline_test_registry_file} "${TARGET}\n")


--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -68,7 +68,6 @@ class LITE_API Predictor {
      GenRuntimeProgram();
    }
    program_->Run();
-    LOG(INFO) << "running";
  }

  // Get offset-th col of feed inputs.

--- a/lite/api/mobilenetv1_int8_test.cc
+++ b/lite/api/mobilenetv1_int8_test.cc
@@ -58,8 +58,9 @@ void TestModel(const std::vector<Place>& valid_places,

  std::vector<std::vector<float>> results;
  // i = 1
+  // ground truth result from fluid
  results.emplace_back(std::vector<float>(
-      {0.000227548, 0.000262385, 0.000260347, 0.000293865, 0.00025008}));
+      {0.0002451055, 0.0002585023, 0.0002659616, 0.0002823}));
  auto* out = predictor.GetOutput(0);
  ASSERT_EQ(out->dims().size(), 2);
  ASSERT_EQ(out->dims()[0], 1);

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -6,6 +6,17 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
  return()
 endif()

+set(script_dir ${CMAKE_CURRENT_SOURCE_DIR}/../../../tools/)
+message(STATUS "generating arm dotprod code")
+find_package(PythonInterp REQUIRED)
+execute_process(COMMAND ${PYTHON_EXECUTABLE} ${script_dir}/convert_arm_sdot_to_machine_code.py
+        "--input_file=${CMAKE_CURRENT_SOURCE_DIR}/dotprod/__gemm_sdot_meta__.h"
+        "--output_file=${CMAKE_CURRENT_SOURCE_DIR}/dotprod/gemm_sdot.h"
+        RESULT_VARIABLE gen_code_ret)
+if (NOT ${gen_code_ret} STREQUAL "0")
+  message(FATAL_ERROR "generating dotprod code quit with error: ${gen_code_ret}")
+endif ()
+
 set(HAS_ARM_MATH_LIB_DIR OFF)
 # will search name as "libmath_arm.${os}.${abi}.${lang}.a"
 if(ARM_MATH_LIB_DIR AND EXISTS "${ARM_MATH_LIB_DIR}")
@@ -50,6 +61,25 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      funcs.cc 
      packed_sgemm.cc
      sgemm.cc
+      gemm_prepacked_int8.cc
+      gemm_s8.cc
+      sgemv.cc
+      gemv_arm_int8.cc
+      conv3x3s1_direct_fp32.cc
+      conv3x3s2_direct_fp32.cc
+      conv3x3s1_depthwise_fp32.cc
+      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1_direct_int8.cc
+      conv3x3s2_direct_int8.cc
+      conv3x3s1_depthwise_int8.cc
+      conv3x3s2_depthwise_int8.cc
+      conv5x5s1_depthwise_int8.cc
+      conv5x5s1_depthwise_fp32.cc
+      conv5x5s2_depthwise_fp32.cc
+      conv_depthwise_3x3p0.cc
+      conv_depthwise_3x3p1.cc
+      conv_winograd_3x3.cc
+      conv_impl.cc
      softmax.cc 
      scale.cc
      pooling.cc
@@ -57,32 +87,13 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      lrn.cc
      decode_bboxes.cc
      concat.cc
-      sgemv.cc
      type_trans.cc
      box_coder.cc
-      conv_impl.cc
-      conv_direct_3x3s1.cc
-      conv_direct_3x3s2.cc
-      conv_direct.cc
-      conv_depthwise_3x3_int8.cc
-      conv_depthwise_5x5s1_int8.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_5x5s1.cc
-      conv_depthwise_5x5s2.cc
-      conv_depthwise.cc
-      conv_gemmlike.cc
-      conv_winograd_3x3.cc
-      conv_winograd.cc
      split.cc
      shuffle_channel.cc
      activation.cc
      yolo_box.cc
      dropout.cc
-      gemm_prepacked_int8.cc
-      gemv_arm_int8.cc
-      conv3x3s1_direct_int8.cc
-      conv3x3s2_direct_int8.cc
      power.cc
      interpolate.cc
      argmax.cc

--- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
--- a/lite/backends/arm/math/conv_direct_3x3s1.cc
+++ b/lite/backends/arm/math/conv_direct_3x3s1.cc
@@ -51,12 +51,7 @@ void conv_3x3s1_direct_fp32(const float* i_data,
  const int win_round = wout_round + 2;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
+
  int hout_r_block = (l2_size - 2 * win_round * ic) /
                     (win_round * ic + hout_c_block * wout_round * threads);
  hout_r_block = hout_r_block > oh ? oh : hout_r_block;

--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
--- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  int threads = ctx->threads();
+  const int pad_h = param.paddings[0];
+  const int pad_w = param.paddings[1];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 1;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 1;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc1 = ptr_write;
+            case 2:
+              outc2 = ptr_write;
+            case 1:
+              outc3 = ptr_write;
+            default:
+              break;
+          }
+        }
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
+          }
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldr    q8, [%[bias]]\n"         /* load bias */
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v19.16b,  v8.16b, v8.16b\n"
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v20.16b,  v8.16b, v8.16b\n"
+          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v21.16b,  v8.16b, v8.16b\n"
+          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v22.16b,  v8.16b, v8.16b\n"
+          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
+          /*  r0 mul w0-w2, get out */
+          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
+          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
+          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
+          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
+          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
+          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
+          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
+          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
+          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
+          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
+          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
+          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
+          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
+          /*  r1, mul w3-w5, get out */
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
+          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
+          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
+          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
+          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
+          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
+          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
+          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
+          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
+          /*  r2, mul w6-w8, get out r0, r1 */
+          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
+          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
+          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
+          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
+          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
+          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
+          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
+          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
+          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
+          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
+          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
+          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
+          /* transpose */
+          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
+          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
+          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
+          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
+          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          /* relu */
+          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi v0.4s, #0\n"             /* for relu */
+          "fmax v19.4s, v19.4s, v0.4s\n"
+          "fmax v20.4s, v20.4s, v0.4s\n"
+          "fmax v21.4s, v21.4s, v0.4s\n"
+          "fmax v22.4s, v22.4s, v0.4s\n"
+          /* save result */
+          "0:\n"
+          "str q19, [%[outc0]], #16\n"
+          "str q20, [%[outc1]], #16\n"
+          "str q21, [%[outc2]], #16\n"
+          "str q22, [%[outc3]], #16\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+          [inr2] "+r"(inr2),
+          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
+          : "cc", "memory",
+                  "v0","v1","v2","v3","v4","v5","v6","v7",
+                  "v8", "v19","v20","v21","v22"
+          );
+#else
+          asm volatile(
+          /* fill with bias */
+          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
+          /* load weights */
+          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
+          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
+          "vand.i32 q12,  q8, q8\n"
+          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
+          "vand.i32 q13,  q8, q8\n"
+          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
+          "vand.i32 q14,  q8, q8\n"
+          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
+          "vand.i32 q15,  q8, q8\n"
+          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
+          /* mul r0 with w0, w1, w2 */
+          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
+          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
+          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
+          /* mul r1 with w3, w4, w5 */
+          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
+          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
+          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
+          /* mul r2 with w6, w7, w8 */
+          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
+          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
+          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
+          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
+          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
+          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
+          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
+          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
+          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
+          /* transpose */
+          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
+          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
+          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
+          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
+          "cmp  %[flag_relu], #0\n"
+          "beq  0f\n"             /* skip relu*/
+          "vmov.u32 q0, #0\n"
+          "vmax.f32 q12, q12, q0\n"
+          "vmax.f32 q13, q13, q0\n"
+          "vmax.f32 q14, q14, q0\n"
+          "vmax.f32 q15, q15, q0\n"
+          "0:\n"
+          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
+          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
+          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
+          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+          :[r0] "+r"(inr0), [r1] "+r"(inr1),
+           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
+           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[bias] "r" (bias_local),
+           [flag_relu]"r"(flag_relu)
+          :"cc", "memory",
+            "q0","q1","q2","q3","q4","q5","q6","q7",
+            "q8", "q9","q10","q11","q12","q13","q14","q15"
+          );
+#endif  //  __arch64__
+          // clang-format off
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
--- a/lite/backends/arm/math/conv_direct_3x3s2.cc
+++ b/lite/backends/arm/math/conv_direct_3x3s2.cc
@@ -51,12 +51,7 @@ void conv_3x3s2_direct_fp32(const float* i_data,
  const int win_round = wout_round * 2 /*stride_w*/ + 1;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
+
  //! get h block
  //! win_round * ic * hin_r_block + wout_round * hout_c_block * hout_r_block
  //! * threads = l2_size

--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
--- a/lite/backends/arm/math/conv_depthwise_5x5s1.cc
+++ b/lite/backends/arm/math/conv_depthwise_5x5s1.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/backends/arm/math/conv_depthwise.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"

 namespace paddle {
 namespace lite {
@@ -5073,7 +5073,7 @@ void conv_depthwise_5x5s1_small_impl(const float* din,
  int w_in_new = w_in + 2 * pad_new;
  int h_out_new = h_out - 2 * pad_0;
  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
+  float zero_ptr[w_in_new + w_out];  // NOLINT
  memset(zero_ptr, 0, w_in_new * sizeof(float));
  float* write_ptr = zero_ptr + w_in_new;
  int pad_cnt = pad_0 >> 2;
@@ -5320,7 +5320,7 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
  int pad_0 = pad - pad_new;
  int h_in_new = h_in + 2 * pad_new;
  int w_in_new = w_in + 2 * pad_new;
-  float zero_ptr[w_in_new + w_out];
+  float zero_ptr[w_in_new + w_out];  // NOLINT
  memset(zero_ptr, 0, w_in_new * sizeof(float));
  float* write_ptr = zero_ptr + w_in_new;
  int h_out_new = h_out - 2 * pad_0;
@@ -9177,7 +9177,7 @@ void conv_depthwise_5x5s1_small_impl(const float* din,
  int w_in_new = w_in + 2 * pad_new;
  int h_out_new = h_out - 2 * pad_0;
  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
+  float zero_ptr[w_in_new + w_out];  // NOLINT
  memset(zero_ptr, 0, w_in_new * sizeof(float));
  float* write_ptr = zero_ptr + w_in_new;
  int pad_cnt = pad_0 >> 2;
@@ -9359,7 +9359,7 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
  int w_in_new = w_in + 2 * pad_new;
  int h_out_new = h_out - 2 * pad_0;
  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
+  float zero_ptr[w_in_new + w_out];  // NOLINT
  memset(zero_ptr, 0, w_in_new * sizeof(float));
  float* write_ptr = zero_ptr + w_in_new;
  int pad_cnt = pad_0 >> 2;
@@ -9523,21 +9523,21 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
 }
 #endif  // __aarch64__

-void conv_depthwise_5x5s1(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
+void conv_depthwise_5x5s1_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
  if (win < 4) {
    if (flag_relu) {
      conv_depthwise_5x5s1_small_relu_impl(din,

--- a/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc
+++ b/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc
--- a/lite/backends/arm/math/conv_depthwise_5x5s2.cc
+++ b/lite/backends/arm/math/conv_depthwise_5x5s2.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/backends/arm/math/conv_depthwise.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"

 namespace paddle {
 namespace lite {
@@ -80,21 +80,21 @@ void conv_depthwise_5x5s2p2_relu_s(const float* din,
                                   bool flag_relu,
                                   ARMContext* ctx);

-void conv_depthwise_5x5s2(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
+void conv_depthwise_5x5s2_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
  if (pad == 2) {
    if (win >= 9) {
      if (flag_relu) {

--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -16,20 +16,16 @@

 #include <cmath>
 #include <vector>
-#include "lite/backends/arm/math/conv_impl.h"
 #include "lite/core/context.h"
 #include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"

 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {

-template <PrecisionType Ptype>
-class DepthwiseConv
-    : public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
- public:
-  typedef void (*conv_dw_impl)(const float* i_data,
+void conv_3x3s1_depthwise_fp32(const float* i_data,
                               float* o_data,
                               int bs,
                               int oc,
@@ -37,62 +33,125 @@ class DepthwiseConv
                               int ow,
                               int ic,
                               int ih,
-                               int kw,
-                               const float* w_data,
-                               const float* b_data,
+                               int win,
+                               const float* weights,
+                               const float* bias,
                               const operators::ConvParam& param,
-                               Context<TARGET(kARM)>* ctx);
-  DepthwiseConv() = default;
-  ~DepthwiseConv() {}
+                               ARMContext* ctx);

-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  conv_dw_impl impl_{nullptr};
-};
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx);

-template <PrecisionType Ptype_out>
-class DepthwiseConvInt8
-    : public ImplBase<TARGET(kARM), PRECISION(kInt8), operators::ConvParam> {
- public:
-  typedef void (*conv_dw_int8_impl)(const int8_t* i_data,
-                                    int32_t* o_data,
-                                    int bs,
-                                    int oc,
-                                    int oh,
-                                    int ow,
-                                    int ic,
-                                    int ih,
-                                    int kw,
-                                    const int8_t* w_data,
-                                    const int32_t* b_data,
-                                    const operators::ConvParam& param,
-                                    Context<TARGET(kARM)>* ctx,
-                                    PrecisionType out_type,
-                                    const float* scale);
+void conv_depthwise_3x3p0_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int stride,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx);

-  DepthwiseConvInt8() = default;
-  ~DepthwiseConvInt8() {}
+void conv_depthwise_3x3p1_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int stride,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx);

-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
+template <typename Dtype>
+void conv_depthwise_3x3s1_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
+                               bool flag_bias,
+                               bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               ARMContext* ctx);

-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
+template <typename Dtype>
+void conv_depthwise_3x3s2_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
+                               bool flag_bias,
+                               bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               ARMContext* ctx);

-  virtual bool run(const operators::ConvParam& param);
+void conv_depthwise_5x5s1_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx);

- private:
-  conv_dw_int8_impl impl_{nullptr};
-  std::vector<float> w_scale_;
-  Tensor tmp_int32_out_;
-};
+void conv_depthwise_5x5s2_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx);

 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/conv_depthwise_3x3_int8.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3_int8.cc
--- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3p0.cc
@@ -128,21 +128,21 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
                                        const int w_out,
                                        ARMContext* ctx);

-void conv_depthwise_3x3p0(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
+void conv_depthwise_3x3p0_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int stride,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
  if (stride == 1) {
    if (flag_relu) {
      if (w_in > 5) {

--- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc
+++ b/lite/backends/arm/math/conv_depthwise_3x3p1.cc
@@ -128,21 +128,21 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
                                        const int w_out,
                                        ARMContext* ctx);

-void conv_depthwise_3x3p1(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
+void conv_depthwise_3x3p1_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int stride,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
  if (stride == 1) {
    if (flag_relu) {
      if (w_in > 4) {

--- a/lite/backends/arm/math/conv_direct.cc
+++ b/lite/backends/arm/math/conv_direct.cc
--- a/lite/backends/arm/math/conv_direct.h
+++ b/lite/backends/arm/math/conv_direct.h
--- a/lite/backends/arm/math/conv_gemmlike.cc
+++ b/lite/backends/arm/math/conv_gemmlike.cc
--- a/lite/backends/arm/math/conv_gemmlike.h
+++ b/lite/backends/arm/math/conv_gemmlike.h
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
@@ -102,7 +102,7 @@ void conv_winograd3x3(const float* din,
    //! dot mul
    //! transpose input, convert from ch_in * tile_h * tile_w * 64 to
    //! 64 * ch_in * tile_h * tile_w
-    int hblock = get_hblock(ctx->arch());
+    int hblock = get_hblock(ctx);
    int m_round = hblock * ((chout + hblock - 1) / hblock);
    int stride_a = m_round * chin;
    int stride_b = chin * size_tile;

--- a/lite/backends/arm/math/dotprod/__gemm_sdot_meta__.h
+++ b/lite/backends/arm/math/dotprod/__gemm_sdot_meta__.h
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
--- a/lite/backends/arm/math/gemm_prepacked_int8.h
+++ b/lite/backends/arm/math/gemm_prepacked_int8.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <cmath>
 #include "lite/core/context.h"
-#include "lite/core/device_info.h"
 #include "lite/core/tensor.h"

 namespace paddle {
@@ -34,7 +33,7 @@ const int NBLOCK_INT8_OTH = 16;
 const int MBLOCK_INT8_DOT = 8;
 const int NBLOCK_INT8_DOT = 12;

-inline int get_hblock_int8(const ARMContext* ctx) {
+inline int get_hblock_int8(ARMContext* ctx) {
 #ifdef WITH_ARM_DOTPROD
  if (ctx->has_dot()) {
    return MBLOCK_INT8_DOT;
@@ -51,7 +50,7 @@ inline int get_hblock_int8(const ARMContext* ctx) {
 const int MBLOCK_INT8_OTH = 4;
 const int NBLOCK_INT8_OTH = 8;

-inline int get_hblock_int8(const ARMContext* ctx) { return 4; }
+inline int get_hblock_int8(ARMContext* ctx) { return 4; }
 #endif  // __aarch64__

 void prepackA_int8(void* out,
@@ -75,7 +74,7 @@ void prepackA_int8(TensorLite* tout,
 template <typename dtype>
 void gemm_prepack_int8(const int8_t* A_packed,
                       const int8_t* B,
-                       const int* bias,
+                       const float* bias,
                       dtype* C,
                       int M,
                       int N,
@@ -87,7 +86,6 @@ void gemm_prepack_int8(const int8_t* A_packed,
                       ARMContext* ctx);

 #define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/gemm_s8.cc
+++ b/lite/backends/arm/math/gemm_s8.cc
--- a/lite/backends/arm/math/gemm_s8.h
+++ b/lite/backends/arm/math/gemm_s8.h
--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ b/lite/backends/arm/math/gemv_arm_int8.cc
--- a/lite/backends/arm/math/gemv_arm_int8.h
+++ b/lite/backends/arm/math/gemv_arm_int8.h
@@ -14,7 +14,7 @@

 #pragma once
 #include <cmath>
-#include "lite/core/device_info.h"
+#include "lite/core/context.h"

 namespace paddle {
 namespace lite {
@@ -30,9 +30,10 @@ bool gemv_int8(const int8_t* A,
               int M,
               int N,
               const float* scale,
-               bool is_bias = false,
-               const int* bias = nullptr,
-               bool is_relu = false);
+               bool is_bias,
+               const float* bias,
+               bool is_relu,
+               const ARMContext* ctx);

 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/packed_sgemm.h
+++ b/lite/backends/arm/math/packed_sgemm.h
--- a/lite/backends/arm/math/sgemm.cc
+++ b/lite/backends/arm/math/sgemm.cc
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_compute.h
+++ b/lite/kernels/arm/conv_compute.h
--- a/lite/kernels/arm/conv_compute_test.cc
+++ b/lite/kernels/arm/conv_compute_test.cc
--- a/lite/backends/arm/math/conv_depthwise.cc
+++ b/lite/backends/arm/math/conv_depthwise.cc
--- a/lite/kernels/arm/conv_depthwise.h
+++ b/lite/kernels/arm/conv_depthwise.h
--- a/lite/kernels/arm/conv_direct.cc
+++ b/lite/kernels/arm/conv_direct.cc
--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
--- a/lite/kernels/arm/conv_gemmlike.cc
+++ b/lite/kernels/arm/conv_gemmlike.cc
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
--- a/lite/backends/arm/math/conv_winograd.cc
+++ b/lite/backends/arm/math/conv_winograd.cc
--- a/lite/backends/arm/math/conv_winograd.h
+++ b/lite/backends/arm/math/conv_winograd.h
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
--- a/lite/kernels/arm/fc_compute_test.cc
+++ b/lite/kernels/arm/fc_compute_test.cc
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
 add_subdirectory(kernels)
+add_subdirectory(math)
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
--- a/lite/tests/kernels/gru_unit_test.cc
+++ b/lite/tests/kernels/gru_unit_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/kernels/test_sgemm.cc
+++ b/lite/tests/kernels/test_sgemm.cc
--- a/lite/tests/kernels/fill_data.h
+++ b/lite/tests/kernels/fill_data.h
--- a/lite/tests/kernels/test_funcs.h
+++ b/lite/tests/kernels/test_funcs.h
--- a/lite/tests/utils/tensor_utils.h
+++ b/lite/tests/utils/tensor_utils.h
--- a/lite/tests/utils/timer.h
+++ b/lite/tests/utils/timer.h
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/convert_arm_sdot_to_machine_code.py
+++ b/lite/tools/convert_arm_sdot_to_machine_code.py
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc