Optimize gemv u8 and refactor ops.

03e21542 · 李寅 · 5efbfbff · 03e21542 · 03e21542 · 03e21542
60 changed file
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -11,6 +11,7 @@ load(
    "//mace:mace.bzl",
    "if_android",
    "if_neon_enabled",
+    "if_neon_enabled_str",
    "if_openmp_enabled",
    "if_android_armv7",
    "if_hexagon_enabled",
@@ -41,8 +42,8 @@ cc_library(
        "-DMACE_ENABLE_HEXAGON",
    ]),
    deps = [
-        "//mace/public",
        "//mace/ops",
+        "//mace/public",
    ],
    alwayslink = 1,
 )
@@ -59,8 +60,8 @@ cc_binary(
    linkshared = 1,
    linkstatic = 0,
    deps = [
-        "//mace/libmace:mace_version_script.lds",
        "//mace/libmace",
+        "//mace/libmace:mace_version_script.lds",
    ],
 )

@@ -81,6 +82,8 @@ genrule(
    srcs = [
        "//mace/codegen:generated_version",
        "//mace/core",
+        "//mace/ops:common",
+        "//mace/ops:ref_kernels",
        "//mace/ops:internal_ops",
        "//mace/ops",
        "//mace/libmace",
@@ -88,13 +91,20 @@ genrule(
        "//mace/proto:mace_cc",
        "@com_google_protobuf//:protobuf_lite",
    ] + if_opencl_enabled([
+        "//mace/ops:opencl_kernels",
        "//mace/codegen:generated_opencl",
+    ]) + if_neon_enabled([
+        "//mace/ops:arm_neon_kernels",
    ]),
    outs = ["libmace.a"],
    cmd = "tmp_mri_file=$$(mktemp mace-static-lib-mri.XXXXXXXXXX);" +
          "mri_stream=$$(python $(location //mace/python/tools:archive_static_lib) " +
          "$(locations //mace/codegen:generated_version) " +
          "$(locations //mace/core:core) " +
+          "$(locations //mace/ops:common) " +
+          "$(locations //mace/ops:ref_kernels) " +
+          if_neon_enabled_str("$(locations //mace/ops:arm_neon_kernels) ") +
+          if_opencl_enabled_str("$(locations //mace/ops:opencl_kernels) ") +
          "$(locations //mace/ops:internal_ops) " +
          "$(locations //mace/ops:ops) " +
          "$(locations //mace/libmace:libmace) " +

--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>

-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"

 namespace mace {

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -26,7 +26,7 @@
 #include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
-#include "mace/ops/transpose.h"
+#include "mace/ops/common/transpose.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -42,6 +42,12 @@ def if_neon_enabled(a):
      "//conditions:default": [],
  })

+def if_neon_enabled_str(a):
+  return select({
+      "//mace:neon_enabled": a,
+      "//conditions:default": "",
+  })
+
 def if_hexagon_enabled(a):
  return select({
      "//mace:hexagon_enabled": a,

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -18,61 +18,323 @@ load(
 )

 cc_library(
-    name = "internal_ops",
+    name = "common",
    srcs = glob(
        [
-            "*.cc",
-            "arm/*.cc",
+            "common/*.cc",
+        ],
+    ),
+    hdrs = glob(
+        [
+            "common/*.h",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        "//mace/core",
+    ],
+)
+
+cc_library(
+    name = "testing",
+    srcs = glob(
+        [
+            "testing/*.cc",
+        ],
+    ),
+    hdrs = glob(
+        [
+            "testing/*.h",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        "//mace/core",
+        "@gtest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "ref_kernels",
+    srcs = glob(
+        [
+            "ref/*.cc",
+        ],
+    ),
+    hdrs = glob(
+        [
+            "ref/*.h",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        ":common",
+        "//mace/core",
+    ],
+)
+
+# After refactor, all arm neon kernels go here.
+# Could be shipped to other product use.
+cc_library(
+    name = "arm_neon_kernels",
+    srcs = glob(
+        [
+            "arm/fp32/*.cc",
        ],
        exclude = [
-            "*_test.cc",
-            "*_benchmark.cc",
-            "arm/*_test.cc",
-            "ops_registry.cc",
-            "ops_test_util.cc",
-            "buffer_transform.cc",
-            "lstm_cell.cc",
-            "quantize.cc",
-            "quantization_util.cc",
+            "arm/fp32/*_test.cc",
+        ],
+    ) + if_quantize_enabled(glob(
+        [
+            "arm/q8/*.cc",
        ],
-    ) + if_opencl_enabled(glob(
+        exclude = [
+            "arm/q8/*_test.cc",
+        ],
+    )),
+    hdrs = glob(
+        [
+            "arm/fp32/*.h",
+        ],
+    ) + if_quantize_enabled(glob(
+        [
+            "arm/q8/*.h",
+        ],
+    )),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        ":common",
+        "//mace/core",
+    ],
+)
+
+# After refactor, all GPU OpenCL kernels go here.
+# Could be shipped to other product use.
+cc_library(
+    name = "opencl_kernels",
+    srcs = glob(
        [
            "opencl/*.cc",
-            "opencl/image/*.cc",
-            "opencl/buffer/*.cc",
+            "opencl/**/*.cc",
            "buffer_transform.cc",
            "lstm_cell.cc",
        ],
        exclude = [
            "opencl/*_test.cc",
        ],
-    )) + if_quantize_enabled(glob(
+    ),
+    hdrs = glob(
+        [
+            "opencl/*.h",
+            "opencl/**/*.h",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        ":common",
+        "//mace/core",
+    ],
+)
+
+cc_library(
+    name = "arm_neon_kernels_test",
+    srcs = glob(
+        [
+            "arm/fp32/*_test.cc",
+        ],
+    ) + if_quantize_enabled(glob(
+        [
+            "arm/q8/*_test.cc",
+        ],
+    )),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        ":arm_neon_kernels",
+        ":ref_kernels",
+        ":testing",
+        "@gtest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "opencl_kernels_test",
+    srcs = glob(
+        [
+            "opencl/*_test.cc",
+            "opencl/**/*_test.cc",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]) + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = [
+        ":opencl_kernels",
+        ":ref_kernels",
+        ":testing",
+        "@gtest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "internal_ops",
+    srcs = glob(
        [
+            "*.cc",
+            "arm/*.cc",  # remove it after refactor
+        ],
+        exclude = [
+            "*_test.cc",
+            "*_benchmark.cc",
+            "ops_registry.cc",
+            "ops_test_util.cc",
+            "lstm_cell.cc",  # TODO: move it into opencl
+            "buffer_transform.cc",  # TODO: move it into opencl
            "quantize.cc",
            "quantization_util.cc",
+            "arm/*_test.cc",  # remove it after refactor
        ],
-    )),
+    ) + if_quantize_enabled(
+        glob(
+            [
+                "quantize.cc",
+                "quantization_util.cc",
+            ],
+        ),
+    ),
    hdrs = glob(
        [
            "*.h",
-            "arm/*.h",
+            "arm/*.h",  # remove it after refactor
        ],
        exclude = [
            "ops_registry.h",
            "ops_test_util.h",
            "fixpoint.h",
            "gemmlowp_util.h",
-            "arm/fixpoint_*.h",
            "quantization_util.h",
        ],
-    ) + if_opencl_enabled(glob([
-        "opencl/*.h",
-        "opencl/image/*.h",
-        "opencl/buffer/*.h",
-    ])) + if_quantize_enabled(glob([
+    ) + if_quantize_enabled(glob([
        "fixpoint.h",
        "gemmlowp_util.h",
-        "arm/fixpoint_*.h",
        "quantization_util.h",
    ])),
    copts = [
@@ -85,7 +347,6 @@ cc_library(
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
        "-mfpu=neon",
-    ]) + if_android_armv7([
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
@@ -96,21 +357,30 @@ cc_library(
    ]),
    linkopts = if_android(["-lm"]),
    deps = [
+        ":ref_kernels",
        "//mace/core",
    ] + if_quantize_enabled([
        "@tflite",
        "@gemmlowp",
+    ]) + if_neon_enabled([
+        ":arm_neon_kernels",
+    ]) + if_opencl_enabled([
+        ":opencl_kernels",
    ]),
 )

 cc_library(
    name = "ops",
-    srcs = [
-        "ops_registry.cc",
-    ],
-    hdrs = [
-        "ops_registry.h",
-    ],
+    srcs = glob(
+        [
+            "ops_registry.cc",
+        ],
+    ),
+    hdrs = glob(
+        [
+            "ops_registry.h",
+        ],
+    ),
    copts = [
        "-Werror",
        "-Wextra",
@@ -121,7 +391,6 @@ cc_library(
        "-DMACE_ENABLE_NEON",
    ]) + if_android_armv7([
        "-mfpu=neon",
-    ]) + if_android_armv7([
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
@@ -161,6 +430,7 @@ cc_library(
    ]),
    deps = [
        "ops",
+        "testing",
        "@gtest",
    ],
 )
@@ -171,8 +441,8 @@ cc_test(
    srcs = glob(
        [
            "*_test.cc",
-            "arm/*_test.cc",
-            "opencl/*_test.cc",
+            "arm/*_test.cc",  # remove it after refactor
+            "ops_test_util.cc",
        ],
        exclude = [
            "fixpoint_test.cc",
@@ -203,9 +473,14 @@ cc_test(
    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
-        "test",
+        ":ops",
+        ":test",
        "@gtest//:gtest_main",
-    ],
+    ] + if_neon_enabled([
+        ":arm_neon_kernels_test",
+    ]) + if_opencl_enabled([
+        ":opencl_kernels_test",
+    ]),
 )

 cc_test(
@@ -233,7 +508,7 @@ cc_test(
    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
-        "test",
+        ":ops",
        "//mace/benchmark:statistics",
        "//mace/core:test_benchmark_main",
        "//third_party/eigen3",

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -20,22 +20,13 @@
 #include <string>

 #include "mace/core/types.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/arm/activation_neon.h"
 #include "mace/utils/logging.h"

 namespace mace {
 namespace ops {

-enum ActivationType {
-  NOOP = 0,
-  RELU = 1,
-  RELUX = 2,
-  PRELU = 3,
-  TANH = 4,
-  SIGMOID = 5,
-  LEAKYRELU = 6,
-};
-
 inline ActivationType StringToActivationType(const std::string type) {
  if (type == "RELU") {
    return ActivationType::RELU;

--- a/mace/ops/arm/README
+++ b/mace/ops/arm/README
+# Notes for Contributors and Roadmap
+
+We are going to refactor and optimize ARM related kernels one step at a time.
+
+The code structure will be organized as that kernels for each data type are separated.
+By this way, they are independent to each other and can be linked and shipped as a submodule,
+and it saves us from writing macro boilerplate. The reason we do not use a unified header file for each kernel
+is that we are not forcing developers to use exact same interface for kernels of different data types at this level,
+although doing this is recommended if convenient to do so. A reference version is put right in `ops` directory to be used
+as non-NEON kernels, which are not separated for different data types for simplicity.
+
+Although interface is kept flexible and can be defined according to demand,
+input/output parameters should be of `Tensor` type instead of raw pointer, as
+`Tensor` has more information kernel might use.
+
--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
@@ -15,7 +15,6 @@
 #include <algorithm>

 #include "mace/ops/arm/conv_winograd.h"
-#include "mace/ops/gemm.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/arm/fixpoint_gemm.h
+++ b/mace/ops/arm/fixpoint_gemm.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FIXPOINT_GEMM_H_
-#define MACE_OPS_ARM_FIXPOINT_GEMM_H_
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-#define vaddvq_u32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
-#endif
-
-namespace mace {
-namespace ops {
-
-template<typename INPUT_TYPE, typename OUTPUT_TYPE>
-void FixPointGemv(const INPUT_TYPE *lhs,
-                  const INPUT_TYPE *rhs,
-                  const int lhs_zero_point,
-                  const int rhs_zero_point,
-                  const index_t lhs_height,
-                  const index_t lhs_width,
-                  OUTPUT_TYPE *result);
-
-template<>
-void FixPointGemv<uint8_t, int32_t>(const uint8_t *lhs,
-                                    const uint8_t *rhs,
-                                    const int lhs_zero_point,
-                                    const int rhs_zero_point,
-                                    const index_t lhs_height,
-                                    const index_t lhs_width,
-                                    int32_t *result) {
-  int32_t zero_point_dot = lhs_zero_point * rhs_zero_point * lhs_width;
-
-  uint32_t sum_rhs = 0;
-  for (index_t i = 0; i < lhs_width; ++i) {
-    sum_rhs += rhs[i];
-  }
-
-#pragma omp parallel for
-  for (index_t h = 0; h < lhs_height; ++h) {
-    const uint8_t *lhs_ptr = lhs + h * lhs_width;
-    const uint8_t *rhs_ptr = rhs;
-    int32_t *ret_ptr = result + h;
-
-    uint32_t dot = 0;
-    uint32_t sum_lhs = 0;
-    index_t w = 0;
-
-#if defined(MACE_ENABLE_NEON)
-    uint32x4_t vo0_high_u32, vo0_low_u32, vo1_high_u32, vo1_low_u32;
-    vo0_high_u32 = vdupq_n_u32(0);
-    vo0_low_u32 = vdupq_n_u32(0);
-    vo1_high_u32 = vdupq_n_u32(0);
-    vo1_low_u32 = vdupq_n_u32(0);
-
-    uint32x4_t sum_lhs_low_u32, sum_lhs_high_u32;
-    sum_lhs_low_u32 = vdupq_n_u32(0);
-    sum_lhs_high_u32 = vdupq_n_u32(0);
-
-    for (; w <= lhs_width - 16; w += 16) {
-      uint8x8_t vl0_u8, vl1_u8;
-      uint8x8_t vr0_u8, vr1_u8;
-      uint16x8_t vl0_u16, vl1_u16;
-      uint16x8_t vr0_u16, vr1_u16;
-
-      vl0_u8 = vld1_u8(lhs_ptr);
-      vl1_u8 = vld1_u8(lhs_ptr + 8);
-
-      vr0_u8 = vld1_u8(rhs_ptr);
-      vr1_u8 = vld1_u8(rhs_ptr + 8);
-
-      vl0_u16 = vmovl_u8(vl0_u8);
-      vl1_u16 = vmovl_u8(vl1_u8);
-
-      vr0_u16 = vmovl_u8(vr0_u8);
-      vr1_u16 = vmovl_u8(vr1_u8);
-
-      vo0_high_u32 = vmlal_u16(vo0_high_u32,
-                               vget_high_u16(vl0_u16),
-                               vget_high_u16(vr0_u16));
-      vo0_low_u32 = vmlal_u16(vo0_low_u32,
-                              vget_low_u16(vl0_u16),
-                              vget_low_u16(vr0_u16));
-      vo1_high_u32 = vmlal_u16(vo1_high_u32,
-                               vget_high_u16(vl1_u16),
-                               vget_high_u16(vr1_u16));
-      vo1_low_u32 = vmlal_u16(vo1_low_u32,
-                              vget_low_u16(vl1_u16),
-                              vget_low_u16(vr1_u16));
-
-      // It can be precuculated if lhs is const, but for this case
-      // computation is not bottleneck
-      sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16),
-                                   vget_high_u16(vl1_u16));
-      sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16),
-                                  vget_low_u16(vl1_u16));
-
-      lhs_ptr += 16;
-      rhs_ptr += 16;
-    }
-    vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32);
-    vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32);
-    vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32);
-    dot += vaddvq_u32(vo0_low_u32);
-
-    sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32);
-    sum_lhs = vaddvq_u32(sum_lhs_low_u32);
-#endif  // MACE_ENABLE_NEON
-
-    for (; w < lhs_width; ++w) {
-      dot += (*lhs_ptr) * (*rhs_ptr);
-      sum_lhs += (*lhs_ptr);
-      ++lhs_ptr;
-      ++rhs_ptr;
-    }
-
-    int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point
-        + zero_point_dot;
-
-    *ret_ptr = ret;
-  }  // h
-}
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FIXPOINT_GEMM_H_
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mace/ops/arm/fp32/gemv.h"
+
+#include <arm_neon.h>
+#include <algorithm>
+
+#if !defined(__aarch64__)
+#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
+#endif
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Gemv::Compute(const OpContext *context,
+                         const Tensor *lhs,
+                         const Tensor *rhs,
+                         const Tensor *bias,
+                         const index_t batch,
+                         const index_t lhs_height,
+                         const index_t lhs_width,
+                         const bool lhs_batched,
+                         Tensor *output) {
+  MACE_UNUSED(context);
+
+  Tensor::MappingGuard lhs_guard(lhs);
+  Tensor::MappingGuard rhs_guard(rhs);
+  Tensor::MappingGuard bias_guard(bias);
+  Tensor::MappingGuard output_guard(output);
+
+  const index_t h_block_size = 4;
+  const index_t h_block_count = RoundUpDiv(lhs_height, h_block_size);
+  const index_t w_block_size = 8;
+  const index_t w_block_count = lhs_width / w_block_size;
+  const index_t w_remain = lhs_width - w_block_size * w_block_count;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) {
+      // TODO(liyin): it can be put it outside the loop,
+      // but openmp limits param count
+      const float *lhs_data = lhs->data<float>();
+      const float *rhs_data = rhs->data<float>();
+      const float *bias_data = nullptr;
+      if (bias) {
+        bias_data = bias->data<float>();
+      }
+      float *output_data = output->mutable_data<float>();
+
+      const float
+          *lhs_ptr = lhs_data
+          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+          + lhs_width * h_block_idx * h_block_size;
+      const float *rhs_ptr = rhs_data + b * lhs_width;
+      float
+          *ret_ptr = output_data + b * lhs_height + h_block_idx * h_block_size;
+
+      const index_t h_block_len =
+          std::min(h_block_size, lhs_height - h_block_idx * h_block_size);
+      const index_t h_offset = h_block_idx * h_block_size;
+
+      if (h_block_len == 4) {
+        float32x4_t vo0 = vdupq_n_f32(0);
+        float32x4_t vo1 = vdupq_n_f32(0);
+        float32x4_t vo2 = vdupq_n_f32(0);
+        float32x4_t vo3 = vdupq_n_f32(0);
+
+        index_t r_w_block_count = w_block_count;
+        // just make compiler happy
+        MACE_UNUSED(r_w_block_count);
+
+        // Register layout: (4x8) x (8,1)
+        //
+        //                                      +----+
+        //                                      |d16 |
+        //                                      | .  |
+        //                              Rhs     +----+
+        //                                      |d17 |
+        //                                      | .  |
+        //                                      +----+
+        //                                      |d18 |
+        //                                      | .  |
+        //                                      +----+
+        //                                      |d19 |
+        //                                      | .  |
+        //                                      +----+
+        //
+        //                                      |    |
+        //
+        //                      Lhs             |    |
+        //
+        //  +------+------+----+-----+ - - - -  +----+
+        //  | d0 . | d1 .| d2 .| d3 .|          |vo0 |
+        //  | d4 . | d5 .| d6 .| d7 .|          |vo1 |
+        //  | d8 . | d9 .| d10.| d11.|          |vo2 |
+        //  | d12. | d13.| d14.| d15.|          |vo3 |
+        //  +------+-----+-----+-----+ - - - -  +----+
+        //
+        //                                    Accumulator
+        //
+
+#if not defined(__aarch64__)
+        asm volatile(
+        "cmp %[r_w_block_count], #0\n"
+        "beq 0f\n"
+
+        "lsl r5, %[lhs_width], #2\n"
+
+        "mov r0, %[rhs_ptr]\n"
+        "mov r1, %[lhs_ptr]\n"
+        "add r2, r1, r5\n"
+        "add r3, r2, r5\n"
+        "add r4, r3, r5\n"
+
+        // prelogue
+        "vld1.f32 {d16-d17}, [r0]!\n"
+        "vld1.f32 {d18-d19}, [r0]!\n"
+
+        "vld1.f32 {d0-d1}, [r1]!\n"
+        "vld1.f32 {d2-d3}, [r1]!\n"
+        "vld1.f32 {d4-d5}, [r2]!\n"
+        "vld1.f32 {d6-d7}, [r2]!\n"
+        "vld1.f32 {d8-d9}, [r3]!\n"
+        "vld1.f32 {d10-d11}, [r3]!\n"
+        "vld1.f32 {d12-d13}, [r4]!\n"
+        "vld1.f32 {d14-d15}, [r4]!\n"
+
+        "subs %[r_w_block_count], #1\n"
+        "beq 1f\n"
+
+        "2: \n"
+        "vmla.f32 %q[vo0], q0, q8\n"
+        "vmla.f32 %q[vo1], q2, q8\n"
+        "vmla.f32 %q[vo2], q4, q8\n"
+        "vmla.f32 %q[vo3], q6, q8\n"
+
+
+        "vmla.f32 %q[vo0], q1, q9\n"
+        "vmla.f32 %q[vo1], q3, q9\n"
+        "vmla.f32 %q[vo2], q5, q9\n"
+        "vmla.f32 %q[vo3], q7, q9\n"
+
+        "subs %[r_w_block_count], #1\n"
+
+
+        "vld1.f32 {d0-d1}, [r1]!\n"
+        "vld1.f32 {d4-d5}, [r2]!\n"
+        "vld1.f32 {d8-d9}, [r3]!\n"
+        "vld1.f32 {d12-d13}, [r4]!\n"
+        "vld1.f32 {d16-d17}, [r0]!\n"
+
+        "vld1.f32 {d2-d3}, [r1]!\n"
+        "vld1.f32 {d6-d7}, [r2]!\n"
+        "vld1.f32 {d10-d11}, [r3]!\n"
+        "vld1.f32 {d14-d15}, [r4]!\n"
+        "vld1.f32 {d18-d19}, [r0]!\n"
+
+        "bne 2b\n"
+
+        // prologue
+        "1:\n"
+        "vmla.f32 %q[vo0], q0, q8\n"
+        "vmla.f32 %q[vo1], q2, q8\n"
+        "vmla.f32 %q[vo2], q4, q8\n"
+        "vmla.f32 %q[vo3], q6, q8\n"
+
+        "vmla.f32 %q[vo0], q1, q9\n"
+        "vmla.f32 %q[vo1], q3, q9\n"
+        "vmla.f32 %q[vo2], q5, q9\n"
+        "vmla.f32 %q[vo3], q7, q9\n"
+
+        "0:\n"
+        :  // outputs
+        [vo0] "+w"(vo0),
+        [vo1] "+w"(vo1),
+        [vo2] "+w"(vo2),
+        [vo3] "+w"(vo3),
+        [r_w_block_count] "+r"(r_w_block_count)
+        :  // inputs
+        [lhs_ptr] "r"(lhs_ptr), [rhs_ptr] "r"(rhs_ptr),
+        [lhs_width] "r"(lhs_width)
+        :  // clobbers
+        "cc", "memory", "r0", "r1", "r2", "r3", "r4", "r5",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21");
+
+        lhs_ptr += w_block_count * w_block_size;
+        rhs_ptr += w_block_count * w_block_size;
+#else
+        for (index_t w_block_index = 0; w_block_index < w_block_count;
+             ++w_block_index) {
+          float32x4_t vr0 = vld1q_f32(rhs_ptr);
+          float32x4_t vr0n = vld1q_f32(rhs_ptr + 4);
+
+          float32x4_t vl0 = vld1q_f32(lhs_ptr);
+          float32x4_t vl0n = vld1q_f32(lhs_ptr + 4);
+          vo0 = vmlaq_f32(vo0, vl0, vr0);
+          vo0 = vmlaq_f32(vo0, vl0n, vr0n);
+
+          const float *lhs_ptr1 = lhs_ptr + lhs_width;
+          float32x4_t vl1 = vld1q_f32(lhs_ptr1);
+          float32x4_t vl1n = vld1q_f32(lhs_ptr1 + 4);
+          vo1 = vmlaq_f32(vo1, vl1, vr0);
+          vo1 = vmlaq_f32(vo1, vl1n, vr0n);
+
+          const float *lhs_ptr2 = lhs_ptr1 + lhs_width;
+          float32x4_t vl2 = vld1q_f32(lhs_ptr2);
+          float32x4_t vl2n = vld1q_f32(lhs_ptr2 + 4);
+          vo2 = vmlaq_f32(vo2, vl2, vr0);
+          vo2 = vmlaq_f32(vo2, vl2n, vr0n);
+
+          const float *lhs_ptr3 = lhs_ptr2 + lhs_width;
+          float32x4_t vl3 = vld1q_f32(lhs_ptr3);
+          float32x4_t vl3n = vld1q_f32(lhs_ptr3 + 4);
+          vo3 = vmlaq_f32(vo3, vl3, vr0);
+          vo3 = vmlaq_f32(vo3, vl3n, vr0n);
+
+          lhs_ptr += 8;
+          rhs_ptr += 8;
+        }
+#endif  // __aarch64__
+        float32x4_t vo = {
+            vaddvq_f32(vo0),
+            vaddvq_f32(vo1),
+            vaddvq_f32(vo2),
+            vaddvq_f32(vo3)
+        };
+        for (index_t w = 0; w < w_remain; ++w) {
+          vo[0] += lhs_ptr[0] * rhs_ptr[0];
+          vo[1] += lhs_ptr[lhs_width] * rhs_ptr[0];
+          vo[2] += lhs_ptr[lhs_width * 2] * rhs_ptr[0];
+          vo[3] += lhs_ptr[lhs_width * 3] * rhs_ptr[0];
+          ++lhs_ptr;
+          ++rhs_ptr;
+        }
+
+        float32x4_t vbias = vdupq_n_f32(0);
+        if (bias) {
+          vbias = vld1q_f32(bias_data + h_offset);
+        }
+        vo = vaddq_f32(vo, vbias);
+        vst1q_f32(ret_ptr, vo);
+      } else {  // h_block_len < 4
+        // TODO(liyin): handle here case by case (1,2,3) to accelerate
+        const float *tmp_lhs_ptr = lhs_ptr;
+        const float *tmp_rhs_ptr = rhs_ptr;
+        for (index_t h = 0; h < h_block_len; ++h) {
+          lhs_ptr = tmp_lhs_ptr + h * lhs_width;
+          rhs_ptr = tmp_rhs_ptr;
+          float32x4_t vo0 = vdupq_n_f32(0);
+          for (index_t w = 0; w < w_block_count; ++w) {
+            float32x4_t vr0 = vld1q_f32(rhs_ptr);
+            float32x4_t vr0n = vld1q_f32(rhs_ptr + 4);
+
+            float32x4_t vl0 = vld1q_f32(lhs_ptr);
+            float32x4_t vl0n = vld1q_f32(lhs_ptr + 4);
+            vo0 = vmlaq_f32(vo0, vl0, vr0);
+            vo0 = vmlaq_f32(vo0, vl0n, vr0n);
+
+            lhs_ptr += 8;
+            rhs_ptr += 8;
+          }  // w
+          float s0 = vaddvq_f32(vo0) + (bias ? bias_data[h_offset + h] : 0);
+          for (index_t w = 0; w < w_remain; ++w) {
+            s0 += lhs_ptr[0] * rhs_ptr[0];
+            ++lhs_ptr;
+            ++rhs_ptr;
+          }  // w
+
+          ret_ptr[h] = s0;
+        }  // h
+      }  // if
+    }  // h_block_idx
+  }  // b
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+#if defined(vaddvq_f32)
+#undef vaddvq_f32
+#endif
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_GEMV_H_
+#define MACE_OPS_ARM_FP32_GEMV_H_
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Gemv {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *lhs,
+      const Tensor *rhs,
+      const Tensor *bias,
+      const index_t batch,
+      const index_t lhs_height,
+      const index_t lhs_width,
+      const bool lhs_batched,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_GEMV_H_
--- a/mace/ops/arm/fp32/gemv_test.cc
+++ b/mace/ops/arm/fp32/gemv_test.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <gtest/gtest.h>
+
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/gemv.h"
+#include "mace/ops/ref/gemv.h"
+#include "mace/ops/testing/test_utils.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+void TestGemvFloat32(const index_t batch,
+                     const index_t height,
+                     const index_t width,
+                     const bool lhs_batched) {
+  Tensor lhs(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor rhs(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor bias(GetCPUAllocator(), DataType::DT_FLOAT);
+  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
+  lhs.Resize({lhs_batched ? batch : 1, height, width});
+  rhs.Resize({batch, width});
+  bias.Resize({height});
+  output.Resize({batch, height});
+  {
+    Tensor::MappingGuard lhs_guard(&lhs);
+    Tensor::MappingGuard rhs_guard(&rhs);
+    Tensor::MappingGuard bias_guard(&bias);
+    float *lhs_data = lhs.mutable_data<float>();
+    float *rhs_data = rhs.mutable_data<float>();
+    float *bias_data = bias.mutable_data<float>();
+    GenerateRandomRealTypeData<float>(lhs.shape(), lhs_data);
+    GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data);
+    GenerateRandomRealTypeData<float>(bias.shape(), bias_data);
+  }
+  ::mace::ops::arm::fp32::Gemv gemv;
+  gemv.Compute(nullptr,
+               &lhs,
+               &rhs,
+               &bias,
+               batch,
+               height,
+               width,
+               lhs_batched,
+               &output);
+
+  Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
+  expected_output.Resize({batch, height});
+  ::mace::ops::ref::Gemv<float> gemv_ref;
+  gemv_ref.Compute(nullptr,
+                   &lhs,
+                   &rhs,
+                   &bias,
+                   batch,
+                   height,
+                   width,
+                   lhs_batched,
+                   &expected_output);
+
+  Tensor::MappingGuard output_guard(&output);
+  Tensor::MappingGuard expected_guard(&expected_output);
+  const float *output_data = output.data<float>();
+  const float *expected_data = expected_output.data<float>();
+
+  for (index_t i = 0; i < output.size(); ++i) {
+    EXPECT_NEAR(expected_data[i], output_data[i], 0.001);
+  }
+}
+
+TEST(ArmGemv, TestGemvFloat32) {
+  TestGemvFloat32(1, 16, 4, true);
+  TestGemvFloat32(1, 16, 256, true);
+  TestGemvFloat32(2, 16, 256, true);
+  TestGemvFloat32(3, 63, 257, true);
+
+  TestGemvFloat32(1, 16, 4, false);
+  TestGemvFloat32(1, 16, 256, false);
+  TestGemvFloat32(2, 16, 256, false);
+  TestGemvFloat32(3, 63, 257, false);
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mace/ops/arm/q8/gemv.h"
+
+#include <arm_neon.h>
+#include <algorithm>
+
+#include "mace/utils/utils.h"
+#include "mace/utils/quantize.h"
+
+#if !defined(__aarch64__)
+
+#define vmlal_high_s16(c, a, b) vmlal_s16(c, vget_high_s16(a), vget_high_s16(b))
+
+#define vaddvq_s32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
+
+#endif
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+template<typename OUTPUT_TYPE>
+MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
+                                      const Tensor *lhs,
+                                      const Tensor *rhs,
+                                      const Tensor *bias,
+                                      const index_t batch,
+                                      const index_t lhs_height,
+                                      const index_t lhs_width,
+                                      const bool lhs_batched,
+                                      Tensor *output) {
+  MACE_UNUSED(context);
+
+  bool is_output_type_uint8 =
+      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8;
+  Tensor::MappingGuard lhs_guard(lhs);
+  Tensor::MappingGuard rhs_guard(rhs);
+  Tensor::MappingGuard bias_guard(bias);
+  Tensor::MappingGuard output_guard(output);
+
+  float output_multiplier_float = 0.0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
+  if (is_output_type_uint8) {
+    MACE_CHECK(output->scale() > 0, "output scale must not be zero");
+    output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
+    GetOutputMultiplierAndShift(lhs->scale(),
+                                rhs->scale(),
+                                output->scale(),
+                                &output_multiplier,
+                                &output_shift);
+  }
+  const index_t h_block_size = 4;
+  const index_t h_block_count = RoundUpDiv(lhs_height, h_block_size);
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) {
+      // TODO(liyin): it can be put it outside the loop,
+      // but openmp limits param count
+      const index_t w_block_size = 16;
+      const index_t w_block_count = lhs_width / w_block_size;
+      const index_t w_remain = lhs_width - w_block_size * w_block_count;
+
+      uint8_t lhs_zero_point = static_cast<uint8_t>(lhs->zero_point());
+      uint8_t rhs_zero_point = static_cast<uint8_t>(rhs->zero_point());
+
+      const uint8_t *lhs_data = lhs->data<uint8_t>();
+      const uint8_t *rhs_data = rhs->data<uint8_t>();
+      const int32_t *bias_data = nullptr;
+      if (bias) {
+        bias_data = bias->data<int32_t>();
+      }
+      OUTPUT_TYPE *output_data = output->mutable_data<OUTPUT_TYPE>();
+
+      int32x4_t voutput_multiplier = vdupq_n_s32(output_multiplier);
+      int32x4_t voutput_shift_left = vdupq_n_s32(-output_shift);
+
+      uint8x8_t
+          vlhs_zero_point = vdup_n_u8(lhs_zero_point);
+      uint8x8_t
+          vrhs_zero_point = vdup_n_u8(rhs_zero_point);
+
+      const uint8_t
+          *lhs_ptr = lhs_data
+          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+          + lhs_width * h_block_idx * h_block_size;
+      const uint8_t *rhs_ptr = rhs_data + b * lhs_width;
+      OUTPUT_TYPE
+          *ret_ptr = output_data + b * lhs_height + h_block_idx * h_block_size;
+
+      const index_t h_block_len =
+          std::min(h_block_size, lhs_height - h_block_idx * h_block_size);
+      const index_t h_offset = h_block_idx * h_block_size;
+
+      if (h_block_len == 4) {
+        int32x4_t vo0 = vdupq_n_s32(0);
+        int32x4_t vo1 = vdupq_n_s32(0);
+        int32x4_t vo2 = vdupq_n_s32(0);
+        int32x4_t vo3 = vdupq_n_s32(0);
+
+        index_t r_w_block_count = w_block_count;
+        // just make compiler happy
+        MACE_UNUSED(r_w_block_count);
+
+        // Register layout: (4x16) x (16x1)
+        //
+        //                                                 +----+
+        //                                                 |d16 |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                         Rhs     +----+
+        //                                                 |d17 |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 +----+
+        //                                                 |d18 |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 +----+
+        //                                                 |d19 |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 | .  |
+        //                                                 +----+
+        //
+        //                                                 |    |
+        //
+        //                      Lhs                        |    |
+        //
+        //  +--------+--------+--------+--------+ - - - -  +----+
+        //  | d0 ... | d1 ... | d2 ... | d3 ... |          |vo0 |
+        //  | d4 ... | d5 ... | d6 ... | d7 ... |          |vo1 |
+        //  | d8 ... | d9 ... | d10... | d11... |          |vo2 |
+        //  | d12... | d13... | d14... | d15... |          |vo3 |
+        //  +--------+--------+--------+--------+ - - - -  +----+
+        //
+        //                                               Accumulator
+        //
+
+#if not defined(__aarch64__)
+        asm volatile(
+        "cmp %[r_w_block_count], #0\n"
+        "beq 0f\n"
+
+        "mov r0, %[rhs_ptr]\n"
+        "mov r1, %[lhs_ptr]\n"
+        "add r2, r1, %[lhs_width]\n"
+        "add r3, r2, %[lhs_width]\n"
+        "add r4, r3, %[lhs_width]\n"
+
+        "vdup.u8 d20, %[rhs_zero_point]\n"
+        "vdup.u8 d21, %[lhs_zero_point]\n"
+
+        // prelogue
+        "vld1.8 d16, [r0]!\n"
+        "vld1.8 d18, [r0]!\n"
+
+        "vld1.8 d0, [r1]!\n"
+        "vld1.8 d2, [r1]!\n"
+        "vld1.8 d4, [r2]!\n"
+        "vld1.8 d6, [r2]!\n"
+        "vld1.8 d8, [r3]!\n"
+        "vld1.8 d10, [r3]!\n"
+        "vld1.8 d12, [r4]!\n"
+        "vld1.8 d14, [r4]!\n"
+
+        "subs %[r_w_block_count], #1\n"
+        "beq 1f\n"
+
+        "2: \n"
+        "vsubl.u8 q8, d16, d20\n"
+        "vsubl.u8 q9, d18, d20\n"
+
+        "vsubl.u8 q0, d0, d21\n"
+        "vsubl.u8 q1, d2, d21\n"
+        "vsubl.u8 q2, d4, d21\n"
+        "vsubl.u8 q3, d6, d21\n"
+        "vsubl.u8 q4, d8, d21\n"
+        "vsubl.u8 q5, d10, d21\n"
+        "vsubl.u8 q6, d12, d21\n"
+        "vsubl.u8 q7, d14, d21\n"
+
+        "vmlal.s16 %q[vo0], d0, d16\n"
+        "vmlal.s16 %q[vo1], d4, d16\n"
+        "vmlal.s16 %q[vo2], d8, d16\n"
+        "vmlal.s16 %q[vo3], d12, d16\n"
+
+        "vld1.8 d0, [r1]!\n"
+        "vld1.8 d4, [r2]!\n"
+        "vld1.8 d8, [r3]!\n"
+        "vld1.8 d12, [r4]!\n"
+        "vld1.8 d16, [r0]!\n"
+
+        "vmlal.s16 %q[vo0], d2, d18\n"
+        "vmlal.s16 %q[vo1], d6, d18\n"
+        "vmlal.s16 %q[vo2], d10, d18\n"
+        "vmlal.s16 %q[vo3], d14, d18\n"
+
+        "vld1.8 d2, [r1]!\n"
+        "vld1.8 d6, [r2]!\n"
+        "vld1.8 d10, [r3]!\n"
+        "vld1.8 d14, [r4]!\n"
+        "vld1.8 d18, [r0]!\n"
+
+        "vmlal.s16 %q[vo0], d1, d17\n"
+        "vmlal.s16 %q[vo1], d5, d17\n"
+        "vmlal.s16 %q[vo2], d9, d17\n"
+        "vmlal.s16 %q[vo3], d13, d17\n"
+
+        "subs %[r_w_block_count], #1\n"
+        "vmlal.s16 %q[vo0], d3, d19\n"
+        "vmlal.s16 %q[vo1], d7, d19\n"
+        "vmlal.s16 %q[vo2], d11, d19\n"
+        "vmlal.s16 %q[vo3], d15, d19\n"
+
+        "bne 2b\n"
+
+        // prologue
+        "1:\n"
+        "vsubl.u8 q8, d16, d20\n"
+        "vsubl.u8 q9, d18, d20\n"
+
+        "vsubl.u8 q0, d0, d21\n"
+        "vsubl.u8 q1, d2, d21\n"
+        "vsubl.u8 q2, d4, d21\n"
+        "vsubl.u8 q3, d6, d21\n"
+        "vsubl.u8 q4, d8, d21\n"
+        "vsubl.u8 q5, d10, d21\n"
+        "vsubl.u8 q6, d12, d21\n"
+        "vsubl.u8 q7, d14, d21\n"
+
+        "vmlal.s16 %q[vo0], d0, d16\n"
+        "vmlal.s16 %q[vo1], d4, d16\n"
+        "vmlal.s16 %q[vo2], d8, d16\n"
+        "vmlal.s16 %q[vo3], d12, d16\n"
+
+        "vmlal.s16 %q[vo0], d1, d17\n"
+        "vmlal.s16 %q[vo1], d5, d17\n"
+        "vmlal.s16 %q[vo2], d9, d17\n"
+        "vmlal.s16 %q[vo3], d13, d17\n"
+
+        "vmlal.s16 %q[vo0], d2, d18\n"
+        "vmlal.s16 %q[vo1], d6, d18\n"
+        "vmlal.s16 %q[vo2], d10, d18\n"
+        "vmlal.s16 %q[vo3], d14, d18\n"
+
+        "vmlal.s16 %q[vo0], d3, d19\n"
+        "vmlal.s16 %q[vo1], d7, d19\n"
+        "vmlal.s16 %q[vo2], d11, d19\n"
+        "vmlal.s16 %q[vo3], d15, d19\n"
+
+        "0:\n"
+        :  // outputs
+        [vo0] "+w"(vo0),
+        [vo1] "+w"(vo1),
+        [vo2] "+w"(vo2),
+        [vo3] "+w"(vo3),
+        [r_w_block_count] "+r"(r_w_block_count)
+        :  // inputs
+        [lhs_ptr] "r"(lhs_ptr), [rhs_ptr] "r"(rhs_ptr),
+        [lhs_width] "r"(lhs_width),
+        [lhs_zero_point] "r"(lhs_zero_point),
+        [rhs_zero_point] "r"(rhs_zero_point)
+        :  // clobbers
+        "cc", "memory", "r0", "r1", "r2", "r3", "r4",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21");
+
+        lhs_ptr += w_block_count * w_block_size;
+        rhs_ptr += w_block_count * w_block_size;
+#else
+        for (index_t w_block_index = 0; w_block_index < w_block_count;
+             ++w_block_index) {
+          uint8x8_t vr0 = vld1_u8(rhs_ptr);
+          int16x8_t
+              vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point));
+          uint8x8_t vr0n = vld1_u8(rhs_ptr + 8);
+          int16x8_t
+              vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point));
+
+          uint8x8_t vl0 = vld1_u8(lhs_ptr);
+          int16x8_t
+              vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point));
+          uint8x8_t vl0n = vld1_u8(lhs_ptr + 8);
+          int16x8_t
+              vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point));
+
+          vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0));
+          vo0 = vmlal_high_s16(vo0, vxl0, vxr0);
+          vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n));
+          vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n);
+
+          const uint8_t *lhs_ptr1 = lhs_ptr + lhs_width;
+
+          uint8x8_t vl1 = vld1_u8(lhs_ptr1);
+          int16x8_t
+              vxl1 = vreinterpretq_s16_u16(vsubl_u8(vl1, vlhs_zero_point));
+          uint8x8_t vl1n = vld1_u8(lhs_ptr1 + 8);
+          int16x8_t
+              vxl1n = vreinterpretq_s16_u16(vsubl_u8(vl1n, vlhs_zero_point));
+
+          vo1 = vmlal_s16(vo1, vget_low_s16(vxl1), vget_low_s16(vxr0));
+          vo1 = vmlal_high_s16(vo1, vxl1, vxr0);
+          vo1 = vmlal_s16(vo1, vget_low_s16(vxl1n), vget_low_s16(vxr0n));
+          vo1 = vmlal_high_s16(vo1, vxl1n, vxr0n);
+
+          const uint8_t *lhs_ptr2 = lhs_ptr1 + lhs_width;
+
+          uint8x8_t vl2 = vld1_u8(lhs_ptr2);
+          int16x8_t
+              vxl2 = vreinterpretq_s16_u16(vsubl_u8(vl2, vlhs_zero_point));
+          uint8x8_t vl2n = vld1_u8(lhs_ptr2 + 8);
+          int16x8_t
+              vxl2n = vreinterpretq_s16_u16(vsubl_u8(vl2n, vlhs_zero_point));
+
+          vo2 = vmlal_s16(vo2, vget_low_s16(vxl2), vget_low_s16(vxr0));
+          vo2 = vmlal_high_s16(vo2, vxl2, vxr0);
+          vo2 = vmlal_s16(vo2, vget_low_s16(vxl2n), vget_low_s16(vxr0n));
+          vo2 = vmlal_high_s16(vo2, vxl2n, vxr0n);
+
+          const uint8_t *lhs_ptr3 = lhs_ptr2 + lhs_width;
+
+          uint8x8_t vl3 = vld1_u8(lhs_ptr3);
+          int16x8_t
+              vxl3 = vreinterpretq_s16_u16(vsubl_u8(vl3, vlhs_zero_point));
+          uint8x8_t vl3n = vld1_u8(lhs_ptr3 + 8);
+          int16x8_t
+              vxl3n = vreinterpretq_s16_u16(vsubl_u8(vl3n, vlhs_zero_point));
+
+          vo3 = vmlal_s16(vo3, vget_low_s16(vxl3), vget_low_s16(vxr0));
+          vo3 = vmlal_high_s16(vo3, vxl3, vxr0);
+          vo3 = vmlal_s16(vo3, vget_low_s16(vxl3n), vget_low_s16(vxr0n));
+          vo3 = vmlal_high_s16(vo3, vxl3n, vxr0n);
+
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+        }
+#endif  // __aarch64__
+        int32x4_t vo = {vaddvq_s32(vo0),
+                        vaddvq_s32(vo1),
+                        vaddvq_s32(vo2),
+                        vaddvq_s32(vo3)};
+
+        for (index_t w = 0; w < w_remain; ++w) {
+          vo[0] +=
+              (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point);
+          vo[1] += (lhs_ptr[lhs_width] - lhs_zero_point)
+              * (rhs_ptr[0] - rhs_zero_point);
+          vo[2] += (lhs_ptr[lhs_width * 2] - lhs_zero_point)
+              * (rhs_ptr[0] - rhs_zero_point);
+          vo[3] += (lhs_ptr[lhs_width * 3] - lhs_zero_point)
+              * (rhs_ptr[0] - rhs_zero_point);
+          ++lhs_ptr;
+          ++rhs_ptr;
+        }
+
+        int32x4_t vbias = vdupq_n_s32(0);
+        if (bias) {
+          vbias = vld1q_s32(bias_data + h_offset);
+        }
+        vo = vaddq_s32(vo, vbias);
+
+        if (is_output_type_uint8) {
+          int32x4_t vo_mul = vqrdmulhq_s32(vo, voutput_multiplier);
+          int32x4_t
+              fixup = vshrq_n_s32(vandq_s32(vo_mul, voutput_shift_left), 31);
+          int32x4_t fixed_up_x = vqaddq_s32(vo_mul, fixup);
+          int32x4_t
+              vo_rescale_int32 = vrshlq_s32(fixed_up_x, voutput_shift_left);
+
+          int16x4_t vo_rescale_int16 = vqmovn_s32(vo_rescale_int32);
+          uint8x8_t vo_rescale_uint8 =
+              vqmovun_s16(vcombine_s16(vo_rescale_int16, vo_rescale_int16));
+
+          ret_ptr[0] = vo_rescale_uint8[0];
+          ret_ptr[1] = vo_rescale_uint8[1];
+          ret_ptr[2] = vo_rescale_uint8[2];
+          ret_ptr[3] = vo_rescale_uint8[3];
+        } else {
+          ret_ptr[0] = vo[0];
+          ret_ptr[1] = vo[1];
+          ret_ptr[2] = vo[2];
+          ret_ptr[3] = vo[3];
+        }
+      } else {  // h_block_len < 4
+        // TODO(liyin): handle here case by case (1,2,3) to accelerate
+        const uint8_t *tmp_lhs_ptr = lhs_ptr;
+        const uint8_t *tmp_rhs_ptr = rhs_ptr;
+        for (index_t h = 0; h < h_block_len; ++h) {
+          lhs_ptr = tmp_lhs_ptr + h * lhs_width;
+          rhs_ptr = tmp_rhs_ptr;
+          int32x4_t vo0 = vdupq_n_s32(0);
+          for (index_t w = 0; w < w_block_count; ++w) {
+            uint8x8_t vr0 = vld1_u8(rhs_ptr);
+            int16x8_t
+                vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point));
+            uint8x8_t vr0n = vld1_u8(rhs_ptr + 8);
+            int16x8_t
+                vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point));
+
+            uint8x8_t vl0 = vld1_u8(lhs_ptr);
+            int16x8_t
+                vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point));
+            uint8x8_t vl0n = vld1_u8(lhs_ptr + 8);
+            int16x8_t
+                vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point));
+
+            vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0));
+            vo0 = vmlal_high_s16(vo0, vxl0, vxr0);
+            vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n));
+            vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n);
+
+            lhs_ptr += 16;
+            rhs_ptr += 16;
+          }  // w
+          int32_t s0 = vaddvq_s32(vo0) + (bias ? bias_data[h_offset + h] : 0);
+          for (index_t w = 0; w < w_remain; ++w) {
+            s0 += (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point);
+            ++lhs_ptr;
+            ++rhs_ptr;
+          }  // w
+
+          if (is_output_type_uint8) {
+            ret_ptr[h] =
+                Saturate<uint8_t>(std::roundf(s0 * output_multiplier_float));
+          } else {
+            ret_ptr[h] = s0;
+          }
+        }  // h
+      }  // if
+    }  // h_block_idx
+  }  // b
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template
+class Gemv<uint8_t>;
+template
+class Gemv<int32_t>;
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#if defined(vmlal_high_s16)
+#undef vmlal_high_s16
+#undef vaddvq_s32
+#endif
--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_Q8_GEMV_H_
+#define MACE_OPS_ARM_Q8_GEMV_H_
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+template<typename OUTPUT_TYPE>
+class Gemv {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *lhs,
+      const Tensor *rhs,
+      const Tensor *bias,
+      const index_t batch,
+      const index_t lhs_height,
+      const index_t lhs_width,
+      const bool lhs_batched,
+      Tensor *output);
+};
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_Q8_GEMV_H_
--- a/mace/ops/arm/q8/gemv_test.cc
+++ b/mace/ops/arm/q8/gemv_test.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <gtest/gtest.h>
+
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/q8/gemv.h"
+#include "mace/ops/ref/gemv.h"
+#include "mace/ops/testing/test_utils.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+void TestGemvInt32(const index_t batch,
+                   const index_t height,
+                   const index_t width,
+                   const bool lhs_batched) {
+  Tensor lhs(GetCPUAllocator(), DataType::DT_UINT8);
+  Tensor rhs(GetCPUAllocator(), DataType::DT_UINT8);
+  Tensor bias(GetCPUAllocator(), DataType::DT_INT32);
+  Tensor output(GetCPUAllocator(), DataType::DT_INT32);
+  lhs.SetScale(0.5);
+  rhs.SetScale(0.3);
+  lhs.SetZeroPoint(0);
+  rhs.SetZeroPoint(0);
+  lhs.Resize({lhs_batched ? batch : 1, height, width});
+  rhs.Resize({batch, width});
+  bias.Resize({height});
+  output.Resize({batch, height});
+  {
+    Tensor::MappingGuard lhs_guard(&lhs);
+    Tensor::MappingGuard rhs_guard(&rhs);
+    Tensor::MappingGuard bias_guard(&bias);
+    uint8_t *lhs_data = lhs.mutable_data<uint8_t>();
+    uint8_t *rhs_data = rhs.mutable_data<uint8_t>();
+    int32_t *bias_data = bias.mutable_data<int32_t>();
+    GenerateRandomIntTypeData<uint8_t>(lhs.shape(), lhs_data);
+    GenerateRandomIntTypeData<uint8_t>(rhs.shape(), rhs_data);
+    GenerateRandomIntTypeData<int32_t>(bias.shape(), bias_data);
+  }
+
+  mace::ops::arm::q8::Gemv<int32_t> gemv;
+  gemv.Compute(nullptr,
+               &lhs,
+               &rhs,
+               &bias,
+               batch,
+               height,
+               width,
+               lhs_batched,
+               &output);
+
+  Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
+  expected_output.Resize({batch, height});
+  mace::ops::ref::Gemv<int32_t> gemv_ref;
+  gemv_ref.Compute(nullptr,
+                   &lhs,
+                   &rhs,
+                   &bias,
+                   batch,
+                   height,
+                   width,
+                   lhs_batched,
+                   &expected_output);
+
+  Tensor::MappingGuard output_guard(&output);
+  Tensor::MappingGuard expected_guard(&expected_output);
+  const int32_t *output_data = output.data<int32_t>();
+  const int32_t *expected_data = expected_output.data<int32_t>();
+
+  for (index_t i = 0; i < output.size(); ++i) {
+    EXPECT_EQ(expected_data[i], output_data[i]);
+  }
+}
+
+void TestGemvUint8(const index_t batch,
+                   const index_t height,
+                   const index_t width,
+                   const bool lhs_batched) {
+  Tensor lhs(GetCPUAllocator(), DataType::DT_UINT8);
+  Tensor rhs(GetCPUAllocator(), DataType::DT_UINT8);
+  Tensor bias(GetCPUAllocator(), DataType::DT_INT32);
+  Tensor output(GetCPUAllocator(), DataType::DT_UINT8);
+  lhs.SetScale(0.5);
+  rhs.SetScale(0.3);
+  output.SetScale(0.6);
+  lhs.SetZeroPoint(23);
+  rhs.SetZeroPoint(45);
+  output.SetZeroPoint(57);
+  lhs.Resize({batch, height, width});
+  rhs.Resize({batch, width});
+  bias.Resize({height});
+  output.Resize({batch, height});
+  {
+    Tensor::MappingGuard lhs_guard(&lhs);
+    Tensor::MappingGuard rhs_guard(&rhs);
+    Tensor::MappingGuard bias_guard(&bias);
+
+    uint8_t *lhs_data = lhs.mutable_data<uint8_t>();
+    uint8_t *rhs_data = rhs.mutable_data<uint8_t>();
+    int32_t *bias_data = bias.mutable_data<int32_t>();
+    GenerateRandomIntTypeData<uint8_t>(lhs.shape(), lhs_data);
+    GenerateRandomIntTypeData<uint8_t>(rhs.shape(), rhs_data);
+    GenerateRandomIntTypeData<int32_t>(bias.shape(), bias_data);
+  }
+
+  mace::ops::arm::q8::Gemv<uint8_t> gemv;
+  gemv.Compute(nullptr,
+               &lhs,
+               &rhs,
+               &bias,
+               batch,
+               height,
+               width,
+               lhs_batched,
+               &output);
+
+  Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
+  expected_output.SetScale(0.6);
+  expected_output.SetZeroPoint(57);
+  expected_output.Resize({batch, height});
+  mace::ops::ref::Gemv<uint8_t> gemv_ref;
+  gemv_ref.Compute(nullptr,
+                   &lhs,
+                   &rhs,
+                   &bias,
+                   batch,
+                   height,
+                   width,
+                   lhs_batched,
+                   &expected_output);
+
+  Tensor::MappingGuard output_guard(&output);
+  Tensor::MappingGuard expected_guard(&expected_output);
+  const uint8_t *output_data = output.data<uint8_t>();
+  const uint8_t *expected_data = expected_output.data<uint8_t>();
+
+  for (index_t i = 0; i < output.size(); ++i) {
+    EXPECT_EQ(expected_data[i], output_data[i]);
+  }
+}
+
+TEST(ArmGemv, TestGemvInt32) {
+  TestGemvInt32(1, 16, 4, true);
+  TestGemvInt32(1, 16, 256, true);
+  TestGemvInt32(2, 16, 256, true);
+  TestGemvInt32(3, 63, 257, true);
+
+  TestGemvInt32(1, 16, 4, false);
+  TestGemvInt32(1, 16, 256, false);
+  TestGemvInt32(2, 16, 256, false);
+  TestGemvInt32(3, 63, 257, false);
+}
+
+TEST(ArmGemv, TestGemvUint8) {
+  TestGemvUint8(1, 16, 4, true);
+  TestGemvUint8(1, 16, 256, true);
+  TestGemvUint8(2, 16, 256, true);
+  TestGemvUint8(3, 63, 257, true);
+
+  TestGemvUint8(1, 16, 4, false);
+  TestGemvUint8(1, 16, 256, false);
+  TestGemvUint8(2, 16, 256, false);
+  TestGemvUint8(3, 63, 257, false);
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/common/activation_type.h
+++ b/mace/ops/common/activation_type.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_COMMON_ACTIVATION_TYPE_H_
+#define MACE_OPS_COMMON_ACTIVATION_TYPE_H_
+
+namespace mace {
+namespace ops {
+
+enum ActivationType {
+  NOOP = 0,
+  RELU = 1,
+  RELUX = 2,
+  PRELU = 3,
+  TANH = 4,
+  SIGMOID = 5,
+  LEAKYRELU = 6,
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_COMMON_ACTIVATION_TYPE_H_
--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 #include <algorithm>
 #include <cmath>

--- a/mace/ops/conv_pool_2d_util.h
+++ b/mace/ops/conv_pool_2d_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_CONV_POOL_2D_UTIL_H_
-#define MACE_OPS_CONV_POOL_2D_UTIL_H_
+#ifndef MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_
+#define MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_

 #include "mace/core/tensor.h"

@@ -116,4 +116,4 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input,
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_CONV_POOL_2D_UTIL_H_
+#endif  // MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_
--- a/mace/ops/common/transpose.cc
+++ b/mace/ops/common/transpose.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/common/transpose.h"
+
+#include <algorithm>
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/core/types.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace ops {
+
+namespace {
+void TransposeNHWCToNCHWC3(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
+
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width * 3;
+    index_t out_offset = h * width;
+
+#if defined(MACE_ENABLE_NEON)
+    index_t w;
+    for (w = 0; w + 3 < width; w += 4) {
+      float32x4x3_t vi = vld3q_f32(input + in_offset);
+      vst1q_f32(output + out_offset, vi.val[0]);
+      vst1q_f32(output + out_offset + image_size, vi.val[1]);
+      vst1q_f32(output + out_offset + image_size * 2, vi.val[2]);
+
+      in_offset += 12;
+      out_offset += 4;
+    }
+    for (; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[h * width + image_size * c + w] =
+          input[h * width * 3 + w * 3 + c];
+      }
+    }
+#else
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 3; ++c) {
+        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
+      }
+    }
+#endif
+  }
+}
+
+void TransposeNCHWToNHWCC2(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width) {
+  index_t image_size = height * width;
+#pragma omp parallel for
+  for (index_t h = 0; h < height; ++h) {
+    index_t in_offset = h * width;
+    index_t out_offset = h * width * 2;
+
+#if defined(MACE_ENABLE_NEON)
+    index_t w;
+    for (w = 0; w + 3 < width; w += 4) {
+      float32x4_t vi0 = vld1q_f32(input + in_offset);
+      float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
+      float32x4x2_t vi = {vi0, vi1};
+      vst2q_f32(output + out_offset, vi);
+      in_offset += 4;
+      out_offset += 8;
+    }
+    for (; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[h * width * 2 + w * 2 + c] =
+          input[h * width + image_size * c + w];
+      }
+    }
+#else
+    for (index_t w = 0; w < width; ++w) {
+      for (index_t c = 0; c < 2; ++c) {
+        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
+      }
+    }
+#endif
+  }
+}
+}  // namespace
+
+MaceStatus Transpose(const float *input,
+                     const std::vector<int64_t> &input_shape,
+                     const std::vector<int> &dst_dims,
+                     float *output) {
+  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
+               (input_shape.size() == 4 && dst_dims.size() == 4),
+             "Only support 2D or 4D transpose");
+
+  std::vector<index_t> output_shape;
+  for (size_t i = 0; i < dst_dims.size(); ++i) {
+    output_shape.push_back(input_shape[dst_dims[i]]);
+  }
+
+  if (input_shape.size() == 2) {
+    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t stride_i = height;
+    index_t stride_j = width;
+    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
+#pragma omp parallel for collapse(2)
+    for (index_t i = 0; i < height; i += tile_size) {
+      for (index_t j = 0; j < width; j += tile_size) {
+        index_t end_i = std::min(i + tile_size, height);
+        index_t end_j = std::min(j + tile_size, width);
+        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+            output[tile_j * stride_i + tile_i] =
+              input[tile_i * stride_j + tile_j];
+          }
+        }
+      }
+    }
+  } else if (input_shape.size() == 4) {
+    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
+
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        TransposeNHWCToNCHWC3(input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[1],
+                              input_shape[2]);
+      }
+    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
+      && input_shape[1] == 2) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        TransposeNCHWToNHWCC2(input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[2],
+                              input_shape[3]);
+      }
+    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channel = input_shape[3];
+      index_t channel_raw_size = channel * sizeof(float);
+      index_t stride_i = height;
+      index_t stride_j = width;
+      index_t tile_size = std::max(static_cast<index_t>(1),
+                                   static_cast<index_t>(std::sqrt(
+                                     8 * 1024 / channel)));
+#pragma omp parallel for collapse(2)
+      for (index_t i = 0; i < height; i += tile_size) {
+        for (index_t j = 0; j < width; j += tile_size) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              memcpy(output + (tile_j * stride_i + tile_i) * channel,
+                     input + (tile_i * stride_j + tile_j) * channel,
+                     channel_raw_size);
+            }
+          }
+        }
+      }
+    } else {
+      std::vector<index_t>
+        in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                  input_shape[2] * input_shape[3], input_shape[3], 1};
+      std::vector<index_t>
+        out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                   output_shape[2] * output_shape[3], output_shape[3], 1};
+
+      std::vector<index_t> idim(4, 0);
+      std::vector<index_t> odim(4, 0);
+      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+              idim[dst_dims[0]] = odim[0];
+              idim[dst_dims[1]] = odim[1];
+              idim[dst_dims[2]] = odim[2];
+              idim[dst_dims[3]] = odim[3];
+
+              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                + odim[2] * out_stride[2] + odim[3]] =
+                input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                  + idim[2] * in_stride[2] + idim[3]];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_TRANSPOSE_H_
-#define MACE_OPS_TRANSPOSE_H_
+#ifndef MACE_OPS_COMMON_TRANSPOSE_H_
+#define MACE_OPS_COMMON_TRANSPOSE_H_

 #include <vector>

@@ -30,4 +30,4 @@ MaceStatus Transpose(const float *input,
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_TRANSPOSE_H_
+#endif  // MACE_OPS_COMMON_TRANSPOSE_H_
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -30,7 +30,7 @@
 #include "mace/ops/arm/conv_2d_neon.h"
 #include "mace/ops/arm/conv_winograd.h"
 #include "mace/ops/conv_pool_2d_base.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"

 #ifdef MACE_ENABLE_QUANTIZE

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -16,7 +16,7 @@

 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 #include <vector>

-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -18,7 +18,7 @@
 #include <vector>

 #include "mace/core/operator.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -22,7 +22,7 @@
 #include "mace/core/operator.h"
 #include "mace/core/types.h"
 #include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -16,7 +16,7 @@

 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -16,7 +16,7 @@
 #include <vector>

 #include "mace/ops/deconv_2d.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -16,7 +16,7 @@

 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -20,13 +20,19 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
-#include "mace/ops/gemm.h"
+
+#ifdef MACE_ENABLE_NEON
+
+#include "mace/ops/arm/fp32/gemv.h"

 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
-#include "mace/ops/quantization_util.h"
+#include "mace/ops/arm/q8/gemv.h"
 #endif  // MACE_ENABLE_QUANTIZE

+#else
+#include "mace/ops/ref/gemv.h"
+#endif  // MACE_ENABLE_NEON
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/fully_connected.h"
@@ -41,10 +47,10 @@ class FullyConnectedOpBase : public Operation {
      : Operation(context),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
+                                                   "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)) {}
+            "leakyrelu_coefficient", 0.0f)) {}
 protected:
  const ActivationType activation_;
  const float relux_max_limit_;
@@ -54,10 +60,10 @@ class FullyConnectedOpBase : public Operation {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class FullyConnectedOp;

-template <>
+template<>
 class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
 public:
  explicit FullyConnectedOp(OpConstructContext *context)
@@ -84,38 +90,37 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
    }
    std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    const index_t N = output->dim(0);
+    const index_t batch = output->dim(0);
    const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
    const index_t output_size = weight->dim(0);

-    Tensor::MappingGuard guard_input(input);
-    Tensor::MappingGuard guard_weight(weight);
+    gemv_.Compute(context,
+                  weight,
+                  input,
+                  bias,
+                  batch,
+                  output_size,
+                  input_size,
+                  false,
+                  output);
    Tensor::MappingGuard guard_output(output);
-    const float *input_ptr = input->data<float>();
-    const float *weight_ptr = weight->data<float>();
    float *output_ptr = output->mutable_data<float>();
-
-    Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr);
-
-    if (bias) {
-      Tensor::MappingGuard guard_bias(bias);
-      const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
-      for (int i = 0; i < N; ++i) {
-        for (int j = 0; j < output_size; ++j) {
-          output_ptr[j + i * output_size] += bias_ptr[j];
-        }
-      }
-    }
-
    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_, leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
+
+ private:
+#ifdef MACE_ENABLE_NEON
+  arm::fp32::Gemv gemv_;
+#else
+  ref::Gemv<float> gemv_;
+#endif  // MACE_ENABLE_NEON
 };

 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 class FullyConnectedOp<DeviceType::CPU, uint8_t>
    : public FullyConnectedOpBase {
 public:
@@ -145,44 +150,28 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>

    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    const int N = static_cast<int>(output->dim(0));
+    const int batch = static_cast<int>(output->dim(0));
    const int input_size =
        static_cast<int>(weight->dim(1) * weight->dim(2) * weight->dim(3));
    const int output_size = static_cast<int>(weight->dim(0));
-
-    Tensor::MappingGuard guard_input(input);
-    Tensor::MappingGuard guard_weight(weight);
-    Tensor::MappingGuard guard_output(output);
-    auto input_ptr = input->data<uint8_t>();
-    auto weight_ptr = weight->data<uint8_t>();
-    auto output_ptr = output->mutable_data<uint8_t>();
-    auto bias_ptr = GetBiasData(bias,
-                                input->scale(),
-                                weight->scale(),
-                                output_size,
-                                &bias_);
-
-    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
-        weight_matrix(weight_ptr, output_size, input_size);
-    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor>
-        input_matrix(input_ptr, input_size, N);
-    gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor>
-        output_matrix(output_ptr, output_size, N);
-
-    const auto &output_pipeline = GemmlowpOutputPipeline::Make(
-        bias_ptr, output_size, weight->scale(), input->scale(), output->scale(),
-        output->zero_point());
-
-    using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-    gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        gemm_context, weight_matrix, input_matrix, &output_matrix,
-        -weight->zero_point(), -input->zero_point(), output_pipeline);
-
+    gemv_.Compute(context,
+                  weight,
+                  input,
+                  bias,
+                  batch,
+                  output_size,
+                  input_size,
+                  false,
+                  output);
    return MaceStatus::MACE_SUCCESS;
  }

 private:
-  std::vector<int32_t> bias_;
+#ifdef MACE_ENABLE_NEON
+  ::mace::ops::arm::q8::Gemv<uint8_t> gemv_;
+#else
+  ref::Gemv<uint8_t> gemv_;
+#endif  // MACE_ENABLE_NEON
 };
 #endif  // MACE_ENABLE_QUANTIZE


--- a/mace/ops/gemm.cc
+++ b/mace/ops/gemm.cc
--- a/mace/ops/gemm.h
+++ b/mace/ops/gemm.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_GEMM_H_
-#define MACE_OPS_GEMM_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "mace/core/types.h"
-
-// Gemm function does fast matrix-matrix multiplications with batch.
-// Gemv function does fast matrix-vector multiplications with batch.
-
-namespace mace {
-namespace ops {
-
-// Gemm calculates A[batch, height, K] dot B[batch, K, width] within each batch,
-// and output to C[batch, height, width].
-// height, K, width correspond to matrix dimension size after transpose (if any)
-void Gemm(const float *A,
-          const float *B,
-          const index_t batch,
-          const index_t height,
-          const index_t K,
-          const index_t width,
-          float *C,
-          const bool transpose_a = false,
-          const bool transpose_b = false);
-
-void GemmRef(const float *A,
-             const float *B,
-             const index_t batch,
-             const index_t height,
-             const index_t K,
-             const index_t width,
-             float *C,
-             const bool transpose_a = false,
-             const bool transpose_b = false);
-
-// Gemm calculates M[height, width] dot V[batch, height] within each batch of V,
-// and output to out[batch, width].
-void Gemv(const float *m_ptr,
-          const float *v_ptr,
-          const index_t batch,
-          const index_t width,
-          const index_t height,
-          float *out_ptr);
-
-void GemvRef(const float *m_ptr,
-             const float *v_ptr,
-             const index_t batch,
-             const index_t width,
-             const index_t height,
-             float *out_ptr);
-
-void Transpose(const float *src,
-               index_t height,
-               index_t width,
-               index_t stride_w,
-               float *dst);
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_GEMM_H_
--- a/mace/ops/gemm_test.cc
+++ b/mace/ops/gemm_test.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <vector>
-#include <memory>
-#include <random>
-
-#include "mace/core/types.h"
-#include "mace/ops/gemm.h"
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-
-namespace {
-
-void GemmTest(index_t batch,
-              index_t N,
-              index_t K,
-              index_t M,
-              bool transpose_a,
-              bool transpose_b) {
-  std::unique_ptr<float[]> A(new float[batch * N * K]);
-  std::unique_ptr<float[]> B(new float[batch * K * M]);
-  std::unique_ptr<float[]> C(new float[batch * N * M]);
-  std::unique_ptr<float[]> C_ref(new float[batch * N * M]);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-
-  std::generate(A.get(), A.get() + batch * N * K,
-                [&gen, &nd] { return nd(gen); });
-  std::generate(B.get(), B.get() + batch * K * M,
-                [&gen, &nd] { return nd(gen); });
-  ops::Gemm(A.get(), B.get(), batch, N, K, M, C.get(), transpose_a,
-                transpose_b);
-  ops::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
-                   transpose_b);
-
-  for (int i = 0; i < batch * N * M; ++i) {
-    EXPECT_NEAR(C_ref[i], C[i], 0.1);
-  }
-}
-
-void GemvTest(index_t batch, index_t N, index_t M) {
-  std::unique_ptr<float[]> A(new float[N * M]);
-  std::unique_ptr<float[]> B(new float[batch * M]);
-  std::unique_ptr<float[]> C(new float[batch * N]);
-  std::unique_ptr<float[]> C_ref(new float[batch * N]);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-
-  std::generate(A.get(), A.get() + N * M, [&gen, &nd] { return nd(gen); });
-  std::generate(B.get(), B.get() + batch * M, [&gen, &nd] { return nd(gen); });
-  ops::Gemv(A.get(), B.get(), batch, M, N, C.get());
-  ops::GemvRef(A.get(), B.get(), batch, M, N, C_ref.get());
-
-  for (int i = 0; i < batch * N; ++i) {
-    EXPECT_NEAR(C_ref[i], C[i], 0.1);
-  }
-}
-
-void SGemmTest(index_t batch,
-               index_t N,
-               index_t K,
-               index_t M,
-               bool transpose_a,
-               bool transpose_b) {
-  std::unique_ptr<float[]> A(new float[batch * N * K]);
-  std::unique_ptr<float[]> B(new float[batch * K * M]);
-  std::unique_ptr<float[]> C(new float[batch * N * M]);
-  std::unique_ptr<float[]> C_ref(new float[batch * N * M]);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-
-  std::generate(A.get(), A.get() + batch * N * K,
-                [&gen, &nd] { return nd(gen); });
-  std::generate(B.get(), B.get() + batch * K * M,
-                [&gen, &nd] { return nd(gen); });
-  ops::GemmRef(A.get(), B.get(), batch, N, K, M, C_ref.get(), transpose_a,
-                   transpose_b);
-
-  ops::MatrixMap<const float> matrix_a;
-  ops::MatrixMap<const float> matrix_b;
-
-  if (!transpose_a) {
-    matrix_a =
-        ops::MatrixMap<const float>(batch,
-                                        N,
-                                        K,
-                                        ops::RowMajor,
-                                        A.get());
-  } else {
-    matrix_a =
-        ops::MatrixMap<const float>(batch,
-                                        K,
-                                        N,
-                                        ops::RowMajor,
-                                        A.get());
-    matrix_a = matrix_a.transpose();
-  }
-
-  if (!transpose_b) {
-    matrix_b =
-        ops::MatrixMap<const float>(batch,
-                                        K,
-                                        M,
-                                        ops::RowMajor,
-                                        B.get());
-  } else {
-    matrix_b =
-        ops::MatrixMap<const float>(batch,
-                                        M,
-                                        K,
-                                        ops::RowMajor,
-                                        B.get());
-    matrix_b = matrix_b.transpose();
-  }
-  ops::MatrixMap<float> matrix_c(batch, N, M, ops::RowMajor, C.get());
-
-  ops::SGemm sgemm;
-  sgemm(matrix_a, matrix_b, &matrix_c);
-
-  for (int i = 0; i < N * M; ++i) {
-    EXPECT_NEAR(C_ref[i], C[i], 0.1);
-  }
-}
-
-}  // namespace
-
-TEST(GEMMTest, AlignedWithoutBatch) {
-  GemmTest(1, 1, 64, 128, false, false);
-  GemmTest(1, 2, 64, 128, false, true);
-  GemmTest(1, 3, 64, 128, true, false);
-  GemmTest(1, 4, 64, 128, true, true);
-  GemmTest(1, 5, 64, 128, false, false);
-  GemmTest(1, 6, 64, 128, false, true);
-  GemmTest(1, 7, 64, 128, true, false);
-  GemmTest(1, 17, 64, 128, true, true);
-  GemmTest(1, 256, 128, 4096, false, false);
-  GemmTest(1, 256, 128, 4104, false, false);
-}
-
-TEST(GEMMTest, UnalignedWithoutBatch) {
-  GemmTest(1, 1, 63, 127, false, false);
-  GemmTest(1, 2, 63, 127, false, true);
-  GemmTest(1, 3, 63, 127, true, false);
-  GemmTest(1, 4, 63, 127, true, true);
-  GemmTest(1, 5, 63, 127, false, false);
-  GemmTest(1, 6, 63, 127, false, true);
-  GemmTest(1, 7, 63, 127, true, false);
-  GemmTest(1, 17, 63, 127, true, true);
-}
-
-TEST(GEMMTest, UnalignedWithBatch) {
-  GemmTest(3, 1, 63, 127, false, false);
-  GemmTest(3, 2, 63, 127, false, true);
-  GemmTest(3, 3, 63, 127, true, false);
-  GemmTest(3, 4, 63, 127, true, true);
-  GemmTest(3, 5, 63, 127, false, false);
-  GemmTest(3, 6, 63, 127, false, true);
-  GemmTest(3, 7, 63, 127, true, false);
-  GemmTest(3, 17, 63, 127, true, true);
-}
-
-TEST(GEMMTest, gemv) {
-  GemvTest(1, 17, 63);
-  GemvTest(3, 17, 63);
-}
-
-namespace {
-void TestSGemmTranspose(index_t batch, index_t N, index_t K, index_t M) {
-  SGemmTest(batch, N, K, M, false, false);
-  SGemmTest(batch, N, K, M, true, false);
-  SGemmTest(batch, N, K, M, false, true);
-  SGemmTest(batch, N, K, M, true, true);
-}
-}
-
-TEST(SGEMMTest, UnalignedWithoutBatch) {
-  std::vector<index_t> tests{1, 5, 14, 31, 47};
-  for (index_t N : tests) {
-    for (index_t K : tests) {
-      for (index_t M : tests) {
-        TestSGemmTranspose(1, N, K, M);
-        TestSGemmTranspose(16, N, K, M);
-      }
-    }
-  }
-}
-
-}  // namespace mace
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -14,7 +14,7 @@


 #include "mace/core/operator.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -21,13 +21,23 @@

 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/gemm.h"
 #include "mace/ops/sgemm.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_NEON
+
+#include "mace/ops/arm/fp32/gemv.h"
+
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/arm/q8/gemv.h"
+#endif  // MACE_ENABLE_QUANTIZE
+
+#else
+#include "mace/ops/ref/gemv.h"
+#endif  // MACE_ENABLE_NEON
+
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/gemmlowp_util.h"
-#include "mace/ops/arm/fixpoint_gemm.h"
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
@@ -106,7 +116,6 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
    }
    batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
                            std::multiplies<index_t>());
-
    std::vector<index_t> c_shape = A->shape();
    c_shape[rank - 2] = height;
    c_shape[rank - 1] = width;
@@ -141,11 +150,17 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
               B->is_weight(),
               c_ptr_base,
               context->device()->scratch_buffer());
+
    return MaceStatus::MACE_SUCCESS;
  }

 private:
  SGemm sgemm_;
+#ifdef MACE_ENABLE_NEON
+  arm::fp32::Gemv gemv_;
+#else
+  ref::Gemv<float> gemv_;
+#endif  // MACE_ENABLE_NEON
 };

 #ifdef MACE_ENABLE_QUANTIZE
@@ -163,38 +178,54 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                  const index_t K,
                  const index_t width,
                  Tensor *C) {
-    Tensor::MappingGuard guarda(A);
-    Tensor::MappingGuard guardb(B);
-    Tensor::MappingGuard guardc(C);
-    auto a_ptr_base = A->data<uint8_t>();
-    auto b_ptr_base = B->data<uint8_t>();
-    auto c_ptr_base = C->mutable_data<uint8_t>();
    index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
                                    std::multiplies<index_t>());
-    auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext();
-    MACE_CHECK_NOTNULL(gemm_context);
-
-    index_t a_size = height * K;
-    index_t b_size = K * width;
-    index_t c_size = height * width;
-
-    const auto &output_pipeline = GemmlowpOutputPipeline::MakeNoBias(
-        A->scale(), B->scale(), C->scale(), C->zero_point());
-
-    for (index_t i = 0; i < batch; ++i) {
-      gemmlowp::MatrixMap<const uint8_t, AOrder>
-          a_matrix(a_ptr_base + i * a_size, height, K);
-      gemmlowp::MatrixMap<const uint8_t, BOrder>
-          b_matrix(b_ptr_base + i * b_size, K, width);
-      gemmlowp::MatrixMap <uint8_t, gemmlowp::MapOrder::RowMajor>
-          c_matrix(c_ptr_base + i * c_size, height, width);
-
-      using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-      gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-          gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
-          -B->zero_point(), output_pipeline);
+
+#if defined(MACE_ENABLE_NEON)
+    if (width == 1 && AOrder == gemmlowp::MapOrder::RowMajor) {
+      gemv_kernel_.Compute(context, A, B, nullptr, batch, height, K, true, C);
+    } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
+      gemv_kernel_.Compute(context, B, A, nullptr, batch, width, K, true, C);
+    } else {
+#endif  // MACE_ENABLE_NEON
+      Tensor::MappingGuard guarda(A);
+      Tensor::MappingGuard guardb(B);
+      Tensor::MappingGuard guardc(C);
+      auto a_ptr_base = A->data<uint8_t>();
+      auto b_ptr_base = B->data<uint8_t>();
+      auto c_ptr_base = C->mutable_data<uint8_t>();
+
+      auto gemm_context =
+          context->device()->cpu_runtime()->GetGemmlowpContext();
+      MACE_CHECK_NOTNULL(gemm_context);
+
+      index_t a_size = height * K;
+      index_t b_size = K * width;
+      index_t c_size = height * width;
+
+      const auto &output_pipeline = GemmlowpOutputPipeline::MakeNoBias(
+          A->scale(), B->scale(), C->scale(), C->zero_point());
+
+      for (index_t i = 0; i < batch; ++i) {
+        gemmlowp::MatrixMap<const uint8_t, AOrder>
+            a_matrix(a_ptr_base + i * a_size, height, K);
+        gemmlowp::MatrixMap<const uint8_t, BOrder>
+            b_matrix(b_ptr_base + i * b_size, K, width);
+        gemmlowp::MatrixMap <uint8_t, gemmlowp::MapOrder::RowMajor>
+            c_matrix(c_ptr_base + i * c_size, height, width);
+
+        using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+        gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
+            gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
+            -B->zero_point(), output_pipeline);
+      }
    }
+#if defined(MACE_ENABLE_NEON)
  }
+
+ private:
+  arm::q8::Gemv<uint8_t> gemv_kernel_;
+#endif  // MACE_ENABLE_NEON
 };

 template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
@@ -207,38 +238,24 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                  const index_t K,
                  const index_t width,
                  Tensor *C) {
-    Tensor::MappingGuard guarda(A);
-    Tensor::MappingGuard guardb(B);
-    Tensor::MappingGuard guardc(C);
-    auto a_ptr_base = A->data<uint8_t>();
-    auto b_ptr_base = B->data<uint8_t>();
-    auto c_ptr_base = C->mutable_data<int32_t>();
+    C->SetScale(A->scale() * B->scale());
+    C->SetZeroPoint(0);
    index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
                                    std::multiplies<index_t>());

+#if defined(MACE_ENABLE_NEON)
    if (width == 1 && AOrder == gemmlowp::MapOrder::RowMajor) {
-      // gemv
-      for (index_t i = 0; i < batch; ++i) {
-        FixPointGemv(a_ptr_base + i * height * K,
-                     b_ptr_base + i * K,
-                     A->zero_point(),
-                     B->zero_point(),
-                     height,
-                     K,
-                     c_ptr_base + i * height);
-      }
+      gemv_kernel_.Compute(context, A, B, nullptr, batch, height, K, true, C);
    } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
-      // gevm
-      for (index_t i = 0; i < batch; ++i) {
-        FixPointGemv(b_ptr_base + i * K * width,
-                     a_ptr_base + i * K,
-                     B->zero_point(),
-                     A->zero_point(),
-                     width,
-                     K,
-                     c_ptr_base + i * width);
-      }
+      gemv_kernel_.Compute(context, B, A, nullptr, batch, width, K, true, C);
    } else {
+#endif  // MACE_ENABLE_NEON
+      Tensor::MappingGuard guarda(A);
+      Tensor::MappingGuard guardb(B);
+      Tensor::MappingGuard guardc(C);
+      auto a_ptr_base = A->data<uint8_t>();
+      auto b_ptr_base = B->data<uint8_t>();
+      auto c_ptr_base = C->mutable_data<int32_t>();
      auto
          gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext();
      MACE_CHECK_NOTNULL(gemm_context);
@@ -264,9 +281,12 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
      }
    }

-    C->SetScale(A->scale() * B->scale());
-    C->SetZeroPoint(0);
+#if defined(MACE_ENABLE_NEON)
  }
+
+ private:
+  arm::q8::Gemv<int32_t> gemv_kernel_;
+#endif  // MACE_ENABLE_NEON
 };

 template <>

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -21,7 +21,6 @@
 #include "public/gemmlowp.h"
 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/gemm.h"
 #include "mace/ops/sgemm.h"
 #include "mace/ops/ops_test_util.h"

@@ -96,19 +95,6 @@ namespace test {
 namespace {

 // Matmul with (m, k) x (k, n)
-void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-  // warm up
-  Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  mace::testing::StartTiming();
-  while (iters--) {
-    Gemm(lhs.data(), rhs.data(), 1, m, k, n, result.data());
-  }
-}
-
 void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
  mace::testing::StopTiming();
  std::vector<float> lhs(m * k);
@@ -234,7 +220,6 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
  MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)

 #define MACE_BM_MATMUL(M, K, N)                          \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace, float);             \
  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
  MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
  MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
@@ -307,6 +292,7 @@ void MatMulBenchmark(
      .Input("A")
      .Input("B")
      .Output("Output")
+      .OutputType({DT_INT32})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -408,7 +394,7 @@ void MatMulTransposeBenchmark(
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);

-MACE_BM_MATMUL_OP(1, 128, 128, 49);
+MACE_BM_MATMUL_OP(1, 30000, 256, 1);
 MACE_BM_MATMUL_OP(2, 128, 128, 49);
 MACE_BM_MATMUL_OP(3, 128, 128, 49);
 MACE_BM_MATMUL_OP(4, 128, 128, 49);

--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -23,7 +23,7 @@ namespace test {
 class MatMulOpTest : public OpsTestBase {};

 namespace {
-template <DeviceType D>
+template<DeviceType D>
 void Simple(const std::vector<index_t> &A_shape,
            const std::vector<float> &A_value,
            const std::vector<index_t> &B_shape,
@@ -55,12 +55,12 @@ TEST_F(MatMulOpTest, SimpleCPU) {
  Simple<DeviceType::CPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
                          {1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
  Simple<DeviceType::CPU>(
-      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      {1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      {1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
-                  650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
+      {1, 5, 5}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
+                  650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
                  1310, 1400, 1315, 1430, 1545, 1660, 1775});
 }

@@ -289,9 +289,6 @@ TEST_F(MatMulOpTest, QuantOutputInt32) {
  QuantOutputInt32({2}, 253, 300, 1, false, false);
 }

-// TODO(liyin): test transpose after implementing gpu runtime
-// now transpose test is in kernels_test
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -14,7 +14,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {

--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -14,7 +14,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {

--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -23,7 +23,7 @@
 #include "mace/ops/opencl/image/buffer_to_image.h"
 #include "mace/ops/opencl/image/image_to_buffer.h"
 #include "mace/ops/opencl/buffer/buffer_transform.h"
-#include "mace/ops/transpose.h"
+#include "mace/ops/common/transpose.h"

 namespace mace {
 namespace ops {

--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -18,7 +18,7 @@
 #include <vector>

 #include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {
 class OpContext;

--- a/mace/ops/opencl/depthwise_conv2d.h
+++ b/mace/ops/opencl/depthwise_conv2d.h
@@ -17,8 +17,8 @@

 #include <vector>

-#include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/activation_type.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {


--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>

-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"

 namespace mace {


--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {

--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -14,7 +14,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {

--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -14,7 +14,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
 #include "mace/utils/utils.h"


--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -15,7 +15,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/utils/utils.h"

 namespace mace {

--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -14,8 +14,8 @@

 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/op_context.h"
-#include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/activation_type.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/opencl/helper.h"
 #include "mace/utils/utils.h"


--- a/mace/ops/opencl/pooling.h
+++ b/mace/ops/opencl/pooling.h
@@ -18,7 +18,7 @@
 #include <vector>

 #include "mace/ops/pooling.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"

 namespace mace {


--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -36,6 +36,7 @@
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 #include "mace/utils/quantize.h"
+#include "mace/ops/testing/test_utils.h"

 namespace mace {
 namespace ops {
@@ -422,230 +423,6 @@ class OpsTestBase : public ::testing::Test {
  }
 };

-template <typename T>
-void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
-                                std::vector<T> *res,
-                                bool positive = true) {
-  MACE_CHECK_NOTNULL(res);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-
-  index_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                 std::multiplies<index_t>());
-  res->resize(size);
-
-  if (DataTypeToEnum<T>::value == DT_HALF) {
-    std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
-      return half_float::half_cast<half>(positive ? std::abs(nd(gen))
-                                                  : nd(gen));
-    });
-  } else {
-    std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
-      return positive ? std::abs(nd(gen)) : nd(gen);
-    });
-  }
-}
-
-template <typename T>
-void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
-                               std::vector<T> *res,
-                               const T a = 0,
-                               const T b = std::numeric_limits<T>::max()) {
-  MACE_CHECK_NOTNULL(res);
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<> nd(a, b);
-
-  index_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                 std::multiplies<index_t>());
-  res->resize(size);
-
-  std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); });
-}
-
-template <typename T>
-std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
-  std::vector<T> dest;
-  dest.reserve(src.size());
-  for (float f : src) {
-    dest.push_back(static_cast<T>(f));
-  }
-  return std::move(dest);
-}
-
-inline bool IsSameSize(const Tensor &x, const Tensor &y) {
-  if (x.dim_size() != y.dim_size()) return false;
-  for (int d = 0; d < x.dim_size(); ++d) {
-    if (x.dim(d) != y.dim(d)) return false;
-  }
-  return true;
-}
-
-inline std::string ShapeToString(const Tensor &x) {
-  std::stringstream stream;
-  for (int i = 0; i < x.dim_size(); i++) {
-    if (i > 0) stream << ",";
-    int64_t dim = x.dim(i);
-    if (dim < 0) {
-      stream << "?";
-    } else {
-      stream << dim;
-    }
-  }
-  stream << "]";
-  return std::string(stream.str());
-}
-
-template <typename T>
-struct is_floating_point_type {
-  static const bool value = std::is_same<T, float>::value ||
-                            std::is_same<T, double>::value ||
-                            std::is_same<T, half>::value;
-};
-
-template <typename T>
-inline void ExpectEqual(const T &a, const T &b) {
-  EXPECT_EQ(a, b);
-}
-
-template <>
-inline void ExpectEqual<float>(const float &a, const float &b) {
-  EXPECT_FLOAT_EQ(a, b);
-}
-
-template <>
-inline void ExpectEqual<double>(const double &a, const double &b) {
-  EXPECT_DOUBLE_EQ(a, b);
-}
-
-inline void AssertSameDims(const Tensor &x, const Tensor &y) {
-  ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
-                                << "y.shape [ " << ShapeToString(y) << "]";
-}
-
-template <typename EXP_TYPE,
-          typename RES_TYPE,
-          bool is_fp = is_floating_point_type<EXP_TYPE>::value>
-struct Expector;
-
-// Partial specialization for float and double.
-template <typename EXP_TYPE, typename RES_TYPE>
-struct Expector<EXP_TYPE, RES_TYPE, true> {
-  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
-
-  static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
-    AssertSameDims(x, y);
-    Tensor::MappingGuard x_mapper(&x);
-    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<RES_TYPE>();
-    for (int i = 0; i < x.size(); ++i) {
-      ExpectEqual(a[i], b[i]);
-    }
-  }
-
-  static void Near(const Tensor &x,
-                   const Tensor &y,
-                   const double rel_err,
-                   const double abs_err) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
-    AssertSameDims(x, y);
-    Tensor::MappingGuard x_mapper(&x);
-    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<RES_TYPE>();
-    if (x.dim_size() == 4) {
-      for (int n = 0; n < x.dim(0); ++n) {
-        for (int h = 0; h < x.dim(1); ++h) {
-          for (int w = 0; w < x.dim(2); ++w) {
-            for (int c = 0; c < x.dim(3); ++c) {
-              const double error = abs_err + rel_err * std::abs(*a);
-              EXPECT_NEAR(*a, *b, error) << "with index = [" << n << ", " << h
-                                         << ", " << w << ", " << c << "]";
-              a++;
-              b++;
-            }
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < x.size(); ++i) {
-        const double error = abs_err + rel_err * std::abs(a[i]);
-        EXPECT_NEAR(a[i], b[i], error) << "a = " << a << " b = " << b
-                                       << " index = " << i;
-      }
-    }
-  }
-};
-
-template <typename EXP_TYPE, typename RES_TYPE>
-struct Expector<EXP_TYPE, RES_TYPE, false> {
-  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
-
-  static void Equal(const Tensor &x, const Tensor &y) {
-    ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
-    ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
-    AssertSameDims(x, y);
-    Tensor::MappingGuard x_mapper(&x);
-    Tensor::MappingGuard y_mapper(&y);
-    auto a = x.data<EXP_TYPE>();
-    auto b = y.data<RES_TYPE>();
-    for (int i = 0; i < x.size(); ++i) {
-      ExpectEqual(a[i], b[i]);
-    }
-  }
-
-  static void Near(const Tensor &x,
-                   const Tensor &y,
-                   const double rel_err,
-                   const double abs_err) {
-    MACE_UNUSED(rel_err);
-    MACE_UNUSED(abs_err);
-    Equal(x, y);
-  }
-};
-
-template <typename T>
-void ExpectTensorNear(const Tensor &x,
-                      const Tensor &y,
-                      const double rel_err = 1e-5,
-                      const double abs_err = 1e-8) {
-  Expector<T, T>::Near(x, y, rel_err, abs_err);
-}
-
-template <typename EXP_TYPE, typename RES_TYPE>
-void ExpectTensorNear(const Tensor &x,
-                      const Tensor &y,
-                      const double rel_err = 1e-5,
-                      const double abs_err = 1e-8) {
-  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, rel_err, abs_err);
-}
-
-template <typename T>
-void ExpectTensorSimilar(const Tensor &x,
-                         const Tensor &y,
-                         const double abs_err = 1e-5) {
-  AssertSameDims(x, y);
-  Tensor::MappingGuard x_mapper(&x);
-  Tensor::MappingGuard y_mapper(&y);
-  auto x_data = x.data<T>();
-  auto y_data = y.data<T>();
-  double dot_product = 0.0, x_norm = 0.0, y_norm = 0.0;
-  for (index_t i = 0; i < x.size(); i++) {
-    dot_product += x_data[i] * y_data[i];
-    x_norm += x_data[i] * x_data[i];
-    y_norm += y_data[i] * y_data[i];
-  }
-  double similarity = dot_product / (sqrt(x_norm) * sqrt(y_norm));
-  EXPECT_NEAR(1.0, similarity, abs_err);
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -27,7 +27,7 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/conv_pool_2d_base.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pooling.h"
 #include "mace/ops/opencl/buffer/pooling.h"

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/pooling.h"
 #include "mace/ops/ops_test_util.h"


--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -15,7 +15,7 @@
 #include <vector>

 #include "mace/ops/pooling.h"
-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {

--- a/mace/ops/ref/gemv.cc
+++ b/mace/ops/ref/gemv.cc
--- a/mace/ops/ref/gemv.h
+++ b/mace/ops/ref/gemv.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_REF_GEMV_H_
+#define MACE_OPS_REF_GEMV_H_
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+template<typename OUTPUT_TYPE>
+class Gemv {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+    const OpContext *context,
+    const Tensor *lhs,
+    const Tensor *rhs,
+    const Tensor *bias,
+    const index_t batch,
+    const index_t lhs_height,
+    const index_t lhs_width,
+    const bool lhs_batched,
+    Tensor *output);
+};
+
+template<>
+class Gemv<float> {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+    const OpContext *context,
+    const Tensor *lhs,
+    const Tensor *rhs,
+    const Tensor *bias,
+    const index_t batch,
+    const index_t lhs_height,
+    const index_t lhs_width,
+    const bool lhs_batched,
+    Tensor *output);
+};
+
+#if defined(MACE_ENABLE_QUANTIZE)
+template<>
+class Gemv<uint8_t> {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *lhs,
+      const Tensor *rhs,
+      const Tensor *bias,
+      const index_t batch,
+      const index_t lhs_height,
+      const index_t lhs_width,
+      const bool lhs_batched,
+      Tensor *output);
+};
+
+template<>
+class Gemv<int32_t> {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *lhs,
+      const Tensor *rhs,
+      const Tensor *bias,
+      const index_t batch,
+      const index_t lhs_height,
+      const index_t lhs_width,
+      const bool lhs_batched,
+      Tensor *output);
+};
+#endif  // MACE_ENABLE_QUANTIZE
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_REF_GEMV_H_
+
--- a/mace/ops/testing/test_utils.h
+++ b/mace/ops/testing/test_utils.h
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
--- a/mace/python/tools/quantization/quantize_stat.py
+++ b/mace/python/tools/quantization/quantize_stat.py
@@ -31,6 +31,13 @@ class QuantizeStat(object):
            if not enhance or samples <= 1:
                res[tensor_name] = (tensor_min, tensor_max)
            else:
+                """
+                Enhancement mode:
+                This policy eliminates outliers that cause long-tail
+                statistical range. We try to reduce as much range as it could
+                while retaining more samples. d(range)/d(sample_quantile) is
+                used to measure this qualitatively.
+                """
                tensor_mins = np.sort(tensor_ranges[tensor_name][0])
                tensor_maxs = np.sort(tensor_ranges[tensor_name][1])[::-1]
                cur_min_idx = 0

--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>

-#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/public/mace.h"