diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel
index 7217567d9147eaf938c9e424f3ea77cda2b4417b..1d651c1b3274bab30a7adf30f9f63be2c57dca7f 100644
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -124,6 +124,7 @@ cc_library(
     srcs = glob(
         [
             "arm/fp32/*.cc",
+            "arm/fp16/gemv.h",
         ],
         exclude = [
             "arm/fp32/*_test.cc",
diff --git a/mace/ops/arm/fp16_gemm.h b/mace/ops/arm/fp16/gemv.h
similarity index 97%
rename from mace/ops/arm/fp16_gemm.h
rename to mace/ops/arm/fp16/gemv.h
index 878080684f288692250fff0d8f8bc64f7cea7f1d..8e7e2a3c91438303f6724b740f16739db2ed5ebc 100644
--- a/mace/ops/arm/fp16_gemm.h
+++ b/mace/ops/arm/fp16/gemv.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP16_GEMM_H_
-#define MACE_OPS_ARM_FP16_GEMM_H_
+#ifndef MACE_OPS_ARM_FP16_GEMV_H_
+#define MACE_OPS_ARM_FP16_GEMV_H_
 
 #include "mace/core/types.h"
 
@@ -117,4 +117,4 @@ void FP16Gemv<float16_t, float, float>(const float16_t *m_ptr,
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP16_GEMM_H_
+#endif  // MACE_OPS_ARM_FP16_GEMV_H_
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index 3799eafd05274196665e34f165809cab6c5c72d8..592d25ae724ed8a93191049a31097a4e95c91d2a 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -45,7 +45,7 @@
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL
 #ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp16_gemm.h"
+#include "mace/ops/arm/fp16/gemv.h"
 #endif
 
 namespace mace {
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index 087c824e389434f05fb0577146dabfa5df82fe55..c0d5af05c172cd108286fa3ec1c0f25ee776531a 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -396,15 +396,13 @@ void MatMulTransposeBenchmark(
   }                                                                            \
   MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
 
-#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+#ifdef MACE_ENABLE_QUANTIZE
 #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W)                   \
   MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);     \
   MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
 #else
 #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W)                   \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
+  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
 #endif
 
 MACE_BM_MATMUL_OP(1, 30000, 256, 1);
@@ -427,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
 MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961);
 MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969);
 
+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W)              \
+  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);
+
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 30000);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 256);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 2048);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 256);
+
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 30000);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 512);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 2048);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 512);
+#endif  // MACE_ENABLE_NEON
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index f88ac39435e328ad2a4ada6b3c41a73558fdb791..d0432bb0b958ae6ee452b976b5c403e4bb4c04ba 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector<index_t> &batch,
 }
 }  // namespace
 
+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+namespace {
+void FloatOutput16(const std::vector<index_t> &batch,
+                      const index_t rows,
+                      const index_t depth,
+                      const index_t cols,
+                      const bool transpose_lhs,
+                      const bool transpose_rhs,
+                      const bool lhs_batched = true,
+                      const bool rhs_batched = true) {
+  // Construct graph
+  OpsTestNet net;
+
+  index_t lhs_rows = transpose_lhs ? depth : rows;
+  index_t lhs_cols = transpose_lhs ? rows : depth;
+  index_t rhs_rows = transpose_rhs ? cols : depth;
+  index_t rhs_cols = transpose_rhs ? depth: cols;
+  std::vector<index_t> lhs_shape = {lhs_rows, lhs_cols};
+  std::vector<index_t> rhs_shape = {rhs_rows, rhs_cols};
+  if (lhs_batched) {
+    lhs_shape.insert(lhs_shape.begin(), batch.begin(), batch.end());
+  }
+  if (rhs_batched) {
+    rhs_shape.insert(rhs_shape.begin(), batch.begin(), batch.end());
+  }
+  net.AddRandomInput<CPU, float>("A", lhs_shape);
+  net.AddRandomInput<CPU, float>("B", rhs_shape);
+
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("A")
+      .AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
+      .Input("B")
+      .AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
+      .Output("Output")
+      .AddIntArg("T", DT_FLOAT)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+
+  OpDefBuilder("Cast", "CastTest")
+      .Input("B")
+      .Output("HalveB")
+      .OutputType({DT_FLOAT16})
+      .AddIntArg("T", DT_FLOAT)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("MatMul", "Float16MatMulTest")
+      .Input("A")
+      .AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
+      .Input("HalveB")
+      .AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
+      .Output("Float16Output")
+      .AddIntArg("T", DT_FLOAT16)
+      .OutputType({DT_FLOAT})
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("Output"),
+                             *net.GetTensor("Float16Output"), 0.01);
+}
+}  // namespace
+#endif  // MACE_ENABLE_NEON
+
 TEST_F(MatMulOpTest, QuantOutputUint8) {
   QuantOutputUint8({1}, 64, 128, 32, false, false);
   QuantOutputUint8({1}, 64, 32, 128, false, false);
@@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) {
   QuantOutputInt32({2, 3}, 31, 61, 67, true, true, false, true);
 }
 
+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+TEST_F(MatMulOpTest, FloatOutput16) {
+  FloatOutput16({1}, 1, 512, 30745, false, true, false, false);
+  FloatOutput16({1}, 1, 256, 30000, false, true, false, false);
+  FloatOutput16({1}, 1, 256, 2048, false, true, false, false);
+  FloatOutput16({1}, 1, 2048, 256, false, true, false, false);
+
+  FloatOutput16({1}, 1, 512, 30000, false, true, false, false);
+  FloatOutput16({1}, 1, 512, 512, false, true, false, false);
+  FloatOutput16({1}, 1, 512, 2048, false, true, false, false);
+  FloatOutput16({1}, 1, 2048, 512, false, true, false, false);
+}
+#endif  // MACE_ENABLE_NEON
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 05caccb3dc5284ffe0fa742e7491dd773bdef557..58658dd81d90b7b9110706338ae7328214ada19b 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -140,7 +140,6 @@ def main(unused_args):
     option.winograd = FLAGS.winograd
     option.quantize = FLAGS.quantize
     option.quantize_range_file = FLAGS.quantize_range_file
-    option.fp16_matmul_file = FLAGS.fp16_matmul_file
     option.change_concat_ranges = FLAGS.change_concat_ranges
     option.cl_mem_type = FLAGS.cl_mem_type
     option.device = device_type_map[FLAGS.runtime]
@@ -385,11 +384,6 @@ def parse_args():
         type=str,
         default="",
         help="file path of quantize range for each tensor")
-    parser.add_argument(
-        "--fp16_matmul_file",
-        type=str,
-        default="",
-        help="file path of matmul names for fp16")
     parser.add_argument(
         "--change_concat_ranges",
         type=str2bool,
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index ff01f59790c9be6a57d766ae183deb5ac5754a64..3f09745b36e44acd567df14b19624dfa54cdc12e 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -391,7 +391,6 @@ class ConverterOption(object):
         self._winograd = 0
         self._quantize = False
         self._quantize_range_file = ""
-        self._fp16_matmul_file = ""
         self._change_concat_ranges = False
         self._transformer_option = None
         self._cl_mem_type = ""
@@ -432,10 +431,6 @@ class ConverterOption(object):
     def quantize_range_file(self):
         return self._quantize_range_file
 
-    @property
-    def fp16_matmul_file(self):
-        return self._fp16_matmul_file
-
     @property
     def transformer_option(self):
         return self._transformer_option
@@ -488,10 +483,6 @@ class ConverterOption(object):
     def quantize_range_file(self, quantize_range_file):
         self._quantize_range_file = quantize_range_file
 
-    @fp16_matmul_file.setter
-    def fp16_matmul_file(self, fp16_matmul_file):
-        self._fp16_matmul_file = fp16_matmul_file
-
     @change_concat_ranges.setter
     def change_concat_ranges(self, change_concat_ranges):
         self._change_concat_ranges = change_concat_ranges
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index cb095643de1c973a210fc8d3fc700c0ddd5a02eb..71bcc3555459562f3d0010cff43e4935975fb079 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1905,25 +1905,14 @@ class Transformer(base_converter.ConverterInterface):
         if self._option.device != DeviceType.CPU.value:
             return
 
-        if self._option.fp16_matmul_file:
-            with open(self._option.fp16_matmul_file) as f:
-                lines = f.readlines()
-            specific_matmul_names = [x.strip() for x in lines]
-            print('Convert matmul weights to fp16 for:')
-            for name in specific_matmul_names:
-                print('\t%s' % name)
-        else:
-            specific_matmul_names = None
-            print('Convert matmul weights to fp16 for specific matmul: activation + weights')  # noqa
+        print('Convert matmul weights to fp16 for specific matmul: activation + weights')  # noqa
 
         for op in self._model.op:
             if op.type != MaceOp.MatMul.name:
                 continue
-            if specific_matmul_names is not None and str(op.name) not in specific_matmul_names:  # noqa
-                continue
-            if specific_matmul_names is None and op.input[0] not in self._consts and op.input[1] not in self._consts:  # noqa
+            if op.input[0] not in self._consts and op.input[1] not in self._consts:  # noqa
                 continue
-            if specific_matmul_names is None and op.input[0] in self._consts and op.input[1] in self._consts:  # noqa
+            if op.input[0] in self._consts and op.input[1] in self._consts:
                 continue
 
             # Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now!  # noqa
diff --git a/tools/common.py b/tools/common.py
index a5724ae254f96753c421fe95aef73cbb6ed9d276..0884319ff9f369c0d05271141e16935cdbf57a56 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -416,7 +416,6 @@ class YAMLKeyword(object):
     docker_image_tag = 'docker_image_tag'
     dockerfile_path = 'dockerfile_path'
     dockerfile_sha256_checksum = 'dockerfile_sha256_checksum'
-    fp16_matmul_file = 'fp16_matmul_file'
 
 
 ################################
diff --git a/tools/converter.py b/tools/converter.py
index c2a946ab32bea11f12640f08ed8b2d13b40b225e..a5df88a9cecd8493b26b6462b33a9aaff729f99b 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -745,7 +745,6 @@ def convert_model(configs, cl_mem_type):
             model_config[YAMLKeyword.winograd],
             model_config[YAMLKeyword.quantize],
             quantize_range_file_path,
-            model_config.get(YAMLKeyword.fp16_matmul_file, ""),
             model_config[YAMLKeyword.change_concat_ranges],
             model_config[YAMLKeyword.obfuscate],
             configs[YAMLKeyword.model_graph_format],
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 5c14336f697829db1aa5e563c3a0014f464a8188..3b98c7a691bf6a047bdc91bfd4c90cc36d336d4e 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -501,7 +501,6 @@ def gen_model_code(model_codegen_dir,
                    winograd,
                    quantize,
                    quantize_range_file,
-                   fp16_matmul_file,
                    change_concat_ranges,
                    obfuscate,
                    model_graph_format,
@@ -540,7 +539,6 @@ def gen_model_code(model_codegen_dir,
               "--winograd=%s" % winograd,
               "--quantize=%s" % quantize,
               "--quantize_range_file=%s" % quantize_range_file,
-              "--fp16_matmul_file=%s" % fp16_matmul_file,
               "--change_concat_ranges=%s" % change_concat_ranges,
               "--obfuscate=%s" % obfuscate,
               "--output_dir=%s" % model_codegen_dir,