update fp16

7b147bbd · liukai6 · 92f6d362 · 7b147bbd · 7b147bbd · 7b147bbd
11 changed file
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -124,6 +124,7 @@ cc_library(
    srcs = glob(
        [
            "arm/fp32/*.cc",
+            "arm/fp16/gemv.h",
        ],
        exclude = [
            "arm/fp32/*_test.cc",

--- a/mace/ops/arm/fp16_gemm.h
+++ b/mace/ops/arm/fp16_gemm.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_ARM_FP16_GEMM_H_
-#define MACE_OPS_ARM_FP16_GEMM_H_
+#ifndef MACE_OPS_ARM_FP16_GEMV_H_
+#define MACE_OPS_ARM_FP16_GEMV_H_

 #include "mace/core/types.h"

@@ -117,4 +117,4 @@ void FP16Gemv<float16_t, float, float>(const float16_t *m_ptr,
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_ARM_FP16_GEMM_H_
+#endif  // MACE_OPS_ARM_FP16_GEMV_H_
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -45,7 +45,7 @@
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL
 #ifdef MACE_ENABLE_NEON
-#include "mace/ops/arm/fp16_gemm.h"
+#include "mace/ops/arm/fp16/gemv.h"
 #endif

 namespace mace {

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -396,15 +396,13 @@ void MatMulTransposeBenchmark(
  }                                                                            \
  MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)

-#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+#ifdef MACE_ENABLE_QUANTIZE
 #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W)                   \
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);     \
  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
 #else
 #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W)                   \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);     \
-  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
+  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
 #endif

 MACE_BM_MATMUL_OP(1, 30000, 256, 1);
@@ -427,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
 MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961);
 MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969);

+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W)              \
+  MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);
+
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 30000);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 256);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 2048);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 256);
+
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 30000);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 512);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 2048);
+MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 512);
+#endif  // MACE_ENABLE_NEON
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector<index_t> &batch,
 }
 }  // namespace

+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+namespace {
+void FloatOutput16(const std::vector<index_t> &batch,
+                      const index_t rows,
+                      const index_t depth,
+                      const index_t cols,
+                      const bool transpose_lhs,
+                      const bool transpose_rhs,
+                      const bool lhs_batched = true,
+                      const bool rhs_batched = true) {
+  // Construct graph
+  OpsTestNet net;
+
+  index_t lhs_rows = transpose_lhs ? depth : rows;
+  index_t lhs_cols = transpose_lhs ? rows : depth;
+  index_t rhs_rows = transpose_rhs ? cols : depth;
+  index_t rhs_cols = transpose_rhs ? depth: cols;
+  std::vector<index_t> lhs_shape = {lhs_rows, lhs_cols};
+  std::vector<index_t> rhs_shape = {rhs_rows, rhs_cols};
+  if (lhs_batched) {
+    lhs_shape.insert(lhs_shape.begin(), batch.begin(), batch.end());
+  }
+  if (rhs_batched) {
+    rhs_shape.insert(rhs_shape.begin(), batch.begin(), batch.end());
+  }
+  net.AddRandomInput<CPU, float>("A", lhs_shape);
+  net.AddRandomInput<CPU, float>("B", rhs_shape);
+
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("A")
+      .AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
+      .Input("B")
+      .AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
+      .Output("Output")
+      .AddIntArg("T", DT_FLOAT)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+
+  OpDefBuilder("Cast", "CastTest")
+      .Input("B")
+      .Output("HalveB")
+      .OutputType({DT_FLOAT16})
+      .AddIntArg("T", DT_FLOAT)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("MatMul", "Float16MatMulTest")
+      .Input("A")
+      .AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
+      .Input("HalveB")
+      .AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
+      .Output("Float16Output")
+      .AddIntArg("T", DT_FLOAT16)
+      .OutputType({DT_FLOAT})
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("Output"),
+                             *net.GetTensor("Float16Output"), 0.01);
+}
+}  // namespace
+#endif  // MACE_ENABLE_NEON
+
 TEST_F(MatMulOpTest, QuantOutputUint8) {
  QuantOutputUint8({1}, 64, 128, 32, false, false);
  QuantOutputUint8({1}, 64, 32, 128, false, false);
@@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) {
  QuantOutputInt32({2, 3}, 31, 61, 67, true, true, false, true);
 }

+#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
+TEST_F(MatMulOpTest, FloatOutput16) {
+  FloatOutput16({1}, 1, 512, 30745, false, true, false, false);
+  FloatOutput16({1}, 1, 256, 30000, false, true, false, false);
+  FloatOutput16({1}, 1, 256, 2048, false, true, false, false);
+  FloatOutput16({1}, 1, 2048, 256, false, true, false, false);
+
+  FloatOutput16({1}, 1, 512, 30000, false, true, false, false);
+  FloatOutput16({1}, 1, 512, 512, false, true, false, false);
+  FloatOutput16({1}, 1, 512, 2048, false, true, false, false);
+  FloatOutput16({1}, 1, 2048, 512, false, true, false, false);
+}
+#endif  // MACE_ENABLE_NEON
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -140,7 +140,6 @@ def main(unused_args):
    option.winograd = FLAGS.winograd
    option.quantize = FLAGS.quantize
    option.quantize_range_file = FLAGS.quantize_range_file
-    option.fp16_matmul_file = FLAGS.fp16_matmul_file
    option.change_concat_ranges = FLAGS.change_concat_ranges
    option.cl_mem_type = FLAGS.cl_mem_type
    option.device = device_type_map[FLAGS.runtime]
@@ -385,11 +384,6 @@ def parse_args():
        type=str,
        default="",
        help="file path of quantize range for each tensor")
-    parser.add_argument(
-        "--fp16_matmul_file",
-        type=str,
-        default="",
-        help="file path of matmul names for fp16")
    parser.add_argument(
        "--change_concat_ranges",
        type=str2bool,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -391,7 +391,6 @@ class ConverterOption(object):
        self._winograd = 0
        self._quantize = False
        self._quantize_range_file = ""
-        self._fp16_matmul_file = ""
        self._change_concat_ranges = False
        self._transformer_option = None
        self._cl_mem_type = ""
@@ -432,10 +431,6 @@ class ConverterOption(object):
    def quantize_range_file(self):
        return self._quantize_range_file

-    @property
-    def fp16_matmul_file(self):
-        return self._fp16_matmul_file
-
    @property
    def transformer_option(self):
        return self._transformer_option
@@ -488,10 +483,6 @@ class ConverterOption(object):
    def quantize_range_file(self, quantize_range_file):
        self._quantize_range_file = quantize_range_file

-    @fp16_matmul_file.setter
-    def fp16_matmul_file(self, fp16_matmul_file):
-        self._fp16_matmul_file = fp16_matmul_file
-
    @change_concat_ranges.setter
    def change_concat_ranges(self, change_concat_ranges):
        self._change_concat_ranges = change_concat_ranges

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1905,25 +1905,14 @@ class Transformer(base_converter.ConverterInterface):
        if self._option.device != DeviceType.CPU.value:
            return

-        if self._option.fp16_matmul_file:
-            with open(self._option.fp16_matmul_file) as f:
-                lines = f.readlines()
-            specific_matmul_names = [x.strip() for x in lines]
-            print('Convert matmul weights to fp16 for:')
-            for name in specific_matmul_names:
-                print('\t%s' % name)
-        else:
-            specific_matmul_names = None
-            print('Convert matmul weights to fp16 for specific matmul: activation + weights')  # noqa
+        print('Convert matmul weights to fp16 for specific matmul: activation + weights')  # noqa

        for op in self._model.op:
            if op.type != MaceOp.MatMul.name:
                continue
-            if specific_matmul_names is not None and str(op.name) not in specific_matmul_names:  # noqa
-                continue
-            if specific_matmul_names is None and op.input[0] not in self._consts and op.input[1] not in self._consts:  # noqa
+            if op.input[0] not in self._consts and op.input[1] not in self._consts:  # noqa
                continue
-            if specific_matmul_names is None and op.input[0] in self._consts and op.input[1] in self._consts:  # noqa
+            if op.input[0] in self._consts and op.input[1] in self._consts:
                continue

            # Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now!  # noqa

--- a/tools/common.py
+++ b/tools/common.py
@@ -416,7 +416,6 @@ class YAMLKeyword(object):
    docker_image_tag = 'docker_image_tag'
    dockerfile_path = 'dockerfile_path'
    dockerfile_sha256_checksum = 'dockerfile_sha256_checksum'
-    fp16_matmul_file = 'fp16_matmul_file'


 ################################

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -745,7 +745,6 @@ def convert_model(configs, cl_mem_type):
            model_config[YAMLKeyword.winograd],
            model_config[YAMLKeyword.quantize],
            quantize_range_file_path,
-            model_config.get(YAMLKeyword.fp16_matmul_file, ""),
            model_config[YAMLKeyword.change_concat_ranges],
            model_config[YAMLKeyword.obfuscate],
            configs[YAMLKeyword.model_graph_format],

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -501,7 +501,6 @@ def gen_model_code(model_codegen_dir,
                   winograd,
                   quantize,
                   quantize_range_file,
-                   fp16_matmul_file,
                   change_concat_ranges,
                   obfuscate,
                   model_graph_format,
@@ -540,7 +539,6 @@ def gen_model_code(model_codegen_dir,
              "--winograd=%s" % winograd,
              "--quantize=%s" % quantize,
              "--quantize_range_file=%s" % quantize_range_file,
-              "--fp16_matmul_file=%s" % fp16_matmul_file,
              "--change_concat_ranges=%s" % change_concat_ranges,
              "--obfuscate=%s" % obfuscate,
              "--output_dir=%s" % model_codegen_dir,