diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 7217567d9147eaf938c9e424f3ea77cda2b4417b..1d651c1b3274bab30a7adf30f9f63be2c57dca7f 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -124,6 +124,7 @@ cc_library( srcs = glob( [ "arm/fp32/*.cc", + "arm/fp16/gemv.h", ], exclude = [ "arm/fp32/*_test.cc", diff --git a/mace/ops/arm/fp16_gemm.h b/mace/ops/arm/fp16/gemv.h similarity index 97% rename from mace/ops/arm/fp16_gemm.h rename to mace/ops/arm/fp16/gemv.h index 878080684f288692250fff0d8f8bc64f7cea7f1d..8e7e2a3c91438303f6724b740f16739db2ed5ebc 100644 --- a/mace/ops/arm/fp16_gemm.h +++ b/mace/ops/arm/fp16/gemv.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP16_GEMM_H_ -#define MACE_OPS_ARM_FP16_GEMM_H_ +#ifndef MACE_OPS_ARM_FP16_GEMV_H_ +#define MACE_OPS_ARM_FP16_GEMV_H_ #include "mace/core/types.h" @@ -117,4 +117,4 @@ void FP16Gemv(const float16_t *m_ptr, } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP16_GEMM_H_ +#endif // MACE_OPS_ARM_FP16_GEMV_H_ diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 3799eafd05274196665e34f165809cab6c5c72d8..592d25ae724ed8a93191049a31097a4e95c91d2a 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -45,7 +45,7 @@ #include "mace/ops/opencl/image/matmul.h" #endif // MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_NEON -#include "mace/ops/arm/fp16_gemm.h" +#include "mace/ops/arm/fp16/gemv.h" #endif namespace mace { diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 087c824e389434f05fb0577146dabfa5df82fe55..c0d5af05c172cd108286fa3ec1c0f25ee776531a 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -396,15 +396,13 @@ void MatMulTransposeBenchmark( } \ MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#ifdef MACE_ENABLE_QUANTIZE #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU); \ MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU); #else #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU); + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); #endif MACE_BM_MATMUL_OP(1, 30000, 256, 1); @@ -427,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49); MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961); MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W) \ + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU); + +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 30000); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 256); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 2048); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 256); + +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 30000); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 512); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 2048); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 512); +#endif // MACE_ENABLE_NEON + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index f88ac39435e328ad2a4ada6b3c41a73558fdb791..d0432bb0b958ae6ee452b976b5c403e4bb4c04ba 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector &batch, } } // namespace +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +namespace { +void FloatOutput16(const std::vector &batch, + const index_t rows, + const index_t depth, + const index_t cols, + const bool transpose_lhs, + const bool transpose_rhs, + const bool lhs_batched = true, + const bool rhs_batched = true) { + // Construct graph + OpsTestNet net; + + index_t lhs_rows = transpose_lhs ? depth : rows; + index_t lhs_cols = transpose_lhs ? rows : depth; + index_t rhs_rows = transpose_rhs ? cols : depth; + index_t rhs_cols = transpose_rhs ? depth: cols; + std::vector lhs_shape = {lhs_rows, lhs_cols}; + std::vector rhs_shape = {rhs_rows, rhs_cols}; + if (lhs_batched) { + lhs_shape.insert(lhs_shape.begin(), batch.begin(), batch.end()); + } + if (rhs_batched) { + rhs_shape.insert(rhs_shape.begin(), batch.begin(), batch.end()); + } + net.AddRandomInput("A", lhs_shape); + net.AddRandomInput("B", rhs_shape); + + OpDefBuilder("MatMul", "MatMulTest") + .Input("A") + .AddIntArg("transpose_a", transpose_lhs ? 1 : 0) + .Input("B") + .AddIntArg("transpose_b", transpose_rhs ? 1 : 0) + .Output("Output") + .AddIntArg("T", DT_FLOAT) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("Cast", "CastTest") + .Input("B") + .Output("HalveB") + .OutputType({DT_FLOAT16}) + .AddIntArg("T", DT_FLOAT) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("MatMul", "Float16MatMulTest") + .Input("A") + .AddIntArg("transpose_a", transpose_lhs ? 1 : 0) + .Input("HalveB") + .AddIntArg("transpose_b", transpose_rhs ? 1 : 0) + .Output("Float16Output") + .AddIntArg("T", DT_FLOAT16) + .OutputType({DT_FLOAT}) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + // Check + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("Float16Output"), 0.01); +} +} // namespace +#endif // MACE_ENABLE_NEON + TEST_F(MatMulOpTest, QuantOutputUint8) { QuantOutputUint8({1}, 64, 128, 32, false, false); QuantOutputUint8({1}, 64, 32, 128, false, false); @@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) { QuantOutputInt32({2, 3}, 31, 61, 67, true, true, false, true); } +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +TEST_F(MatMulOpTest, FloatOutput16) { + FloatOutput16({1}, 1, 512, 30745, false, true, false, false); + FloatOutput16({1}, 1, 256, 30000, false, true, false, false); + FloatOutput16({1}, 1, 256, 2048, false, true, false, false); + FloatOutput16({1}, 1, 2048, 256, false, true, false, false); + + FloatOutput16({1}, 1, 512, 30000, false, true, false, false); + FloatOutput16({1}, 1, 512, 512, false, true, false, false); + FloatOutput16({1}, 1, 512, 2048, false, true, false, false); + FloatOutput16({1}, 1, 2048, 512, false, true, false, false); +} +#endif // MACE_ENABLE_NEON } // namespace test } // namespace ops } // namespace mace diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 05caccb3dc5284ffe0fa742e7491dd773bdef557..58658dd81d90b7b9110706338ae7328214ada19b 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -140,7 +140,6 @@ def main(unused_args): option.winograd = FLAGS.winograd option.quantize = FLAGS.quantize option.quantize_range_file = FLAGS.quantize_range_file - option.fp16_matmul_file = FLAGS.fp16_matmul_file option.change_concat_ranges = FLAGS.change_concat_ranges option.cl_mem_type = FLAGS.cl_mem_type option.device = device_type_map[FLAGS.runtime] @@ -385,11 +384,6 @@ def parse_args(): type=str, default="", help="file path of quantize range for each tensor") - parser.add_argument( - "--fp16_matmul_file", - type=str, - default="", - help="file path of matmul names for fp16") parser.add_argument( "--change_concat_ranges", type=str2bool, diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index ff01f59790c9be6a57d766ae183deb5ac5754a64..3f09745b36e44acd567df14b19624dfa54cdc12e 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -391,7 +391,6 @@ class ConverterOption(object): self._winograd = 0 self._quantize = False self._quantize_range_file = "" - self._fp16_matmul_file = "" self._change_concat_ranges = False self._transformer_option = None self._cl_mem_type = "" @@ -432,10 +431,6 @@ class ConverterOption(object): def quantize_range_file(self): return self._quantize_range_file - @property - def fp16_matmul_file(self): - return self._fp16_matmul_file - @property def transformer_option(self): return self._transformer_option @@ -488,10 +483,6 @@ class ConverterOption(object): def quantize_range_file(self, quantize_range_file): self._quantize_range_file = quantize_range_file - @fp16_matmul_file.setter - def fp16_matmul_file(self, fp16_matmul_file): - self._fp16_matmul_file = fp16_matmul_file - @change_concat_ranges.setter def change_concat_ranges(self, change_concat_ranges): self._change_concat_ranges = change_concat_ranges diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index cb095643de1c973a210fc8d3fc700c0ddd5a02eb..71bcc3555459562f3d0010cff43e4935975fb079 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1905,25 +1905,14 @@ class Transformer(base_converter.ConverterInterface): if self._option.device != DeviceType.CPU.value: return - if self._option.fp16_matmul_file: - with open(self._option.fp16_matmul_file) as f: - lines = f.readlines() - specific_matmul_names = [x.strip() for x in lines] - print('Convert matmul weights to fp16 for:') - for name in specific_matmul_names: - print('\t%s' % name) - else: - specific_matmul_names = None - print('Convert matmul weights to fp16 for specific matmul: activation + weights') # noqa + print('Convert matmul weights to fp16 for specific matmul: activation + weights') # noqa for op in self._model.op: if op.type != MaceOp.MatMul.name: continue - if specific_matmul_names is not None and str(op.name) not in specific_matmul_names: # noqa - continue - if specific_matmul_names is None and op.input[0] not in self._consts and op.input[1] not in self._consts: # noqa + if op.input[0] not in self._consts and op.input[1] not in self._consts: # noqa continue - if specific_matmul_names is None and op.input[0] in self._consts and op.input[1] in self._consts: # noqa + if op.input[0] in self._consts and op.input[1] in self._consts: continue # Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now! # noqa diff --git a/tools/common.py b/tools/common.py index a5724ae254f96753c421fe95aef73cbb6ed9d276..0884319ff9f369c0d05271141e16935cdbf57a56 100644 --- a/tools/common.py +++ b/tools/common.py @@ -416,7 +416,6 @@ class YAMLKeyword(object): docker_image_tag = 'docker_image_tag' dockerfile_path = 'dockerfile_path' dockerfile_sha256_checksum = 'dockerfile_sha256_checksum' - fp16_matmul_file = 'fp16_matmul_file' ################################ diff --git a/tools/converter.py b/tools/converter.py index c2a946ab32bea11f12640f08ed8b2d13b40b225e..a5df88a9cecd8493b26b6462b33a9aaff729f99b 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -745,7 +745,6 @@ def convert_model(configs, cl_mem_type): model_config[YAMLKeyword.winograd], model_config[YAMLKeyword.quantize], quantize_range_file_path, - model_config.get(YAMLKeyword.fp16_matmul_file, ""), model_config[YAMLKeyword.change_concat_ranges], model_config[YAMLKeyword.obfuscate], configs[YAMLKeyword.model_graph_format], diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 5c14336f697829db1aa5e563c3a0014f464a8188..3b98c7a691bf6a047bdc91bfd4c90cc36d336d4e 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -501,7 +501,6 @@ def gen_model_code(model_codegen_dir, winograd, quantize, quantize_range_file, - fp16_matmul_file, change_concat_ranges, obfuscate, model_graph_format, @@ -540,7 +539,6 @@ def gen_model_code(model_codegen_dir, "--winograd=%s" % winograd, "--quantize=%s" % quantize, "--quantize_range_file=%s" % quantize_range_file, - "--fp16_matmul_file=%s" % fp16_matmul_file, "--change_concat_ranges=%s" % change_concat_ranges, "--obfuscate=%s" % obfuscate, "--output_dir=%s" % model_codegen_dir,