From 0c5aab6750e0236babe0aed871af29bb4b9e22bb Mon Sep 17 00:00:00 2001 From: yulianfei Date: Thu, 29 Nov 2018 21:17:09 +0800 Subject: [PATCH] Reformat fp16 gemv, add FP16_MATMUL_WEIGHT Summary: Reformat fp16 gemv, add FP16_MATMUL_WEIGHT Differential Revision: https://phabricator.d.xiaomi.net/D144134 --- mace/core/BUILD.bazel | 2 +- mace/core/tensor.h | 31 +++-- mace/core/types.cc | 4 + mace/core/types.h | 6 + mace/ops/BUILD.bazel | 12 +- mace/ops/arm/fp16_gemm.h | 122 ++++++++++++++++++ mace/ops/matmul.cc | 96 ++++++++++++++ mace/ops/matmul_benchmark.cc | 25 ++-- mace/ops/ops_test_util.h | 3 +- mace/proto/mace.proto | 1 + mace/python/tools/converter.py | 6 + .../tools/converter_tool/base_converter.py | 10 ++ .../tools/converter_tool/transformer.py | 65 ++++++++++ mace/python/tools/model_saver.py | 9 +- tools/common.py | 1 + tools/converter.py | 1 + tools/sh_commands.py | 2 + 17 files changed, 362 insertions(+), 34 deletions(-) create mode 100644 mace/ops/arm/fp16_gemm.h diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index 628a156d..7497e5b4 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -71,7 +71,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]), linkopts = ["-ldl"], diff --git a/mace/core/tensor.h b/mace/core/tensor.h index dc6c8f62..3f0585b8 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -46,28 +46,28 @@ namespace mace { break; \ } -#ifdef MACE_ENABLE_OPENCL -#define MACE_TYPE_ENUM_SWITCH( \ - TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \ - switch (TYPE_ENUM) { \ - MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ - case DT_INVALID: \ - INVALID_STATEMENTS; \ - break; \ - default: \ - DEFAULT_STATEMENTS; \ - break; \ - } +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ + MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) +#else +#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) +#endif + +#if MACE_ENABLE_OPENCL +#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ + MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) #else +#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) +#endif + #define MACE_TYPE_ENUM_SWITCH( \ TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \ switch (TYPE_ENUM) { \ MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ + MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ + MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ case DT_INVALID: \ INVALID_STATEMENTS; \ break; \ @@ -75,7 +75,6 @@ namespace mace { DEFAULT_STATEMENTS; \ break; \ } -#endif // `TYPE_ENUM` will be converted to template `T` in `STATEMENTS` #define MACE_RUN_WITH_TYPE_ENUM(TYPE_ENUM, STATEMENTS) \ diff --git a/mace/core/types.cc b/mace/core/types.cc index 9b09d1d0..2b5393a6 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -47,6 +47,10 @@ size_t GetEnumTypeSize(const DataType dt) { return sizeof(float); case DT_HALF: return sizeof(half); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) + case DT_FLOAT16: + return sizeof(float16_t); +#endif case DT_UINT8: return sizeof(uint8_t); case DT_INT32: diff --git a/mace/core/types.h b/mace/core/types.h index 8dde57fd..f965f286 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -17,6 +17,9 @@ #include #include +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +#include +#endif #include "mace/proto/mace.pb.h" #include "include/half.hpp" @@ -51,6 +54,9 @@ struct EnumToDataType; }; MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); +#endif MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT); MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8); MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32); diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 5d2d2cb2..37fec862 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -324,7 +324,8 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", + ]) + if_android_armv7([ "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -368,7 +369,8 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", + ]) + if_android_armv7([ "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -398,7 +400,7 @@ cc_library( ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", ]) + if_android_armv7([ "-mfloat-abi=softfp", ]) + if_opencl_enabled([ @@ -439,7 +441,7 @@ cc_test( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -476,7 +478,7 @@ cc_test( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", diff --git a/mace/ops/arm/fp16_gemm.h b/mace/ops/arm/fp16_gemm.h new file mode 100644 index 00000000..860e259f --- /dev/null +++ b/mace/ops/arm/fp16_gemm.h @@ -0,0 +1,122 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP16_GEMM_H_ +#define MACE_OPS_ARM_FP16_GEMM_H_ + +#include "mace/core/types.h" + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +#include +#endif + +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) +#if defined(__ANDROID__) && defined(ANDROID) +#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3]) +#endif +#endif + +namespace mace { +namespace ops { + +template +void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, + const INPUT_TYPE_RIGHT *v_ptr, + const index_t height, + const index_t width, + OUTPUT_TYPE *result); + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +template<> +void FP16Gemv(const float16_t *m_ptr, + const float *v_ptr, + const index_t height, + const index_t width, + float *out_ptr) { +#pragma omp parallel for + for (index_t h = 0; h < height; ++h) { + const float16_t *m_ptr0 = m_ptr + h * width; + const float *v_ptr0 = v_ptr; + float *out_ptr0 = out_ptr + h; + float sum0 = 0; + + float32x4_t vm0, vm1, vm2, vm3; + float32x4_t vv0, vv1, vv2, vv3; + float32x4_t vsum0 = vdupq_n_f32(0.f); + float32x4_t vsum1 = vdupq_n_f32(0.f); + float32x4_t vsum2 = vdupq_n_f32(0.f); + float32x4_t vsum3 = vdupq_n_f32(0.f); + + index_t w; + for (w = 0; w + 15 < width; w += 16) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8)); + vv2 = vld1q_f32(v_ptr0 + 8); + vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12)); + vv3 = vld1q_f32(v_ptr0 + 12); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + vsum2 = vmlaq_f32(vsum2, vm2, vv2); + vsum3 = vmlaq_f32(vsum3, vm3, vv3); + + m_ptr0 += 16; + v_ptr0 += 16; + } + + for (; w + 7 < width; w += 8) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + + m_ptr0 += 8; + v_ptr0 += 8; + } + + for (; w + 3 < width; w += 4) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + + m_ptr0 += 4; + v_ptr0 += 4; + } + vsum0 += vsum1; + vsum2 += vsum3; + vsum0 += vsum2; + sum0 = vaddvq_f32(vsum0); + + for (; w < width; ++w) { + sum0 += m_ptr0[0] * v_ptr0[0]; + m_ptr0++; + v_ptr0++; + } + *out_ptr0++ = sum0; + } +} +#endif + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP16_GEMM_H_ diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index b662ce2e..98d8c30c 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -44,6 +44,9 @@ #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/matmul.h" #endif // MACE_ENABLE_OPENCL +#ifdef MACE_ENABLE_NEON +#include "mace/ops/arm/fp16_gemm.h" +#endif namespace mace { namespace ops { @@ -510,6 +513,86 @@ class MatMulOp : public MatMulOpBase { }; #endif // MACE_ENABLE_OPENCL +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) +template <> +class MatMulOp : public MatMulOpBase { + public: + explicit MatMulOp(OpConstructContext *context) + : MatMulOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_CHECK_NOTNULL(context); + Validate(); + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + Tensor *C = this->Output(OUTPUT); + + index_t batch; + index_t height; + index_t K; + index_t width; + + index_t rank = A->dim_size(); + height = A->dim(rank - 2); + K = A->dim(rank - 1); + if (transpose_a_) { + std::swap(height, K); + } + if (transpose_b_) { + width = B->dim(rank - 2); + } else { + width = B->dim(rank - 1); + } + batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, + std::multiplies()); + + std::vector c_shape = A->shape(); + c_shape[rank - 2] = height; + c_shape[rank - 1] = width; + + MACE_RETURN_IF_ERROR(C->Resize(c_shape)); + + Tensor::MappingGuard guarda(A); + Tensor::MappingGuard guardb(B); + Tensor::MappingGuard guardc(C); + auto *c_ptr_base = C->mutable_data(); + + MACE_CHECK(batch == 1, "matmul fp16 only support batch = 1 now"); + + if (width == 1 && !transpose_a_ && A->dtype() == DT_FLOAT16 && + B->dtype() == DT_FLOAT) { + auto *a_ptr_base = A->data(); + auto *b_ptr_base = B->data(); + FP16Gemv(a_ptr_base, + b_ptr_base, + height, + K, + c_ptr_base); + return MaceStatus::MACE_SUCCESS; + } else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT && + B->dtype() == DT_FLOAT16) { + auto *b_ptr_base = B->data(); + auto *a_ptr_base = A->data(); + FP16Gemv(b_ptr_base, + a_ptr_base, + width, + K, + c_ptr_base); + return MaceStatus::MACE_SUCCESS; + } else { + LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " " + << transpose_a_ << " " << transpose_b_; + LOG(FATAL) << "Matmul fp16 Op only support fp32[1,k]·fp16[w,k]T or" + " fp16[w,k]·fp32[k,1] now!"; + return MaceStatus::MACE_INVALID_ARGS; + } + } + + private: +}; +#endif // MACE_ENABLE_NEON + + void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, float); @@ -518,6 +601,19 @@ void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) && defined(ANDROID) + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::CPU, float16_t); +#endif // MACE_ENABLE_NEON } } // namespace ops diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 2ab46bac..e87dc91b 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -263,8 +263,13 @@ void MatMulBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("A", {batch, height, channels}); - net.AddRandomInput("B", {batch, channels, out_width}); + if (DataTypeToEnum::value == DT_FLOAT16) { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, channels, out_width}); + } else { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, channels, out_width}); + } net.GetTensor("A")->SetIsWeight(true); net.GetTensor("B")->SetIsWeight(true); if (DataTypeToEnum::value == DT_UINT8) { @@ -305,8 +310,13 @@ void MatMulTransposeBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("A", {batch, height, channels}); - net.AddRandomInput("B", {batch, out_width, channels}); + if (DataTypeToEnum::value == DT_FLOAT16) { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, out_width, channels}); + } else { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, out_width, channels}); + } net.GetTensor("A")->SetIsWeight(true); net.GetTensor("B")->SetIsWeight(true); if (DataTypeToEnum::value == DT_UINT8) { @@ -381,11 +391,8 @@ void MatMulTransposeBenchmark( #ifdef MACE_ENABLE_QUANTIZE #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU) -#else -#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU) -#endif + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU); \ + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU); MACE_BM_MATMUL_OP(1, 30000, 256, 1); MACE_BM_MATMUL_OP(1, 128, 256, 128); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index bdc67037..cc761699 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -162,7 +162,8 @@ class OpsTestNet { std::random_device rd; std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - if (DataTypeToEnum::value == DT_HALF) { + if (DataTypeToEnum::value == DT_HALF || + DataTypeToEnum::value == DT_FLOAT16) { std::generate( input_data, input_data + input->size(), [&gen, &nd, positive, truncate, truncate_min, truncate_max] { diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index d3b564fc..540b1b1a 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -13,6 +13,7 @@ enum DataType { DT_UINT8 = 2; DT_HALF = 3; DT_INT32 = 4; + DT_FLOAT16 = 5; } enum MemoryType { diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 58658dd8..05caccb3 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -140,6 +140,7 @@ def main(unused_args): option.winograd = FLAGS.winograd option.quantize = FLAGS.quantize option.quantize_range_file = FLAGS.quantize_range_file + option.fp16_matmul_file = FLAGS.fp16_matmul_file option.change_concat_ranges = FLAGS.change_concat_ranges option.cl_mem_type = FLAGS.cl_mem_type option.device = device_type_map[FLAGS.runtime] @@ -384,6 +385,11 @@ def parse_args(): type=str, default="", help="file path of quantize range for each tensor") + parser.add_argument( + "--fp16_matmul_file", + type=str, + default="", + help="file path of matmul names for fp16") parser.add_argument( "--change_concat_ranges", type=str2bool, diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 61e65bae..871e0293 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -314,6 +314,7 @@ class TransformerRule(Enum): TRANSFORM_CHANNEL_SHUFFLE = 38 UPDATE_DATA_FORMAT = 39 QUANTIZE_SPECIFIC_OPS_ONLY = 40 + FP16_MATMUL_WEIGHT = 41 class ConverterInterface(object): @@ -389,6 +390,7 @@ class ConverterOption(object): self._winograd = 0 self._quantize = False self._quantize_range_file = "" + self._fp16_matmul_file = "" self._change_concat_ranges = False self._transformer_option = None self._cl_mem_type = "" @@ -429,6 +431,10 @@ class ConverterOption(object): def quantize_range_file(self): return self._quantize_range_file + @property + def fp16_matmul_file(self): + return self._fp16_matmul_file + @property def transformer_option(self): return self._transformer_option @@ -481,6 +487,10 @@ class ConverterOption(object): def quantize_range_file(self, quantize_range_file): self._quantize_range_file = quantize_range_file + @fp16_matmul_file.setter + def fp16_matmul_file(self, fp16_matmul_file): + self._fp16_matmul_file = fp16_matmul_file + @change_concat_ranges.setter def change_concat_ranges(self, change_concat_ranges): self._change_concat_ranges = change_concat_ranges diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 536a88e1..7e032735 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -106,6 +106,8 @@ class Transformer(base_converter.ConverterInterface): self.transform_channel_shuffle, TransformerRule.QUANTIZE_SPECIFIC_OPS_ONLY: self.quantize_specific_ops_only, + TransformerRule.FP16_MATMUL_WEIGHT: + self.fp16_matmul_weight, } self._option = option @@ -1853,6 +1855,69 @@ class Transformer(base_converter.ConverterInterface): op.quantize_info[i].minval, op.quantize_info[i].maxval)) + def fp16_matmul_weight(self): + if self._option.device != DeviceType.CPU.value: + return + + if self._option.fp16_matmul_file: + with open(self._option.fp16_matmul_file) as f: + lines = f.readlines() + specific_matmul_names = [x.strip() for x in lines] + print('Convert matmul weights to fp16 for:') + for name in specific_matmul_names: + print('\t%s' % name) + else: + specific_matmul_names = None + print('Convert matmul weights to fp16 for specific matmul: activation + weights') # noqa + + for op in self._model.op: + if op.type != MaceOp.MatMul.name: + continue + if specific_matmul_names is not None and str(op.name) not in specific_matmul_names: # noqa + continue + if specific_matmul_names is None and op.input[0] not in self._consts and op.input[1] not in self._consts: # noqa + continue + if specific_matmul_names is None and op.input[0] in self._consts and op.input[1] in self._consts: # noqa + continue + + # Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now! # noqa + + transpose_a_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_a_str) # noqa + transpose_b_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa + transpose_a = transpose_a_arg is not None and transpose_a_arg.i == 1 # noqa + transpose_b = transpose_b_arg is not None and transpose_b_arg.i == 1 # noqa + + left_tensor = op.input[0] + right_tensor = op.input[1] + left_shape = self.get_tensor_shape(left_tensor) + right_shape = self.get_tensor_shape(right_tensor) + + height = left_shape[-1] if transpose_a else left_shape[-2] + width = right_shape[-2] if transpose_b else right_shape[-1] + batch = reduce(lambda x, y: x * y, left_shape[: -2], 1) + + if batch != 1: + continue + + if left_tensor in self._consts: + if width != 1 or transpose_a: + continue + const_tensor = self._consts[left_tensor] + else: + if height != 1 or not transpose_b: + continue + const_tensor = self._consts[right_tensor] + + print('Convert Matmul Weights to fp16: %s' % op.name) + + const_tensor.data_type = mace_pb2.DT_FLOAT16 + data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa + if data_type_arg is None: + data_type_arg = op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_FLOAT16 + op.output_type.extend([mace_pb2.DT_FLOAT]) + def add_opencl_informations(self): print("Add OpenCL informations") diff --git a/mace/python/tools/model_saver.py b/mace/python/tools/model_saver.py index c062dfad..270ac8e4 100644 --- a/mace/python/tools/model_saver.py +++ b/mace/python/tools/model_saver.py @@ -117,6 +117,9 @@ class TensorInfo: elif tensor.data_type == mace_pb2.DT_UINT8: self.data = bytearray( np.array(tensor.int32_data).astype(np.uint8).tolist()) + elif tensor.data_type == mace_pb2.DT_FLOAT16: + self.data = bytearray( + np.array(tensor.float_data).astype(np.float16).tobytes()) else: raise Exception('Tensor data type %s not supported' % tensor.data_type) @@ -139,7 +142,8 @@ def update_tensor_infos(net_def, data_type): offset += padding if tensor.data_type == mace_pb2.DT_FLOAT \ - or tensor.data_type == mace_pb2.DT_HALF: + or tensor.data_type == mace_pb2.DT_HALF \ + or tensor.data_type == mace_pb2.DT_FLOAT16: tensor.data_size = len(tensor.float_data) elif tensor.data_type == mace_pb2.DT_INT32: tensor.data_size = len(tensor.int32_data) @@ -178,7 +182,8 @@ def save_model_data(net_def, model_tag, output_dir): def save_model_to_proto(net_def, model_tag, output_dir): for tensor in net_def.tensors: if tensor.data_type == mace_pb2.DT_FLOAT \ - or tensor.data_type == mace_pb2.DT_HALF: + or tensor.data_type == mace_pb2.DT_HALF \ + or tensor.data_type == mace_pb2.DT_FLOAT16: del tensor.float_data[:] elif tensor.data_type == mace_pb2.DT_INT32: del tensor.int32_data[:] diff --git a/tools/common.py b/tools/common.py index 0884319f..a5724ae2 100644 --- a/tools/common.py +++ b/tools/common.py @@ -416,6 +416,7 @@ class YAMLKeyword(object): docker_image_tag = 'docker_image_tag' dockerfile_path = 'dockerfile_path' dockerfile_sha256_checksum = 'dockerfile_sha256_checksum' + fp16_matmul_file = 'fp16_matmul_file' ################################ diff --git a/tools/converter.py b/tools/converter.py index a5df88a9..c2a946ab 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -745,6 +745,7 @@ def convert_model(configs, cl_mem_type): model_config[YAMLKeyword.winograd], model_config[YAMLKeyword.quantize], quantize_range_file_path, + model_config.get(YAMLKeyword.fp16_matmul_file, ""), model_config[YAMLKeyword.change_concat_ranges], model_config[YAMLKeyword.obfuscate], configs[YAMLKeyword.model_graph_format], diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 3b98c7a6..5c14336f 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -501,6 +501,7 @@ def gen_model_code(model_codegen_dir, winograd, quantize, quantize_range_file, + fp16_matmul_file, change_concat_ranges, obfuscate, model_graph_format, @@ -539,6 +540,7 @@ def gen_model_code(model_codegen_dir, "--winograd=%s" % winograd, "--quantize=%s" % quantize, "--quantize_range_file=%s" % quantize_range_file, + "--fp16_matmul_file=%s" % fp16_matmul_file, "--change_concat_ranges=%s" % change_concat_ranges, "--obfuscate=%s" % obfuscate, "--output_dir=%s" % model_codegen_dir, -- GitLab