diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index 628a156d03041157d6639912c31505a6feda57e8..7497e5b4687b0bab58d5b0c8dce918db431609e3 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -71,7 +71,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]), linkopts = ["-ldl"], diff --git a/mace/core/tensor.h b/mace/core/tensor.h index dc6c8f62d09cf52d2149c18e0ff9239856cbc2ac..f4bd2a3748a0ba30bdd34756e547932e0e4bd3d5 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -46,28 +46,28 @@ namespace mace { break; \ } -#ifdef MACE_ENABLE_OPENCL -#define MACE_TYPE_ENUM_SWITCH( \ - TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \ - switch (TYPE_ENUM) { \ - MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ - MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ - case DT_INVALID: \ - INVALID_STATEMENTS; \ - break; \ - default: \ - DEFAULT_STATEMENTS; \ - break; \ - } +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ + MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) +#else +#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) +#endif + +#if MACE_ENABLE_OPENCL +#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ + MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) #else +#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) +#endif + #define MACE_TYPE_ENUM_SWITCH( \ TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \ switch (TYPE_ENUM) { \ MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ + MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ + MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ case DT_INVALID: \ INVALID_STATEMENTS; \ break; \ @@ -75,7 +75,6 @@ namespace mace { DEFAULT_STATEMENTS; \ break; \ } -#endif // `TYPE_ENUM` will be converted to template `T` in `STATEMENTS` #define MACE_RUN_WITH_TYPE_ENUM(TYPE_ENUM, STATEMENTS) \ diff --git a/mace/core/types.cc b/mace/core/types.cc index 9b09d1d073fbbb6401c40926c82f793f7e9022c2..3e4225391d32c65c94e73673510de03a6b0750c7 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -47,6 +47,10 @@ size_t GetEnumTypeSize(const DataType dt) { return sizeof(float); case DT_HALF: return sizeof(half); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + case DT_FLOAT16: + return sizeof(float16_t); +#endif case DT_UINT8: return sizeof(uint8_t); case DT_INT32: diff --git a/mace/core/types.h b/mace/core/types.h index 8dde57fd48d4bfd29405b28bfdcbc05a67d0c897..f2fbad3099adb500ab763de6b93dfed59d33bc76 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -17,6 +17,9 @@ #include #include +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#include +#endif #include "mace/proto/mace.pb.h" #include "include/half.hpp" @@ -51,6 +54,9 @@ struct EnumToDataType; }; MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); +#endif MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT); MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8); MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32); diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 5d2d2cb26668c6ac304c38fbbe14c8e95da96303..1d651c1b3274bab30a7adf30f9f63be2c57dca7f 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -38,7 +38,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -61,7 +61,20 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ], + ] + if_openmp_enabled([ + "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon-fp16", + "-mfloat-abi=softfp", + ]) + if_opencl_enabled([ + "-DMACE_ENABLE_OPENCL", + ]) + if_quantize_enabled([ + "-DMACE_ENABLE_QUANTIZE", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), deps = [ "//mace/core", "@gtest", @@ -89,7 +102,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -111,6 +124,7 @@ cc_library( srcs = glob( [ "arm/fp32/*.cc", + "arm/fp16/gemv.h", ], exclude = [ "arm/fp32/*_test.cc", @@ -141,7 +155,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -186,7 +200,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -221,7 +235,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -256,7 +270,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -324,7 +338,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -368,7 +382,7 @@ cc_library( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -398,7 +412,7 @@ cc_library( ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", ]) + if_android_armv7([ "-mfloat-abi=softfp", ]) + if_opencl_enabled([ @@ -439,7 +453,7 @@ cc_test( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", @@ -476,7 +490,7 @@ cc_test( ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ - "-mfpu=neon", + "-mfpu=neon-fp16", "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", diff --git a/mace/ops/arm/fp16/gemv.h b/mace/ops/arm/fp16/gemv.h new file mode 100644 index 0000000000000000000000000000000000000000..8e7e2a3c91438303f6724b740f16739db2ed5ebc --- /dev/null +++ b/mace/ops/arm/fp16/gemv.h @@ -0,0 +1,120 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP16_GEMV_H_ +#define MACE_OPS_ARM_FP16_GEMV_H_ + +#include "mace/core/types.h" + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#include +#endif + +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) && defined(__ANDROID__) +#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3]) +#endif + +namespace mace { +namespace ops { + +template +void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, + const INPUT_TYPE_RIGHT *v_ptr, + const index_t height, + const index_t width, + OUTPUT_TYPE *result); + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +template<> +void FP16Gemv(const float16_t *m_ptr, + const float *v_ptr, + const index_t height, + const index_t width, + float *out_ptr) { +#pragma omp parallel for + for (index_t h = 0; h < height; ++h) { + const float16_t *m_ptr0 = m_ptr + h * width; + const float *v_ptr0 = v_ptr; + float *out_ptr0 = out_ptr + h; + float sum0 = 0; + + float32x4_t vm0, vm1, vm2, vm3; + float32x4_t vv0, vv1, vv2, vv3; + float32x4_t vsum0 = vdupq_n_f32(0.f); + float32x4_t vsum1 = vdupq_n_f32(0.f); + float32x4_t vsum2 = vdupq_n_f32(0.f); + float32x4_t vsum3 = vdupq_n_f32(0.f); + + index_t w; + for (w = 0; w + 15 < width; w += 16) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8)); + vv2 = vld1q_f32(v_ptr0 + 8); + vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12)); + vv3 = vld1q_f32(v_ptr0 + 12); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + vsum2 = vmlaq_f32(vsum2, vm2, vv2); + vsum3 = vmlaq_f32(vsum3, vm3, vv3); + + m_ptr0 += 16; + v_ptr0 += 16; + } + + for (; w + 7 < width; w += 8) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + + m_ptr0 += 8; + v_ptr0 += 8; + } + + for (; w + 3 < width; w += 4) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + + m_ptr0 += 4; + v_ptr0 += 4; + } + vsum0 += vsum1; + vsum2 += vsum3; + vsum0 += vsum2; + sum0 = vaddvq_f32(vsum0); + + for (; w < width; ++w) { + sum0 += m_ptr0[0] * v_ptr0[0]; + m_ptr0++; + v_ptr0++; + } + *out_ptr0++ = sum0; + } +} +#endif + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP16_GEMV_H_ diff --git a/mace/ops/cast.cc b/mace/ops/cast.cc index 9604bb90edf88a5d057e17f967a6e23447831d35..940959a93f0333033e26a0825f28cf0f735f1bb3 100644 --- a/mace/ops/cast.cc +++ b/mace/ops/cast.cc @@ -14,6 +14,10 @@ #include "mace/core/operator.h" +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#include +#endif + namespace mace { namespace ops { @@ -55,6 +59,10 @@ void RegisterCast(OpRegistryBase *op_registry) { DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, int32_t); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + MACE_REGISTER_OP(op_registry, "Cast", CastOp, + DeviceType::CPU, float16_t); +#endif } } // namespace ops diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc index 0c0551cd396af2279f47b245c371df4989143a98..2114290b66ff8d2d256bc7e9dcce02b298331112 100644 --- a/mace/ops/gather.cc +++ b/mace/ops/gather.cc @@ -93,6 +93,10 @@ void RegisterGather(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Gather", GatherOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + MACE_REGISTER_OP(op_registry, "Gather", GatherOp, + DeviceType::CPU, float16_t); +#endif } } // namespace ops diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index b662ce2ee97859051d1c34553d1519dc5939c99f..592d25ae724ed8a93191049a31097a4e95c91d2a 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -44,6 +44,9 @@ #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/matmul.h" #endif // MACE_ENABLE_OPENCL +#ifdef MACE_ENABLE_NEON +#include "mace/ops/arm/fp16/gemv.h" +#endif namespace mace { namespace ops { @@ -510,6 +513,86 @@ class MatMulOp : public MatMulOpBase { }; #endif // MACE_ENABLE_OPENCL +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +template <> +class MatMulOp : public MatMulOpBase { + public: + explicit MatMulOp(OpConstructContext *context) + : MatMulOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_CHECK_NOTNULL(context); + Validate(); + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + Tensor *C = this->Output(OUTPUT); + + index_t batch; + index_t height; + index_t K; + index_t width; + + index_t rank = A->dim_size(); + height = A->dim(rank - 2); + K = A->dim(rank - 1); + if (transpose_a_) { + std::swap(height, K); + } + if (transpose_b_) { + width = B->dim(rank - 2); + } else { + width = B->dim(rank - 1); + } + batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1, + std::multiplies()); + + std::vector c_shape = A->shape(); + c_shape[rank - 2] = height; + c_shape[rank - 1] = width; + + MACE_RETURN_IF_ERROR(C->Resize(c_shape)); + + Tensor::MappingGuard guarda(A); + Tensor::MappingGuard guardb(B); + Tensor::MappingGuard guardc(C); + auto *c_ptr_base = C->mutable_data(); + + MACE_CHECK(batch == 1, "matmul fp16 only support batch = 1 now"); + + if (width == 1 && !transpose_a_ && A->dtype() == DT_FLOAT16 && + B->dtype() == DT_FLOAT) { + auto *a_ptr_base = A->data(); + auto *b_ptr_base = B->data(); + FP16Gemv(a_ptr_base, + b_ptr_base, + height, + K, + c_ptr_base); + return MaceStatus::MACE_SUCCESS; + } else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT && + B->dtype() == DT_FLOAT16) { + auto *b_ptr_base = B->data(); + auto *a_ptr_base = A->data(); + FP16Gemv(b_ptr_base, + a_ptr_base, + width, + K, + c_ptr_base); + return MaceStatus::MACE_SUCCESS; + } else { + LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " " + << transpose_a_ << " " << transpose_b_; + LOG(FATAL) << "Matmul fp16 Op only support fp32[1,k]·fp16[w,k]T or" + " fp16[w,k]·fp32[k,1] now!"; + return MaceStatus::MACE_INVALID_ARGS; + } + } + + private: +}; +#endif // MACE_ENABLE_NEON + + void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, float); @@ -518,6 +601,19 @@ void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL + +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::CPU, float16_t); +#endif // MACE_ENABLE_NEON } } // namespace ops diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 2ab46bac2a09c9e3d95cf9980c006fef284d138a..c0d5af05c172cd108286fa3ec1c0f25ee776531a 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -263,8 +263,17 @@ void MatMulBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("A", {batch, height, channels}); - net.AddRandomInput("B", {batch, channels, out_width}); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + if (DataTypeToEnum::value == DT_FLOAT16) { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, channels, out_width}); + } else { +#endif + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, channels, out_width}); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + } +#endif net.GetTensor("A")->SetIsWeight(true); net.GetTensor("B")->SetIsWeight(true); if (DataTypeToEnum::value == DT_UINT8) { @@ -305,8 +314,17 @@ void MatMulTransposeBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("A", {batch, height, channels}); - net.AddRandomInput("B", {batch, out_width, channels}); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + if (DataTypeToEnum::value == DT_FLOAT16) { + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, out_width, channels}); + } else { +#endif + net.AddRandomInput("A", {batch, height, channels}); + net.AddRandomInput("B", {batch, out_width, channels}); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) + } +#endif net.GetTensor("A")->SetIsWeight(true); net.GetTensor("B")->SetIsWeight(true); if (DataTypeToEnum::value == DT_UINT8) { @@ -381,10 +399,10 @@ void MatMulTransposeBenchmark( #ifdef MACE_ENABLE_QUANTIZE #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU) + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU); #else #define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \ - MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU) + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); #endif MACE_BM_MATMUL_OP(1, 30000, 256, 1); @@ -407,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49); MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961); MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969); +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W) \ + MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU); + +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 30000); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 256); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 2048); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 256); + +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 30000); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 512); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 2048); +MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 512); +#endif // MACE_ENABLE_NEON + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index f88ac39435e328ad2a4ada6b3c41a73558fdb791..d0432bb0b958ae6ee452b976b5c403e4bb4c04ba 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector &batch, } } // namespace +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +namespace { +void FloatOutput16(const std::vector &batch, + const index_t rows, + const index_t depth, + const index_t cols, + const bool transpose_lhs, + const bool transpose_rhs, + const bool lhs_batched = true, + const bool rhs_batched = true) { + // Construct graph + OpsTestNet net; + + index_t lhs_rows = transpose_lhs ? depth : rows; + index_t lhs_cols = transpose_lhs ? rows : depth; + index_t rhs_rows = transpose_rhs ? cols : depth; + index_t rhs_cols = transpose_rhs ? depth: cols; + std::vector lhs_shape = {lhs_rows, lhs_cols}; + std::vector rhs_shape = {rhs_rows, rhs_cols}; + if (lhs_batched) { + lhs_shape.insert(lhs_shape.begin(), batch.begin(), batch.end()); + } + if (rhs_batched) { + rhs_shape.insert(rhs_shape.begin(), batch.begin(), batch.end()); + } + net.AddRandomInput("A", lhs_shape); + net.AddRandomInput("B", rhs_shape); + + OpDefBuilder("MatMul", "MatMulTest") + .Input("A") + .AddIntArg("transpose_a", transpose_lhs ? 1 : 0) + .Input("B") + .AddIntArg("transpose_b", transpose_rhs ? 1 : 0) + .Output("Output") + .AddIntArg("T", DT_FLOAT) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("Cast", "CastTest") + .Input("B") + .Output("HalveB") + .OutputType({DT_FLOAT16}) + .AddIntArg("T", DT_FLOAT) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("MatMul", "Float16MatMulTest") + .Input("A") + .AddIntArg("transpose_a", transpose_lhs ? 1 : 0) + .Input("HalveB") + .AddIntArg("transpose_b", transpose_rhs ? 1 : 0) + .Output("Float16Output") + .AddIntArg("T", DT_FLOAT16) + .OutputType({DT_FLOAT}) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + // Check + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("Float16Output"), 0.01); +} +} // namespace +#endif // MACE_ENABLE_NEON + TEST_F(MatMulOpTest, QuantOutputUint8) { QuantOutputUint8({1}, 64, 128, 32, false, false); QuantOutputUint8({1}, 64, 32, 128, false, false); @@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) { QuantOutputInt32({2, 3}, 31, 61, 67, true, true, false, true); } +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +TEST_F(MatMulOpTest, FloatOutput16) { + FloatOutput16({1}, 1, 512, 30745, false, true, false, false); + FloatOutput16({1}, 1, 256, 30000, false, true, false, false); + FloatOutput16({1}, 1, 256, 2048, false, true, false, false); + FloatOutput16({1}, 1, 2048, 256, false, true, false, false); + + FloatOutput16({1}, 1, 512, 30000, false, true, false, false); + FloatOutput16({1}, 1, 512, 512, false, true, false, false); + FloatOutput16({1}, 1, 512, 2048, false, true, false, false); + FloatOutput16({1}, 1, 2048, 512, false, true, false, false); +} +#endif // MACE_ENABLE_NEON } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index bdc67037c4dd3fc897757dc3d1c95ab0f6e4267d..cc761699c974991a7c6a90ddda35a8e7f229da3b 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -162,7 +162,8 @@ class OpsTestNet { std::random_device rd; std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - if (DataTypeToEnum::value == DT_HALF) { + if (DataTypeToEnum::value == DT_HALF || + DataTypeToEnum::value == DT_FLOAT16) { std::generate( input_data, input_data + input->size(), [&gen, &nd, positive, truncate, truncate_min, truncate_max] { diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index d3b564fc6a9de2b7b79f9c73df53b3fa9e310788..540b1b1a234d5ba946b0d4e41b04b9c52192c0d1 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -13,6 +13,7 @@ enum DataType { DT_UINT8 = 2; DT_HALF = 3; DT_INT32 = 4; + DT_FLOAT16 = 5; } enum MemoryType { diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 92c31e4163202f1de3dd5fd2c8bd4257cf4babbb..4302af0753d024de808ed0b9a8b847271396c48a 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -314,6 +314,8 @@ class TransformerRule(Enum): TRANSFORM_CHANNEL_SHUFFLE = 38 UPDATE_DATA_FORMAT = 39 QUANTIZE_SPECIFIC_OPS_ONLY = 40 + FP16_MATMUL_WEIGHT = 41 + FP16_GATHER_WEIGHT = 42 class ConverterInterface(object): diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 380c0051d764ff23200d1ff5a72cb3a516205ecf..e6cfa534d24514b4c345ea0f9fa76434cb33682e 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -106,6 +106,10 @@ class Transformer(base_converter.ConverterInterface): self.transform_channel_shuffle, TransformerRule.QUANTIZE_SPECIFIC_OPS_ONLY: self.quantize_specific_ops_only, + TransformerRule.FP16_MATMUL_WEIGHT: + self.fp16_matmul_weight, + TransformerRule.FP16_GATHER_WEIGHT: + self.fp16_gather_weight, } self._option = option @@ -1853,6 +1857,102 @@ class Transformer(base_converter.ConverterInterface): op.quantize_info[i].minval, op.quantize_info[i].maxval)) + def fp16_gather_weight(self): + for op in self._model.op: + if op.type != MaceOp.Gather.name: + continue + if op.input[0] not in self._consts: + raise KeyError("Not in const tensor: " + str(op.input[0])) + + const_tensor = self._consts[op.input[0]] + if const_tensor.data_type == mace_pb2.DT_FLOAT16: + print(str(const_tensor.name) + " is alreay float16") + continue + + print("FP16 Embedding Lookup Weights: %s" % const_tensor.name) + + op_outputs = [x for x in op.output] + new_gather_name = op.name + '_fp16' + new_gather_output_name = new_gather_name + ":0" + dehalve_name = op.name + + # fp16 weights + const_tensor.data_type = mace_pb2.DT_FLOAT16 + + # change gather + op.name = new_gather_name + op.output[:] = [new_gather_output_name] + # op.output.extend([new_gather_output_name]) + data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa + if data_type_arg is None: + data_type_arg = op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_FLOAT16 + + # add dehalve + dehalve_op = self._model.op.add() + dehalve_op.name = dehalve_name + dehalve_op.type = MaceOp.Cast.name + dehalve_op.input.extend([new_gather_output_name]) + dehalve_op.output.extend(op_outputs) + dehalve_op.output_shape.extend(op.output_shape) + dehalve_op.output_type.extend([mace_pb2.DT_FLOAT]) + data_type_arg = dehalve_op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_FLOAT16 + + def fp16_matmul_weight(self): + if self._option.device != DeviceType.CPU.value: + return + + print('Convert matmul weights to fp16 for specific matmul: activation + weights') # noqa + + for op in self._model.op: + if op.type != MaceOp.MatMul.name: + continue + if op.input[0] not in self._consts and op.input[1] not in self._consts: # noqa + continue + if op.input[0] in self._consts and op.input[1] in self._consts: + continue + + # Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now! # noqa + + transpose_a_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_a_str) # noqa + transpose_b_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa + transpose_a = transpose_a_arg is not None and transpose_a_arg.i == 1 # noqa + transpose_b = transpose_b_arg is not None and transpose_b_arg.i == 1 # noqa + + left_tensor = op.input[0] + right_tensor = op.input[1] + left_shape = self.get_tensor_shape(left_tensor) + right_shape = self.get_tensor_shape(right_tensor) + + height = left_shape[-1] if transpose_a else left_shape[-2] + width = right_shape[-2] if transpose_b else right_shape[-1] + batch = reduce(lambda x, y: x * y, left_shape[: -2], 1) + + if batch != 1: + continue + + if left_tensor in self._consts: + if width != 1 or transpose_a: + continue + const_tensor = self._consts[left_tensor] + else: + if height != 1 or not transpose_b: + continue + const_tensor = self._consts[right_tensor] + + print('Convert Matmul Weights to fp16: %s' % op.name) + + const_tensor.data_type = mace_pb2.DT_FLOAT16 + data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa + if data_type_arg is None: + data_type_arg = op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_FLOAT16 + op.output_type.extend([mace_pb2.DT_FLOAT]) + def add_opencl_informations(self): print("Add OpenCL informations") @@ -2052,10 +2152,7 @@ class Transformer(base_converter.ConverterInterface): data_type_arg = quantize_op.arg.add() data_type_arg.name = MaceKeyword.mace_non_zero - if non_zero: - data_type_arg.i = 1 - else: - data_type_arg.i = 0 + data_type_arg.i = 0 find_range_arg = quantize_op.arg.add() find_range_arg.name = \ @@ -2064,8 +2161,6 @@ class Transformer(base_converter.ConverterInterface): quantized_inputs_names[-1] = quantize_output_name - non_zero = False - del op.input[:] op.input.extend(quantized_inputs_names) diff --git a/mace/python/tools/model_saver.py b/mace/python/tools/model_saver.py index c062dfad864f1f226269419d7d870d191a9b9b4c..270ac8e4896fda3e85538102baa8a7850d829789 100644 --- a/mace/python/tools/model_saver.py +++ b/mace/python/tools/model_saver.py @@ -117,6 +117,9 @@ class TensorInfo: elif tensor.data_type == mace_pb2.DT_UINT8: self.data = bytearray( np.array(tensor.int32_data).astype(np.uint8).tolist()) + elif tensor.data_type == mace_pb2.DT_FLOAT16: + self.data = bytearray( + np.array(tensor.float_data).astype(np.float16).tobytes()) else: raise Exception('Tensor data type %s not supported' % tensor.data_type) @@ -139,7 +142,8 @@ def update_tensor_infos(net_def, data_type): offset += padding if tensor.data_type == mace_pb2.DT_FLOAT \ - or tensor.data_type == mace_pb2.DT_HALF: + or tensor.data_type == mace_pb2.DT_HALF \ + or tensor.data_type == mace_pb2.DT_FLOAT16: tensor.data_size = len(tensor.float_data) elif tensor.data_type == mace_pb2.DT_INT32: tensor.data_size = len(tensor.int32_data) @@ -178,7 +182,8 @@ def save_model_data(net_def, model_tag, output_dir): def save_model_to_proto(net_def, model_tag, output_dir): for tensor in net_def.tensors: if tensor.data_type == mace_pb2.DT_FLOAT \ - or tensor.data_type == mace_pb2.DT_HALF: + or tensor.data_type == mace_pb2.DT_HALF \ + or tensor.data_type == mace_pb2.DT_FLOAT16: del tensor.float_data[:] elif tensor.data_type == mace_pb2.DT_INT32: del tensor.int32_data[:]