diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ddf861c069ee1fad7904967fe993a8e6cbccd02..c284a9ca82ca65f46a272e3ce588f61c571eb2c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF) option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF) option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF) option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF) +option(MACE_ENABLE_FP16 "whether to enable armv8.2 fp16 support" OFF) option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF) option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF) option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON) @@ -121,6 +122,10 @@ if(MACE_ENABLE_BFLOAT16) add_definitions(-DMACE_ENABLE_BFLOAT16) endif(MACE_ENABLE_BFLOAT16) +if(MACE_ENABLE_FP16) + add_definitions(-DMACE_ENABLE_FP16) +endif(MACE_ENABLE_FP16) + if(MACE_ENABLE_OBFUSCATE) add_definitions(-DMACE_OBFUSCATE_LITERALS) endif(MACE_ENABLE_OBFUSCATE) diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 8d8c50dfe07bbe69bf52505a5e481ce53e79aa4b..d05a1921959cdc7727b135c7137a2022c216e4ab 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -85,7 +85,7 @@ in one deployment file. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32, fp16_fp16] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -584,9 +584,10 @@ Therefore, the default storage type for a regular model in MACE is half. However if the model is very sensitive to accuracy, storage type can be changed to float. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, -for CPU it can also be changed to ``bf16_fp32``. +for CPU it can also be changed to ``bf16_fp32`` and ``fp16_fp16``(``fp16_fp16`` can only be used on armv8.2 or higher version). -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float,85G +and ``fp16_fp16`` means that the weights are saved in half and actual inference is in half. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/docs/user_guide/advanced_usage_cmake.rst b/docs/user_guide/advanced_usage_cmake.rst index a3d2d489bf77437b4bd1800f2c773d995e3b9afc..62e5f555ce3a007bac1c29e0c44ffaaf103ea86d 100644 --- a/docs/user_guide/advanced_usage_cmake.rst +++ b/docs/user_guide/advanced_usage_cmake.rst @@ -63,7 +63,7 @@ There are many advanced options supported. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32, fp16_fp16] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -439,9 +439,12 @@ Therefore, the default storage type for a regular model in MACE is half. However if the model is very sensitive to accuracy, storage type can be changed to float. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, -for CPU it can also be changed to ``bf16_fp32``. +for CPU it can also be changed to ``bf16_fp32`` and ``fp16_fp16``(``fp16_fp16`` can only be used on armv8.2 or higher version). + +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float, +while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float, +and ``fp16_fp16`` means that the weights are saved in half and actual inference is in half. -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/mace/BUILD.bazel b/mace/BUILD.bazel index 3dbccfa5db560e6a21d80263ac7c9938dfe29160..7dcbcd65ed1685b5416612198e5ebcb8aec6bd9c 100644 --- a/mace/BUILD.bazel +++ b/mace/BUILD.bazel @@ -132,6 +132,14 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "fp16_enabled", + define_values = { + "fp16": "true", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "rpcmem_enabled", define_values = { diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index f418e8143489a323747bf2a33f938bd0fe6df52d..24b27b76151762a543e891383708f94b98d08621 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -10,6 +10,7 @@ load( "if_android_armv7", "if_apu_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_hexagon_enabled", "if_hexagon_or_hta_enabled", "if_hta_enabled", @@ -86,6 +87,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/mace/core/fp16.h b/mace/core/fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..6ab4a5d6ccb45c1732426fa761d50364a9cb9350 --- /dev/null +++ b/mace/core/fp16.h @@ -0,0 +1,97 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_FP16_H_ +#define MACE_CORE_FP16_H_ + +#ifdef MACE_ENABLE_FP16 + +#include +#include +#include +#include + + +namespace std { +inline float fabs(const float16_t &value) { + return fabs(static_cast(value)); +} + +inline float abs(const float16_t &value) { + return abs(static_cast(value)); +} + +inline float sqrt(const float16_t &value) { + return sqrt(static_cast(value)); +} + +inline float log(const float16_t &value) { + return log(static_cast(value)); +} + +inline float tanh(const float16_t &value) { + return tanh(static_cast(value)); +} + +inline float exp(const float16_t &value) { + return exp(static_cast(value)); +} + +inline int ceil(const float16_t &value) { + return ceil(static_cast(value)); +} + +inline int floor(const float16_t &value) { + return floor(static_cast(value)); +} + +inline float max(const float16_t &a, const float &b) { + return max(static_cast(a), b); +} + +inline float max(const float &a, const float16_t &b) { + return max(a, static_cast(b)); +} + +inline float min(const float16_t &a, const float &b) { + return min(static_cast(a), b); +} + +inline float min(const float &a, const float16_t &b) { + return min(a, static_cast(b)); +} + +inline float pow(const float16_t &a, const float16_t &b) { + return pow(static_cast(a), static_cast(b)); +} + +inline float pow(const float16_t &a, const float &b) { + return pow(static_cast(a), b); +} + +inline float pow(const float &a, const float16_t &b) { + return pow(a, static_cast(b)); +} + +inline ostream &operator<<(ostream &ss, // NOLINT + const float16_t &value) { + return ss << static_cast(value); +} + +} // namespace std + + +#endif // MACE_ENABLE_FP16 + +#endif // MACE_CORE_FP16_H_ diff --git a/mace/core/registry/op_delegator_registry.h b/mace/core/registry/op_delegator_registry.h index 0f72c78ddca2a52bd0c7b48ff4f58b776ab36e3f..0fdc5de77d7f080d3e5a2dac4ba8f98bf73ced6c 100644 --- a/mace/core/registry/op_delegator_registry.h +++ b/mace/core/registry/op_delegator_registry.h @@ -22,6 +22,7 @@ #include #include "mace/core/bfloat16.h" +#include "mace/core/fp16.h" #include "mace/core/ops/op_delegator.h" #include "mace/core/types.h" #include "mace/proto/mace.pb.h" @@ -101,6 +102,15 @@ class OpDelegatorRegistry { #endif // MACE_ENABLE_BFLOAT16 #endif // MACE_REGISTER_BF16_DELEGATOR +#ifndef MACE_REGISTER_FP16_DELEGATOR +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_DELEGATOR(registry, class_name, param_name, key) \ + MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) +#else +#define MACE_REGISTER_FP16_DELEGATOR(registry, class_name, param_name, key) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_DELEGATOR + #ifndef MACE_DEFINE_DELEGATOR_CREATOR #define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \ static std::unique_ptr Create( \ diff --git a/mace/core/registry/ops_registry.h b/mace/core/registry/ops_registry.h index fff4d6e14b5f55c7b378cd1cb9ee47c88812bae1..5a534c34c952ef6dca1546d32ba803d4e3b9c998 100644 --- a/mace/core/registry/ops_registry.h +++ b/mace/core/registry/ops_registry.h @@ -23,6 +23,7 @@ #include #include "mace/core/bfloat16.h" +#include "mace/core/fp16.h" #include "mace/core/types.h" #include "mace/core/ops/operator.h" #include "mace/core/ops/op_condition_builder.h" @@ -102,6 +103,27 @@ class OpRegistry { #endif // MACE_ENABLE_BFLOAT16 #endif // MACE_REGISTER_BF16_OP_BY_CLASS +#ifndef MACE_REGISTER_FP16_OP +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_OP(op_registry, op_type, class_name, device) \ + MACE_REGISTER_OP(op_registry, op_type, class_name, device, float16_t) +#else +#define MACE_REGISTER_FP16_OP(op_registry, op_type, class_name, device) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_OP + +#ifndef MACE_REGISTER_FP16_OP_BY_CLASS +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_OP_BY_CLASS(op_registry, op_type, \ + class_name, device) \ + MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \ + class_name, device, float16_t) +#else +#define MACE_REGISTER_FP16_OP_BY_CLASS(op_registry, op_type, \ + class_name, device) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_OP_BY_CLASS + #ifdef MACE_ENABLE_OPENCL #define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \ op_registry->Register( \ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 1340f188f6088e0e0f816ac7b42a73d998d7ae6f..d5ba530be41c402d21f91ca1e8ea8b371693af88 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -46,7 +46,8 @@ namespace mace { break; \ } -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) \ + || defined(MACE_ENABLE_FP16) #define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) #else @@ -60,6 +61,13 @@ namespace mace { #define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_TYPE_ENUM_SWITCH_CASE_FP16(STATEMENTS) \ + MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) +#else +#define MACE_TYPE_ENUM_SWITCH_CASE_FP16(STATEMENTS) +#endif // MACE_ENABLE_FP16 + #if MACE_ENABLE_OPENCL #define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) diff --git a/mace/core/types.cc b/mace/core/types.cc index 1decd9054110c433d1eea475488eac5f49aa52f9..1a33cabc2c347d4514b078c35a093b43fcc35bf0 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -26,6 +26,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { case DT_UINT8: case DT_INT32: case DT_BFLOAT16: + case DT_FLOAT16: return true; default: return false; @@ -38,7 +39,8 @@ std::string DataTypeToString(const DataType dt) { {DT_HALF, "DT_HALF"}, {DT_UINT8, "DT_UINT8"}, {DT_INT32, "DT_INT32"}, - {DT_BFLOAT16, "DT_BFLOAT16"}}; + {DT_BFLOAT16, "DT_BFLOAT16"}, + {DT_FLOAT16, "DT_FLOAT16"}}; MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type"); return dtype_string_map[dt]; } @@ -49,7 +51,8 @@ size_t GetEnumTypeSize(const DataType dt) { return sizeof(float); case DT_HALF: return sizeof(half); -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) || \ + defined(MACE_ENABLE_FP16) case DT_FLOAT16: return sizeof(float16_t); #endif diff --git a/mace/core/types.h b/mace/core/types.h index 4871c66b87ca6a92b812a50d988f0f8b6e0aa07b..d83e257112fd203a8bffc4ac42da4ab70731454c 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -55,7 +55,8 @@ struct EnumToDataType; }; MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) \ + || defined(MACE_ENABLE_FP16) MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); #endif #ifdef MACE_ENABLE_BFLOAT16 diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel index f7935811cbb64234737c997a9b8b1ef09ffe9a3e..8ad3222bd77cac6878642467532712e6387dcfbe 100644 --- a/mace/libmace/BUILD.bazel +++ b/mace/libmace/BUILD.bazel @@ -13,6 +13,7 @@ load( "if_android_armv7", "if_apu_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_darwin", "if_hexagon_enabled", "if_hta_enabled", @@ -44,6 +45,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 3085b6556de96c58eea100f8daa097420e366f9c..77e1daa7453388b6b6b09872e600c9295d6093d3 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -846,9 +846,8 @@ MaceStatus MaceEngine::Impl::TransposeInput( } else { LOG(FATAL) << "Invalid net data type: " << net_data_type_; } -#ifdef MACE_ENABLE_BFLOAT16 // todo(lichao): add float16 macro - } else if (input_dt == DataType::DT_FLOAT16 || - input_dt == DataType::DT_BFLOAT16) { +#ifdef MACE_ENABLE_BFLOAT16 + } else if (input_dt == DataType::DT_BFLOAT16) { auto *input_data = input_tensor->mutable_data(); return ops::Transpose(thread_pool_.get(), input.second.data().get(), @@ -856,6 +855,16 @@ MaceStatus MaceEngine::Impl::TransposeInput( dst_dims, input_data); #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_FLOAT16) { + auto *input_data = input_tensor->mutable_data(); + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); +#endif // MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); return ops::Transpose(thread_pool_.get(), @@ -882,15 +891,23 @@ MaceStatus MaceEngine::Impl::TransposeInput( } else { LOG(FATAL) << "Invalid net data type: " << net_data_type_; } -#ifdef MACE_ENABLE_BFLOAT16 // todo(lichao): add float16 macro - } else if (input_dt == DataType::DT_FLOAT16 || - input_dt == DataType::DT_BFLOAT16) { +#ifdef MACE_ENABLE_BFLOAT16 + } else if (input_dt == DataType::DT_BFLOAT16) { auto input_data = input_tensor->mutable_data(); const float *data = input.second.data().get(); for (index_t i = 0; i < input_tensor->size(); ++i) { input_data[i] = data[i]; } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_FLOAT16) { + auto input_data = input_tensor->mutable_data(); + const float *data = input.second.data().get(); + for (index_t i = 0; i < input_tensor->size(); ++i) { + input_data[i] = data[i]; + } +#endif // MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); memcpy(input_data, input.second.data().get(), @@ -963,6 +980,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput( dst_dims, output->second.data().get()); #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (output_dt == DataType::DT_FLOAT16) { + auto output_data = output_tensor->data(); + return ops::Transpose(thread_pool_.get(), + output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); +#endif // MACE_ENABLE_FP16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; return MaceStatus::MACE_INVALID_ARGS; @@ -993,6 +1019,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput( data[i] = output_data[i]; } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (output_dt == DataType::DT_FLOAT16) { + const auto *output_data = output_tensor->data(); + float *data = output->second.data().get(); + for (index_t i = 0; i < output_tensor->size(); ++i) { + data[i] = output_data[i]; + } +#endif // MACE_ENABLE_FP16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; } diff --git a/mace/mace.bzl b/mace/mace.bzl index 007b8b655cc4a3a67b7d7ae4e5d3e9aa4ce5d63c..16a01f8f534438586c05dc21c77a5f4bbb0a8422 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -109,6 +109,12 @@ def if_bfloat16_enabled(a): "//conditions:default": [], }) +def if_fp16_enabled(a): + return select({ + "//mace:fp16_enabled": a, + "//conditions:default": [], + }) + def if_rpcmem_enabled(a, default_value = []): return select({ "//mace:rpcmem_enabled": a, diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 73a3cc20de17fb8ce9d7f3bc91125cee03fd26bd..fe2bede01b10177a0020e8c8540d2de48bd9e170 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -11,6 +11,7 @@ load( "if_android", "if_android_armv7", "if_bfloat16_enabled", + "if_fp16_enabled", "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", @@ -46,6 +47,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -85,6 +89,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -112,6 +119,10 @@ cc_library( [ "arm/bf16/*.cc", ], + )) + if_fp16_enabled(glob( + [ + "arm/fp16/*.cc", + ], )), hdrs = glob( [ @@ -126,6 +137,10 @@ cc_library( [ "arm/bf16/*.h", ], + )) + if_fp16_enabled(glob( + [ + "arm/fp16/*.h", + ], )), copts = [ "-Werror", @@ -142,6 +157,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -225,6 +243,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -265,6 +286,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), diff --git a/mace/ops/CMakeLists.txt b/mace/ops/CMakeLists.txt index c2f18a70b76ade280d2222f0d3d31ca3e6e1c7af..9665d948cdc840723d1b82cd8928c1cc65478da0 100644 --- a/mace/ops/CMakeLists.txt +++ b/mace/ops/CMakeLists.txt @@ -14,6 +14,9 @@ file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS file(GLOB OPS_ARM_NEON_BF16_KERNELS_SRCS arm/bf16/*.cc ) +file(GLOB OPS_ARM_NEON_FP16_KERNELS_SRCS + arm/fp16/*.cc +) file(GLOB OPS_ARM_NEON_Q8_KERNELS_SRCS arm/q8/*.cc ) @@ -45,6 +48,9 @@ if(MACE_ENABLE_NEON) if(MACE_ENABLE_BFLOAT16) set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BF16_KERNELS_SRCS}) endif(MACE_ENABLE_BFLOAT16) + if(MACE_ENABLE_FP16) + set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP16_KERNELS_SRCS}) + endif(MACE_ENABLE_FP16) endif(MACE_ENABLE_NEON) if(MACE_ENABLE_OPENCL) diff --git a/mace/ops/arm/base/common_neon.h b/mace/ops/arm/base/common_neon.h index 5b6d7899ec6260e53cb08d188c237f0be32287dc..41c96ac3ec52ad9acdfd1d1c8d2a4221c5dbb995 100644 --- a/mace/ops/arm/base/common_neon.h +++ b/mace/ops/arm/base/common_neon.h @@ -152,6 +152,40 @@ inline void vst1o(float *ptr, float32x8_t v) { vst1q_f32(ptr + 4, v.val[1]); } +#if defined(MACE_ENABLE_AMR82) + +// load of 4D vector +inline float16x4_t vld1(const float16_t *ptr) { + return vld1_fp16(ptr); +} + +// store of 4D vector +inline void vst1(float16_t *ptr, float16x4_t v) { + vst1_fp16(ptr, v); +} + +// load of 8D vector +inline float16x8_t vld1q(const float16_t *ptr) { + return vld1q_fp16(ptr); +} + +// load of 2 8D vectors and perform de-interleaving +inline float16x8x2_t vld2q(const float16_t *ptr) { + return vld2q_fp16(ptr); +} + +// store of 8D vector +inline void vst1q(float16_t *ptr, const float16x8_t v) { + vst1q_fp16(ptr, v); +} + +// store of 2 8D vectors and perform interleaving +inline void vst2q(float16_t *ptr, const float16x8x2_t v) { + vst2q_fp16(ptr, v); +} + +#endif // MACE_ENABLE_FP16 + #if defined(MACE_ENABLE_BFLOAT16) // load of 2D vector diff --git a/mace/ops/arm/base/conv_2d_1x1.cc b/mace/ops/arm/base/conv_2d_1x1.cc index 40ca524e905c09e120288167bac660bedd7db16b..d53bf1d8247847503782f4d2c875ee1849c7be09 100644 --- a/mace/ops/arm/base/conv_2d_1x1.cc +++ b/mace/ops/arm/base/conv_2d_1x1.cc @@ -101,6 +101,10 @@ void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) { registry, Conv2dK1x1, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K1x1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK1x1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K1x1)); } } // namespace arm diff --git a/mace/ops/arm/base/conv_2d_3x3.cc b/mace/ops/arm/base/conv_2d_3x3.cc index cf232a77f4e5d7512efe61d3ec82f8a20d08f554..8c21b8d64014a33e694cd77a2d4fb78f977a447d 100644 --- a/mace/ops/arm/base/conv_2d_3x3.cc +++ b/mace/ops/arm/base/conv_2d_3x3.cc @@ -36,6 +36,14 @@ void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) { registry, Conv2dK3x3S2, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K3x3S2)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK3x3S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK3x3S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S2)); } } // namespace arm diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc index e17245e612923bd619cd53863f1c3ac60c34aca7..778386f144bafd457e83f3651eea37cd63780fff 100644 --- a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc @@ -20,6 +20,16 @@ namespace mace { namespace ops { namespace arm { +extern template +MaceStatus DepthwiseConv2dK3x3S1::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data); + +extern template +MaceStatus DepthwiseConv2dK3x3S2::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data); + namespace { template void DepthwiseConv2d3x3Pixel(const T *in_base, @@ -464,6 +474,16 @@ void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) { delegator::DepthwiseConv2dParam, MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K3x3S2)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2dK3x3S1, + delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2dK3x3S2, + delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S2)); } } // namespace arm diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.h b/mace/ops/arm/base/depthwise_conv_2d_3x3.h index c9edf26e9318de5d4a0baa5eb88a11f97c840e04..236ad5907e565c00f8b4b05eca27a4e4bcc18da2 100644 --- a/mace/ops/arm/base/depthwise_conv_2d_3x3.h +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.h @@ -27,6 +27,7 @@ namespace mace { namespace ops { namespace arm { + template class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN { public: diff --git a/mace/ops/arm/base/gemm.cc b/mace/ops/arm/base/gemm.cc index 6274311ad9b74a6429e7bbe4fa358af38af43ed2..cba09ecb6f93a57aa48581c03b7c8df871feb250 100644 --- a/mace/ops/arm/base/gemm.cc +++ b/mace/ops/arm/base/gemm.cc @@ -23,6 +23,24 @@ namespace mace { namespace ops { namespace arm { +extern template void Gemm::Pack8x4( + const MatrixMap &matrix, + MatrixMajor dst_major, float16_t *packed_matrix); +extern template void Gemm::Unpack8x8( + const float16_t *packed_output, MatrixMap *output); +extern template void Gemm::PackLhs( + const MatrixMap &lhs, float16_t *packed_lhs); +extern template void Gemm::PackRhs( + const MatrixMap &rhs, float16_t *packed_rhs); +extern template void Gemm::UnpackOutput( + const float16_t *packed_output, MatrixMap *output); +extern template MaceStatus Gemm::Compute( + const OpContext *context, const Tensor *lhs, const Tensor *rhs, + const index_t batch, const index_t rows, const index_t cols, + const index_t depth, const MatrixMajor lhs_major, + const MatrixMajor rhs_major, const MatrixMajor output_major, + const bool lhs_batched, const bool rhs_batched, Tensor *output); + template void Gemm::Pack4x4(const MatrixMap &matrix, MatrixMajor dst_major, T *packed_matrix) { @@ -681,9 +699,9 @@ MaceStatus Gemm::Compute( depth_padded, packed_output_data_block); MatrixMap output_block = output_matrix.block(start_row, - start_col, - row_block_len, - col_block_len); + start_col, + row_block_len, + col_block_len); UnpackOutput(packed_output_data_block, &output_block); } // col_block_idx } // row_block_idx @@ -701,6 +719,10 @@ void RegisterGemmDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemm, delegator::GemmParam, MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::NEON)); + + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::NEON)); } } // namespace arm diff --git a/mace/ops/arm/fp16/conv_2d_3x3.cc b/mace/ops/arm/fp16/conv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..f22efb66a47d865fea5a519acb113c992169351e --- /dev/null +++ b/mace/ops/arm/fp16/conv_2d_3x3.cc @@ -0,0 +1,367 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/ops/arm/base/conv_2d_3x3.h" +#include "mace/ops/delegator/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { + +template<> +MaceStatus Conv2dK3x3S1::DoCompute( + const ConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 1 < p.out_channels) { + float16_t *out_ptr0_base = + output_data + b * p.out_batch_size + m * p.out_image_size; + float16_t *out_ptr1_base = + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t *in_ptr0 = + input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t + *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9; + + float16_t *out_ptr1 = out_ptr1_base; + const float16_t *in_ptr1 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; + const float16_t *in_ptr2 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; + const float16_t *in_ptr3 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; + const float16_t *filter_ptr1 = + filter_data + (m + 1) * p.in_channels * 9 + c * 9; + + float16_t *out_ptr0 = out_ptr0_base; + + // load filter (2 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + float16x8_t vf10, vf11; + vf00 = vld1q_f16(filter_ptr0); + vf01 = vld1q_f16(filter_ptr0 + 8); + + vf10 = vld1q_f16(filter_ptr1); + vf11 = vld1q_f16(filter_ptr1 + 8); + + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; // reg count: 14 + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + float16x8_t vi30, vi31, vi32; + float16x8_t vo20, vo30; // tmp use + + // output (4 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + float16x8_t vo10, vo11; + + // load input + vi00 = vld1q_f16(in_ptr0); + vo00 = vld1q_f16(in_ptr0 + 8); // reuse vo00: vi0n + vi10 = vld1q_f16(in_ptr1); + vo10 = vld1q_f16(in_ptr1 + 8); + vi20 = vld1q_f16(in_ptr2); + vo20 = vld1q_f16(in_ptr2 + 8); + vi30 = vld1q_f16(in_ptr3); + vo30 = vld1q_f16(in_ptr3 + 8); + + vi01 = vextq_f16(vi00, vo00, 1); + vi02 = vextq_f16(vi00, vo00, 2); + vi11 = vextq_f16(vi10, vo10, 1); + vi12 = vextq_f16(vi10, vo10, 2); + vi21 = vextq_f16(vi20, vo20, 1); + vi22 = vextq_f16(vi20, vo20, 2); + vi31 = vextq_f16(vi30, vo30, 1); + vi32 = vextq_f16(vi30, vo30, 2); + + // load ouptut + vo00 = vld1q_f16(out_ptr0); + vo01 = vld1q_f16(out_ptr0 + p.out_width); + vo10 = vld1q_f16(out_ptr1); + vo11 = vld1q_f16(out_ptr1 + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); // reg count: 18 + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + + // outch 1, height 0 + vo10 = vfmaq_laneq_f16(vo10, vi00, vf10, 0); + vo10 = vfmaq_laneq_f16(vo10, vi01, vf10, 1); + vo10 = vfmaq_laneq_f16(vo10, vi02, vf10, 2); + vo10 = vfmaq_laneq_f16(vo10, vi10, vf10, 3); + vo10 = vfmaq_laneq_f16(vo10, vi11, vf10, 4); + vo10 = vfmaq_laneq_f16(vo10, vi12, vf10, 5); + vo10 = vfmaq_laneq_f16(vo10, vi20, vf10, 6); + vo10 = vfmaq_laneq_f16(vo10, vi21, vf10, 7); + vo10 = vfmaq_laneq_f16(vo10, vi22, vf11, 0); + + // outch 1, height 1 + vo11 = vfmaq_laneq_f16(vo11, vi10, vf10, 0); + vo11 = vfmaq_laneq_f16(vo11, vi11, vf10, 1); + vo11 = vfmaq_laneq_f16(vo11, vi12, vf10, 2); + vo11 = vfmaq_laneq_f16(vo11, vi20, vf10, 3); + vo11 = vfmaq_laneq_f16(vo11, vi21, vf10, 4); + vo11 = vfmaq_laneq_f16(vo11, vi22, vf10, 5); + vo11 = vfmaq_laneq_f16(vo11, vi30, vf10, 6); + vo11 = vfmaq_laneq_f16(vo11, vi31, vf10, 7); + vo11 = vfmaq_laneq_f16(vo11, vi32, vf11, 0); + + vst1q_f16(out_ptr0, vo00); + vst1q_f16(out_ptr0 + p.out_width, vo01); + vst1q_f16(out_ptr1, vo10); + vst1q_f16(out_ptr1 + p.out_width, vo11); + + in_ptr0 += 8; + in_ptr1 += 8; + in_ptr2 += 8; + in_ptr3 += 8; + + out_ptr0 += 8; + out_ptr1 += 8; + } // w + + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; + + out_ptr0 += p.out_width; + out_ptr1 += p.out_width; + } // h + } // c + } else { + for (index_t mm = m; mm < p.out_channels; ++mm) { + float16_t *out_ptr0_base = + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t *in_ptr0 = + input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t *in_ptr1 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; + const float16_t *in_ptr2 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; + const float16_t *in_ptr3 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; + const float16_t + *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9; + + float16_t *out_ptr0 = out_ptr0_base; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr0); + vf01 = vld1q_f16(filter_ptr0 + 8); + + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02, vi0n; + float16x8_t vi10, vi11, vi12, vi1n; + float16x8_t vi20, vi21, vi22, vi2n; + float16x8_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + + // load input + vi00 = vld1q_f16(in_ptr0); + vi0n = vld1q_f16(in_ptr0 + 8); + vi10 = vld1q_f16(in_ptr1); + vi1n = vld1q_f16(in_ptr1 + 8); + vi20 = vld1q_f16(in_ptr2); + vi2n = vld1q_f16(in_ptr2 + 8); + vi30 = vld1q_f16(in_ptr3); + vi3n = vld1q_f16(in_ptr3 + 8); + + vi01 = vextq_f16(vi00, vi0n, 1); + vi02 = vextq_f16(vi00, vi0n, 2); + vi11 = vextq_f16(vi10, vi1n, 1); + vi12 = vextq_f16(vi10, vi1n, 2); + vi21 = vextq_f16(vi20, vi2n, 1); + vi22 = vextq_f16(vi20, vi2n, 2); + vi31 = vextq_f16(vi30, vi3n, 1); + vi32 = vextq_f16(vi30, vi3n, 2); + + // load ouptut + vo00 = vld1q_f16(out_ptr0); + vo01 = vld1q_f16(out_ptr0 + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + + vst1q_f16(out_ptr0, vo00); + vst1q_f16(out_ptr0 + p.out_width, vo01); + + in_ptr0 += 8; + in_ptr1 += 8; + in_ptr2 += 8; + in_ptr3 += 8; + + out_ptr0 += 8; + } // w + + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; + + out_ptr0 += p.out_width; + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 2); + + return MaceStatus::MACE_SUCCESS; +} + +template<> +MaceStatus Conv2dK3x3S2::DoCompute( + const ConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t + *in_base = input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t *filter_ptr = + filter_data + m * p.in_channels * 9 + c * 9; + float16_t *out_base = + output_data + b * p.out_batch_size + m * p.out_image_size; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 7 < p.out_width; w += 8) { + float16x8x2_t vi0, vi1, vi2; + float16x8_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 8 width): vo + float16x8_t vo; + + // load input + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * p.in_width + in_w; + vi0 = vld2q_f16(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f16(in_base + in_offset + p.in_width); + vi2 = vld2q_f16(in_base + in_offset + 2 * p.in_width); + + vi0n = vld1q_f16(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 8); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 8); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo = vld1q_f16(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f16(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f16(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f16(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f16(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f16(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f16(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f16(vo, vi10, vf00, 3); + vo = vfmaq_laneq_f16(vo, vi11, vf00, 4); + vo = vfmaq_laneq_f16(vo, vi12, vf00, 5); + vo = vfmaq_laneq_f16(vo, vi20, vf00, 6); + vo = vfmaq_laneq_f16(vo, vi21, vf00, 7); + vo = vfmaq_laneq_f16(vo, vi22, vf01, 0); + + vst1q_f16(out_base + out_offset, vo); + } // w + } // h + } // c + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 1); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..7455bce8f4c1a833de1e08b3af708a4d63ee1f2b --- /dev/null +++ b/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc @@ -0,0 +1,410 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +template +void DepthwiseConv2d3x3Pixel(const float16_t *in_base, + const float16_t *filter, + const index_t out_h, + const index_t out_w, + const index_t in_h_start, + const index_t in_w_start, + const index_t out_width, + const index_t in_height, + const index_t in_width, + float16_t *out_base) { + const index_t filter_width = 3; + float sum = 0.0f; + + index_t in_h = in_h_start; + const float16_t *in = in_base + in_h * in_width; + const float16_t *filter_ptr = filter; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + in_h++; + in += in_width; + filter_ptr += filter_width; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + in_h++; + in += in_width; + filter_ptr += filter_width; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + out_base[out_h * out_width + out_w] = static_cast(sum); +} + +template<> +MaceStatus DepthwiseConv2dK3x3S1::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + const index_t c = m / p.multiplier; + const index_t multi_index = m % p.multiplier; + auto filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9; + auto in_base = input_data + b * p.in_batch_size + c * p.in_image_size; + auto out_base = output_data + b * p.out_batch_size + + m * p.out_image_size; + index_t h, w; + + // top + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) { + // left + for (w = 0; w < p.valid_w_start; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + + for (w = p.valid_w_start; w + 7 < p.valid_w_stop; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02, vi0n; + float16x8_t vi10, vi11, vi12, vi1n; + float16x8_t vi20, vi21, vi22, vi2n; + float16x8_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + + // load input + index_t in_h = h - p.pad_top; + index_t in_w = w - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; + vi00 = vld1q_f16(in_base + in_offset); + vi0n = vld1q_f16(in_base + in_offset + 8); + vi10 = vld1q_f16(in_base + in_offset + p.in_width); + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 8); + vi20 = vld1q_f16(in_base + in_offset + 2 * p.in_width); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 8); + vi30 = vld1q_f16(in_base + in_offset + 3 * p.in_width); + vi3n = vld1q_f16(in_base + in_offset + 3 * p.in_width + 8); + + vi01 = vextq_f16(vi00, vi0n, 1); + vi02 = vextq_f16(vi00, vi0n, 2); + vi11 = vextq_f16(vi10, vi1n, 1); + vi12 = vextq_f16(vi10, vi1n, 2); + vi21 = vextq_f16(vi20, vi2n, 1); + vi22 = vextq_f16(vi20, vi2n, 2); + vi31 = vextq_f16(vi30, vi3n, 1); + vi32 = vextq_f16(vi30, vi3n, 2); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo00 = vld1q_f16(out_base + out_offset); + vo01 = vld1q_f16(out_base + out_offset + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + vst1q_f16(out_base + out_offset, vo00); + vst1q_f16(out_base + out_offset + p.out_width, vo01); + } // w + + // right + for (; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } // h + + // bottom + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool + + return MaceStatus::MACE_SUCCESS; +} + +template<> +MaceStatus DepthwiseConv2dK3x3S2::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D( + [=](index_t start0, index_t end0, index_t step0, index_t start1, + index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + index_t c = m / p.multiplier; + index_t multi_index = m % p.multiplier; + auto filter_ptr = filter_data + multi_index * p.in_channels * 9 + + c * 9; + auto in_base = input_data + b * p.in_batch_size + + c * p.in_image_size; + auto out_base = output_data + b * p.out_batch_size + + m * p.out_image_size; + index_t h, w; + + // top + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (h = p.valid_h_start; h < p.valid_h_stop; ++h) { + // left + for (w = 0; w < p.valid_w_start; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + + for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 8) { + float16x8x2_t vi0, vi1, vi2; + float16x8_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 8 width): vo + float16x8_t vo; + + // load input + index_t in_h = h * 2 - p.pad_top; + index_t in_w = w * 2 - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; + vi0 = vld2q_f16(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f16(in_base + in_offset + p.in_width); + vi2 = vld2q_f16(in_base + in_offset + 2 * p.in_width); + + vi0n = vld1q_f16(in_base + in_offset + 16); // [8.9.10.11] + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 16); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 16); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo = vld1q_f16(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f16(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f16(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f16(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f16(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f16(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f16(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f16(vo, vi10, vf00, 3); + vo = vfmaq_laneq_f16(vo, vi11, vf00, 4); + vo = vfmaq_laneq_f16(vo, vi12, vf00, 5); + vo = vfmaq_laneq_f16(vo, vi20, vf00, 6); + vo = vfmaq_laneq_f16(vo, vi21, vf00, 7); + vo = vfmaq_laneq_f16(vo, vi22, vf01, 0); + vst1q_f16(out_base + out_offset, vo); + } // w + + // right + for (; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } // h + + // bottom + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + } // m + } // b + }, + 0, p.batch, 1, 0, p.out_channels, 1); + + return MaceStatus::MACE_SUCCESS; +} + + +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 26b4d02a9c9ddad721ceccffa3888df71ab3e309..b56832deb9914b72afc22a4e762d2a458694ac42 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -507,7 +507,7 @@ class Conv2dOp : public ConvPool2dOpBase { void RegisterConv2D(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU); - + MACE_REGISTER_FP16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, uint8_t); diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 3f0d7d64e1b45876072d1a19fcd007dc3a356b9f..96c3f38dfcdd01efbf40790d587ba52d46f5834d 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -406,6 +406,8 @@ void RegisterDepthwiseConv2d(OpRegistry *op_registry) { DepthwiseConv2dOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 96c4e85fb378d9c03a556d1bf9f10aeb465d5995..3b95410eefa2fd21a36c2714b081199bfc4905d2 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -518,6 +518,8 @@ void RegisterPooling(OpRegistry *op_registry) { DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Pooling", PoolingOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc index c940270949c7c455ce5fc989e7c6ae9cba013e7e..cb3ec86ea41bafcae74d69b1dd08d95e4d312e48 100644 --- a/mace/ops/ref/activation.cc +++ b/mace/ops/ref/activation.cc @@ -130,6 +130,10 @@ void RegisterActivationDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Activation, delegator::ActivationParam, MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc index eb65f80cce18ac1f6bc93d35a22c0dce6610fa8d..19552f171b41a2d556b86600130fd8095f2d3f3a 100644 --- a/mace/ops/ref/bias_add.cc +++ b/mace/ops/ref/bias_add.cc @@ -152,6 +152,9 @@ void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, BiasAdd, DelegatorParam, MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index 0bfec1ed80464ec26fb7569aa806a4af0cca49a3..992b2272ab51f252b7bf2a8d3c08683a8d4fafc7 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -131,6 +131,9 @@ void RegisterConv2dDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Conv2d, delegator::Conv2dParam, MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2d, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc index 19aa73efd60b62f6d58bfdb6fa3b5f8c5042322f..a52aa02e63d076ae23e997a336457ecfd54ca527 100644 --- a/mace/ops/ref/depthwise_conv_2d.cc +++ b/mace/ops/ref/depthwise_conv_2d.cc @@ -137,6 +137,10 @@ void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) { registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/gemm.cc b/mace/ops/ref/gemm.cc index 21a939a0e7cd53b56f9b58c50db9af0c0aa79615..321db87a16b4b54856a3444fc16472545f02531f 100644 --- a/mace/ops/ref/gemm.cc +++ b/mace/ops/ref/gemm.cc @@ -156,6 +156,9 @@ void RegisterGemmDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemm, delegator::GemmParam, MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc index 05b04f3ce986918bea2ed1cc2702b54924990422..dfdd02c54497a08b03381222b991fc0b9f343df5 100644 --- a/mace/ops/ref/gemv.cc +++ b/mace/ops/ref/gemv.cc @@ -92,6 +92,9 @@ void RegisterGemvDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemv, DelegatorParam, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index c233cba936a9d8360d1a9c94f14e2b4e648e0009..762db324fcb4a5f972a2bae1df95d080b651c87b 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -526,6 +526,8 @@ void RegisterSoftmax(OpRegistry *op_registry) { DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Softmax", SoftmaxOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 75afc92620b61fc278fdb97259e949dfbcde5c8c..b2d5d91c74a4b188de8fce3a6ba360b6665525e5 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -81,6 +81,7 @@ class SqueezeOp : public SqueezeOpRaw { void RegisterSqueeze(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE diff --git a/test/ccbenchmark/BUILD.bazel b/test/ccbenchmark/BUILD.bazel index ea5357ca0f702dea9fc4aa1b25fd5ea0e676da57..3bfe8f488487edcda11d7e5061e603f6c6c4ee21 100644 --- a/test/ccbenchmark/BUILD.bazel +++ b/test/ccbenchmark/BUILD.bazel @@ -11,6 +11,7 @@ load( "if_hexagon_enabled", "if_neon_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -61,6 +62,9 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), diff --git a/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc b/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc index 84ad04d3874a6af18f667cc1795bb627291c3fba..7c31764f7edbd022343177197bad3600199fbdf3 100644 --- a/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc +++ b/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc @@ -181,6 +181,12 @@ void Conv2d(int iters, #else #define MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float16_t, CPU) +#else +#define MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) +#endif // MACE_ENABLE_FP16 #ifdef MACE_ENABLE_OPENCL #define MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \ MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ @@ -193,6 +199,7 @@ void Conv2d(int iters, MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ MACE_BM_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ + MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC) // Filter sizes and data alignments diff --git a/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc b/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc index 3abf50679cda6ac776642efb39dc93428fbd7eaf..f159a78385e3eaa25e5c2a65a8a25e8bb279d8ea 100644 --- a/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc +++ b/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc @@ -140,6 +140,12 @@ void DepthwiseConv2d(int iters, #else #define MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M) \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float16_t, CPU) +#else +#define MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M) +#endif // MACE_ENABLE_FP16 #ifdef MACE_ENABLE_OPENCL #define MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M) \ MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ @@ -152,6 +158,7 @@ void DepthwiseConv2d(int iters, MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ MACE_BM_DEPTHWISE_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, P, M); \ MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M); \ + MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M); \ MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M) MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); diff --git a/test/ccunit/BUILD.bazel b/test/ccunit/BUILD.bazel index 50d12d4e45ae4e4b290b24714276d10e3a1bc586..22882feec30cbfcd16fec93f0a4fc9ed0dcfa2c3 100644 --- a/test/ccunit/BUILD.bazel +++ b/test/ccunit/BUILD.bazel @@ -12,6 +12,7 @@ load( "if_hta_enabled", "if_neon_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -42,6 +43,10 @@ cc_test( [ "mace/ops/arm/bf16/*.cc", ] + )) + if_fp16_enabled(glob( + [ + "mace/ops/arm/fp16/*.cc", + ] )) + if_opencl_enabled(glob( [ "mace/ops/opencl/*.cc", @@ -66,6 +71,9 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/test/ccunit/mace/ops/arm/arm82/gemm_test.cc b/test/ccunit/mace/ops/arm/arm82/gemm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd296b85b7f2ebc6135a3778a0512e2d44db526d --- /dev/null +++ b/test/ccunit/mace/ops/arm/arm82/gemm_test.cc @@ -0,0 +1,106 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/delegator/gemm.h" + +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/ops_test_util.h" +#include "mace/ops/testing/test_utils.h" + +namespace mace { +namespace ops { +namespace test { + +void TestGemmFloat16(const index_t batch, + const index_t rows, + const index_t cols, + const index_t depth, + const MatrixMajor lhs_major, + const MatrixMajor rhs_major, + const MatrixMajor output_major, + const bool lhs_batched, + const bool rhs_batched) { + Tensor lhs(GetCPUAllocator(), DT_FLOAT16); + Tensor rhs(GetCPUAllocator(), DT_FLOAT16); + Tensor output(GetCPUAllocator(), DT_FLOAT16); + lhs.Resize({lhs_batched ? batch : 1, rows, depth}); + rhs.Resize({rhs_batched ? batch : 1, depth, cols}); + output.Resize({batch, rows, cols}); + { + Tensor::MappingGuard lhs_guard(&lhs); + Tensor::MappingGuard rhs_guard(&rhs); + auto lhs_data = lhs.mutable_data(); + auto rhs_data = rhs.mutable_data(); + auto output_data = output.mutable_data(); + GenerateRandomRealTypeData(lhs.shape(), lhs_data); + GenerateRandomRealTypeData(rhs.shape(), rhs_data); + GenerateRandomRealTypeData(output.shape(), output_data); + } + + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemm = delegator::Gemm::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::NEON), + delegator::GemmParam()); + gemm->Compute(&context, &lhs, &rhs, batch, rows, cols, depth, lhs_major, + rhs_major, output_major, lhs_batched, rhs_batched, &output); + + Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT16); + expected_output.Resize({batch, rows, cols}); + std::unique_ptr gemm_ref = delegator::Gemm::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::REF), + delegator::GemmParam()); + gemm_ref->Compute(&context, &lhs, &rhs, batch, rows, cols, depth, lhs_major, + rhs_major, output_major, lhs_batched, rhs_batched, + &expected_output); + + ExpectTensorSimilar(expected_output, output, 1e-4); +} + +TEST(ArmGemm, TestGemmFP16) { + TestGemmFloat16(1, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, ColMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, ColMajor, ColMajor, true, true); + + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, ColMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, ColMajor, ColMajor, true, true); + + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, false); + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, false, true); + + TestGemmFloat16(16, 31, 61, 67, RowMajor, ColMajor, RowMajor, true, true); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/test/ccunit/mace/ops/conv_2d_test.cc b/test/ccunit/mace/ops/conv_2d_test.cc index 6d04d05e682f53e18f8f6710777c23e46a007dce..aebca907b936648742c446a2e45b564a6f38d156 100644 --- a/test/ccunit/mace/ops/conv_2d_test.cc +++ b/test/ccunit/mace/ops/conv_2d_test.cc @@ -1435,6 +1435,74 @@ TEST_F(Conv2dOpTest, BFloat16) { TestBFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {3, 3}); } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +namespace { +void TestFloat16(const index_t batch, + const index_t out_channels, + const index_t in_channels, + const index_t in_height, + const index_t in_width, + const index_t k_height, + const index_t k_width, + enum Padding padding_type, + const std::vector &strides) { + OpsTestNet net; + net.AddRandomInput( + "Input", {batch, in_channels, in_height, in_width}); + net.AddRandomInput( + "Filter", {out_channels, in_channels, k_height, k_width}, true); + net.AddRandomInput("Bias", {out_channels}, true); + net.Cast("Input", "FP16Input"); + net.Cast("Filter", "FP16Filter"); + net.Cast("Bias", "FP16Bias"); + + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("Conv2D", "FP16Conv2dTest") + .Input("FP16Input") + .Input("FP16Filter") + .Input("FP16Bias") + .Output("FP16Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT16)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + net.Cast("FP16Output", "CastOutput"); + + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("CastOutput"), 1e-4); +} +} // namespace + +TEST_F(Conv2dOpTest, float16_t) { + TestFloat16(1, 128, 64, 32, 32, 1, 1, VALID, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, VALID, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, FULL, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, SAME, {2, 2}); + TestFloat16(1, 129, 63, 33, 31, 3, 3, SAME, {1, 1}); + TestFloat16(9, 128, 64, 32, 32, 3, 3, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 1, 5, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 5, 5, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 5, 1, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {2, 2}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {3, 3}); +} +#endif // MACE_ENABLE_FP16 } // namespace test } // namespace ops } // namespace mace diff --git a/test/ccunit/mace/ops/depthwise_conv2d_test.cc b/test/ccunit/mace/ops/depthwise_conv2d_test.cc index f3585fcdec64601cea63ccd3a0f6eb1249e6ca17..30fc56f5f8f708798e612c7511333e7a976d2f8b 100644 --- a/test/ccunit/mace/ops/depthwise_conv2d_test.cc +++ b/test/ccunit/mace/ops/depthwise_conv2d_test.cc @@ -560,6 +560,72 @@ TEST_F(DepthwiseConv2dOpTest, BFloat16) { #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +namespace { +void TestFloat16(const index_t batch, + const index_t multiplier, + const index_t in_channels, + const index_t in_height, + const index_t in_width, + const index_t k_height, + const index_t k_width, + enum Padding padding_type, + const std::vector &strides) { + OpsTestNet net; + const index_t out_channels = multiplier * in_channels; + net.AddRandomInput( + "Input", {batch, in_channels, in_height, in_width}, false, false); + net.AddRandomInput( + "Filter", {multiplier, in_channels, k_height, k_width}, true, false); + net.AddRandomInput("Bias", {out_channels}, true); + net.Cast("Input", "FP16Input"); + net.Cast("Filter", "FP16Filter"); + net.Cast("Bias", "FP16Bias"); + + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("DepthwiseConv2d", "FP16DepthwiseConv2DTest") + .Input("FP16Input") + .Input("FP16Filter") + .Input("FP16Bias") + .Output("FP16Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT16)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + net.Cast("FP16Output", "CastOutput"); + + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("CastOutput"), 1e-4); +} +} // namespace + +TEST_F(DepthwiseConv2dOpTest, float16_t) { + TestFloat16(1, 1, 1024, 7, 7, 3, 3, VALID, {1, 1}); + TestFloat16(1, 1, 1024, 7, 7, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 1024, 7, 7, 3, 3, FULL, {1, 1}); + TestFloat16(1, 2, 1024, 7, 7, 3, 3, SAME, {1, 1}); + TestFloat16(1, 2, 1024, 7, 7, 3, 3, SAME, {2, 2}); + TestFloat16(1, 1, 512, 14, 14, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 512, 14, 13, 5, 5, SAME, {2, 2}); + TestFloat16(1, 1, 256, 28, 28, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 128, 56, 56, 3, 3, SAME, {2, 2}); + TestFloat16(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2}); +} +#endif // MACE_ENABLE_FP16 } // namespace test } // namespace ops } // namespace mace diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 7526314b034cba5470908aca35c7cff0d8a79e34..857ab2cf170078fb314e8f99d15002c5ef3556f0 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -100,6 +100,11 @@ def parse_args(): type=str2bool, default=True, help="Whether to use bfloat16") + parser.add_argument( + "--enable_fp16", + type=str2bool, + default=False, + help="Whether to use armv8.2") parser.add_argument( "--enable_rpcmem", type=str2bool, @@ -180,6 +185,7 @@ def main(unused_args): enable_neon=FLAGS.enable_neon, enable_quantize=FLAGS.enable_quantize, enable_bfloat16=FLAGS.enable_bfloat16, + enable_fp16=FLAGS.enable_fp16, enable_rpcmem=FLAGS.enable_rpcmem, enable_hta=FLAGS.enable_hta, address_sanitizer=FLAGS.address_sanitizer, diff --git a/tools/converter.py b/tools/converter.py index e7a5f05d11ef15d041ba5e6b4d8aab6bc71c7667..d15156eec72aa5ff471f9c8eb9ad4513431438a1 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -89,6 +89,7 @@ FPDataTypeStrs = [ "fp16_fp32", "fp32_fp32", "bf16_fp32", + "fp16_fp16", ] FPDataType = Enum('GPUDataType', [(ele, ele) for ele in FPDataTypeStrs], @@ -184,6 +185,15 @@ def bfloat16_enabled(configs): return False +def fp16_enabled(configs): + for model_name in configs[YAMLKeyword.models]: + model_config = configs[YAMLKeyword.models][model_name] + dtype = model_config.get(YAMLKeyword.data_type, FPDataType.fp16_fp32) + if dtype == FPDataType.fp16_fp16: + return True + return False + + def hexagon_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: @@ -765,6 +775,7 @@ def build_model_lib(configs, address_sanitizer, debug_mode): enable_opencl=opencl_enabled(configs), enable_quantize=quantize_enabled(configs), enable_bfloat16=bfloat16_enabled(configs), + enable_fp16=fp16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode), debug_mode=debug_mode @@ -927,6 +938,7 @@ def build_mace_run(configs, target_abi, toolchain, enable_opencl=opencl_enabled(configs), enable_quantize=quantize_enabled(configs), enable_bfloat16=bfloat16_enabled(configs), + enable_fp16=fp16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type), debug_mode=debug_mode, diff --git a/tools/python/utils/config_parser.py b/tools/python/utils/config_parser.py index 5521d4e8e81dc42c8dfffd2f213a6031320fac91..f48acb0097a6039cc5af738a84f24ffe66d25d49 100644 --- a/tools/python/utils/config_parser.py +++ b/tools/python/utils/config_parser.py @@ -182,6 +182,8 @@ def parse_internal_data_type(str): return mace_pb2.DT_FLOAT elif str == 'bf16_fp32': return mace_pb2.DT_BFLOAT16 + elif str == 'fp16_fp16': + return mace_pb2.DT_FLOAT16 else: return mace_pb2.DT_HALF diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 9eec6289ec6ef5c98194f2df38893904c44ebd49..1a7d2b97d08818008dc37cb2c420fda54e4f282c 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -271,6 +271,7 @@ def bazel_build(target, enable_opencl=True, enable_quantize=True, enable_bfloat16=False, + enable_fp16=False, enable_rpcmem=True, address_sanitizer=False, symbol_hidden=True, @@ -305,6 +306,8 @@ def bazel_build(target, "--define", "bfloat16=%s" % str(enable_bfloat16).lower(), "--define", + "fp16=%s" % str(enable_fp16).lower(), + "--define", "rpcmem=%s" % str(enable_rpcmem).lower(), "--define", "hexagon=%s" % str(enable_hexagon).lower(),