From 9221eee951fd4bfe3e9d36024c02a576be33f147 Mon Sep 17 00:00:00 2001 From: lichao18 Date: Wed, 14 Oct 2020 18:23:10 +0800 Subject: [PATCH] Add armv8.2 fp16 for MobileNet --- CMakeLists.txt | 5 + docs/user_guide/advanced_usage.rst | 7 +- docs/user_guide/advanced_usage_cmake.rst | 9 +- mace/BUILD.bazel | 8 + mace/core/BUILD.bazel | 4 + mace/core/fp16.h | 97 +++++ mace/core/registry/op_delegator_registry.h | 10 + mace/core/registry/ops_registry.h | 22 + mace/core/tensor.h | 10 +- mace/core/types.cc | 7 +- mace/core/types.h | 3 +- mace/libmace/BUILD.bazel | 4 + mace/libmace/mace.cc | 46 +- mace/mace.bzl | 6 + mace/ops/BUILD.bazel | 24 + mace/ops/CMakeLists.txt | 6 + mace/ops/arm/base/common_neon.h | 34 ++ mace/ops/arm/base/conv_2d_1x1.cc | 4 + mace/ops/arm/base/conv_2d_3x3.cc | 8 + mace/ops/arm/base/depthwise_conv_2d_3x3.cc | 20 + mace/ops/arm/base/depthwise_conv_2d_3x3.h | 1 + mace/ops/arm/base/gemm.cc | 28 +- mace/ops/arm/fp16/conv_2d_3x3.cc | 367 ++++++++++++++++ mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc | 410 ++++++++++++++++++ mace/ops/conv_2d.cc | 2 +- mace/ops/depthwise_conv2d.cc | 2 + mace/ops/pooling.cc | 2 + mace/ops/ref/activation.cc | 4 + mace/ops/ref/bias_add.cc | 3 + mace/ops/ref/conv_2d.cc | 3 + mace/ops/ref/depthwise_conv_2d.cc | 4 + mace/ops/ref/gemm.cc | 3 + mace/ops/ref/gemv.cc | 3 + mace/ops/softmax.cc | 2 + mace/ops/squeeze.cc | 1 + test/ccbenchmark/BUILD.bazel | 4 + .../ccbenchmark/mace/ops/conv_2d_benchmark.cc | 7 + .../mace/ops/depthwise_conv2d_benchmark.cc | 7 + test/ccunit/BUILD.bazel | 8 + test/ccunit/mace/ops/arm/arm82/gemm_test.cc | 106 +++++ test/ccunit/mace/ops/conv_2d_test.cc | 68 +++ test/ccunit/mace/ops/depthwise_conv2d_test.cc | 66 +++ tools/bazel_adb_run.py | 6 + tools/converter.py | 12 + tools/python/utils/config_parser.py | 2 + tools/sh_commands.py | 3 + 46 files changed, 1438 insertions(+), 20 deletions(-) create mode 100644 mace/core/fp16.h create mode 100644 mace/ops/arm/fp16/conv_2d_3x3.cc create mode 100644 mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc create mode 100644 test/ccunit/mace/ops/arm/arm82/gemm_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ddf861c..c284a9ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF) option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF) option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF) option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF) +option(MACE_ENABLE_FP16 "whether to enable armv8.2 fp16 support" OFF) option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF) option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF) option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON) @@ -121,6 +122,10 @@ if(MACE_ENABLE_BFLOAT16) add_definitions(-DMACE_ENABLE_BFLOAT16) endif(MACE_ENABLE_BFLOAT16) +if(MACE_ENABLE_FP16) + add_definitions(-DMACE_ENABLE_FP16) +endif(MACE_ENABLE_FP16) + if(MACE_ENABLE_OBFUSCATE) add_definitions(-DMACE_OBFUSCATE_LITERALS) endif(MACE_ENABLE_OBFUSCATE) diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 8d8c50df..d05a1921 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -85,7 +85,7 @@ in one deployment file. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32, fp16_fp16] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -584,9 +584,10 @@ Therefore, the default storage type for a regular model in MACE is half. However if the model is very sensitive to accuracy, storage type can be changed to float. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, -for CPU it can also be changed to ``bf16_fp32``. +for CPU it can also be changed to ``bf16_fp32`` and ``fp16_fp16``(``fp16_fp16`` can only be used on armv8.2 or higher version). -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float,85G +and ``fp16_fp16`` means that the weights are saved in half and actual inference is in half. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/docs/user_guide/advanced_usage_cmake.rst b/docs/user_guide/advanced_usage_cmake.rst index a3d2d489..62e5f555 100644 --- a/docs/user_guide/advanced_usage_cmake.rst +++ b/docs/user_guide/advanced_usage_cmake.rst @@ -63,7 +63,7 @@ There are many advanced options supported. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32, fp16_fp16] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -439,9 +439,12 @@ Therefore, the default storage type for a regular model in MACE is half. However if the model is very sensitive to accuracy, storage type can be changed to float. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, -for CPU it can also be changed to ``bf16_fp32``. +for CPU it can also be changed to ``bf16_fp32`` and ``fp16_fp16``(``fp16_fp16`` can only be used on armv8.2 or higher version). + +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float, +while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float, +and ``fp16_fp16`` means that the weights are saved in half and actual inference is in half. -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/mace/BUILD.bazel b/mace/BUILD.bazel index 3dbccfa5..7dcbcd65 100644 --- a/mace/BUILD.bazel +++ b/mace/BUILD.bazel @@ -132,6 +132,14 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "fp16_enabled", + define_values = { + "fp16": "true", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "rpcmem_enabled", define_values = { diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index f418e814..24b27b76 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -10,6 +10,7 @@ load( "if_android_armv7", "if_apu_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_hexagon_enabled", "if_hexagon_or_hta_enabled", "if_hta_enabled", @@ -86,6 +87,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/mace/core/fp16.h b/mace/core/fp16.h new file mode 100644 index 00000000..6ab4a5d6 --- /dev/null +++ b/mace/core/fp16.h @@ -0,0 +1,97 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_FP16_H_ +#define MACE_CORE_FP16_H_ + +#ifdef MACE_ENABLE_FP16 + +#include +#include +#include +#include + + +namespace std { +inline float fabs(const float16_t &value) { + return fabs(static_cast(value)); +} + +inline float abs(const float16_t &value) { + return abs(static_cast(value)); +} + +inline float sqrt(const float16_t &value) { + return sqrt(static_cast(value)); +} + +inline float log(const float16_t &value) { + return log(static_cast(value)); +} + +inline float tanh(const float16_t &value) { + return tanh(static_cast(value)); +} + +inline float exp(const float16_t &value) { + return exp(static_cast(value)); +} + +inline int ceil(const float16_t &value) { + return ceil(static_cast(value)); +} + +inline int floor(const float16_t &value) { + return floor(static_cast(value)); +} + +inline float max(const float16_t &a, const float &b) { + return max(static_cast(a), b); +} + +inline float max(const float &a, const float16_t &b) { + return max(a, static_cast(b)); +} + +inline float min(const float16_t &a, const float &b) { + return min(static_cast(a), b); +} + +inline float min(const float &a, const float16_t &b) { + return min(a, static_cast(b)); +} + +inline float pow(const float16_t &a, const float16_t &b) { + return pow(static_cast(a), static_cast(b)); +} + +inline float pow(const float16_t &a, const float &b) { + return pow(static_cast(a), b); +} + +inline float pow(const float &a, const float16_t &b) { + return pow(a, static_cast(b)); +} + +inline ostream &operator<<(ostream &ss, // NOLINT + const float16_t &value) { + return ss << static_cast(value); +} + +} // namespace std + + +#endif // MACE_ENABLE_FP16 + +#endif // MACE_CORE_FP16_H_ diff --git a/mace/core/registry/op_delegator_registry.h b/mace/core/registry/op_delegator_registry.h index 0f72c78d..0fdc5de7 100644 --- a/mace/core/registry/op_delegator_registry.h +++ b/mace/core/registry/op_delegator_registry.h @@ -22,6 +22,7 @@ #include #include "mace/core/bfloat16.h" +#include "mace/core/fp16.h" #include "mace/core/ops/op_delegator.h" #include "mace/core/types.h" #include "mace/proto/mace.pb.h" @@ -101,6 +102,15 @@ class OpDelegatorRegistry { #endif // MACE_ENABLE_BFLOAT16 #endif // MACE_REGISTER_BF16_DELEGATOR +#ifndef MACE_REGISTER_FP16_DELEGATOR +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_DELEGATOR(registry, class_name, param_name, key) \ + MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) +#else +#define MACE_REGISTER_FP16_DELEGATOR(registry, class_name, param_name, key) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_DELEGATOR + #ifndef MACE_DEFINE_DELEGATOR_CREATOR #define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \ static std::unique_ptr Create( \ diff --git a/mace/core/registry/ops_registry.h b/mace/core/registry/ops_registry.h index fff4d6e1..5a534c34 100644 --- a/mace/core/registry/ops_registry.h +++ b/mace/core/registry/ops_registry.h @@ -23,6 +23,7 @@ #include #include "mace/core/bfloat16.h" +#include "mace/core/fp16.h" #include "mace/core/types.h" #include "mace/core/ops/operator.h" #include "mace/core/ops/op_condition_builder.h" @@ -102,6 +103,27 @@ class OpRegistry { #endif // MACE_ENABLE_BFLOAT16 #endif // MACE_REGISTER_BF16_OP_BY_CLASS +#ifndef MACE_REGISTER_FP16_OP +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_OP(op_registry, op_type, class_name, device) \ + MACE_REGISTER_OP(op_registry, op_type, class_name, device, float16_t) +#else +#define MACE_REGISTER_FP16_OP(op_registry, op_type, class_name, device) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_OP + +#ifndef MACE_REGISTER_FP16_OP_BY_CLASS +#ifdef MACE_ENABLE_FP16 +#define MACE_REGISTER_FP16_OP_BY_CLASS(op_registry, op_type, \ + class_name, device) \ + MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \ + class_name, device, float16_t) +#else +#define MACE_REGISTER_FP16_OP_BY_CLASS(op_registry, op_type, \ + class_name, device) +#endif // MACE_ENABLE_FP16 +#endif // MACE_REGISTER_FP16_OP_BY_CLASS + #ifdef MACE_ENABLE_OPENCL #define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \ op_registry->Register( \ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 1340f188..d5ba530b 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -46,7 +46,8 @@ namespace mace { break; \ } -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) \ + || defined(MACE_ENABLE_FP16) #define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) #else @@ -60,6 +61,13 @@ namespace mace { #define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_TYPE_ENUM_SWITCH_CASE_FP16(STATEMENTS) \ + MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS)) +#else +#define MACE_TYPE_ENUM_SWITCH_CASE_FP16(STATEMENTS) +#endif // MACE_ENABLE_FP16 + #if MACE_ENABLE_OPENCL #define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) diff --git a/mace/core/types.cc b/mace/core/types.cc index 1decd905..1a33cabc 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -26,6 +26,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { case DT_UINT8: case DT_INT32: case DT_BFLOAT16: + case DT_FLOAT16: return true; default: return false; @@ -38,7 +39,8 @@ std::string DataTypeToString(const DataType dt) { {DT_HALF, "DT_HALF"}, {DT_UINT8, "DT_UINT8"}, {DT_INT32, "DT_INT32"}, - {DT_BFLOAT16, "DT_BFLOAT16"}}; + {DT_BFLOAT16, "DT_BFLOAT16"}, + {DT_FLOAT16, "DT_FLOAT16"}}; MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type"); return dtype_string_map[dt]; } @@ -49,7 +51,8 @@ size_t GetEnumTypeSize(const DataType dt) { return sizeof(float); case DT_HALF: return sizeof(half); -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) || \ + defined(MACE_ENABLE_FP16) case DT_FLOAT16: return sizeof(float16_t); #endif diff --git a/mace/core/types.h b/mace/core/types.h index 4871c66b..d83e2571 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -55,7 +55,8 @@ struct EnumToDataType; }; MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); -#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) +#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) \ + || defined(MACE_ENABLE_FP16) MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); #endif #ifdef MACE_ENABLE_BFLOAT16 diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel index f7935811..8ad3222b 100644 --- a/mace/libmace/BUILD.bazel +++ b/mace/libmace/BUILD.bazel @@ -13,6 +13,7 @@ load( "if_android_armv7", "if_apu_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_darwin", "if_hexagon_enabled", "if_hta_enabled", @@ -44,6 +45,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 3085b655..77e1daa7 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -846,9 +846,8 @@ MaceStatus MaceEngine::Impl::TransposeInput( } else { LOG(FATAL) << "Invalid net data type: " << net_data_type_; } -#ifdef MACE_ENABLE_BFLOAT16 // todo(lichao): add float16 macro - } else if (input_dt == DataType::DT_FLOAT16 || - input_dt == DataType::DT_BFLOAT16) { +#ifdef MACE_ENABLE_BFLOAT16 + } else if (input_dt == DataType::DT_BFLOAT16) { auto *input_data = input_tensor->mutable_data(); return ops::Transpose(thread_pool_.get(), input.second.data().get(), @@ -856,6 +855,16 @@ MaceStatus MaceEngine::Impl::TransposeInput( dst_dims, input_data); #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_FLOAT16) { + auto *input_data = input_tensor->mutable_data(); + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); +#endif // MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); return ops::Transpose(thread_pool_.get(), @@ -882,15 +891,23 @@ MaceStatus MaceEngine::Impl::TransposeInput( } else { LOG(FATAL) << "Invalid net data type: " << net_data_type_; } -#ifdef MACE_ENABLE_BFLOAT16 // todo(lichao): add float16 macro - } else if (input_dt == DataType::DT_FLOAT16 || - input_dt == DataType::DT_BFLOAT16) { +#ifdef MACE_ENABLE_BFLOAT16 + } else if (input_dt == DataType::DT_BFLOAT16) { auto input_data = input_tensor->mutable_data(); const float *data = input.second.data().get(); for (index_t i = 0; i < input_tensor->size(); ++i) { input_data[i] = data[i]; } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_FLOAT16) { + auto input_data = input_tensor->mutable_data(); + const float *data = input.second.data().get(); + for (index_t i = 0; i < input_tensor->size(); ++i) { + input_data[i] = data[i]; + } +#endif // MACE_ENABLE_FP16 + } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); memcpy(input_data, input.second.data().get(), @@ -963,6 +980,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput( dst_dims, output->second.data().get()); #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (output_dt == DataType::DT_FLOAT16) { + auto output_data = output_tensor->data(); + return ops::Transpose(thread_pool_.get(), + output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); +#endif // MACE_ENABLE_FP16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; return MaceStatus::MACE_INVALID_ARGS; @@ -993,6 +1019,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput( data[i] = output_data[i]; } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 + } else if (output_dt == DataType::DT_FLOAT16) { + const auto *output_data = output_tensor->data(); + float *data = output->second.data().get(); + for (index_t i = 0; i < output_tensor->size(); ++i) { + data[i] = output_data[i]; + } +#endif // MACE_ENABLE_FP16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; } diff --git a/mace/mace.bzl b/mace/mace.bzl index 007b8b65..16a01f8f 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -109,6 +109,12 @@ def if_bfloat16_enabled(a): "//conditions:default": [], }) +def if_fp16_enabled(a): + return select({ + "//mace:fp16_enabled": a, + "//conditions:default": [], + }) + def if_rpcmem_enabled(a, default_value = []): return select({ "//mace:rpcmem_enabled": a, diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 73a3cc20..fe2bede0 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -11,6 +11,7 @@ load( "if_android", "if_android_armv7", "if_bfloat16_enabled", + "if_fp16_enabled", "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", @@ -46,6 +47,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -85,6 +89,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -112,6 +119,10 @@ cc_library( [ "arm/bf16/*.cc", ], + )) + if_fp16_enabled(glob( + [ + "arm/fp16/*.cc", + ], )), hdrs = glob( [ @@ -126,6 +137,10 @@ cc_library( [ "arm/bf16/*.h", ], + )) + if_fp16_enabled(glob( + [ + "arm/fp16/*.h", + ], )), copts = [ "-Werror", @@ -142,6 +157,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -225,6 +243,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -265,6 +286,9 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), diff --git a/mace/ops/CMakeLists.txt b/mace/ops/CMakeLists.txt index c2f18a70..9665d948 100644 --- a/mace/ops/CMakeLists.txt +++ b/mace/ops/CMakeLists.txt @@ -14,6 +14,9 @@ file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS file(GLOB OPS_ARM_NEON_BF16_KERNELS_SRCS arm/bf16/*.cc ) +file(GLOB OPS_ARM_NEON_FP16_KERNELS_SRCS + arm/fp16/*.cc +) file(GLOB OPS_ARM_NEON_Q8_KERNELS_SRCS arm/q8/*.cc ) @@ -45,6 +48,9 @@ if(MACE_ENABLE_NEON) if(MACE_ENABLE_BFLOAT16) set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BF16_KERNELS_SRCS}) endif(MACE_ENABLE_BFLOAT16) + if(MACE_ENABLE_FP16) + set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP16_KERNELS_SRCS}) + endif(MACE_ENABLE_FP16) endif(MACE_ENABLE_NEON) if(MACE_ENABLE_OPENCL) diff --git a/mace/ops/arm/base/common_neon.h b/mace/ops/arm/base/common_neon.h index 5b6d7899..41c96ac3 100644 --- a/mace/ops/arm/base/common_neon.h +++ b/mace/ops/arm/base/common_neon.h @@ -152,6 +152,40 @@ inline void vst1o(float *ptr, float32x8_t v) { vst1q_f32(ptr + 4, v.val[1]); } +#if defined(MACE_ENABLE_AMR82) + +// load of 4D vector +inline float16x4_t vld1(const float16_t *ptr) { + return vld1_fp16(ptr); +} + +// store of 4D vector +inline void vst1(float16_t *ptr, float16x4_t v) { + vst1_fp16(ptr, v); +} + +// load of 8D vector +inline float16x8_t vld1q(const float16_t *ptr) { + return vld1q_fp16(ptr); +} + +// load of 2 8D vectors and perform de-interleaving +inline float16x8x2_t vld2q(const float16_t *ptr) { + return vld2q_fp16(ptr); +} + +// store of 8D vector +inline void vst1q(float16_t *ptr, const float16x8_t v) { + vst1q_fp16(ptr, v); +} + +// store of 2 8D vectors and perform interleaving +inline void vst2q(float16_t *ptr, const float16x8x2_t v) { + vst2q_fp16(ptr, v); +} + +#endif // MACE_ENABLE_FP16 + #if defined(MACE_ENABLE_BFLOAT16) // load of 2D vector diff --git a/mace/ops/arm/base/conv_2d_1x1.cc b/mace/ops/arm/base/conv_2d_1x1.cc index 40ca524e..d53bf1d8 100644 --- a/mace/ops/arm/base/conv_2d_1x1.cc +++ b/mace/ops/arm/base/conv_2d_1x1.cc @@ -101,6 +101,10 @@ void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) { registry, Conv2dK1x1, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K1x1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK1x1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K1x1)); } } // namespace arm diff --git a/mace/ops/arm/base/conv_2d_3x3.cc b/mace/ops/arm/base/conv_2d_3x3.cc index cf232a77..8c21b8d6 100644 --- a/mace/ops/arm/base/conv_2d_3x3.cc +++ b/mace/ops/arm/base/conv_2d_3x3.cc @@ -36,6 +36,14 @@ void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) { registry, Conv2dK3x3S2, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K3x3S2)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK3x3S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2dK3x3S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S2)); } } // namespace arm diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc index e17245e6..778386f1 100644 --- a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc @@ -20,6 +20,16 @@ namespace mace { namespace ops { namespace arm { +extern template +MaceStatus DepthwiseConv2dK3x3S1::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data); + +extern template +MaceStatus DepthwiseConv2dK3x3S2::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data); + namespace { template void DepthwiseConv2d3x3Pixel(const T *in_base, @@ -464,6 +474,16 @@ void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) { delegator::DepthwiseConv2dParam, MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, BFloat16, ImplType::NEON, K3x3S2)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2dK3x3S1, + delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S1)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2dK3x3S2, + delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::NEON, K3x3S2)); } } // namespace arm diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.h b/mace/ops/arm/base/depthwise_conv_2d_3x3.h index c9edf26e..236ad590 100644 --- a/mace/ops/arm/base/depthwise_conv_2d_3x3.h +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.h @@ -27,6 +27,7 @@ namespace mace { namespace ops { namespace arm { + template class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN { public: diff --git a/mace/ops/arm/base/gemm.cc b/mace/ops/arm/base/gemm.cc index 6274311a..cba09ecb 100644 --- a/mace/ops/arm/base/gemm.cc +++ b/mace/ops/arm/base/gemm.cc @@ -23,6 +23,24 @@ namespace mace { namespace ops { namespace arm { +extern template void Gemm::Pack8x4( + const MatrixMap &matrix, + MatrixMajor dst_major, float16_t *packed_matrix); +extern template void Gemm::Unpack8x8( + const float16_t *packed_output, MatrixMap *output); +extern template void Gemm::PackLhs( + const MatrixMap &lhs, float16_t *packed_lhs); +extern template void Gemm::PackRhs( + const MatrixMap &rhs, float16_t *packed_rhs); +extern template void Gemm::UnpackOutput( + const float16_t *packed_output, MatrixMap *output); +extern template MaceStatus Gemm::Compute( + const OpContext *context, const Tensor *lhs, const Tensor *rhs, + const index_t batch, const index_t rows, const index_t cols, + const index_t depth, const MatrixMajor lhs_major, + const MatrixMajor rhs_major, const MatrixMajor output_major, + const bool lhs_batched, const bool rhs_batched, Tensor *output); + template void Gemm::Pack4x4(const MatrixMap &matrix, MatrixMajor dst_major, T *packed_matrix) { @@ -681,9 +699,9 @@ MaceStatus Gemm::Compute( depth_padded, packed_output_data_block); MatrixMap output_block = output_matrix.block(start_row, - start_col, - row_block_len, - col_block_len); + start_col, + row_block_len, + col_block_len); UnpackOutput(packed_output_data_block, &output_block); } // col_block_idx } // row_block_idx @@ -701,6 +719,10 @@ void RegisterGemmDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemm, delegator::GemmParam, MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::NEON)); + + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::NEON)); } } // namespace arm diff --git a/mace/ops/arm/fp16/conv_2d_3x3.cc b/mace/ops/arm/fp16/conv_2d_3x3.cc new file mode 100644 index 00000000..f22efb66 --- /dev/null +++ b/mace/ops/arm/fp16/conv_2d_3x3.cc @@ -0,0 +1,367 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/ops/arm/base/conv_2d_3x3.h" +#include "mace/ops/delegator/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { + +template<> +MaceStatus Conv2dK3x3S1::DoCompute( + const ConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 1 < p.out_channels) { + float16_t *out_ptr0_base = + output_data + b * p.out_batch_size + m * p.out_image_size; + float16_t *out_ptr1_base = + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t *in_ptr0 = + input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t + *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9; + + float16_t *out_ptr1 = out_ptr1_base; + const float16_t *in_ptr1 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; + const float16_t *in_ptr2 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; + const float16_t *in_ptr3 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; + const float16_t *filter_ptr1 = + filter_data + (m + 1) * p.in_channels * 9 + c * 9; + + float16_t *out_ptr0 = out_ptr0_base; + + // load filter (2 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + float16x8_t vf10, vf11; + vf00 = vld1q_f16(filter_ptr0); + vf01 = vld1q_f16(filter_ptr0 + 8); + + vf10 = vld1q_f16(filter_ptr1); + vf11 = vld1q_f16(filter_ptr1 + 8); + + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; // reg count: 14 + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + float16x8_t vi30, vi31, vi32; + float16x8_t vo20, vo30; // tmp use + + // output (4 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + float16x8_t vo10, vo11; + + // load input + vi00 = vld1q_f16(in_ptr0); + vo00 = vld1q_f16(in_ptr0 + 8); // reuse vo00: vi0n + vi10 = vld1q_f16(in_ptr1); + vo10 = vld1q_f16(in_ptr1 + 8); + vi20 = vld1q_f16(in_ptr2); + vo20 = vld1q_f16(in_ptr2 + 8); + vi30 = vld1q_f16(in_ptr3); + vo30 = vld1q_f16(in_ptr3 + 8); + + vi01 = vextq_f16(vi00, vo00, 1); + vi02 = vextq_f16(vi00, vo00, 2); + vi11 = vextq_f16(vi10, vo10, 1); + vi12 = vextq_f16(vi10, vo10, 2); + vi21 = vextq_f16(vi20, vo20, 1); + vi22 = vextq_f16(vi20, vo20, 2); + vi31 = vextq_f16(vi30, vo30, 1); + vi32 = vextq_f16(vi30, vo30, 2); + + // load ouptut + vo00 = vld1q_f16(out_ptr0); + vo01 = vld1q_f16(out_ptr0 + p.out_width); + vo10 = vld1q_f16(out_ptr1); + vo11 = vld1q_f16(out_ptr1 + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); // reg count: 18 + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + + // outch 1, height 0 + vo10 = vfmaq_laneq_f16(vo10, vi00, vf10, 0); + vo10 = vfmaq_laneq_f16(vo10, vi01, vf10, 1); + vo10 = vfmaq_laneq_f16(vo10, vi02, vf10, 2); + vo10 = vfmaq_laneq_f16(vo10, vi10, vf10, 3); + vo10 = vfmaq_laneq_f16(vo10, vi11, vf10, 4); + vo10 = vfmaq_laneq_f16(vo10, vi12, vf10, 5); + vo10 = vfmaq_laneq_f16(vo10, vi20, vf10, 6); + vo10 = vfmaq_laneq_f16(vo10, vi21, vf10, 7); + vo10 = vfmaq_laneq_f16(vo10, vi22, vf11, 0); + + // outch 1, height 1 + vo11 = vfmaq_laneq_f16(vo11, vi10, vf10, 0); + vo11 = vfmaq_laneq_f16(vo11, vi11, vf10, 1); + vo11 = vfmaq_laneq_f16(vo11, vi12, vf10, 2); + vo11 = vfmaq_laneq_f16(vo11, vi20, vf10, 3); + vo11 = vfmaq_laneq_f16(vo11, vi21, vf10, 4); + vo11 = vfmaq_laneq_f16(vo11, vi22, vf10, 5); + vo11 = vfmaq_laneq_f16(vo11, vi30, vf10, 6); + vo11 = vfmaq_laneq_f16(vo11, vi31, vf10, 7); + vo11 = vfmaq_laneq_f16(vo11, vi32, vf11, 0); + + vst1q_f16(out_ptr0, vo00); + vst1q_f16(out_ptr0 + p.out_width, vo01); + vst1q_f16(out_ptr1, vo10); + vst1q_f16(out_ptr1 + p.out_width, vo11); + + in_ptr0 += 8; + in_ptr1 += 8; + in_ptr2 += 8; + in_ptr3 += 8; + + out_ptr0 += 8; + out_ptr1 += 8; + } // w + + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; + + out_ptr0 += p.out_width; + out_ptr1 += p.out_width; + } // h + } // c + } else { + for (index_t mm = m; mm < p.out_channels; ++mm) { + float16_t *out_ptr0_base = + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t *in_ptr0 = + input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t *in_ptr1 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; + const float16_t *in_ptr2 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; + const float16_t *in_ptr3 = + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; + const float16_t + *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9; + + float16_t *out_ptr0 = out_ptr0_base; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr0); + vf01 = vld1q_f16(filter_ptr0 + 8); + + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02, vi0n; + float16x8_t vi10, vi11, vi12, vi1n; + float16x8_t vi20, vi21, vi22, vi2n; + float16x8_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + + // load input + vi00 = vld1q_f16(in_ptr0); + vi0n = vld1q_f16(in_ptr0 + 8); + vi10 = vld1q_f16(in_ptr1); + vi1n = vld1q_f16(in_ptr1 + 8); + vi20 = vld1q_f16(in_ptr2); + vi2n = vld1q_f16(in_ptr2 + 8); + vi30 = vld1q_f16(in_ptr3); + vi3n = vld1q_f16(in_ptr3 + 8); + + vi01 = vextq_f16(vi00, vi0n, 1); + vi02 = vextq_f16(vi00, vi0n, 2); + vi11 = vextq_f16(vi10, vi1n, 1); + vi12 = vextq_f16(vi10, vi1n, 2); + vi21 = vextq_f16(vi20, vi2n, 1); + vi22 = vextq_f16(vi20, vi2n, 2); + vi31 = vextq_f16(vi30, vi3n, 1); + vi32 = vextq_f16(vi30, vi3n, 2); + + // load ouptut + vo00 = vld1q_f16(out_ptr0); + vo01 = vld1q_f16(out_ptr0 + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + + vst1q_f16(out_ptr0, vo00); + vst1q_f16(out_ptr0 + p.out_width, vo01); + + in_ptr0 += 8; + in_ptr1 += 8; + in_ptr2 += 8; + in_ptr3 += 8; + + out_ptr0 += 8; + } // w + + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; + + out_ptr0 += p.out_width; + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 2); + + return MaceStatus::MACE_SUCCESS; +} + +template<> +MaceStatus Conv2dK3x3S2::DoCompute( + const ConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + for (index_t c = 0; c < p.in_channels; ++c) { + const float16_t + *in_base = input_data + b * p.in_batch_size + c * p.in_image_size; + const float16_t *filter_ptr = + filter_data + m * p.in_channels * 9 + c * 9; + float16_t *out_base = + output_data + b * p.out_batch_size + m * p.out_image_size; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 7 < p.out_width; w += 8) { + float16x8x2_t vi0, vi1, vi2; + float16x8_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 8 width): vo + float16x8_t vo; + + // load input + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * p.in_width + in_w; + vi0 = vld2q_f16(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f16(in_base + in_offset + p.in_width); + vi2 = vld2q_f16(in_base + in_offset + 2 * p.in_width); + + vi0n = vld1q_f16(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 8); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 8); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo = vld1q_f16(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f16(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f16(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f16(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f16(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f16(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f16(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f16(vo, vi10, vf00, 3); + vo = vfmaq_laneq_f16(vo, vi11, vf00, 4); + vo = vfmaq_laneq_f16(vo, vi12, vf00, 5); + vo = vfmaq_laneq_f16(vo, vi20, vf00, 6); + vo = vfmaq_laneq_f16(vo, vi21, vf00, 7); + vo = vfmaq_laneq_f16(vo, vi22, vf01, 0); + + vst1q_f16(out_base + out_offset, vo); + } // w + } // h + } // c + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 1); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc new file mode 100644 index 00000000..7455bce8 --- /dev/null +++ b/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc @@ -0,0 +1,410 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +template +void DepthwiseConv2d3x3Pixel(const float16_t *in_base, + const float16_t *filter, + const index_t out_h, + const index_t out_w, + const index_t in_h_start, + const index_t in_w_start, + const index_t out_width, + const index_t in_height, + const index_t in_width, + float16_t *out_base) { + const index_t filter_width = 3; + float sum = 0.0f; + + index_t in_h = in_h_start; + const float16_t *in = in_base + in_h * in_width; + const float16_t *filter_ptr = filter; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + in_h++; + in += in_width; + filter_ptr += filter_width; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + in_h++; + in += in_width; + filter_ptr += filter_width; + if (in_h >= 0 && in_h < in_height) { + index_t in_w = in_w_start; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[0]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[1]; + } + in_w++; + if (in_w >= 0 && in_w < in_width) { + sum += in[in_w] * filter_ptr[2]; + } + } + out_base[out_h * out_width + out_w] = static_cast(sum); +} + +template<> +MaceStatus DepthwiseConv2dK3x3S1::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + const index_t c = m / p.multiplier; + const index_t multi_index = m % p.multiplier; + auto filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9; + auto in_base = input_data + b * p.in_batch_size + c * p.in_image_size; + auto out_base = output_data + b * p.out_batch_size + + m * p.out_image_size; + index_t h, w; + + // top + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) { + // left + for (w = 0; w < p.valid_w_start; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + + for (w = p.valid_w_start; w + 7 < p.valid_w_stop; w += 8) { + // input (4 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02, vi0n; + float16x8_t vi10, vi11, vi12, vi1n; + float16x8_t vi20, vi21, vi22, vi2n; + float16x8_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 8 width): vo_outch_height + float16x8_t vo00, vo01; + + // load input + index_t in_h = h - p.pad_top; + index_t in_w = w - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; + vi00 = vld1q_f16(in_base + in_offset); + vi0n = vld1q_f16(in_base + in_offset + 8); + vi10 = vld1q_f16(in_base + in_offset + p.in_width); + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 8); + vi20 = vld1q_f16(in_base + in_offset + 2 * p.in_width); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 8); + vi30 = vld1q_f16(in_base + in_offset + 3 * p.in_width); + vi3n = vld1q_f16(in_base + in_offset + 3 * p.in_width + 8); + + vi01 = vextq_f16(vi00, vi0n, 1); + vi02 = vextq_f16(vi00, vi0n, 2); + vi11 = vextq_f16(vi10, vi1n, 1); + vi12 = vextq_f16(vi10, vi1n, 2); + vi21 = vextq_f16(vi20, vi2n, 1); + vi22 = vextq_f16(vi20, vi2n, 2); + vi31 = vextq_f16(vi30, vi3n, 1); + vi32 = vextq_f16(vi30, vi3n, 2); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo00 = vld1q_f16(out_base + out_offset); + vo01 = vld1q_f16(out_base + out_offset + p.out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f16(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f16(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f16(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f16(vo00, vi10, vf00, 3); + vo00 = vfmaq_laneq_f16(vo00, vi11, vf00, 4); + vo00 = vfmaq_laneq_f16(vo00, vi12, vf00, 5); + vo00 = vfmaq_laneq_f16(vo00, vi20, vf00, 6); + vo00 = vfmaq_laneq_f16(vo00, vi21, vf00, 7); + vo00 = vfmaq_laneq_f16(vo00, vi22, vf01, 0); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f16(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f16(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f16(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f16(vo01, vi20, vf00, 3); + vo01 = vfmaq_laneq_f16(vo01, vi21, vf00, 4); + vo01 = vfmaq_laneq_f16(vo01, vi22, vf00, 5); + vo01 = vfmaq_laneq_f16(vo01, vi30, vf00, 6); + vo01 = vfmaq_laneq_f16(vo01, vi31, vf00, 7); + vo01 = vfmaq_laneq_f16(vo01, vi32, vf01, 0); + vst1q_f16(out_base + out_offset, vo00); + vst1q_f16(out_base + out_offset + p.out_width, vo01); + } // w + + // right + for (; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } // h + + // bottom + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + } // m + } // b + }, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool + + return MaceStatus::MACE_SUCCESS; +} + +template<> +MaceStatus DepthwiseConv2dK3x3S2::DoCompute( + const DepthwiseConvComputeParam &p, const float16_t *filter_data, + const float16_t *input_data, float16_t *output_data) { + p.thread_pool.Compute2D( + [=](index_t start0, index_t end0, index_t step0, index_t start1, + index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + index_t c = m / p.multiplier; + index_t multi_index = m % p.multiplier; + auto filter_ptr = filter_data + multi_index * p.in_channels * 9 + + c * 9; + auto in_base = input_data + b * p.in_batch_size + + c * p.in_image_size; + auto out_base = output_data + b * p.out_batch_size + + m * p.out_image_size; + index_t h, w; + + // top + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float16x8_t vf00, vf01; + vf00 = vld1q_f16(filter_ptr); + vf01 = vld1q_f16(filter_ptr + 8); + + for (h = p.valid_h_start; h < p.valid_h_stop; ++h) { + // left + for (w = 0; w < p.valid_w_start; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + + for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 8) { + float16x8x2_t vi0, vi1, vi2; + float16x8_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float16x8_t vi00, vi01, vi02; + float16x8_t vi10, vi11, vi12; + float16x8_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 8 width): vo + float16x8_t vo; + + // load input + index_t in_h = h * 2 - p.pad_top; + index_t in_w = w * 2 - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; + vi0 = vld2q_f16(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f16(in_base + in_offset + p.in_width); + vi2 = vld2q_f16(in_base + in_offset + 2 * p.in_width); + + vi0n = vld1q_f16(in_base + in_offset + 16); // [8.9.10.11] + vi1n = vld1q_f16(in_base + in_offset + p.in_width + 16); + vi2n = vld1q_f16(in_base + in_offset + 2 * p.in_width + 16); + + // load ouptut + index_t out_offset = h * p.out_width + w; + vo = vld1q_f16(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f16(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f16(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f16(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f16(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f16(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f16(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f16(vo, vi10, vf00, 3); + vo = vfmaq_laneq_f16(vo, vi11, vf00, 4); + vo = vfmaq_laneq_f16(vo, vi12, vf00, 5); + vo = vfmaq_laneq_f16(vo, vi20, vf00, 6); + vo = vfmaq_laneq_f16(vo, vi21, vf00, 7); + vo = vfmaq_laneq_f16(vo, vi22, vf01, 0); + vst1q_f16(out_base + out_offset, vo); + } // w + + // right + for (; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } // h + + // bottom + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { + DepthwiseConv2d3x3Pixel(in_base, + filter_ptr, + h, + w, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, + out_base); + } + } + } // m + } // b + }, + 0, p.batch, 1, 0, p.out_channels, 1); + + return MaceStatus::MACE_SUCCESS; +} + + +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 26b4d02a..b56832de 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -507,7 +507,7 @@ class Conv2dOp : public ConvPool2dOpBase { void RegisterConv2D(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU); - + MACE_REGISTER_FP16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, uint8_t); diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 3f0d7d64..96c3f38d 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -406,6 +406,8 @@ void RegisterDepthwiseConv2d(OpRegistry *op_registry) { DepthwiseConv2dOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 96c4e85f..3b95410e 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -518,6 +518,8 @@ void RegisterPooling(OpRegistry *op_registry) { DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Pooling", PoolingOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc index c9402709..cb3ec86e 100644 --- a/mace/ops/ref/activation.cc +++ b/mace/ops/ref/activation.cc @@ -130,6 +130,10 @@ void RegisterActivationDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Activation, delegator::ActivationParam, MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc index eb65f80c..19552f17 100644 --- a/mace/ops/ref/bias_add.cc +++ b/mace/ops/ref/bias_add.cc @@ -152,6 +152,9 @@ void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, BiasAdd, DelegatorParam, MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index 0bfec1ed..992b2272 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -131,6 +131,9 @@ void RegisterConv2dDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Conv2d, delegator::Conv2dParam, MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Conv2d, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc index 19aa73ef..a52aa02e 100644 --- a/mace/ops/ref/depthwise_conv_2d.cc +++ b/mace/ops/ref/depthwise_conv_2d.cc @@ -137,6 +137,10 @@ void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) { registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, + float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/gemm.cc b/mace/ops/ref/gemm.cc index 21a939a0..321db87a 100644 --- a/mace/ops/ref/gemm.cc +++ b/mace/ops/ref/gemm.cc @@ -156,6 +156,9 @@ void RegisterGemmDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemm, delegator::GemmParam, MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc index 05b04f3c..dfdd02c5 100644 --- a/mace/ops/ref/gemv.cc +++ b/mace/ops/ref/gemv.cc @@ -92,6 +92,9 @@ void RegisterGemvDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_BF16_DELEGATOR( registry, Gemv, DelegatorParam, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF)); + MACE_REGISTER_FP16_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float16_t, ImplType::REF)); } } // namespace ref diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index c233cba9..762db324 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -526,6 +526,8 @@ void RegisterSoftmax(OpRegistry *op_registry) { DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Softmax", SoftmaxOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 75afc926..b2d5d91c 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -81,6 +81,7 @@ class SqueezeOp : public SqueezeOpRaw { void RegisterSqueeze(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU); + MACE_REGISTER_FP16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE diff --git a/test/ccbenchmark/BUILD.bazel b/test/ccbenchmark/BUILD.bazel index ea5357ca..3bfe8f48 100644 --- a/test/ccbenchmark/BUILD.bazel +++ b/test/ccbenchmark/BUILD.bazel @@ -11,6 +11,7 @@ load( "if_hexagon_enabled", "if_neon_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -61,6 +62,9 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), diff --git a/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc b/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc index 84ad04d3..7c31764f 100644 --- a/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc +++ b/test/ccbenchmark/mace/ops/conv_2d_benchmark.cc @@ -181,6 +181,12 @@ void Conv2d(int iters, #else #define MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \ + MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float16_t, CPU) +#else +#define MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) +#endif // MACE_ENABLE_FP16 #ifdef MACE_ENABLE_OPENCL #define MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \ MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \ @@ -193,6 +199,7 @@ void Conv2d(int iters, MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ MACE_BM_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ + MACE_BM_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \ MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC) // Filter sizes and data alignments diff --git a/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc b/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc index 3abf5067..f159a783 100644 --- a/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc +++ b/test/ccbenchmark/mace/ops/depthwise_conv2d_benchmark.cc @@ -140,6 +140,12 @@ void DepthwiseConv2d(int iters, #else #define MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M) #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +#define MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M) \ + MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float16_t, CPU) +#else +#define MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M) +#endif // MACE_ENABLE_FP16 #ifdef MACE_ENABLE_OPENCL #define MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M) \ MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \ @@ -152,6 +158,7 @@ void DepthwiseConv2d(int iters, MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ MACE_BM_DEPTHWISE_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, P, M); \ MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M); \ + MACE_BM_DEPTHWISE_CONV_2D_FP16_MACRO(N, C, H, W, KH, KW, S, P, M); \ MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M) MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); diff --git a/test/ccunit/BUILD.bazel b/test/ccunit/BUILD.bazel index 50d12d4e..22882fee 100644 --- a/test/ccunit/BUILD.bazel +++ b/test/ccunit/BUILD.bazel @@ -12,6 +12,7 @@ load( "if_hta_enabled", "if_neon_enabled", "if_bfloat16_enabled", + "if_fp16_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -42,6 +43,10 @@ cc_test( [ "mace/ops/arm/bf16/*.cc", ] + )) + if_fp16_enabled(glob( + [ + "mace/ops/arm/fp16/*.cc", + ] )) + if_opencl_enabled(glob( [ "mace/ops/opencl/*.cc", @@ -66,6 +71,9 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_bfloat16_enabled([ "-DMACE_ENABLE_BFLOAT16", + ]) + if_fp16_enabled([ + "-DMACE_ENABLE_FP16", + "-march=armv8.2-a+fp16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/test/ccunit/mace/ops/arm/arm82/gemm_test.cc b/test/ccunit/mace/ops/arm/arm82/gemm_test.cc new file mode 100644 index 00000000..fd296b85 --- /dev/null +++ b/test/ccunit/mace/ops/arm/arm82/gemm_test.cc @@ -0,0 +1,106 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/delegator/gemm.h" + +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/ops_test_util.h" +#include "mace/ops/testing/test_utils.h" + +namespace mace { +namespace ops { +namespace test { + +void TestGemmFloat16(const index_t batch, + const index_t rows, + const index_t cols, + const index_t depth, + const MatrixMajor lhs_major, + const MatrixMajor rhs_major, + const MatrixMajor output_major, + const bool lhs_batched, + const bool rhs_batched) { + Tensor lhs(GetCPUAllocator(), DT_FLOAT16); + Tensor rhs(GetCPUAllocator(), DT_FLOAT16); + Tensor output(GetCPUAllocator(), DT_FLOAT16); + lhs.Resize({lhs_batched ? batch : 1, rows, depth}); + rhs.Resize({rhs_batched ? batch : 1, depth, cols}); + output.Resize({batch, rows, cols}); + { + Tensor::MappingGuard lhs_guard(&lhs); + Tensor::MappingGuard rhs_guard(&rhs); + auto lhs_data = lhs.mutable_data(); + auto rhs_data = rhs.mutable_data(); + auto output_data = output.mutable_data(); + GenerateRandomRealTypeData(lhs.shape(), lhs_data); + GenerateRandomRealTypeData(rhs.shape(), rhs_data); + GenerateRandomRealTypeData(output.shape(), output_data); + } + + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemm = delegator::Gemm::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::NEON), + delegator::GemmParam()); + gemm->Compute(&context, &lhs, &rhs, batch, rows, cols, depth, lhs_major, + rhs_major, output_major, lhs_batched, rhs_batched, &output); + + Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT16); + expected_output.Resize({batch, rows, cols}); + std::unique_ptr gemm_ref = delegator::Gemm::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::REF), + delegator::GemmParam()); + gemm_ref->Compute(&context, &lhs, &rhs, batch, rows, cols, depth, lhs_major, + rhs_major, output_major, lhs_batched, rhs_batched, + &expected_output); + + ExpectTensorSimilar(expected_output, output, 1e-4); +} + +TEST(ArmGemm, TestGemmFP16) { + TestGemmFloat16(1, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, RowMajor, ColMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(1, 47, 69, 37, ColMajor, ColMajor, ColMajor, true, true); + + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, RowMajor, ColMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, RowMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, RowMajor, ColMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, ColMajor, RowMajor, true, true); + TestGemmFloat16(3, 47, 69, 37, ColMajor, ColMajor, ColMajor, true, true); + + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, true, false); + TestGemmFloat16(3, 47, 69, 37, RowMajor, RowMajor, RowMajor, false, true); + + TestGemmFloat16(16, 31, 61, 67, RowMajor, ColMajor, RowMajor, true, true); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/test/ccunit/mace/ops/conv_2d_test.cc b/test/ccunit/mace/ops/conv_2d_test.cc index 6d04d05e..aebca907 100644 --- a/test/ccunit/mace/ops/conv_2d_test.cc +++ b/test/ccunit/mace/ops/conv_2d_test.cc @@ -1435,6 +1435,74 @@ TEST_F(Conv2dOpTest, BFloat16) { TestBFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {3, 3}); } #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +namespace { +void TestFloat16(const index_t batch, + const index_t out_channels, + const index_t in_channels, + const index_t in_height, + const index_t in_width, + const index_t k_height, + const index_t k_width, + enum Padding padding_type, + const std::vector &strides) { + OpsTestNet net; + net.AddRandomInput( + "Input", {batch, in_channels, in_height, in_width}); + net.AddRandomInput( + "Filter", {out_channels, in_channels, k_height, k_width}, true); + net.AddRandomInput("Bias", {out_channels}, true); + net.Cast("Input", "FP16Input"); + net.Cast("Filter", "FP16Filter"); + net.Cast("Bias", "FP16Bias"); + + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("Conv2D", "FP16Conv2dTest") + .Input("FP16Input") + .Input("FP16Filter") + .Input("FP16Bias") + .Output("FP16Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT16)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + net.Cast("FP16Output", "CastOutput"); + + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("CastOutput"), 1e-4); +} +} // namespace + +TEST_F(Conv2dOpTest, float16_t) { + TestFloat16(1, 128, 64, 32, 32, 1, 1, VALID, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, VALID, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, FULL, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 3, 3, SAME, {2, 2}); + TestFloat16(1, 129, 63, 33, 31, 3, 3, SAME, {1, 1}); + TestFloat16(9, 128, 64, 32, 32, 3, 3, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 1, 5, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 5, 5, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 5, 1, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {1, 1}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {2, 2}); + TestFloat16(1, 128, 64, 32, 32, 7, 7, SAME, {3, 3}); +} +#endif // MACE_ENABLE_FP16 } // namespace test } // namespace ops } // namespace mace diff --git a/test/ccunit/mace/ops/depthwise_conv2d_test.cc b/test/ccunit/mace/ops/depthwise_conv2d_test.cc index f3585fcd..30fc56f5 100644 --- a/test/ccunit/mace/ops/depthwise_conv2d_test.cc +++ b/test/ccunit/mace/ops/depthwise_conv2d_test.cc @@ -560,6 +560,72 @@ TEST_F(DepthwiseConv2dOpTest, BFloat16) { #endif // MACE_ENABLE_BFLOAT16 +#ifdef MACE_ENABLE_FP16 +namespace { +void TestFloat16(const index_t batch, + const index_t multiplier, + const index_t in_channels, + const index_t in_height, + const index_t in_width, + const index_t k_height, + const index_t k_width, + enum Padding padding_type, + const std::vector &strides) { + OpsTestNet net; + const index_t out_channels = multiplier * in_channels; + net.AddRandomInput( + "Input", {batch, in_channels, in_height, in_width}, false, false); + net.AddRandomInput( + "Filter", {multiplier, in_channels, k_height, k_width}, true, false); + net.AddRandomInput("Bias", {out_channels}, true); + net.Cast("Input", "FP16Input"); + net.Cast("Filter", "FP16Filter"); + net.Cast("Bias", "FP16Bias"); + + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + OpDefBuilder("DepthwiseConv2d", "FP16DepthwiseConv2DTest") + .Input("FP16Input") + .Input("FP16Filter") + .Input("FP16Bias") + .Output("FP16Output") + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding_type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DT_FLOAT16)) + .Finalize(net.NewOperatorDef()); + net.RunOp(CPU); + + net.Cast("FP16Output", "CastOutput"); + + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("CastOutput"), 1e-4); +} +} // namespace + +TEST_F(DepthwiseConv2dOpTest, float16_t) { + TestFloat16(1, 1, 1024, 7, 7, 3, 3, VALID, {1, 1}); + TestFloat16(1, 1, 1024, 7, 7, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 1024, 7, 7, 3, 3, FULL, {1, 1}); + TestFloat16(1, 2, 1024, 7, 7, 3, 3, SAME, {1, 1}); + TestFloat16(1, 2, 1024, 7, 7, 3, 3, SAME, {2, 2}); + TestFloat16(1, 1, 512, 14, 14, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 512, 14, 13, 5, 5, SAME, {2, 2}); + TestFloat16(1, 1, 256, 28, 28, 3, 3, SAME, {1, 1}); + TestFloat16(1, 1, 128, 56, 56, 3, 3, SAME, {2, 2}); + TestFloat16(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2}); +} +#endif // MACE_ENABLE_FP16 } // namespace test } // namespace ops } // namespace mace diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 7526314b..857ab2cf 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -100,6 +100,11 @@ def parse_args(): type=str2bool, default=True, help="Whether to use bfloat16") + parser.add_argument( + "--enable_fp16", + type=str2bool, + default=False, + help="Whether to use armv8.2") parser.add_argument( "--enable_rpcmem", type=str2bool, @@ -180,6 +185,7 @@ def main(unused_args): enable_neon=FLAGS.enable_neon, enable_quantize=FLAGS.enable_quantize, enable_bfloat16=FLAGS.enable_bfloat16, + enable_fp16=FLAGS.enable_fp16, enable_rpcmem=FLAGS.enable_rpcmem, enable_hta=FLAGS.enable_hta, address_sanitizer=FLAGS.address_sanitizer, diff --git a/tools/converter.py b/tools/converter.py index e7a5f05d..d15156ee 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -89,6 +89,7 @@ FPDataTypeStrs = [ "fp16_fp32", "fp32_fp32", "bf16_fp32", + "fp16_fp16", ] FPDataType = Enum('GPUDataType', [(ele, ele) for ele in FPDataTypeStrs], @@ -184,6 +185,15 @@ def bfloat16_enabled(configs): return False +def fp16_enabled(configs): + for model_name in configs[YAMLKeyword.models]: + model_config = configs[YAMLKeyword.models][model_name] + dtype = model_config.get(YAMLKeyword.data_type, FPDataType.fp16_fp32) + if dtype == FPDataType.fp16_fp16: + return True + return False + + def hexagon_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: @@ -765,6 +775,7 @@ def build_model_lib(configs, address_sanitizer, debug_mode): enable_opencl=opencl_enabled(configs), enable_quantize=quantize_enabled(configs), enable_bfloat16=bfloat16_enabled(configs), + enable_fp16=fp16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode), debug_mode=debug_mode @@ -927,6 +938,7 @@ def build_mace_run(configs, target_abi, toolchain, enable_opencl=opencl_enabled(configs), enable_quantize=quantize_enabled(configs), enable_bfloat16=bfloat16_enabled(configs), + enable_fp16=fp16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type), debug_mode=debug_mode, diff --git a/tools/python/utils/config_parser.py b/tools/python/utils/config_parser.py index 5521d4e8..f48acb00 100644 --- a/tools/python/utils/config_parser.py +++ b/tools/python/utils/config_parser.py @@ -182,6 +182,8 @@ def parse_internal_data_type(str): return mace_pb2.DT_FLOAT elif str == 'bf16_fp32': return mace_pb2.DT_BFLOAT16 + elif str == 'fp16_fp16': + return mace_pb2.DT_FLOAT16 else: return mace_pb2.DT_HALF diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 9eec6289..1a7d2b97 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -271,6 +271,7 @@ def bazel_build(target, enable_opencl=True, enable_quantize=True, enable_bfloat16=False, + enable_fp16=False, enable_rpcmem=True, address_sanitizer=False, symbol_hidden=True, @@ -305,6 +306,8 @@ def bazel_build(target, "--define", "bfloat16=%s" % str(enable_bfloat16).lower(), "--define", + "fp16=%s" % str(enable_fp16).lower(), + "--define", "rpcmem=%s" % str(enable_rpcmem).lower(), "--define", "hexagon=%s" % str(enable_hexagon).lower(), -- GitLab