diff --git a/CMakeLists.txt b/CMakeLists.txt index 17fa7ec3780871e732b2381697509c30b1c5625d..fe8fa6efc12625f5b84053bea57e7d5fb4cdf938 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ option(MACE_ENABLE_CUDA "whether to enable CUDA support" OFF) option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF) option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF) option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF) +option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF) option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF) option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF) option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON) @@ -116,6 +117,10 @@ if(MACE_ENABLE_MTK_APU) add_definitions(-DMACE_ENABLE_MTK_APU) endif(MACE_ENABLE_MTK_APU) +if(MACE_ENABLE_BFLOAT16) + add_definitions(-DMACE_ENABLE_BFLOAT16) +endif(MACE_ENABLE_BFLOAT16) + if(MACE_ENABLE_OBFUSCATE) add_definitions(-DMACE_OBFUSCATE_LITERALS) endif(MACE_ENABLE_OBFUSCATE) diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index dc7344bed145dd0d1c1dea4673226a15d2a1e638..665daa71645f7de3b278b1f59d2c3d95aa7aaaaa 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -85,7 +85,7 @@ in one deployment file. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -582,9 +582,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati Therefore, the default storage type for a regular model in MACE is half. However, if the model is very sensitive to accuracy, storage type can be changed to float. -In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``. +In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, +for CPU it can also be changed to ``bf16_fp32``. -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float. +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/docs/user_guide/advanced_usage_cmake.rst b/docs/user_guide/advanced_usage_cmake.rst index 23631b93d3de058fab4ce04b3aa2a3fb8bae19cc..3956b9757ede1ab22dc9079e5720f3837dbdac6e 100644 --- a/docs/user_guide/advanced_usage_cmake.rst +++ b/docs/user_guide/advanced_usage_cmake.rst @@ -63,7 +63,7 @@ There are many advanced options supported. * - runtime - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. * - input_data_formats @@ -438,9 +438,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati Therefore, the default storage type for a regular model in MACE is half. However, if the model is very sensitive to accuracy, storage type can be changed to float. -In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``. +In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``, +for CPU it can also be changed to ``bf16_fp32``. -For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float. +For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. diff --git a/docs/zh/installation/env_requirement.rst b/docs/zh/installation/env_requirement.rst index 600483d8e7acf624ddcc58b39e3bb8722d9d0b9b..1982cf17c859d726367c9cdd4fec2431f757261b 100644 --- a/docs/zh/installation/env_requirement.rst +++ b/docs/zh/installation/env_requirement.rst @@ -43,12 +43,13 @@ MACE 需要安装下列依赖: - 版本和说明 * - Android NDK - `NDK 安装指南 `__ - - Required by Android build, r15b, r15c, r16b, r17b + - 安卓编译需要, bazel用户可以使用r15b及以上的版本, cmake用户可以使用r17b及以上版本 * - CMake - apt-get install cmake - >= 3.11.3 * - ADB - - Linux:``apt-get install android-tools-adb`` Mac:``brew cask install android-platform-tools`` + - | Linux:``apt-get install android-tools-adb`` + | Mac:``brew cask install android-platform-tools`` - Android 运行需要, >= 1.0.32 * - TensorFlow - pip install tensorflow==1.8.0 diff --git a/mace/BUILD.bazel b/mace/BUILD.bazel index 748af938613ec84275140c6dc4425f5b56ee248c..1a9d56eadfaaaf2629b0c450b305b0a05bad61ef 100644 --- a/mace/BUILD.bazel +++ b/mace/BUILD.bazel @@ -132,6 +132,14 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "bfloat16_enabled", + define_values = { + "bfloat16": "true", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "rpcmem_enabled", define_values = { diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index 39fc3883d7acab01ecf26533ce4613b8b59f6208..67d94f103b489257896bebac79aec570fbe314bb 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -9,6 +9,7 @@ load( "if_android", "if_android_armv7", "if_apu_enabled", + "if_bfloat16_enabled", "if_hexagon_enabled", "if_hexagon_or_hta_enabled", "if_hta_enabled", @@ -87,6 +88,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ diff --git a/mace/core/bfloat16.h b/mace/core/bfloat16.h new file mode 100644 index 0000000000000000000000000000000000000000..21f8ae0b9d65a5d46d954c40033573f700a1bf03 --- /dev/null +++ b/mace/core/bfloat16.h @@ -0,0 +1,261 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_BFLOAT16_H_ +#define MACE_CORE_BFLOAT16_H_ + +#ifdef MACE_ENABLE_BFLOAT16 + +#include +#include +#include + +namespace mace { + +union Sphinx { + uint32_t i; + float f; + + Sphinx(uint32_t value) : i(value) {} + + Sphinx(float value) : f(value) {} +}; + +class BFloat16 { + public: + BFloat16() : data_(0) {} + + // we need implicit transformation, so `explicit` keyword is not used + BFloat16(float value) : data_(Sphinx(value).i >> 16) {} // NOLINT + + operator float() const { + return Sphinx(static_cast(data_ << 16)).f; + } + + operator double() const { + return static_cast( + Sphinx(static_cast(data_ << 16)).f); + } + + operator int() const { + return static_cast(Sphinx(static_cast(data_ << 16)).f); + } + + template + void operator=(T value) { + data_ = Sphinx(static_cast(value)).i >> 16; + } + + BFloat16 operator-() const { + return BFloat16(-(Sphinx(static_cast(data_ << 16)).f)); + } + + template + BFloat16 operator+(T value) const { + return BFloat16(Sphinx( + static_cast(data_ << 16)).f + static_cast(value)); + } + + template + BFloat16 operator-(T value) const { + return BFloat16(Sphinx( + static_cast(data_ << 16)).f - static_cast(value)); + } + + template + BFloat16 operator*(T value) const { + return BFloat16(Sphinx( + static_cast(data_ << 16)).f * static_cast(value)); + } + + template + BFloat16 operator/(T value) const { + return BFloat16(Sphinx( + static_cast(data_ << 16)).f / static_cast(value)); + } + + template + bool operator>(T value) const { + return Sphinx( + static_cast(data_ << 16)).f > static_cast(value); + } + + template + bool operator>=(T value) const { + return Sphinx( + static_cast(data_ << 16)).f >= static_cast(value); + } + + template + bool operator<(T value) const { + return Sphinx( + static_cast(data_ << 16)).f < static_cast(value); + } + + template + bool operator<=(T value) const { + return Sphinx( + static_cast(data_ << 16)).f <= static_cast(value); + } + + template + bool operator==(T value) const { + return Sphinx( + static_cast(data_ << 16)).f == static_cast(value); + } + + template + void operator+=(T value) { + data_ = Sphinx(Sphinx(static_cast(data_ << 16)).f + + static_cast(value)).i >> 16; + } + + template + void operator/=(T value) { + data_ = Sphinx(Sphinx(static_cast(data_ << 16)).f / + static_cast(value)).i >> 16; + } + + template + void operator-=(T value) { + data_ = Sphinx(Sphinx(static_cast(data_ << 16)).f - + static_cast(value)).i >> 16; + } + + template + void operator*=(T value) { + data_ = Sphinx(Sphinx(static_cast(data_ << 16)).f * + static_cast(value)).i >> 16; + } + + private: + uint16_t data_; +}; + +template<> +inline bool BFloat16::operator==(const BFloat16 &value) const { + return data_ == value.data_; +} + +template<> +inline void BFloat16::operator=(const BFloat16 &value) { + data_ = value.data_; +} + +} // namespace mace + +namespace std { +inline float fabs(const mace::BFloat16 &value) { + return fabs(static_cast(value)); +} + +inline float abs(const mace::BFloat16 &value) { + return abs(static_cast(value)); +} + +inline float sqrt(const mace::BFloat16 &value) { + return sqrt(static_cast(value)); +} + +inline float log(const mace::BFloat16 &value) { + return log(static_cast(value)); +} + +inline float tanh(const mace::BFloat16 &value) { + return tanh(static_cast(value)); +} + +inline float exp(const mace::BFloat16 &value) { + return exp(static_cast(value)); +} + +inline int ceil(const mace::BFloat16 &value) { + return ceil(static_cast(value)); +} + +inline int floor(const mace::BFloat16 &value) { + return floor(static_cast(value)); +} + +inline float max(const mace::BFloat16 &a, const float &b) { + return max(static_cast(a), b); +} + +inline float max(const float &a, const mace::BFloat16 &b) { + return max(a, static_cast(b)); +} + +inline float min(const mace::BFloat16 &a, const float &b) { + return min(static_cast(a), b); +} + +inline float min(const float &a, const mace::BFloat16 &b) { + return min(a, static_cast(b)); +} + +inline float pow(const mace::BFloat16 &a, const mace::BFloat16 &b) { + return pow(static_cast(a), static_cast(b)); +} + +inline float pow(const mace::BFloat16 &a, const float &b) { + return pow(static_cast(a), b); +} + +inline float pow(const float &a, const mace::BFloat16 &b) { + return pow(a, static_cast(b)); +} + +inline ostream &operator<<(ostream &ss, // NOLINT + const mace::BFloat16 &value) { + return ss << static_cast(value); +} + +} // namespace std + + +inline float operator+(const float &a, const mace::BFloat16 &value) { + return a + static_cast(value); +} + +inline float operator-(const float &a, const mace::BFloat16 &value) { + return a - static_cast(value); +} + +inline float operator*(const float &a, const mace::BFloat16 &value) { + return a * static_cast(value); +} + +inline float operator/(const float &a, const mace::BFloat16 &value) { + return a / static_cast(value); +} + +inline void operator+=(float &a, const mace::BFloat16 &value) { // NOLINT + a += static_cast(value); +} + +inline void operator-=(float &a, const mace::BFloat16 &value) { // NOLINT + a -= static_cast(value); +} + +inline void operator*=(float &a, const mace::BFloat16 &value) { // NOLINT + a *= static_cast(value); +} + +inline void operator/=(float &a, const mace::BFloat16 &value) { // NOLINT + a /= static_cast(value); +} + +#endif // MACE_ENABLE_BFLOAT16 + +#endif // MACE_CORE_BFLOAT16_H_ diff --git a/mace/core/ops/op_delegator.h b/mace/core/ops/op_delegator.h index 029bd39f814e8b69507a0a2db162732885fb2acd..92298c52298c28900854946f45717642984696c4 100644 --- a/mace/core/ops/op_delegator.h +++ b/mace/core/ops/op_delegator.h @@ -28,9 +28,9 @@ enum ImplType { }; #ifdef MACE_ENABLE_NEON -#define MACE_CPU_IMPL_TYPE NEON +const ImplType kCpuImplType = ImplType::NEON; #else -#define MACE_CPU_IMPL_TYPE REF +const ImplType kCpuImplType = ImplType::REF; #endif struct DelegatorParam { diff --git a/mace/core/registry/op_delegator_registry.cc b/mace/core/registry/op_delegator_registry.cc index 006f5555f8710ddd667166c182088b86de6e2af5..7aed0cb08254c98cdde3b8faf2d7811e46a80d63 100644 --- a/mace/core/registry/op_delegator_registry.cc +++ b/mace/core/registry/op_delegator_registry.cc @@ -15,25 +15,86 @@ #include "mace/core/registry/op_delegator_registry.h" #include +#include #include "mace/utils/logging.h" namespace mace { -MaceStatus OpDelegatorRegistry::Register(const std::string &key, +namespace { +const char *kDefaultTag = "general"; +} + +DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type, + DeviceType in_device, ImplType in_impl_type, + const char *in_tag) + : delegator_name(in_name), data_type(in_data_type), + device(in_device), impl_type(in_impl_type), tag(in_tag) {} + +DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type, + DeviceType in_device, ImplType in_impl_type) + : DelegatorInfo(in_name, in_data_type, + in_device, in_impl_type, kDefaultTag) {} + +std::string DelegatorInfo::ToString() const { + std::stringstream ss; + ss << delegator_name << "_" << data_type << "_" + << device << "_" << impl_type << "_" << tag; + return ss.str(); +} + +bool DelegatorInfo::operator==(const DelegatorInfo &info) const { + return device == info.device && impl_type == info.impl_type && + data_type == info.data_type && + delegator_name == info.delegator_name && tag == info.tag; +} + +MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key, DelegatorCreator creator) { - MACE_CHECK(registry_.count(key) == 0, "Register an exist key."); + MACE_CHECK(registry_.count(key) == 0, + "Register an exist key: ", key.ToString()); registry_[key] = std::move(creator); return MaceStatus::MACE_SUCCESS; } -DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const { - MACE_CHECK(registry_.count(key) > 0, key, " not exist."); - return registry_.at(key); -} +DelegatorCreator OpDelegatorRegistry::GetCreator( + const DelegatorInfo &key) const { + if (registry_.count(key) > 0) { + return registry_.at(key); + } + + DelegatorInfo info = key; + if (key.impl_type == ImplType::NEON) { + if (info.tag != kDefaultTag) { + info.tag = kDefaultTag; + if (registry_.count(info) > 0) { + VLOG(1) << key.ToString() + << " delegator fall back to " << info.ToString(); + return registry_.at(info); + } + info.tag = key.tag; + } -template<> const char *DType::name_ = "float"; -template<> const char *DType::name_ = "int"; -template<> const char *DType::name_ = "uint8_t"; + info.impl_type = ImplType::REF; + if (registry_.count(info) > 0) { + VLOG(1) << key.ToString() + << " delegator fall back to " << info.ToString(); + return registry_.at(info); + } + } + + // for REF + if (info.tag != kDefaultTag) { + info.tag = kDefaultTag; + if (registry_.count(info) > 0) { + VLOG(1) << key.ToString() + << " delegator fall back to " << info.ToString(); + return registry_.at(info); + } + } + + LOG(FATAL) << "Delegator not exist: " << key.ToString(); + return DelegatorCreator(); +} } // namespace mace diff --git a/mace/core/registry/op_delegator_registry.h b/mace/core/registry/op_delegator_registry.h index f70d5555792b19419d48c84fd06ad9f422096d95..0f72c78ddca2a52bd0c7b48ff4f58b776ab36e3f 100644 --- a/mace/core/registry/op_delegator_registry.h +++ b/mace/core/registry/op_delegator_registry.h @@ -21,7 +21,9 @@ #include #include +#include "mace/core/bfloat16.h" #include "mace/core/ops/op_delegator.h" +#include "mace/core/types.h" #include "mace/proto/mace.pb.h" #include "mace/public/mace.h" @@ -29,40 +31,50 @@ namespace mace { typedef std::function(const DelegatorParam &)> DelegatorCreator; +struct DelegatorInfo { + explicit DelegatorInfo(const char *delegator_name, + DataType data_type, + DeviceType device, + ImplType impl_type, + const char *tag); + explicit DelegatorInfo(const char *delegator_name, + DataType data_type, + DeviceType device, + ImplType impl_type); + + std::string ToString() const; + + bool operator==(const DelegatorInfo &info) const; + + std::string delegator_name; + DataType data_type; + DeviceType device; + ImplType impl_type; + std::string tag; +}; + class OpDelegatorRegistry { public: OpDelegatorRegistry() = default; ~OpDelegatorRegistry() = default; - MaceStatus Register(const std::string &key, DelegatorCreator creator); - DelegatorCreator GetCreator(const std::string &key) const; + MaceStatus Register(const DelegatorInfo &key, DelegatorCreator creator); + DelegatorCreator GetCreator(const DelegatorInfo &key) const; private: - std::unordered_map registry_; + struct HashName { + size_t operator()(const DelegatorInfo &delegator_info) const { + return std::hash()(delegator_info.ToString()); + } + }; + std::unordered_map registry_; }; -template -struct DType { static const char *name_; }; -template<> const char *DType::name_; -template<> const char *DType::name_; -template<> const char *DType::name_; - - } // namespace mace -#ifndef MACE_DELEGATOR_KEY_TMP -#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \ - (std::string(#delegator_name"_"#device"_"#impl"_") + DType
::name_) -#endif // MACE_DELEGATOR_KEY_TMP - -#ifndef MACE_DELEGATOR_KEY -#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \ - MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) -#endif // MACE_DELEGATOR_KEY - #ifndef MACE_DELEGATOR_KEY_EX_TMP #define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \ - (std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType
::name_) + DelegatorInfo(#delegator_name, DataTypeToEnum
::value, device, impl, #tag) #endif // MACE_DELEGATOR_KEY_EX_TMP #ifndef MACE_DELEGATOR_KEY_EX @@ -70,21 +82,32 @@ template<> const char *DType::name_; MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) #endif // MACE_DELEGATOR_KEY_EX +#ifndef MACE_DELEGATOR_KEY +#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \ + DelegatorInfo(#delegator_name, DataTypeToEnum
::value, device, impl) +#endif // MACE_DELEGATOR_KEY + #ifndef MACE_REGISTER_DELEGATOR #define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) \ - void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \ - registry->Register( \ - key, OpDelegator::DefaultCreator); \ - } + registry->Register(key, OpDelegator::DefaultCreator) #endif // MACE_REGISTER_DELEGATOR +#ifndef MACE_REGISTER_BF16_DELEGATOR +#ifdef MACE_ENABLE_BFLOAT16 +#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key) \ + MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) +#else +#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key) +#endif // MACE_ENABLE_BFLOAT16 +#endif // MACE_REGISTER_BF16_DELEGATOR + #ifndef MACE_DEFINE_DELEGATOR_CREATOR #define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \ static std::unique_ptr Create( \ - Workspace *workspace, const std::string &tag, \ + Workspace *workspace, const DelegatorInfo &key, \ const DelegatorParam ¶m) { \ DelegatorCreator creator = \ - workspace->GetDelegatorRegistry()->GetCreator(tag); \ + workspace->GetDelegatorRegistry()->GetCreator(key); \ std::unique_ptr delegator = creator(param); \ return std::unique_ptr( \ static_cast(delegator.release())); \ diff --git a/mace/core/registry/ops_registry.h b/mace/core/registry/ops_registry.h index 46476a64d157e6446b5668279e7adedd2df4eec5..fff4d6e14b5f55c7b378cd1cb9ee47c88812bae1 100644 --- a/mace/core/registry/ops_registry.h +++ b/mace/core/registry/ops_registry.h @@ -22,6 +22,8 @@ #include #include +#include "mace/core/bfloat16.h" +#include "mace/core/types.h" #include "mace/core/ops/operator.h" #include "mace/core/ops/op_condition_builder.h" #include "mace/core/ops/op_condition_context.h" @@ -80,6 +82,26 @@ class OpRegistry { DataTypeToEnum
::value, \ OpRegistry::DefaultCreator) +#ifndef MACE_REGISTER_BF16_OP +#ifdef MACE_ENABLE_BFLOAT16 +#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device) \ + MACE_REGISTER_OP(op_registry, op_type, class_name, device, BFloat16) +#else +#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device) +#endif // MACE_ENABLE_BFLOAT16 +#endif // MACE_REGISTER_BF16_OP + +#ifndef MACE_REGISTER_BF16_OP_BY_CLASS +#ifdef MACE_ENABLE_BFLOAT16 +#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, \ + class_name, device) \ + MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \ + class_name, device, BFloat16) +#else +#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, class_name, device) +#endif // MACE_ENABLE_BFLOAT16 +#endif // MACE_REGISTER_BF16_OP_BY_CLASS + #ifdef MACE_ENABLE_OPENCL #define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \ op_registry->Register( \ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index dc7d24b49e3236a93a6dbb3ee9325d755c1aa5cc..33ce0c223a6fb258e1f448fc95c1694b1496fb73 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -53,6 +53,13 @@ namespace mace { #define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) #endif +#ifdef MACE_ENABLE_BFLOAT16 +#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \ + MACE_CASE(BFloat16, MACE_SINGLE_ARG(STATEMENTS)) +#else +#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) +#endif // MACE_ENABLE_BFLOAT16 + #if MACE_ENABLE_OPENCL #define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) @@ -67,6 +74,7 @@ namespace mace { MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ + MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \ MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ case DT_INVALID: \ INVALID_STATEMENTS; \ @@ -419,7 +427,8 @@ class Tensor { if (i != 0 && i % shape_.back() == 0) { os << "\n"; } - MACE_RUN_WITH_TYPE_ENUM(dtype_, (os << (this->data()[i]) << ", ")); + MACE_RUN_WITH_TYPE_ENUM( + dtype_, (os << this->data()[i] << ", ")); } LOG(INFO) << os.str(); } diff --git a/mace/core/types.cc b/mace/core/types.cc index 3e4225391d32c65c94e73673510de03a6b0750c7..1decd9054110c433d1eea475488eac5f49aa52f9 100644 --- a/mace/core/types.cc +++ b/mace/core/types.cc @@ -25,6 +25,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { case DT_FLOAT: case DT_UINT8: case DT_INT32: + case DT_BFLOAT16: return true; default: return false; @@ -36,7 +37,8 @@ std::string DataTypeToString(const DataType dt) { {DT_FLOAT, "DT_FLOAT"}, {DT_HALF, "DT_HALF"}, {DT_UINT8, "DT_UINT8"}, - {DT_INT32, "DT_INT32"}}; + {DT_INT32, "DT_INT32"}, + {DT_BFLOAT16, "DT_BFLOAT16"}}; MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type"); return dtype_string_map[dt]; } @@ -50,6 +52,10 @@ size_t GetEnumTypeSize(const DataType dt) { #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) case DT_FLOAT16: return sizeof(float16_t); +#endif +#ifdef MACE_ENABLE_BFLOAT16 + case DT_BFLOAT16: + return sizeof(BFloat16); #endif case DT_UINT8: return sizeof(uint8_t); diff --git a/mace/core/types.h b/mace/core/types.h index 5bdd4930c17aface45bf8859e3291e1d8464b228..876049af281bf04e5378bf2f821710f6e249ffb3 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -21,6 +21,7 @@ #include #endif +#include "mace/core/bfloat16.h" #include "mace/proto/mace.pb.h" #include "include/half.hpp" @@ -57,6 +58,9 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); #endif +#ifdef MACE_ENABLE_BFLOAT16 +MACE_MAPPING_DATA_TYPE_AND_ENUM(BFloat16, DT_BFLOAT16); +#endif MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT); MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8); MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32); diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel index 8b540b53d946df2751df3ba957d9bc0bdda2534a..2c02a4a790dde4d696a6638540daf621b39ebef1 100644 --- a/mace/libmace/BUILD.bazel +++ b/mace/libmace/BUILD.bazel @@ -12,6 +12,7 @@ load( "if_android", "if_android_armv7", "if_apu_enabled", + "if_bfloat16_enabled", "if_darwin", "if_hexagon_enabled", "if_hta_enabled", @@ -42,6 +43,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]) + if_hta_enabled([ @@ -52,8 +55,8 @@ cc_library( "-DMACE_ENABLE_RPCMEM", ]), deps = [ - "//mace/ops", "//include:public_headers", + "//mace/ops", ], alwayslink = 1, ) diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 6ab855f42a0654ec3b8040c27bc66831f7f937af..732b688f631f30d64456576d9a5b4be6b323312f 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -16,9 +16,11 @@ #include #include +#include "mace/core/bfloat16.h" #include "mace/core/device_context.h" #include "mace/core/memory_optimizer.h" #include "mace/core/net.h" +#include "mace/core/net_def_adapter.h" #include "mace/core/registry/ops_registry.h" #include "mace/core/registry/op_delegator_registry.h" #include "mace/ops/common/transpose.h" @@ -29,7 +31,6 @@ #include "mace/public/mace.h" #include "mace/port/env.h" #include "mace/port/file_system.h" -#include "mace/core/net_def_adapter.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/gpu_device.h" @@ -460,6 +461,7 @@ class MaceEngine::Impl { std::unique_ptr ws_; std::unique_ptr net_; bool is_quantized_model_; + DataType net_data_type_; std::map input_info_map_; std::map output_info_map_; std::unique_ptr thread_pool_; @@ -565,6 +567,7 @@ MaceStatus MaceEngine::Impl::Init( #endif // mark quantized model flag is_quantized_model_ = IsQuantizedModel(*net_def); + net_data_type_ = net_def->data_type(); // Get input and output information. for (auto &input_info : net_def->input_info()) { input_info_map_[input_info.name()] = input_info; @@ -589,8 +592,8 @@ MaceStatus MaceEngine::Impl::Init( } input_tensor->Resize(shape); // Set to the default data format - input_tensor->set_data_format(static_cast( - input_info_map_[input_name].data_format())); + input_tensor->set_data_format( + static_cast(input_info_map_[input_name].data_format())); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -691,7 +694,8 @@ MaceStatus MaceEngine::Impl::Init( MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile( model_data_file.c_str(), &model_data_)); - MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, + MACE_RETURN_IF_ERROR(Init( + net_def, input_nodes, output_nodes, reinterpret_cast(model_data_->data()))); if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON || @@ -753,11 +757,24 @@ MaceStatus MaceEngine::Impl::TransposeInput( Tensor::MappingGuard input_guard(input_tensor); if (input_dt == DataType::DT_FLOAT) { auto input_data = input_tensor->mutable_data(); - return ops::Transpose(thread_pool_.get(), - input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); + if (net_data_type_ == DT_FLOAT || net_data_type_ == DataType::DT_HALF) { + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); +#ifdef MACE_ENABLE_BFLOAT16 + } else if (net_data_type_ == DT_BFLOAT16) { + auto *input_data = input_tensor->mutable_data(); + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); +#endif // MACE_ENABLE_BFLOAT16 + } else { + LOG(FATAL) << "Invalid net data type: " << net_data_type_; + } } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); return ops::Transpose(thread_pool_.get(), @@ -776,9 +793,22 @@ MaceStatus MaceEngine::Impl::TransposeInput( MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); Tensor::MappingGuard input_guard(input_tensor); if (input_dt == DataType::DT_FLOAT) { - auto input_data = input_tensor->mutable_data(); - memcpy(input_data, input.second.data().get(), - input_tensor->size() * sizeof(float)); + if (net_data_type_ == DataType::DT_FLOAT || + net_data_type_ == DataType::DT_HALF) { + auto input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(float)); +#ifdef MACE_ENABLE_BFLOAT16 + } else if (net_data_type_ == DataType::DT_BFLOAT16) { + auto input_data = input_tensor->mutable_data(); + const float *data = input.second.data().get(); + for (index_t i = 0; i < input_tensor->size(); ++i) { + input_data[i] = data[i]; + } +#endif // MACE_ENABLE_BFLOAT16 + } else { + LOG(FATAL) << "Invalid net data type: " << net_data_type_; + } } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); memcpy(input_data, input.second.data().get(), @@ -842,6 +872,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput( output_tensor->shape(), dst_dims, output->second.data().get()); +#ifdef MACE_ENABLE_BFLOAT16 + } else if (output_dt == DataType::DT_BFLOAT16) { + auto output_data = output_tensor->data(); + return ops::Transpose(thread_pool_.get(), + output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); +#endif // MACE_ENABLE_BFLOAT16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; return MaceStatus::MACE_INVALID_ARGS; @@ -864,6 +903,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput( std::memcpy(output->second.data().get(), output_tensor->data(), output_size * sizeof(int)); +#ifdef MACE_ENABLE_BFLOAT16 + } else if (output_dt == DataType::DT_BFLOAT16) { + const auto *output_data = output_tensor->data(); + float *data = output->second.data().get(); + for (index_t i = 0; i < output_tensor->size(); ++i) { + data[i] = output_data[i]; + } +#endif // MACE_ENABLE_BFLOAT16 } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; } diff --git a/mace/mace.bzl b/mace/mace.bzl index 6322e0357659405da23c7bffea80b252df4b341d..330a305e7008f0e8b168af6452b20e1ec0aecbb4 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -109,6 +109,12 @@ def if_quantize_enabled(a): "//conditions:default": [], }) +def if_bfloat16_enabled(a): + return select({ + "//mace:bfloat16_enabled": a, + "//conditions:default": [], + }) + def if_rpcmem_enabled(a): return select({ "//mace:rpcmem_enabled": a, diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 52ad46edfde322f45d12becbb249261beed12498..a3b8ec79e3ea8537f0b8ebc8f002bc0ff2249a23 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -10,6 +10,7 @@ load( "//mace:mace.bzl", "if_android", "if_android_armv7", + "if_bfloat16_enabled", "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", @@ -46,6 +47,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -85,6 +88,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -138,6 +143,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -223,6 +230,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), @@ -263,6 +272,8 @@ cc_library( "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", + ]) + if_bfloat16_enabled([ + "-DMACE_ENABLE_BFLOAT16", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", ]), diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 338de7ead4db24e35169bdc6cd681729e84b15b8..7d3b1e4d792a1354cc7e10e6d256e2593d0ee9cc 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -19,7 +19,6 @@ #include "mace/core/ops/operator.h" #include "mace/core/registry/ops_registry.h" - #include "mace/ops/delegator/activation.h" #ifdef MACE_ENABLE_OPENCL @@ -43,11 +42,12 @@ class ActivationOp : public Operation { Operation::GetOptionalArg("activation", "NOOP"))), activation_delegator_(delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType), delegator::ActivationParam( activation_type_, - Operation::GetOptionalArg("max_limit", 0), - Operation::GetOptionalArg("leakyrelu_coefficient", 0)))) {} + Operation::GetOptionalArg("max_limit", 0.f), + Operation::GetOptionalArg( + "leakyrelu_coefficient", 0.f)))) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -119,6 +119,8 @@ class ActivationOp : public Operation { void RegisterActivation(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Activation", + ActivationOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp); MACE_REGISTER_OP_CONDITION( op_registry, diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 8e5ce2e1928a1244ccd0ee27a3aa8c9bdc7a5ec7..4121d49ff27afdb81624b088512ad20d2b5ced5f 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -33,8 +33,8 @@ namespace ops { template class AddNOp; -template<> -class AddNOp : public Operation { +template +class AddNOp : public Operation { public: explicit AddNOp(OpConstructContext *context) : Operation(context) {} @@ -46,12 +46,12 @@ class AddNOp : public Operation { const index_t size = output->size(); Tensor::MappingGuard output_guard(output); - auto output_data = output->mutable_data(); - memset(output_data, 0, size * sizeof(float)); + auto output_data = output->mutable_data(); + memset(output_data, 0, size * sizeof(T)); for (auto &input : inputs_) { Tensor::MappingGuard input_guard(input); - auto input_data = input->data(); + auto input_data = input->template data(); for (index_t j = 0; j < size; ++j) { output_data[j] += input_data[j]; @@ -95,6 +95,7 @@ class AddNOp : public Operation { void RegisterAddN(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "AddN", AddNOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp); MACE_REGISTER_OP_CONDITION( op_registry, diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc index 5ec9dc92b818196b53ba60c0886467f5f2618bb4..9dccee7d99179db1b04983792c024e9c19fa797a 100644 --- a/mace/ops/argmax.cc +++ b/mace/ops/argmax.cc @@ -74,11 +74,12 @@ class ArgMaxOp : public Operation { if (argmin_) { for (index_t i = 0; i < outer_size; ++i) { int idx = 0; - T min_value = std::numeric_limits::max(); + float min_value = std::numeric_limits::max(); const T *input_ptr = input_data + i * inner_size; for (index_t j = 0; j < inner_size; ++j) { - if (input_ptr[j] < min_value) { - min_value = input_ptr[j]; + float input_value = input_ptr[j]; + if (input_value < min_value) { + min_value = input_value; idx = j; } } @@ -87,11 +88,12 @@ class ArgMaxOp : public Operation { } else { for (index_t i = 0; i < outer_size; ++i) { int idx = 0; - T max_value = std::numeric_limits::lowest(); + float max_value = std::numeric_limits::lowest(); const T *input_ptr = input_data + i * inner_size; for (index_t j = 0; j < inner_size; ++j) { - if (input_ptr[j] > max_value) { - max_value = input_ptr[j]; + float input_value = input_ptr[j]; + if (input_value > max_value) { + max_value = input_value; idx = j; } } @@ -111,8 +113,8 @@ class ArgMaxOp : public Operation { void RegisterArgMax(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc index 8c66bd563093a20941c64a50faa2a68aad891710..5d8d6984bd04fe7ae1ea9626e409388475505fbb 100644 --- a/mace/ops/arm/fp32/activation.cc +++ b/mace/ops/arm/fp32/activation.cc @@ -185,8 +185,11 @@ void Activation::DoActivation(const OpContext *context, } } -MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam, - MACE_DELEGATOR_KEY(Activation, CPU, float, NEON)) +void RegisterActivationDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc index fc5a55b3d4d0abf6cdad15bfd540bb20446803af..7edafec327692d736cc66ec22e82808031819e05 100644 --- a/mace/ops/arm/fp32/bias_add.cc +++ b/mace/ops/arm/fp32/bias_add.cc @@ -129,8 +129,11 @@ void BiasAdd::AddBias(const OpContext *context, } } -MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam, - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON)) +void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc index 0aad6be90729aac36bd09d1f9a3bea57ddb82b8b..fb3c8a26a738eaedadf7afea7ce1cb60c5c362f5 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.cc +++ b/mace/ops/arm/fp32/conv_2d_1x1.cc @@ -113,8 +113,12 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, output); } -MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1)) +void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK1x1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K1x1)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc index fc92091f55edf6f9d9eac7a6a285f718d62034e0..0b5d335a69753c705a49180c5e005f6bbff125b2 100644 --- a/mace/ops/arm/fp32/conv_2d_1xn.cc +++ b/mace/ops/arm/fp32/conv_2d_1xn.cc @@ -861,18 +861,27 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1)) - -MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1)) - -MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - NEON, K1x15S1)) -MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - NEON, K15x1S1)) +void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK1x7S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K1x7S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x1S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x1S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK1x15S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K1x15S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK15x1S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K15x1S1)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc index 37d8ef849f73e53d4afebc55ac19efe50fe7c02b..84635c7cac26d7c76bd82cd181716c2f5b987ecd 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3.cc @@ -737,10 +737,16 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1)) -MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2)) +void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK3x3S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK3x3S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc index cbdb7d66443e5d47759dcb8fe44890f85f2c4d5a..1ec5205735e9564e5c7516768c77491a394c391d 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc @@ -801,9 +801,12 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context, }, 0, batch, 1, 0, out_channels, 1); } -MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX( - Conv2d, CPU, float, NEON, K3x3Winograd)) +void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK3x3Winograd, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3Winograd)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc index cc117cf98637b2f886007ae15ffe75d47f884ff0..2bfb762520f49cf0a5b5cb82dea11bc2f55fc6a0 100644 --- a/mace/ops/arm/fp32/conv_2d_5x5.cc +++ b/mace/ops/arm/fp32/conv_2d_5x5.cc @@ -258,8 +258,12 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1)) +void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK5x5S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K5x5S1)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc index cc6963e7b1b8cd7eda4a09cb74a57d5f5ac3b6b2..d1f69967a21dd7393dafb196fd02b0c9e0322e4b 100644 --- a/mace/ops/arm/fp32/conv_2d_7x7.cc +++ b/mace/ops/arm/fp32/conv_2d_7x7.cc @@ -722,12 +722,20 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1)) -MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2)) -MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3)) +void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S1)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S2)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S3, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S3)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc index 2fdc57e2ef7d9e0f029919249a0bb776d5183879..d58a1725e507e27af12bcb0b0d64821c36769829 100644 --- a/mace/ops/arm/fp32/conv_general.cc +++ b/mace/ops/arm/fp32/conv_general.cc @@ -252,9 +252,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR( - registry, Conv2dGeneral, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General)) +void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dGeneral, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc index 65cfc6e8d7020e1fd753cbed9a2e7416b1ff56b9..57784e638f0da27575020b50a63e3080674c5c6f 100644 --- a/mace/ops/arm/fp32/deconv_2d_2x2.cc +++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc @@ -335,12 +335,16 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K2x2S1)) -MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K2x2S2)) +void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK2x2S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K2x2S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK2x2S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K2x2S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc index 55911e25f432a21290295018eefacedb00cfd25d..d0b49e0d296d89ca2dc12757dd8feda69ef25a67 100644 --- a/mace/ops/arm/fp32/deconv_2d_3x3.cc +++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc @@ -464,12 +464,16 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K3x3S1)) -MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K3x3S2)) +void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK3x3S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK3x3S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc index b2e17afa75f2545d820722ad90b3297397941a56..4a84e0394bf07764103c7c2c6c23f8cc79a31d5b 100644 --- a/mace/ops/arm/fp32/deconv_2d_4x4.cc +++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc @@ -574,12 +574,16 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K4x4S1)) -MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, K4x4S2)) +void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK4x4S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK4x4S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/fp32/deconv_2d_general.cc index 5ffe7b0d7a25bf92824ee1120e65ede9b50fcc08..d090ba23104869712fa2af1e9fc9e6dc203f0276 100644 --- a/mace/ops/arm/fp32/deconv_2d_general.cc +++ b/mace/ops/arm/fp32/deconv_2d_general.cc @@ -124,9 +124,11 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - NEON, General)) +void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dGeneral, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc index 8d77672b7ab094771e067722f703e8bc0e27a6d1..cc0ab45a02425f5917eb9edc44d4d20122b57296 100644 --- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc @@ -512,12 +512,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR( - registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1)) -MACE_REGISTER_DELEGATOR( - registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2)) +void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc index 291075ae2205d61035e211fd1c8daa04bec8c9d5..875e08fa5ed271d599b33d490b0211dcd1360254 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc @@ -776,19 +776,27 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1)) -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2)) - -MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1)) -MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2)) +void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc index f9de2de3df27aeabb4eb9199140993fbd5abb31e..6f313c591212008b0c614cfebbf24d5dfebdc1a1 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc @@ -959,19 +959,27 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1)) -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2)) - -MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1)) -MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2)) +void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} + +void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc index 81d715e26dbb34186bcd873b9dc083b27cd1a352..33d9cb01a377757358757576564d8131eb3c3e48 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc @@ -207,13 +207,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General)) +void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON)); +} -MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General)) +void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc index ca429e63d544e13774eb4073c02e9fd6122ad499..d506d8b1dbec75121dc4d025b7e89eaf22da1ecf 100644 --- a/mace/ops/arm/fp32/gemm.cc +++ b/mace/ops/arm/fp32/gemm.cc @@ -1224,8 +1224,11 @@ MaceStatus Gemm::Compute(const OpContext *context, output); } -MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam, - MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON)) +void RegisterGemmDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc index 317e422404327f50b6874993a2ed10f76a000e87..57f2f248ebbbf738793bd3df1cc509f88ffcf3e6 100644 --- a/mace/ops/arm/fp32/gemv.cc +++ b/mace/ops/arm/fp32/gemv.cc @@ -378,9 +378,11 @@ MaceStatus Gemv::Compute(const OpContext *context, #undef vaddvq_f32 #endif - -MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON)) +void RegisterGemvDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON)); +} } // namespace fp32 } // namespace arm diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc index 74d44104c422f555ee9e5b18ab5647aba9c7f2bd..97e50e1bc8faa1aa4ccfc2fd022f33879b04839b 100644 --- a/mace/ops/arm/q8/eltwise.cc +++ b/mace/ops/arm/q8/eltwise.cc @@ -162,8 +162,11 @@ MaceStatus Eltwise::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam, - MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON)) +void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Eltwise, delegator::EltwiseParam, + MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::NEON)); +} } // namespace q8 } // namespace arm diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc index 11290d357d0a33992ba52d3a5b8de31040a66738..4e45ae2ac753ad37414b6418837010bc11c22555 100644 --- a/mace/ops/arm/q8/gemv.cc +++ b/mace/ops/arm/q8/gemv.cc @@ -176,18 +176,14 @@ MaceStatus Gemv::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -template -class Gemv; -template -class Gemv; - -typedef Gemv GemvUint8; -MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON)) - -typedef Gemv GemvInt32; -MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON)) +void RegisterGemvDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON)); + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON)); +} } // namespace q8 } // namespace arm diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 88c9a179fe2982b1ec38821dd850784d97953608..a25936b1ce2534047f78d4b92faa3ac75c7e98d9 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -33,8 +33,8 @@ namespace ops { template class BatchNormOp; -template<> -class BatchNormOp : public Operation { +template +class BatchNormOp : public Operation { public: explicit BatchNormOp(OpConstructContext *context) : Operation(context), @@ -43,7 +43,8 @@ class BatchNormOp : public Operation { activation_delegator_( delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + T, kCpuImplType), delegator::ActivationParam( ops::StringToActivationType( Operation::GetOptionalArg("activation", @@ -91,13 +92,13 @@ class BatchNormOp : public Operation { Tensor::MappingGuard offset_mapper(offset); Tensor::MappingGuard output_mapper(output); - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *offset_ptr = offset->data(); - float *output_ptr = output->mutable_data(); + const T *input_ptr = input->data(); + const T *scale_ptr = scale->data(); + const T *offset_ptr = offset->data(); + T *output_ptr = output->mutable_data(); - std::vector new_scale; - std::vector new_offset; + std::vector new_scale; + std::vector new_offset; if (not_folded) { const Tensor *mean = this->Input(MEAN); const Tensor *var = this->Input(VAR); @@ -109,8 +110,8 @@ class BatchNormOp : public Operation { new_offset.resize(channels); Tensor::MappingGuard mean_mapper(mean); Tensor::MappingGuard var_mapper(var); - const float *mean_ptr = mean->data(); - const float *var_ptr = var->data(); + const T *mean_ptr = mean->data(); + const T *var_ptr = var->data(); thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start, index_t end, @@ -122,9 +123,8 @@ class BatchNormOp : public Operation { }, 0, channels, 1); } - const float *scale_data = not_folded ? new_scale.data() : scale_ptr; - const float - *offset_data = not_folded ? new_offset.data() : offset_ptr; + const T *scale_data = not_folded ? new_scale.data() : scale_ptr; + const T *offset_data = not_folded ? new_offset.data() : offset_ptr; index_t channel_size = height * width; index_t batch_size = channels * channel_size; @@ -232,6 +232,7 @@ class BatchNormOp : public Operation { void RegisterBatchNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp); } diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index 90324cd76f0797ae0535b99c139f48ee58077a35..5b2286b03a6d809d8a93f804104de4b8e6670580 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -84,8 +84,8 @@ class BatchToSpaceOpBase : public Operation { template class BatchToSpaceNDOp; -template<> -class BatchToSpaceNDOp : public BatchToSpaceOpBase { +template +class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) : BatchToSpaceOpBase(context) {} @@ -108,8 +108,8 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { int block_shape_h = block_shape_[0]; int block_shape_w = block_shape_[1]; - const float *input_data = batch_tensor->data(); - float *output_data = space_tensor->mutable_data(); + const T *input_data = batch_tensor->data(); + T *output_data = space_tensor->mutable_data(); index_t in_batches = batch_tensor->dim(0); index_t in_height = batch_tensor->dim(2); @@ -120,10 +120,11 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { index_t out_height = space_tensor->dim(2); index_t out_width = space_tensor->dim(3); - // 32k/sizeof(float)/out_width/block_shape - index_t - block_h_size = - std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); + // 32k/sizeof(T)/out_width/block_shape + index_t block_h_size = std::max( + static_cast(1), + static_cast( + (32 / sizeof(T)) * 1024 / block_shape_w / out_width)); // make channel outter loop so we can make best use of cache for (index_t c = 0; c < channels; ++c) { @@ -153,9 +154,9 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { (out_width + pad_left - tile_w + block_shape_w - 1) / block_shape_w); - const float *input_base = + const T *input_base = input_data + (in_b * channels + c) * in_height * in_width; - float *output_base = + T *output_base = output_data + (b * channels + c) * out_height * out_width; index_t h = valid_h_start * block_shape_h + tile_h - pad_top; @@ -290,6 +291,9 @@ void RegisterBatchToSpaceND(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "BatchToSpaceND", + BatchToSpaceNDOp, DeviceType::CPU); + MACE_REGISTER_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp, DeviceType::CPU, uint8_t); diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 54a0f2710ad7ca8430e26d9661baf6a86b58c315..4d476ea34df2b9c045039fc6f10bd9722af18565 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -33,15 +33,15 @@ namespace ops { template class BiasAddOp; -template<> -class BiasAddOp : public Operation { +template +class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), has_data_format_(Operation::GetOptionalArg("has_data_format", 0)), bias_add_delegator_(delegator::BiasAdd::Create( context->workspace(), - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -67,9 +67,9 @@ class BiasAddOp : public Operation { Tensor::MappingGuard bias_mapper(bias); Tensor::MappingGuard output_mapper(output); - const float *input_ptr = input->data(); - const float *bias_ptr = bias->data(); - float *output_ptr = output->mutable_data(); + const T *input_ptr = input->data(); + const T *bias_ptr = bias->data(); + T *output_ptr = output->mutable_data(); const std::vector &shape = input->shape(); const index_t channels = *shape.rbegin(); @@ -162,6 +162,7 @@ class BiasAddOp : public Operation { void RegisterBiasAdd(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp); MACE_REGISTER_OP_CONDITION( op_registry, diff --git a/mace/ops/cast.cc b/mace/ops/cast.cc index dfa42a7600de0f7ebc0a4e6cc8dac7c12c783db8..361beced19969c8c21efeb0a09cec5baa9751175 100644 --- a/mace/ops/cast.cc +++ b/mace/ops/cast.cc @@ -56,10 +56,8 @@ class CastOp : public Operation { }; void RegisterCast(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Cast", CastOp, - DeviceType::CPU, float); - MACE_REGISTER_OP(op_registry, "Cast", CastOp, - DeviceType::CPU, int32_t); + MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, int32_t); #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, float16_t); diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index cddda38db323d70151093bcf9a84446f6f3cc5e4..205ddc5ff48755a2a3837f019f4d1232bae7871e 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -64,7 +64,7 @@ class ChannelShuffleOp : public Operation { const T *in_ptr = input_ptr + b * batch_size + (g * channels_per_group + idx) * image_size; T *out_ptr = output_ptr + b * batch_size + c * image_size; - memcpy(out_ptr, in_ptr, image_size * sizeof(float)); + memcpy(out_ptr, in_ptr, image_size * sizeof(T)); } } @@ -102,6 +102,8 @@ class ChannelShuffleOp : public Operation { void RegisterChannelShuffle(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ChannelShuffle", ChannelShuffleOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ChannelShuffle", + ChannelShuffleOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp); diff --git a/mace/ops/common/lstm.cc b/mace/ops/common/lstm.cc deleted file mode 100644 index cde148e1560168b7ddd9138a7fb4847663bc9de2..0000000000000000000000000000000000000000 --- a/mace/ops/common/lstm.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Details are in -// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164 - -#include "mace/ops/common/lstm.h" -#include "mace/utils/math.h" - -namespace mace { -namespace ops { - -void LSTMNonlinearKernel(const OpContext *context, - const float *input_data, - const float *prev_data, - const float *scale_data, - const float *params_data, - bool embed_scales, - index_t params_stride, - index_t cell_dim, - float *output_cell, - float *output_data) { - float i_scale = (embed_scales && scale_data) ? scale_data[0] : 1.0f; - float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f; - float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { - if (prev_data == nullptr) { - for (index_t c = start; c < end; c += step) { - float i_part = input_data[c]; - float c_part = input_data[c + 2 * cell_dim]; - float o_part = input_data[c + 3 * cell_dim]; - float w_oc = params_data[c + params_stride * 2]; - float i_t = ScalarSigmoid(i_part); - float c_t = i_t * i_scale * std::tanh(c_part); - float o_t = ScalarSigmoid(o_part + w_oc * c_t); - float m_t = o_t * o_scale * std::tanh(c_t); - output_cell[c] = c_t; - output_data[c] = m_t; - } - } else { - for (index_t c = start; c < end; c += step) { - float i_part = input_data[c]; - float f_part = input_data[c + cell_dim]; - float c_part = input_data[c + 2 * cell_dim]; - float o_part = input_data[c + 3 * cell_dim]; - float c_prev = prev_data[c]; - float w_ic = params_data[c]; - float w_fc = params_data[c + params_stride]; - float w_oc = params_data[c + params_stride * 2]; - float i_t = ScalarSigmoid(i_part + w_ic * c_prev); - float f_t = ScalarSigmoid(f_part + w_fc * c_prev); - float c_t = - f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part); - float o_t = ScalarSigmoid(o_part + w_oc * c_t); - float m_t = o_t * o_scale * std::tanh(c_t); - output_cell[c] = c_t; - output_data[c] = m_t; - } - } - }, 0, cell_dim, 1); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/common/lstm.h b/mace/ops/common/lstm.h index a22094e59abcc3b4e7331e7103ad12a49229786d..87377a84661fc9ded497ea7bea45331bfb306393 100644 --- a/mace/ops/common/lstm.h +++ b/mace/ops/common/lstm.h @@ -12,25 +12,77 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Details are in +// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164 + + #ifndef MACE_OPS_COMMON_LSTM_H_ #define MACE_OPS_COMMON_LSTM_H_ #include "mace/core/ops/op_context.h" #include "mace/core/types.h" +#include "mace/utils/math.h" namespace mace { namespace ops { -void LSTMNonlinearKernel(const OpContext *opContext, - const float *input_data, - const float *prev_data, - const float *scale_data, - const float *params_data, +template +void LSTMNonlinearKernel(const OpContext *context, + const T *input_data, + const T *prev_data, + const T *scale_data, + const T *params_data, bool embed_scales, index_t params_stride, index_t cell_dim, - float *output_cell, - float *output_data); + T *output_cell, + T *output_data) { + float i_scale = + (embed_scales && scale_data) ? static_cast(scale_data[0]) : 1.0f; + float f_scale = + (embed_scales && scale_data) ? static_cast(scale_data[1]) : 1.0f; + float o_scale = + (embed_scales && scale_data) ? static_cast(scale_data[2]) : 1.0f; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (prev_data == nullptr) { + for (index_t c = start; c < end; c += step) { + float i_part = input_data[c]; + float c_part = input_data[c + 2 * cell_dim]; + float o_part = input_data[c + 3 * cell_dim]; + float w_oc = params_data[c + params_stride * 2]; + float i_t = ScalarSigmoid(i_part); + float c_t = i_t * i_scale * std::tanh(c_part); + float o_t = ScalarSigmoid(o_part + w_oc * c_t); + float m_t = o_t * o_scale * std::tanh(c_t); + output_cell[c] = c_t; + output_data[c] = m_t; + } + } else { + for (index_t c = start; c < end; c += step) { + float i_part = input_data[c]; + float f_part = input_data[c + cell_dim]; + float c_part = input_data[c + 2 * cell_dim]; + float o_part = input_data[c + 3 * cell_dim]; + float c_prev = prev_data[c]; + float w_ic = params_data[c]; + float w_fc = params_data[c + params_stride]; + float w_oc = params_data[c + params_stride * 2]; + float i_t = ScalarSigmoid(i_part + w_ic * c_prev); + float f_t = ScalarSigmoid(f_part + w_fc * c_prev); + float c_t = + f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part); + float o_t = ScalarSigmoid(o_part + w_oc * c_t); + float m_t = o_t * o_scale * std::tanh(c_t); + output_cell[c] = c_t; + output_data[c] = m_t; + } + } + }, 0, cell_dim, 1); +} } // namespace ops } // namespace mace diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h index 8ff72df6cdd99d4969622f952ccd452f0fa89fa1..052fc0edcb4263cfa46c922ad4d998594f4651c7 100644 --- a/mace/ops/common/transpose.h +++ b/mace/ops/common/transpose.h @@ -26,10 +26,10 @@ namespace mace { namespace ops { -template +template void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, - const T *input, - T *output, + const SrcT *input, + DstT *output, const index_t height, const index_t width) { index_t image_size = height * width; @@ -50,11 +50,11 @@ void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, } template<> -inline void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, - const float *input, - float *output, - const index_t height, - const index_t width) { +inline void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, + const float *input, + float *output, + const index_t height, + const index_t width) { index_t image_size = height * width; thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { @@ -91,10 +91,10 @@ inline void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, }, 0, height, 1); } -template +template void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, - const T *input, - T *output, + const SrcT *input, + DstT *output, const index_t height, const index_t width) { index_t image_size = height * width; @@ -115,11 +115,11 @@ void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, } template<> -inline void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, - const float *input, - float *output, - const index_t height, - const index_t width) { +inline void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, + const float *input, + float *output, + const index_t height, + const index_t width) { index_t image_size = height * width; thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { @@ -155,15 +155,15 @@ inline void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, }, 0, height, 1); } -template +template MaceStatus Transpose(utils::ThreadPool *thread_pool, - const T *input, + const SrcT *input, const std::vector &input_shape, const std::vector &dst_dims, - T *output) { + DstT *output) { MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || - (input_shape.size() == 3 && dst_dims.size() == 3) || - (input_shape.size() == 4 && dst_dims.size() == 4), + (input_shape.size() == 3 && dst_dims.size() == 3) || + (input_shape.size() == 4 && dst_dims.size() == 4), "Only support 2D, 3D or 4D transpose"); std::vector output_shape; @@ -220,7 +220,6 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, index_t height = input_shape[1]; index_t width = input_shape[2]; index_t channel = input_shape[3]; - size_t channel_raw_size = channel * sizeof(T); index_t stride_i = height; index_t stride_j = width; index_t tile_size = std::max(static_cast(1), @@ -232,9 +231,11 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, index_t end_j = std::min(j + tile_size, width); for (index_t tile_i = i; tile_i < end_i; ++tile_i) { for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - memcpy(output + (tile_j * stride_i + tile_i) * channel, - input + (tile_i * stride_j + tile_j) * channel, - channel_raw_size); + auto output_ptr = output + (tile_j * stride_i + tile_i) * channel; + auto input_ptr = input + (tile_i * stride_j + tile_j) * channel; + for (index_t k = 0; k < channel; ++k) { + output_ptr[k] = input_ptr[k]; + } } } } @@ -296,14 +297,15 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, } }, 0, batch, 1, 0, height, tile_size, 0, width, tile_size); } else if (dst_dims == std::vector{1, 0, 2}) { - size_t width_raw_size = width * sizeof(T); thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0, index_t start1, index_t end1, index_t step1) { for (int i = start0; i < end0; i += step0) { for (int j = start1; j < end1; j += step1) { - memcpy(output + (j * batch + i) * width, - input + (i * height + j) * width, - width_raw_size); + auto output_ptr = output + (j * batch + i) * width; + auto input_ptr = input + (i * height + j) * width; + for (index_t k = 0; k < width; ++k) { + output_ptr[k] = input_ptr[k]; + } } } }, 0, batch, 1, 0, height, 1); diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 65f05fdc63418d6a3e31cecd9700f6dd2055a02e..f793b68ffcc703b8ae34bca91e0611210f45da6b 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -225,6 +225,8 @@ class ConcatOp : public ConcatOpBase { void RegisterConcat(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Concat", ConcatOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, DeviceType::CPU, int32_t); diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 83da3f85c7185f2004248e5cd2ce3697c1ce58b1..d40770ea40475810bfdd2c5bb8450d4101c0ad08 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -53,15 +53,16 @@ namespace ops { template class Conv2dOp; -template<> -class Conv2dOp : public ConvPool2dOpBase { +template +class Conv2dOp : public ConvPool2dOpBase { public: explicit Conv2dOp(OpConstructContext *context) : ConvPool2dOpBase(context), activation_delegator_( delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + T, kCpuImplType), delegator::ActivationParam( ops::StringToActivationType( Operation::GetOptionalArg("activation", @@ -71,7 +72,7 @@ class Conv2dOp : public ConvPool2dOpBase { 0.0f)))), bias_add_delegator_(delegator::BiasAdd::Create( context->workspace(), - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -81,9 +82,8 @@ class Conv2dOp : public ConvPool2dOpBase { Tensor *output = this->Output(OUTPUT); if (conv2d_delegator_ == nullptr) { - std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, General); - if (MACE_CPU_IMPL_TYPE == NEON) { + auto tag = MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, T, kCpuImplType); + if (kCpuImplType == NEON) { // the following params are used to decide which conv delegator to use const index_t stride_h = strides_[0]; const index_t stride_w = strides_[1]; @@ -98,63 +98,63 @@ class Conv2dOp : public ConvPool2dOpBase { // We do not support changeable filter for now. if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K1x1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K1x1); } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { if (input_channels >= 8 && channels >= 8) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3Winograd); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K3x3Winograd); } else { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S1); } } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S2); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S2); } else if (filter_h == 5 && filter_w == 5 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K5x5S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K5x5S1); } else if (filter_h == 7 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K7x7S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K7x7S1); } else if (filter_h == 7 && filter_w == 7 && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K7x7S2); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K7x7S2); } else if (filter_h == 7 && filter_w == 7 && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K7x7S3); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K7x7S3); } else if (filter_h == 1 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K1x7S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K1x7S1); } else if (filter_h == 7 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K7x1S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K7x1S1); } else if (filter_h == 1 && filter_w == 15 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K1x15S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K1x15S1); } else if (filter_h == 15 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K15x1S1); + tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T, + kCpuImplType, K15x1S1); } } delegator::Conv2dParam param(strides_, dilations_, @@ -497,8 +497,8 @@ class Conv2dOp : public ConvPool2dOpBase { #endif // MACE_ENABLE_OPENCL void RegisterConv2D(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 5be823453ebd852ae24edbcdd1a33fa2893af03e..bc418f2b67671deb4945f0856dbb69491273f54b 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -100,8 +100,7 @@ class CropOp : public Operation { input_data + (b + offsets[0]) * in_img_size + (c + offsets[1]) * in_hw + (h + offsets[2]) * input_shape[3] + offsets[3]; - memcpy(out_ptr, in_ptr_bch, - output_shape[3] * sizeof(T)); + memcpy(out_ptr, in_ptr_bch, output_shape[3] * sizeof(T)); } } } @@ -134,8 +133,8 @@ class CropOp : public Operation { #endif // MACE_ENABLE_OPENCL void RegisterCrop(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Crop", CropOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Crop", CropOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp); MACE_REGISTER_OP_CONDITION( op_registry, diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc index b1cb58f0b268da6df2b98397a3a4d005d7706f01..41230ea25b3880694ce370096d9fd51c636984d7 100644 --- a/mace/ops/cumsum.cc +++ b/mace/ops/cumsum.cc @@ -143,8 +143,8 @@ class CumsumOp : public Operation { }; void RegisterCumsum(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 1e68449bdf1b36d9cbf7566a19f03a3194821069..a15395d2fc9c55009d650cb314d2e30abf3ced6c 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -46,20 +46,21 @@ const std::vector kDeconv2dStrides = {1, 1}; template class Deconv2dOp; -template<> -class Deconv2dOp : public Deconv2dOpBase { +template +class Deconv2dOp : public Deconv2dOpBase { public: explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context), activation_delegator_( delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + T, kCpuImplType), delegator::ActivationParam(activation_, relux_max_limit_, leakyrelu_coefficient_))), bias_add_delegator_(delegator::BiasAdd::Create( context->workspace(), - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -80,11 +81,9 @@ class Deconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - if (deconv2d_delegator_ == nullptr) { - std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, General); - if (MACE_CPU_IMPL_TYPE == NEON) { + auto tag = MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, T, kCpuImplType); + if (kCpuImplType == NEON) { const index_t kernel_h = filter->dim(2); const index_t kernel_w = filter->dim(3); @@ -104,23 +103,23 @@ class Deconv2dOp : public Deconv2dOpBase { strides_[0] == strides_[1] && strides_[0] == 2; if (use_neon_2x2_s1) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K2x2S1); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K2x2S1); } else if (use_neon_2x2_s2) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K2x2S2); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K2x2S2); } else if (use_neon_3x3_s1) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S1); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S1); } else if (use_neon_3x3_s2) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S2); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S2); } else if (use_neon_4x4_s1) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S1); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S1); } else if (use_neon_4x4_s2) { - tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S2); + tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S2); } } delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_, @@ -236,8 +235,8 @@ class Deconv2dOp : public Deconv2dOpBase { #endif // MACE_ENABLE_OPENCL void RegisterDeconv2D(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OP_CONDITION( diff --git a/mace/ops/delegator/conv_2d.h b/mace/ops/delegator/conv_2d.h index 9ff85f6dacd1123cfbd02a12f90990c6750d5c37..7b8b735fede16bb2720146694dafcefd41a0bd45 100644 --- a/mace/ops/delegator/conv_2d.h +++ b/mace/ops/delegator/conv_2d.h @@ -27,7 +27,6 @@ namespace mace { namespace ops { enum ConvType { - General, K1x1, K1x7S1, K7x1S1, diff --git a/mace/ops/delegator/deconv_2d.h b/mace/ops/delegator/deconv_2d.h index 856f3595bcd37b86dc3c65d2c48a70a4901f3b47..45401551540af24f4e2585334fb3c3346f1deefc 100644 --- a/mace/ops/delegator/deconv_2d.h +++ b/mace/ops/delegator/deconv_2d.h @@ -21,12 +21,12 @@ #include "mace/core/ops/op_context.h" #include "mace/core/ops/op_delegator.h" #include "mace/core/registry/op_delegator_registry.h" +#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { enum DeconvType { - General, K2x2S1, K2x2S2, K3x3S1, diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 9484fdde2964952389e3402d2ffb7323076a153e..865dc4f48c1502a3cf3d2e04d2f68c6f88df71c0 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -28,8 +28,8 @@ namespace ops { template class DepthToSpaceOp; -template<> -class DepthToSpaceOp : public Operation { +template +class DepthToSpaceOp : public Operation { public: explicit DepthToSpaceOp(OpConstructContext *context) : Operation(context), @@ -59,8 +59,8 @@ class DepthToSpaceOp : public Operation { Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard output_guard(output); - const float *input_ptr = input->data(); - float *output_ptr = output->mutable_data(); + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); for (index_t b = 0; b < batch_size; ++b) { for (index_t d = 0; d < output_depth; ++d) { @@ -188,6 +188,8 @@ class DepthToSpaceOp : public Operation { void RegisterDepthToSpace(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "DepthToSpace", DepthToSpaceOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "DepthToSpace", + DepthToSpaceOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "DepthToSpace", diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 23cf8e046fa82edbab28cbddfb57a99d721c61ac..a6372f514e4d72ea10ad36c7f996585ee3104d9a 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -63,20 +63,21 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase { template class DepthwiseConv2dOp; -template<> -class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { +template +class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context), activation_delegator_( delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + T, kCpuImplType), delegator::ActivationParam(activation_, relux_max_limit_, leakyrelu_coefficient_))), bias_add_delegator_(delegator::BiasAdd::Create( context->workspace(), - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -93,9 +94,9 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { MACE_CHECK_NOTNULL(output); if (depthwise_conv2d_delegator_ == nullptr) { - std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, - REF, General); - if (MACE_CPU_IMPL_TYPE == NEON) { + auto tag = MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, + T, ImplType::REF); + if (kCpuImplType == NEON) { const index_t filter_h = filter->dim(2); const index_t filter_w = filter->dim(3); const index_t stride_h = strides_[0]; @@ -104,13 +105,13 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { const index_t dilation_w = dilations_[1]; if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S1); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S1); } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S2); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S2); } } delegator::Conv2dParam param(strides_, dilations_, @@ -347,7 +348,8 @@ class DepthwiseConv2dOp #ifdef MACE_ENABLE_OPENCL template<> -class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { +class DepthwiseConv2dOp : + public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { @@ -402,6 +404,8 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { void RegisterDepthwiseConv2d(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index f09261d6541b4b771baa1a2fe1ac85fad49e5b7d..5e1311076f27cfc8beff232dcae591aa818545c4 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -44,8 +44,8 @@ const std::vector kDepthwiseStrides = {1, 1}; template class DepthwiseDeconv2dOp; -template<> -class DepthwiseDeconv2dOp +template +class DepthwiseDeconv2dOp : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) @@ -53,12 +53,13 @@ class DepthwiseDeconv2dOp activation_delegator_( delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, + T, kCpuImplType), delegator::ActivationParam(activation_, relux_max_limit_, leakyrelu_coefficient_))), bias_add_delegator_(delegator::BiasAdd::Create( context->workspace(), - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -75,7 +76,7 @@ class DepthwiseDeconv2dOp bool is_depthwise = group_ == in_channels; if (depthwise_deconv2d_delegator_ == nullptr) { - if (MACE_CPU_IMPL_TYPE == NEON) { + if (kCpuImplType == NEON) { const index_t kernel_h = filter->dim(2); const index_t kernel_w = filter->dim(3); bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && @@ -88,20 +89,20 @@ class DepthwiseDeconv2dOp strides_[0] == strides_[1] && strides_[0] == 2; if (is_depthwise) { - std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, General); + auto tag = MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, T, + kCpuImplType); if (use_neon_3x3_s1) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S1); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S1); } else if (use_neon_3x3_s2) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S2); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S2); } else if (use_neon_4x4_s1) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S1); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S1); } else if (use_neon_4x4_s2) { - tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S2); + tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S2); } delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides, paddings_, padding_type_, @@ -109,20 +110,20 @@ class DepthwiseDeconv2dOp depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create( context->workspace(), tag, param); } else { - std::string tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, General); + auto tag = MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, T, + kCpuImplType); if (use_neon_3x3_s1) { - tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S1); + tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S1); } else if (use_neon_3x3_s2) { - tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K3x3S2); + tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T, + kCpuImplType, K3x3S2); } else if (use_neon_4x4_s1) { - tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S1); + tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S1); } else if (use_neon_4x4_s2) { - tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, - MACE_CPU_IMPL_TYPE, K4x4S2); + tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T, + kCpuImplType, K4x4S2); } delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides, paddings_, padding_type_, @@ -218,6 +219,8 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "DepthwiseDeconv2d", + DepthwiseDeconv2dOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp); } diff --git a/mace/ops/dynamic_lstm.cc b/mace/ops/dynamic_lstm.cc index 014f23c00d41fe283bc21c23d17bb5b53825fdee..b599c3440c129f96b10a7ae2e389908e634a2f63 100644 --- a/mace/ops/dynamic_lstm.cc +++ b/mace/ops/dynamic_lstm.cc @@ -75,7 +75,7 @@ class DynamicLSTMOp : public Operation { Operation::GetRepeatedArgs("out_cache_indexes")), gemv_(delegator::Gemv::Create( context->workspace(), - MACE_DELEGATOR_KEY(Gemv, CPU, T, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} inline void Validate() { @@ -107,14 +107,14 @@ class DynamicLSTMOp : public Operation { ") should be greater than zero."); } - void UpdateCell(float *cell_data, + void UpdateCell(T *cell_data, const index_t cell_dim, const float scale) { if (std::abs(scale - 1.f) < 1e-6) return; const index_t rounds = cell_dim / 4; for (index_t i = 0; i < rounds * 4; i += 4) { -#ifdef MACE_ENABLE_NEON +#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16) float32x4_t in_vec = vld1q_f32(cell_data + i); float32x4_t scale_vec = vdupq_n_f32(scale); in_vec = vmulq_f32(in_vec, scale_vec); @@ -130,18 +130,18 @@ class DynamicLSTMOp : public Operation { } } - void CopyAndUpdateCell(float *src_data, + void CopyAndUpdateCell(T *src_data, const index_t cell_dim, const float scale, - float *cell_data) { + T *cell_data) { if (std::abs(scale - 1.f) < 1e-6) { - memcpy(cell_data, src_data, cell_dim * sizeof(float)); + memcpy(cell_data, src_data, cell_dim * sizeof(T)); return; } const index_t rounds = cell_dim / 4; for (index_t i = 0; i < rounds * 4; i += 4) { -#ifdef MACE_ENABLE_NEON +#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16) float32x4_t in_vec = vld1q_f32(src_data + i); float32x4_t scale_vec = vdupq_n_f32(scale); in_vec = vmulq_f32(in_vec, scale_vec); @@ -222,49 +222,54 @@ class DynamicLSTMOp : public Operation { << " output_dim: " << output_dim; const index_t affine_a_in_size = - PadAlignSize(affine_a_in_dim * sizeof(float)); + PadAlignSize(affine_a_in_dim * sizeof(T)); const index_t affine_a_out_size = - PadAlignSize(affine_a_out_dim * sizeof(float)); + PadAlignSize(affine_a_out_dim * sizeof(T)); const index_t affine_b_in_size = - PadAlignSize(affine_b_in_dim * sizeof(float)); + PadAlignSize(affine_b_in_dim * sizeof(T)); const index_t affine_b_out_size = - PadAlignSize(affine_b_out_dim * sizeof(float)); + PadAlignSize(affine_b_out_dim * sizeof(T)); const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_); const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_); const index_t out_buf_size = - PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float)); + PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(T)); const index_t cell_buf_size = - PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(float)); + PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(T)); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(affine_a_in_size + affine_a_out_size + affine_b_in_size + affine_b_out_size + out_buf_size + cell_buf_size); - Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT); + Tensor prev_out_buf(scratch->Scratch(out_buf_size), DataTypeToEnum::v()); prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_}); - float *prev_out_buf_data = prev_out_buf.mutable_data(); + T *prev_out_buf_data = prev_out_buf.mutable_data(); - Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT); + Tensor prev_cell_buf( + scratch->Scratch(cell_buf_size), DataTypeToEnum::v()); prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_}); - float *prev_cell_buf_data = prev_cell_buf.mutable_data(); + T *prev_cell_buf_data = prev_cell_buf.mutable_data(); - Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT); + Tensor affine_a_in( + scratch->Scratch(affine_a_in_size), DataTypeToEnum::v()); affine_a_in.Reshape({1, affine_a_in_dim}); - float *affine_a_in_data = affine_a_in.mutable_data(); + T *affine_a_in_data = affine_a_in.mutable_data(); - Tensor affine_a_out(scratch->Scratch(affine_a_out_size), DT_FLOAT); + Tensor affine_a_out( + scratch->Scratch(affine_a_out_size), DataTypeToEnum::v()); affine_a_out.Reshape({1, affine_a_out_dim}); - float *affine_a_out_data = affine_a_out.mutable_data(); + T *affine_a_out_data = affine_a_out.mutable_data(); - Tensor affine_b_in(scratch->Scratch(affine_b_in_size), DT_FLOAT); + Tensor affine_b_in( + scratch->Scratch(affine_b_in_size), DataTypeToEnum::v()); affine_b_in.Reshape({1, affine_b_in_dim}); - float *affine_b_in_data = affine_b_in.mutable_data(); + T *affine_b_in_data = affine_b_in.mutable_data(); - Tensor affine_b_out(scratch->Scratch(affine_b_out_size), DT_FLOAT); + Tensor affine_b_out( + scratch->Scratch(affine_b_out_size), DataTypeToEnum::v()); affine_b_out.Reshape({1, affine_b_out_dim}); - float *affine_b_out_data = affine_b_out.mutable_data(); + T *affine_b_out_data = affine_b_out.mutable_data(); Tensor *output = this->Output(OUTPUT); Tensor *out_cache = this->Output(OUT_CACHE); @@ -293,31 +298,31 @@ class DynamicLSTMOp : public Operation { Tensor::MappingGuard out_cache_guard(out_cache); Tensor::MappingGuard cell_cache_guard(cell_cache); - const float *input_data = input->data(); - const float *prev_out_data = prev_out->data(); - const float *prev_cell_data = prev_cell->data(); - const float *lstm_params_data = lstm_params->data(); - float *output_data = output->mutable_data(); - float *out_cache_data = out_cache->mutable_data(); - float *cell_cache_data = cell_cache->mutable_data(); + const T *input_data = input->data(); + const T *prev_out_data = prev_out->data(); + const T *prev_cell_data = prev_cell->data(); + const T *lstm_params_data = lstm_params->data(); + T *output_data = output->mutable_data(); + T *out_cache_data = out_cache->mutable_data(); + T *cell_cache_data = cell_cache->mutable_data(); for (int b = 0; b < batch; ++b) { memcpy(prev_out_buf_data, prev_out_data + b * out_buf_chunk * prev_out_dim_, - sizeof(float) * out_buf_chunk * prev_out_dim_); + sizeof(T) * out_buf_chunk * prev_out_dim_); memcpy(prev_cell_buf_data, prev_cell_data + b * cell_buf_chunk * prev_cell_dim_, - sizeof(float) * cell_buf_chunk * prev_cell_dim_); + sizeof(T) * cell_buf_chunk * prev_cell_dim_); for (index_t i = 0; i < out_chunk; ++i) { - const float *input_ptr = + const T *input_ptr = input_data + (b * chunk + forward_indexes_[i]) * input_dim; - float *output_ptr = output_data + (b * out_chunk + i) * output_dim; + T *output_ptr = output_data + (b * out_chunk + i) * output_dim; // Append - memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float)); + memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(T)); memcpy(affine_a_in_data + input_dim, prev_out_buf_data + i % out_buf_chunk * prev_out_dim_, - prev_out_dim_ * sizeof(float)); + prev_out_dim_ * sizeof(T)); // Affine gemv_->Compute(context, weights_a, @@ -330,11 +335,11 @@ class DynamicLSTMOp : public Operation { false, &affine_a_out); // Prepare LSTMNonlinear input and output pointer - float *lstm_cell_ptr = + T *lstm_cell_ptr = prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_; - float *curr_cell_ptr = lstm_cell_ptr; + T *curr_cell_ptr = lstm_cell_ptr; // LSTMNonlinear - LSTMNonlinearKernel(context, + LSTMNonlinearKernel(context, affine_a_out_data, lstm_cell_ptr, nullptr, @@ -359,9 +364,9 @@ class DynamicLSTMOp : public Operation { // Output memcpy(output_ptr, affine_b_out_data, - output_dim * sizeof(float)); + output_dim * sizeof(T)); // Update - float *curr_out_ptr = + T *curr_out_ptr = prev_out_buf_data + i % out_buf_chunk * prev_out_dim_; CopyAndUpdateCell(affine_b_out_data + prev_out_offset_, prev_out_dim_, @@ -371,22 +376,22 @@ class DynamicLSTMOp : public Operation { for (size_t k = 0; k < out_cache_indexes_.size(); ++k) { if (i == out_cache_indexes_[k]) { const index_t idx = b * out_buf_chunk + k; - float *out_cache_ptr = + T *out_cache_ptr = out_cache_data + idx * prev_out_dim_; memcpy(out_cache_ptr, curr_out_ptr, - sizeof(float) * prev_out_dim_); + sizeof(T) * prev_out_dim_); } } for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) { if (i == cell_cache_indexes_[k]) { const index_t idx = b * cell_buf_chunk + k; - float *cell_cache_ptr = + T *cell_cache_ptr = cell_cache_data + idx * prev_cell_dim_; memcpy(cell_cache_ptr, curr_cell_ptr, - sizeof(float) * prev_cell_dim_); + sizeof(T) * prev_cell_dim_); } } } @@ -416,6 +421,8 @@ class DynamicLSTMOp : public Operation { void RegisterDynamicLSTM(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "DynamicLSTM", DynamicLSTMOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index e4d5a74b9bf518e10de8d499924733e38edafff6..af447c941288edef181eab6b916022ff16d34af0 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -1073,7 +1073,7 @@ class EltwiseOp : public Operation { "scalar_input_index", 1)), eltwise_delegator_(delegator::Eltwise::Create( context->workspace(), - MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, kCpuImplType), delegator::EltwiseParam( static_cast( Operation::GetOptionalArg( @@ -1175,8 +1175,8 @@ class EltwiseOp : public Operation { #endif // MACE_ENABLE_OPENCL void RegisterEltwise(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, int32_t); diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc index cc3426c3cab7e27a3cb4965d362c147acaf7a428..99005702927fb8dd4039b9d57301468bf6331c44 100644 --- a/mace/ops/expand_dims.cc +++ b/mace/ops/expand_dims.cc @@ -57,6 +57,8 @@ class ExpandDimsOp : public Operation { void RegisterExpandDims(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ExpandDims", ExpandDimsOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, DeviceType::CPU, int32_t); diff --git a/mace/ops/extract_pooling.cc b/mace/ops/extract_pooling.cc index 765fc58ebc6b4fb2c92286cc9651e2c239e04649..82699d4352ab4dc1e01155ebd35749e49813b8ab 100644 --- a/mace/ops/extract_pooling.cc +++ b/mace/ops/extract_pooling.cc @@ -89,15 +89,16 @@ class ExtractPoolingOp : public Operation { output_shape[dim_size - 2] = output_chunk; MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float)); + const index_t extract_out_size = PadAlignSize(output_dim * sizeof(T)); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(extract_out_size); - Tensor extract_out(scratch->Scratch(extract_out_size), DT_FLOAT); + Tensor extract_out( + scratch->Scratch(extract_out_size), DataTypeToEnum::v()); extract_out.Reshape({1, output_dim}); extract_out.Clear(); - float *extract_out_data = extract_out.mutable_data(); + T *extract_out_data = extract_out.mutable_data(); Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_output(output); @@ -162,7 +163,7 @@ class ExtractPoolingOp : public Operation { }, 0, input_dim, 1); } memcpy(output_data + (b * output_chunk + i) * output_dim, - extract_out_data, output_dim * sizeof(float)); + extract_out_data, output_dim * sizeof(T)); } } @@ -180,6 +181,8 @@ class ExtractPoolingOp : public Operation { void RegisterExtractPooling(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ExtractPooling", ExtractPoolingOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/fill.cc b/mace/ops/fill.cc index 0917674b18c854609617e4e6690c74542b23dc7e..46f772dd39df2638869bc0cc4ea44d98391f75c4 100644 --- a/mace/ops/fill.cc +++ b/mace/ops/fill.cc @@ -22,8 +22,8 @@ namespace ops { template class FillOp; -template <> -class FillOp : public Operation { +template +class FillOp : public Operation { public: explicit FillOp(OpConstructContext *context) : Operation(context) {} @@ -46,11 +46,11 @@ class FillOp : public Operation { } Tensor::MappingGuard value_guard(value); - const float *value_data = value->data(); + const T *value_data = value->data(); MACE_RETURN_IF_ERROR(output->Resize(output_shape)); Tensor::MappingGuard output_guard(output); - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); std::fill(output_data, output_data + output->size(), *value_data); @@ -65,6 +65,7 @@ class FillOp : public Operation { void RegisterFill(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Fill", FillOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Fill", FillOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index b037488837e679b8fbf47a8363f5e17c9d4bca42..f0c83a25b6577ef633346d75d7d38e2d2bac107a 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -56,20 +56,20 @@ class FullyConnectedOpBase : public Operation { template class FullyConnectedOp; -template<> -class FullyConnectedOp : public FullyConnectedOpBase { +template +class FullyConnectedOp : public FullyConnectedOpBase { public: explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context), activation_delegator_(delegator::Activation::Create( context->workspace(), - MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType), delegator::ActivationParam(activation_, relux_max_limit_, leakyrelu_coefficient_))), gemv_(delegator::Gemv::Create( context->workspace(), - MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -127,7 +127,7 @@ class FullyConnectedOp : FullyConnectedOpBase(context), gemv_(delegator::Gemv::Create( context->workspace(), - MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -226,6 +226,8 @@ class FullyConnectedOp : public FullyConnectedOpBase { void RegisterFullyConnected(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "FullyConnected", FullyConnectedOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "FullyConnected", + FullyConnectedOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "FullyConnected", diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc index a112d91f94a24b9e8be455e727e8cf87f8c46e6c..eb97000b54998b78fc7c2eb2217b0fd6b2a0b776 100644 --- a/mace/ops/gather.cc +++ b/mace/ops/gather.cc @@ -89,6 +89,7 @@ class GatherOp : public Operation { void RegisterGather(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Gather", GatherOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Gather", GatherOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Gather", GatherOp, diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc index ac915cd848558300b8cd59770f663e0a2e856727..ff196e0a1e7ed4a6890a2446149d1f5d3398fc9b 100644 --- a/mace/ops/identity.cc +++ b/mace/ops/identity.cc @@ -36,6 +36,8 @@ class IdentityOp : public Operation { void RegisterIdentity(OpRegistry *op_registry) { MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "Identity", IdentityOp, + DeviceType::CPU); MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, DeviceType::CPU, int32_t); #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/ifdefined.cc b/mace/ops/ifdefined.cc index 84a2831609bec4a4c5ef455834f29812f30848ec..f3a9226a60a77d2857cb231ca9bc6578c7a98e59 100644 --- a/mace/ops/ifdefined.cc +++ b/mace/ops/ifdefined.cc @@ -166,6 +166,7 @@ class IfDefinedOp : public Operation { void RegisterIfDefined(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "IfDefined", IfDefinedOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc index f29056fec26989b363e532440da982c55866e1eb..e9ffad3e378c28ca3b0d4ec6818ff11858dceddb 100644 --- a/mace/ops/infer_conv2d_shape.cc +++ b/mace/ops/infer_conv2d_shape.cc @@ -105,6 +105,8 @@ class InferConv2dShapeOp : public Operation { void RegisterInferConv2dShape(OpRegistry *op_registry) { MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", InferConv2dShapeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::CPU); MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", InferConv2dShapeOp, DeviceType::CPU, int32_t); #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/kaldi_batch_norm.cc b/mace/ops/kaldi_batch_norm.cc index ed05064faabe77db2feeef4f1fcb24a35fb5970c..f9cf63530d481ff65a92aa5ae141efb9c883f26f 100644 --- a/mace/ops/kaldi_batch_norm.cc +++ b/mace/ops/kaldi_batch_norm.cc @@ -28,8 +28,8 @@ namespace ops { template class KaldiBatchNormOp; -template <> -class KaldiBatchNormOp : public Operation { +template +class KaldiBatchNormOp : public Operation { public: explicit KaldiBatchNormOp(OpConstructContext *context) : Operation(context), @@ -40,13 +40,13 @@ class KaldiBatchNormOp : public Operation { test_mode_(static_cast( Operation::GetOptionalArg("test_mode", 0))) {} - void CalculateMeanVar(const float *input_data, + void CalculateMeanVar(const T *input_data, index_t length, index_t stride, float mean_scale, float var_scale, - float *mean_data, - float *var_data) { + T *mean_data, + T *var_data) { float mean_value = 0.f; float var_value = 0.f; for (index_t i = 0; i < length; ++i) { @@ -84,8 +84,8 @@ class KaldiBatchNormOp : public Operation { Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); - const float *input_data = input->data(); - float *output_data = output->mutable_data(); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); @@ -102,8 +102,8 @@ class KaldiBatchNormOp : public Operation { && scale->size() == block_dim_); Tensor::MappingGuard scale_guard(scale); Tensor::MappingGuard offset_guard(offset); - const float *scale_data = scale->data(); - const float *offset_data = offset->data(); + const T *scale_data = scale->data(); + const T *offset_data = offset->data(); thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, index_t start1, index_t end1, index_t step1) { @@ -116,18 +116,18 @@ class KaldiBatchNormOp : public Operation { }, 0, num_rows, 1, 0, block_dim_, 1); } else { const index_t buf_size = - PadAlignSize(block_dim_ * sizeof(float)); + PadAlignSize(block_dim_ * sizeof(T)); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(2 * buf_size); - Tensor mean(scratch->Scratch(buf_size), DT_FLOAT); + Tensor mean(scratch->Scratch(buf_size), DataTypeToEnum::v()); mean.Reshape({block_dim_}); - float *mean_data = mean.mutable_data(); + T *mean_data = mean.mutable_data(); - Tensor var(scratch->Scratch(buf_size), DT_FLOAT); + Tensor var(scratch->Scratch(buf_size), DataTypeToEnum::v()); var.Reshape({block_dim_}); - float *var_data = var.mutable_data(); + T *var_data = var.mutable_data(); float var_scale = 1.0f / (target_rms_ * target_rms_); float mean_scale = 1.0f / num_rows; @@ -171,6 +171,8 @@ class KaldiBatchNormOp : public Operation { void RegisterKaldiBatchNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc index 2ade126c8e7deba122dddfe4eff19d6b4bbc50bf..742360a8a9b60a8f66f3eb36bfcfe84a0c218f9d 100644 --- a/mace/ops/local_response_norm.cc +++ b/mace/ops/local_response_norm.cc @@ -24,8 +24,8 @@ namespace ops { template class LocalResponseNormOp; -template<> -class LocalResponseNormOp : public Operation { +template +class LocalResponseNormOp : public Operation { public: explicit LocalResponseNormOp(OpConstructContext *context) : Operation(context), @@ -49,8 +49,8 @@ class LocalResponseNormOp : public Operation { const index_t height = input->dim(2); const index_t width = input->dim(3); - const float *input_ptr = input->data(); - float *output_ptr = output->mutable_data(); + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); const index_t image_size = height * width; const index_t batch_size = channels * image_size; @@ -95,6 +95,8 @@ class LocalResponseNormOp : public Operation { void RegisterLocalResponseNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "LocalResponseNorm", LocalResponseNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "LocalResponseNorm", + LocalResponseNormOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/lpnorm.cc b/mace/ops/lpnorm.cc index a5c68a3575931911478461ffd802b16f5e8b79fb..02fc8f0962c20bfd5453f132d0d55f9b8766c3f8 100644 --- a/mace/ops/lpnorm.cc +++ b/mace/ops/lpnorm.cc @@ -35,8 +35,8 @@ namespace ops { template class LpNormOp; -template<> -class LpNormOp : public Operation { +template +class LpNormOp : public Operation { public: explicit LpNormOp(OpConstructContext *context) : Operation(context), @@ -59,8 +59,8 @@ class LpNormOp : public Operation { Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_output(output); - const auto *input_data = input->data(); - auto *output_data = output->mutable_data(); + const auto *input_data = input->data(); + auto *output_data = output->mutable_data(); utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); auto outer_loop = std::accumulate(input_shape.begin(), @@ -95,7 +95,8 @@ class LpNormOp : public Operation { for (index_t i = start; i < end; i += step) { auto output_data_base = output_data + inner_loop * i; norm_ptr[i] = std::accumulate(output_data_base, - output_data_base + inner_loop, 0.0f); + output_data_base + inner_loop, + static_cast(0.0f)); norm_ptr[i] = std::pow(norm_ptr[i], power); norm_ptr[i] += 1e-6; } @@ -151,6 +152,8 @@ class LpNormOp : public Operation { void RegisterLpNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "LpNorm", LpNormOp, + DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp); } diff --git a/mace/ops/lstm_nonlinear.cc b/mace/ops/lstm_nonlinear.cc index c975ae62da40b549105bf936653e8ebaa07694c3..ebec6afb3d635c35bc541fa36e7a28d112d2a2bc 100644 --- a/mace/ops/lstm_nonlinear.cc +++ b/mace/ops/lstm_nonlinear.cc @@ -70,27 +70,27 @@ class LSTMNonlinearOp : public Operation { Tensor::MappingGuard input_guard(input); Tensor::MappingGuard params_guard(params); Tensor::MappingGuard output_guard(output); - const float *input_data = input->data(); - const float *params_data = params->data(); - float *output_data = output->mutable_data(); + const T *input_data = input->data(); + const T *params_data = params->data(); + T *output_data = output->mutable_data(); for (int r = 0; r < num_rows; ++r) { - const float *input_row = input_data + r * input_cols; - const float *prev_row = input_row + 4 * cell_dim; - const float *scale_data = + const T *input_row = input_data + r * input_cols; + const T *prev_row = input_row + 4 * cell_dim; + const T *scale_data = embed_scales ? prev_row + cell_dim : nullptr; - float *output_cell = output_data + r * output_dim; - float *output_row = output_cell + cell_dim; - LSTMNonlinearKernel(context, - input_row, - prev_row, - scale_data, - params_data, - embed_scales, - params_stride, - cell_dim, - output_cell, - output_row); + T *output_cell = output_data + r * output_dim; + T *output_row = output_cell + cell_dim; + LSTMNonlinearKernel(context, + input_row, + prev_row, + scale_data, + params_data, + embed_scales, + params_stride, + cell_dim, + output_cell, + output_row); } return MaceStatus::MACE_SUCCESS; @@ -104,6 +104,8 @@ class LSTMNonlinearOp : public Operation { void RegisterLSTMNonlinear(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 75e278708514aa94c1783bde7bd9bd228d46a242..9f989b41bdc78db440d05be2a41b1b7f5191f770 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -92,18 +92,18 @@ class MatMulOpBase : public Operation { template class MatMulOp; -template<> -class MatMulOp : public MatMulOpBase { +template +class MatMulOp : public MatMulOpBase { public: explicit MatMulOp(OpConstructContext *context) : MatMulOpBase(context), gemm_(delegator::Gemm::Create( context->workspace(), - MACE_DELEGATOR_KEY(Gemm, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, T, kCpuImplType), delegator::GemmParam())), gemv_(delegator::Gemv::Create( context->workspace(), - MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType), DelegatorParam())) {} MaceStatus Run(OpContext *context) override { @@ -197,8 +197,8 @@ class MatMulOp : public MatMulOpBase { "bias' dim should be <= 2."); Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard c_guard(C); - const float *bias_data = bias->data(); - float *c_data = C->mutable_data(); + const T *bias_data = bias->data(); + T *c_data = C->mutable_data(); utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); @@ -599,6 +599,8 @@ class MatMulOp : public MatMulOpBase { void RegisterMatMul(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "MatMul", MatMulOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, diff --git a/mace/ops/mvnorm.cc b/mace/ops/mvnorm.cc index 09d3bb9a3cf0239c68ad857e698c16d5d89439e7..11bcff211d8cbc023f6e1e2ef11aafa146690d81 100644 --- a/mace/ops/mvnorm.cc +++ b/mace/ops/mvnorm.cc @@ -30,8 +30,8 @@ namespace ops { template class MVNormOp; -template<> -class MVNormOp : public Operation { +template +class MVNormOp : public Operation { public: explicit MVNormOp(OpConstructContext *context) : Operation(context), @@ -52,8 +52,8 @@ class MVNormOp : public Operation { Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_output(output); - const auto *input_data = input->data(); - auto *output_data = output->mutable_data(); + const auto *input_data = input->data(); + auto *output_data = output->mutable_data(); const auto input_size = input->size(); const auto outer_loop = @@ -71,7 +71,8 @@ class MVNormOp : public Operation { for (index_t i = start; i < end; i += step) { const auto offset = inner_loop * i; mean_ptr[i] = std::accumulate(input_data + offset, - input_data + offset + inner_loop, 0.0f); + input_data + offset + inner_loop, + static_cast(0.0f)); mean_ptr[i] /= inner_loop; } }, 0, outer_loop, 1); @@ -105,7 +106,8 @@ class MVNormOp : public Operation { for (index_t i = start; i < end; i += step) { auto output_data_base = output_data + inner_loop * i; mean_v_ptr[i] = std::accumulate(output_data_base, - output_data_base + inner_loop, 0.0f); + output_data_base + inner_loop, + static_cast(0.0f)); mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_; } }, 0, outer_loop, 1); @@ -169,6 +171,8 @@ class MVNormOp : public Operation { void RegisterMVNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "MVNorm", MVNormOp, + DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp); } diff --git a/mace/ops/one_hot.cc b/mace/ops/one_hot.cc index 77d18bca3b7635b794c29d0b5a21ae7219876fad..8b64b96eaff702225b20701005b58be7459d39e9 100644 --- a/mace/ops/one_hot.cc +++ b/mace/ops/one_hot.cc @@ -39,10 +39,10 @@ class OneHotOpBase : public Operation { int axis_; }; -template +template class OneHotOp; -template +template class OneHotOp : public OneHotOpBase { public: explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {} @@ -81,15 +81,17 @@ class OneHotOp : public OneHotOpBase { if (axis == 1) { for (index_t i = 0; i < batch; ++i) { for (index_t j = 0; j < depth_; ++j) { - output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ : - off_value_; + float input_value = input_ptr[i]; + output_ptr[i * depth_ + j] = + input_value == j ? on_value_ : off_value_; } } } else { for (index_t i = 0; i < depth_; ++i) { for (index_t j = 0; j < batch; ++j) { - output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ : - off_value_; + float input_value = input_ptr[j]; + output_ptr[i * batch + j] = + input_value == i ? on_value_ : off_value_; } } } @@ -110,7 +112,8 @@ class OneHotOp : public OneHotOpBase { if (left == 0) { for (index_t i = 0; i < length; ++i) { - **output_ptr = **input_ptr == i ? on_value_ : off_value_; + float input_value = **input_ptr; + **output_ptr = input_value == i ? on_value_ : off_value_; ++(*output_ptr); } @@ -130,7 +133,8 @@ class OneHotOp : public OneHotOpBase { if (left == 0) { for (index_t i = 0; i < length; ++i) { - **output_ptr = **input_ptr == test ? on_value_ : off_value_; + float input_value = **input_ptr; + **output_ptr = input_value == test ? on_value_ : off_value_; ++(*output_ptr); ++(*input_ptr); } @@ -144,9 +148,9 @@ class OneHotOp : public OneHotOpBase { } }; - void RegisterOneHot(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index b210f40e87f3acc1712b92acf5ed4d6a7a161e5f..e995ba6c176b27396f34eb2d66c5d76f76956d45 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -200,8 +200,8 @@ class PadOp : public Operation { #endif // MACE_ENABLE_OPENCL void RegisterPad(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Pad", PadOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Pad", PadOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Pad", PadOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp); } diff --git a/mace/ops/pad_context.cc b/mace/ops/pad_context.cc index 02a8c4250922a59b4d72e273b6e3ad6a82913e76..68d5d7f061ef6b89569bb86e894283fb00c96e34 100644 --- a/mace/ops/pad_context.cc +++ b/mace/ops/pad_context.cc @@ -87,6 +87,8 @@ class PadContextOp : public Operation { void RegisterPadContext(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "PadContext", PadContextOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc index 588e59745404b9252bda70e6e2ac0ef192a839f5..4a4464e9a0f7ee160c0e0dfcf21baeb7afa1f87a 100644 --- a/mace/ops/pnorm.cc +++ b/mace/ops/pnorm.cc @@ -80,7 +80,7 @@ class PNormOp : public Operation { for (index_t j = start1; j < end1; j += step1) { const T *in_base = input_data + i * input_dim + j * group_size; T *out_base = output_data + i * output_dim_; - T temp_result = 0; + T temp_result = 0.f; for (index_t g = 0; g < group_size; ++g) { T value = (std::fabs(in_base[g]) @@ -99,9 +99,9 @@ class PNormOp : public Operation { for (index_t j = start1; j < end1; j += step1) { const T *in_base = input_data + i * input_dim + j * group_size; T *out_base = output_data + i * output_dim_; - T temp_result = 0; + T temp_result = 0.f; for (index_t g = 0; g < group_size; ++g) { - temp_result += std::abs(in_base[g]);; + temp_result += std::abs(in_base[g]); } out_base[j] = temp_result; } @@ -114,7 +114,7 @@ class PNormOp : public Operation { for (index_t j = start1; j < end1; j += step1) { const T *in_base = input_data + i * input_dim + j * group_size; T *out_base = output_data + i * output_dim_; - T temp_result = 0; + T temp_result = 0.f; for (index_t g = 0; g < group_size; ++g) { temp_result += in_base[g] * in_base[g]; } @@ -136,6 +136,8 @@ class PNormOp : public Operation { void RegisterPNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "PNorm", PNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "PNorm", PNormOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 2d51c1c4c64eb1a2274c2c6fd44d1965a66242c5..e74232a32d900f3a891f9764fe0e86829d94bfdd 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -60,8 +60,8 @@ class PoolingOpBase : public ConvPool2dOpBase { template class PoolingOp; -template<> -class PoolingOp : public PoolingOpBase { +template +class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) : PoolingOpBase(context) {} @@ -93,8 +93,8 @@ class PoolingOp : public PoolingOpBase { Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard output_guard(output_tensor); - const float *input = input_tensor->data(); - float *output = output_tensor->mutable_data(); + const T *input = input_tensor->data(); + T *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; @@ -127,14 +127,14 @@ class PoolingOp : public PoolingOpBase { private: void MaxPooling(const OpContext *context, - const float *input, + const T *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, const int *stride_hw, const int *dilation_hw, const int *pad_hw, - float *output) { + T *output) { const index_t batch = out_shape[0]; const index_t out_channels = out_shape[1]; const index_t out_height = out_shape[2]; @@ -184,14 +184,14 @@ class PoolingOp : public PoolingOpBase { } void AvgPooling(const OpContext *context, - const float *input, + const T *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, const int *stride_hw, const int *dilation_hw, const int *pad_hw, - float *output) { + T *output) { const index_t batch = out_shape[0]; const index_t out_channels = out_shape[1]; const index_t out_height = out_shape[2]; @@ -514,6 +514,8 @@ class PoolingOp : public PoolingOpBase { void RegisterPooling(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, diff --git a/mace/ops/prior_box.cc b/mace/ops/prior_box.cc index 3598c98a8b98d882d82f89c9b1fc8063b3258a56..5f4e4f953a4cb1c0a625e8073cea467d6f03bf11 100644 --- a/mace/ops/prior_box.cc +++ b/mace/ops/prior_box.cc @@ -148,6 +148,8 @@ class PriorBoxOp : public Operation { void RegisterPriorBox(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "PriorBox", PriorBoxOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index a81a602d9be90b2eece8f2ca96f93609b1317b78..a9b58633498355f21d93292f32e27be54ecb62ec 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -145,7 +145,7 @@ class ReduceOp : public ReduceOpBase { MACE_UNUSED(context); if (reduce_first_axis_) { if (type == ReduceType::MEAN) { - T tmp = 0; + T tmp = 0.f; for (int i = 0; i < data_reshape_[0]; ++i) { tmp = tmp + input[i]; } @@ -169,7 +169,7 @@ class ReduceOp : public ReduceOpBase { } output[0] = tmp; } else if (type == ReduceType::SUM) { - T tmp = 0; + T tmp = 0.f; for (int i = 0; i < data_reshape_[0]; ++i) { tmp = tmp + input[i]; } @@ -193,7 +193,7 @@ class ReduceOp : public ReduceOpBase { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { for (index_t i = start; i < end; i += step) { - T tmp = 0; + T tmp = 0.f; for (int j = 0; j < data_reshape_[0]; ++j) { tmp += input[j * data_reshape_[1] + i]; } @@ -225,7 +225,7 @@ class ReduceOp : public ReduceOpBase { } } else if (type == ReduceType::SUM) { for (index_t i = start; i < end; i += step) { - T tmp = 0; + T tmp = 0.f; for (int j = 0; j < data_reshape_[0]; ++j) { tmp += input[j * data_reshape_[1] + i]; } @@ -239,7 +239,7 @@ class ReduceOp : public ReduceOpBase { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { for (index_t i = start; i < end; i += step) { - T tmp = 0; + T tmp = 0.f; for (int j = 0; j < data_reshape_[1]; ++j) { tmp += input[i * data_reshape_[1] + j]; } @@ -271,7 +271,7 @@ class ReduceOp : public ReduceOpBase { } } else if (type == ReduceType::SUM) { for (index_t i = start; i < end; i += step) { - T tmp = 0; + T tmp = 0.f; for (int j = 0; j < data_reshape_[1]; ++j) { tmp += input[i * data_reshape_[1] + j]; } @@ -335,9 +335,7 @@ class ReduceOp : public ReduceOpBase { T tmp = 1; for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[0]; ++k) { - tmp *= - input[(k * data_reshape_[1] + i) * data_reshape_[2] - + j]; + tmp *= input[(k * data_reshape_[1] + i) * data_reshape_[2] + j]; } } output[i] = tmp; @@ -1036,6 +1034,8 @@ class ReduceOp : public ReduceOpBase { void RegisterReduce(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Reduce", ReduceOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, DeviceType::CPU, int); #ifdef MACE_ENABLE_QUANTIZE diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc index da2ff26fabd940d0a5e1822df2d37486344cfcd7..bb79853108c27ae233fce225ac2bd3172b97ae4d 100644 --- a/mace/ops/ref/activation.cc +++ b/mace/ops/ref/activation.cc @@ -20,6 +20,7 @@ namespace mace { namespace ops { namespace ref { +template class Activation : public delegator::Activation { public: explicit Activation(const delegator::ActivationParam ¶m) @@ -34,9 +35,10 @@ class Activation : public delegator::Activation { Tensor *output); }; -MaceStatus Activation::Compute(const OpContext *context, - const Tensor *input, - Tensor *output) { +template +MaceStatus Activation::Compute(const OpContext *context, + const Tensor *input, + Tensor *output) { Tensor::MappingGuard input_guard(input); if (input != output) { MACE_RETURN_IF_ERROR(output->ResizeLike(input)); @@ -49,12 +51,13 @@ MaceStatus Activation::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -void Activation::DoActivation(const OpContext *context, - const Tensor *input, - Tensor *output) { +template +void Activation::DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output) { MACE_UNUSED(context); - auto input_ptr = input->data(); - auto output_ptr = output->mutable_data(); + auto input_ptr = input->data(); + auto output_ptr = output->mutable_data(); const index_t size = input->size(); switch (type_) { @@ -77,7 +80,7 @@ void Activation::DoActivation(const OpContext *context, case LEAKYRELU: { for (index_t i = 0; i < size; ++i) { *output_ptr = - std::max(*input_ptr, 0.f) + std::max(*input_ptr, 0.f) + std::min(*input_ptr, 0.f) * leakyrelu_coefficient_; ++input_ptr; ++output_ptr; @@ -107,8 +110,14 @@ void Activation::DoActivation(const OpContext *context, } } -MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam, - MACE_DELEGATOR_KEY(Activation, CPU, float, REF)) +void RegisterActivationDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc index 221c2d2e9cc9b00f6157bdedaa276db36fc4dba3..bab1199e10152340b88d0e20183cffe0bfab20cc 100644 --- a/mace/ops/ref/bias_add.cc +++ b/mace/ops/ref/bias_add.cc @@ -18,6 +18,7 @@ namespace mace { namespace ops { namespace ref { +template class BiasAdd : public delegator::BiasAdd { public: explicit BiasAdd(const DelegatorParam ¶m) : delegator::BiasAdd(param) {} @@ -31,10 +32,11 @@ class BiasAdd : public delegator::BiasAdd { const Tensor *bias, Tensor *output); }; -MaceStatus BiasAdd::Compute(const OpContext *context, - const Tensor *input, - const Tensor *bias, - Tensor *output) { +template +MaceStatus BiasAdd::Compute(const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output) { Tensor::MappingGuard input_guard(input); Tensor::MappingGuard bias_guard(bias); if (input != output) { @@ -54,14 +56,15 @@ MaceStatus BiasAdd::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -void BiasAdd::AddBias(const OpContext *context, - const Tensor *input, - const Tensor *bias, - mace::Tensor *output) { +template +void BiasAdd::AddBias(const OpContext *context, + const Tensor *input, + const Tensor *bias, + mace::Tensor *output) { MACE_UNUSED(context); - auto input_data = input->data(); - auto bias_data = bias->data(); - auto output_data = output->mutable_data(); + auto input_data = input->data(); + auto bias_data = bias->data(); + auto output_data = output->mutable_data(); const index_t batch = input->dim(0); const index_t channels = input->dim(1); @@ -84,8 +87,14 @@ void BiasAdd::AddBias(const OpContext *context, } } -MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam, - MACE_DELEGATOR_KEY(BiasAdd, CPU, float, REF)) +void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index d90b7e2bcddb4f2bb8e5997637e4f189eb3c2ba7..a97ddb1fa014542282304ccf374e281671a3df9b 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -12,19 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include "mace/ops/ref/conv_2d.h" - #include +#include "mace/ops/delegator/conv_2d.h" + namespace mace { namespace ops { namespace ref { -MaceStatus Conv2d::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { +template +class Conv2d : public delegator::Conv2d { + public: + explicit Conv2d(const delegator::Conv2dParam ¶m) + : delegator::Conv2d(param) {} + ~Conv2d() {} + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) override; +}; + +template +MaceStatus Conv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { MACE_UNUSED(context); const std::vector in_shape = input->shape(); @@ -62,9 +75,9 @@ MaceStatus Conv2d::Compute(const OpContext *context, Tensor::MappingGuard input_guard(input); Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard output_guard(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto output_data = output->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto output_data = output->mutable_data(); for (index_t b = 0; b < in_shape[0]; b++) { for (index_t m = 0; m < filter_shape[0]; ++m) { @@ -74,7 +87,7 @@ MaceStatus Conv2d::Compute(const OpContext *context, const index_t out_width = out_shape[3]; const index_t in_channels = filter_shape[1]; - float *out_ptr_base = + T *out_ptr_base = output_data + b * out_batch_size + m * out_image_size; for (index_t h = 0; h < out_height; ++h) { @@ -82,9 +95,9 @@ MaceStatus Conv2d::Compute(const OpContext *context, float sum = 0; for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = + const T *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = + const T *filter_ptr = filter_data + m * in_channels * filter_size + c * filter_size; for (index_t kh = 0; kh < filter_shape[2]; ++kh) { @@ -94,7 +107,9 @@ MaceStatus Conv2d::Compute(const OpContext *context, const index_t iw = -pad_left + w * strides_[1] + kw * dilations_[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { - sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw]; + float input_value = in_ptr_base[ih * in_width + iw]; + float filter_value = filter_ptr[kw]; + sum += input_value * filter_value; } } // kw filter_ptr += filter_shape[3]; @@ -109,9 +124,14 @@ MaceStatus Conv2d::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -typedef Conv2d Conv2dRef; -MACE_REGISTER_DELEGATOR(registry, Conv2dRef, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, REF, General)) +void RegisterConv2dDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2d, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, Conv2d, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h deleted file mode 100644 index b241a58a179af6c485dc9ed916bb4f1c3dfae401..0000000000000000000000000000000000000000 --- a/mace/ops/ref/conv_2d.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_CONV_2D_H_ -#define MACE_OPS_REF_CONV_2D_H_ - -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/delegator/conv_2d.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class Conv2d : public delegator::Conv2d { - public: - explicit Conv2d(const delegator::Conv2dParam ¶m) - : delegator::Conv2d(param) {} - ~Conv2d() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; - -template<> -class Conv2d : public delegator::Conv2d { - public: - explicit Conv2d(const delegator::Conv2dParam ¶m) - : delegator::Conv2d(param) {} - ~Conv2d() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_CONV_2D_H_ - diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc index d19a96d273cb99096d3d0bf4877d558b4edff780..bf76824f7bcc159106e22fc2c0c6da9b005e70f4 100644 --- a/mace/ops/ref/deconv_2d.cc +++ b/mace/ops/ref/deconv_2d.cc @@ -16,18 +16,36 @@ #include #include #include -#include "mace/ops/ref/deconv_2d.h" + +#include "mace/ops/delegator/deconv_2d.h" #include "mace/utils/memory.h" namespace mace { namespace ops { namespace ref { -MaceStatus Deconv2d::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { +template +class Deconv2d : public delegator::Deconv2d { + public: + explicit Deconv2d(const delegator::Deconv2dParam ¶m) + : delegator::Deconv2d(param) {} + + ~Deconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +template +MaceStatus Deconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { MACE_UNUSED(context); std::vector out_shape; @@ -65,15 +83,14 @@ MaceStatus Deconv2d::Compute(const OpContext *context, std::accumulate(padded_out_shape.begin(), padded_out_shape.end(), 1, - std::multiplies()) * sizeof(float); + std::multiplies()) * sizeof(T); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); index_t scratch_size = PadAlignSize(padded_out_size); scratch->GrowSize(scratch_size); - std::unique_ptr - padded_out - (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + std::unique_ptr padded_out(make_unique( + scratch->Scratch(scratch_size), DataTypeToEnum::v())); padded_out->Reshape(padded_out_shape); padded_output = std::move(padded_out); } @@ -88,10 +105,10 @@ MaceStatus Deconv2d::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto pad_out_data = out_tensor->mutable_data(); - auto out_data = output->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); auto &in_shape = input->shape(); @@ -122,7 +139,7 @@ MaceStatus Deconv2d::Compute(const OpContext *context, for (index_t b = 0; b < batch; ++b) { for (index_t oc = 0; oc < out_channels; ++oc) { - float *out_base = + T *out_base = pad_out_data + (b * out_channels + oc) * out_img_size; for (index_t i = 0; i < in_height; ++i) { for (index_t j = 0; j < in_width; ++j) { @@ -148,13 +165,13 @@ MaceStatus Deconv2d::Compute(const OpContext *context, for (index_t i = 0; i < batch; ++i) { for (index_t j = 0; j < out_channels; ++j) { for (index_t k = 0; k < out_height; ++k) { - const float *input_base = + const T *input_base = pad_out_data + ((i * out_channels + j) * pad_out_height + (k + pad_top)) * pad_out_width; - float *output_base = + T *output_base = out_data + ((i * out_channels + j) * out_height + k) * out_width; - memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + memcpy(output_base, input_base + pad_left, out_width * sizeof(T)); } } } @@ -162,10 +179,14 @@ MaceStatus Deconv2d::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -typedef Deconv2d Deconv2dRef; -MACE_REGISTER_DELEGATOR( - registry, Deconv2dRef, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, REF, General)) +void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2d, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, Deconv2d, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/deconv_2d.h b/mace/ops/ref/deconv_2d.h deleted file mode 100644 index 564ce7e7afdac1412ef2ddce8a20e2286ab7b3b0..0000000000000000000000000000000000000000 --- a/mace/ops/ref/deconv_2d.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_DECONV_2D_H_ -#define MACE_OPS_REF_DECONV_2D_H_ - -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/delegator/deconv_2d.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class Deconv2d : public delegator::Deconv2d { - public: - explicit Deconv2d(const delegator::Deconv2dParam ¶m) - : delegator::Deconv2d(param) {} - - ~Deconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -template<> -class Deconv2d : public delegator::Deconv2d { - public: - explicit Deconv2d(const delegator::Deconv2dParam ¶m) - : delegator::Deconv2d(param) {} - - ~Deconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_DECONV_2D_H_ - diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc index 03be506ce1e7ea36cb6a763db83c4f50bb0f1e0b..19aa73efd60b62f6d58bfdb6fa3b5f8c5042322f 100644 --- a/mace/ops/ref/depthwise_conv_2d.cc +++ b/mace/ops/ref/depthwise_conv_2d.cc @@ -12,19 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include "mace/ops/ref/depthwise_conv_2d.h" - #include +#include "mace/ops/delegator/depthwise_conv_2d.h" + namespace mace { namespace ops { namespace ref { -MaceStatus DepthwiseConv2d::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { +template +class DepthwiseConv2d : public delegator::DepthwiseConv2d { + public: + explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam ¶m) + : delegator::DepthwiseConv2d(param) {} + ~DepthwiseConv2d() {} + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) override; +}; + +template +MaceStatus DepthwiseConv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { MACE_UNUSED(context); const std::vector in_shape = input->shape(); @@ -65,9 +78,9 @@ MaceStatus DepthwiseConv2d::Compute(const OpContext *context, Tensor::MappingGuard input_guard(input); Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard output_guard(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto output_data = output->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto output_data = output->mutable_data(); for (index_t b = 0; b < in_shape[0]; b++) { for (index_t m = 0; m < out_shape[1]; ++m) { @@ -80,16 +93,16 @@ MaceStatus DepthwiseConv2d::Compute(const OpContext *context, const index_t out_width = out_shape[3]; const index_t in_channels = in_shape[1]; - float *out_ptr_base = + T *out_ptr_base = output_data + b * out_batch_size + m * out_image_size; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { float sum = 0; - const float *in_ptr_base = + const T *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = + const T *filter_ptr = filter_data + multi_index * in_channels * filter_size + c * filter_size; @@ -115,10 +128,16 @@ MaceStatus DepthwiseConv2d::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -typedef DepthwiseConv2d DepthwiseConv2dRef; -MACE_REGISTER_DELEGATOR( - registry, DepthwiseConv2dRef, delegator::DepthwiseConv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, REF, General)) +void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, + float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, DepthwiseConv2d, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU, + BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/depthwise_conv_2d.h b/mace/ops/ref/depthwise_conv_2d.h deleted file mode 100644 index cc5a14ca433b62e9e50973e511551beab5dd5160..0000000000000000000000000000000000000000 --- a/mace/ops/ref/depthwise_conv_2d.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ -#define MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ - -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/delegator/depthwise_conv_2d.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class DepthwiseConv2d : public delegator::DepthwiseConv2d { - public: - explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam ¶m) - : delegator::DepthwiseConv2d(param) {} - ~DepthwiseConv2d() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; - -template<> -class DepthwiseConv2d : public delegator::DepthwiseConv2d { - public: - explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam ¶m) - : delegator::DepthwiseConv2d(param) {} - ~DepthwiseConv2d() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ - diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc index badded160c49037dc0496a7cccaefe037459a8f0..5439f85b124376267334682145763c1dc5a31600 100644 --- a/mace/ops/ref/depthwise_deconv_2d.cc +++ b/mace/ops/ref/depthwise_deconv_2d.cc @@ -15,18 +15,52 @@ #include #include #include -#include "mace/ops/ref/depthwise_deconv_2d.h" + +#include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/utils/memory.h" namespace mace { namespace ops { namespace ref { -MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { +template +class GroupDeconv2d : public delegator::GroupDeconv2d { + public: + explicit GroupDeconv2d(const delegator::GroupDeconv2dParam ¶m) + : delegator::GroupDeconv2d(param) {} + + virtual ~GroupDeconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +template +class DepthwiseDeconv2d : public GroupDeconv2d { + public: + explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam ¶m) + : GroupDeconv2d(param) {} + + ~DepthwiseDeconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +template +MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { MACE_UNUSED(context); std::vector out_shape; @@ -41,15 +75,15 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, std::vector out_pad_size; CalDeconvOutputShapeAndPadSize(input->shape(), filter->shape(), - strides_, - padding_type_, - paddings_, + GroupDeconv2d::strides_, + GroupDeconv2d::padding_type_, + GroupDeconv2d::paddings_, input->dim(1), &out_shape, nullptr, &out_pad_size, &padded_out_shape, - framework_type_, + GroupDeconv2d::framework_type_, DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); @@ -64,15 +98,14 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, std::accumulate(padded_out_shape.begin(), padded_out_shape.end(), 1, - std::multiplies()) * sizeof(float); + std::multiplies()) * sizeof(T); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); index_t scratch_size = PadAlignSize(padded_out_size); scratch->GrowSize(scratch_size); - std::unique_ptr - padded_out - (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + std::unique_ptr padded_out(make_unique( + scratch->Scratch(scratch_size), DataTypeToEnum::v())); padded_out->Reshape(padded_out_shape); padded_output = std::move(padded_out); } @@ -87,10 +120,10 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto pad_out_data = out_tensor->mutable_data(); - auto out_data = output->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); auto &in_shape = input->shape(); @@ -119,15 +152,15 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { - float *out_base = + T *out_base = pad_out_data + (b * channels + c) * out_img_size; for (index_t i = 0; i < in_height; ++i) { for (index_t j = 0; j < in_width; ++j) { - const index_t out_offset = - i * strides_[0] * pad_out_width + j * strides_[1]; + const index_t out_offset = i * GroupDeconv2d::strides_[0] * + pad_out_width + j * GroupDeconv2d::strides_[1]; const index_t input_idx = (b * channels + c) * in_img_size + i * in_width + j; - const float val = input_data[input_idx]; + const T val = input_data[input_idx]; const index_t kernel_offset = c * kernel_size; for (int k = 0; k < kernel_size; ++k) { const index_t out_idx = out_offset + index_map[k]; @@ -143,13 +176,13 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, for (index_t i = 0; i < batch; ++i) { for (index_t j = 0; j < channels; ++j) { for (index_t k = 0; k < out_height; ++k) { - const float *input_base = + const T *input_base = pad_out_data + ((i * channels + j) * pad_out_height + (k + pad_top)) * pad_out_width; - float *output_base = + T *output_base = out_data + ((i * channels + j) * out_height + k) * out_width; - memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + memcpy(output_base, input_base + pad_left, out_width * sizeof(T)); } } } @@ -157,11 +190,12 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2d::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { +template +MaceStatus GroupDeconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { MACE_UNUSED(context); std::vector out_shape; @@ -199,15 +233,14 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, std::accumulate(padded_out_shape.begin(), padded_out_shape.end(), 1, - std::multiplies()) * sizeof(float); + std::multiplies()) * sizeof(T); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); index_t scratch_size = PadAlignSize(padded_out_size); scratch->GrowSize(scratch_size); - std::unique_ptr - padded_out - (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + std::unique_ptr padded_out(make_unique( + scratch->Scratch(scratch_size), DataTypeToEnum::v())); padded_out->Reshape(padded_out_shape); padded_output = std::move(padded_out); } @@ -222,10 +255,10 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto pad_out_data = out_tensor->mutable_data(); - auto out_data = output->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); auto &in_shape = input->shape(); @@ -288,13 +321,13 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, for (int i = 0; i < batch; ++i) { for (int j = 0; j < out_channels; ++j) { for (int k = 0; k < out_height; ++k) { - const float *input_base = + const T *input_base = pad_out_data + ((i * out_channels + j) * pad_out_height + (k + pad_top)) * pad_out_width; - float *output_base = + T *output_base = out_data + ((i * out_channels + j) * out_height + k) * out_width; - memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + memcpy(output_base, input_base + pad_left, out_width * sizeof(T)); } } } @@ -302,10 +335,16 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -typedef DepthwiseDeconv2d DepthwiseDeconv2dRef; -MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dRef, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, REF, General)) +void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2d, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, DepthwiseDeconv2d, delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, + BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/depthwise_deconv_2d.h b/mace/ops/ref/depthwise_deconv_2d.h deleted file mode 100644 index 586f2627838c30bcb366a850f5b230af980cafca..0000000000000000000000000000000000000000 --- a/mace/ops/ref/depthwise_deconv_2d.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ -#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ - -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/delegator/depthwise_deconv_2d.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class GroupDeconv2d : public delegator::GroupDeconv2d { - public: - explicit GroupDeconv2d(const delegator::GroupDeconv2dParam ¶m) - : delegator::GroupDeconv2d(param) {} - - virtual ~GroupDeconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -template -class DepthwiseDeconv2d : public GroupDeconv2d { - public: - explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2d ¶m) - : GroupDeconv2d(param) {} - - ~DepthwiseDeconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -template<> -class GroupDeconv2d : public delegator::GroupDeconv2d { - public: - explicit GroupDeconv2d(const delegator::GroupDeconv2dParam ¶m) - : delegator::GroupDeconv2d(param) {} - - virtual ~GroupDeconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -template<> -class DepthwiseDeconv2d : public GroupDeconv2d { - public: - explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam ¶m) - : GroupDeconv2d(param) {} - - ~DepthwiseDeconv2d() = default; - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ - diff --git a/mace/ops/ref/gemm.cc b/mace/ops/ref/gemm.cc index 956a7affbf22904b2ab6a023c5ed2756660fe765..21a939a0e7cd53b56f9b58c50db9af0c0aa79615 100644 --- a/mace/ops/ref/gemm.cc +++ b/mace/ops/ref/gemm.cc @@ -12,56 +12,93 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include "mace/ops/ref/gemm.h" +#include "mace/ops/delegator/gemm.h" namespace mace { namespace ops { namespace ref { -MaceStatus Gemm::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t rows, - const index_t cols, - const index_t depth, - const MatrixMajor lhs_major, - const MatrixMajor rhs_major, - const MatrixMajor output_major, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { +template +class Gemm : public delegator::Gemm { + public: + explicit Gemm(const delegator::GemmParam ¶m) : delegator::Gemm(param) {} + ~Gemm() {} + MaceStatus Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const index_t batch, + const index_t rows, + const index_t cols, + const index_t depth, + const MatrixMajor lhs_major, + const MatrixMajor rhs_major, + const MatrixMajor output_major, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; + // Original matrix before transpose has row-major + MaceStatus Compute( + const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const index_t batch, + const index_t lhs_rows, + const index_t lhs_cols, + const index_t rhs_rows, + const index_t rhs_cols, + const bool transpose_lhs, + const bool transpose_rhs, + const bool transpose_out, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; +}; + +template +MaceStatus Gemm::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const index_t batch, + const index_t rows, + const index_t cols, + const index_t depth, + const MatrixMajor lhs_major, + const MatrixMajor rhs_major, + const MatrixMajor output_major, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { MACE_UNUSED(context); Tensor::MappingGuard lhs_guard(lhs); Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard output_guard(output); - const float *lhs_data = lhs->data(); - const float *rhs_data = rhs->data(); - float *output_data = output->mutable_data(); + const T *lhs_data = lhs->data(); + const T *rhs_data = rhs->data(); + T *output_data = output->mutable_data(); for (index_t b = 0; b < batch; ++b) { - MatrixMap + MatrixMap lhs_matrix (lhs_data + static_cast(lhs_batched) * b * rows * depth, lhs_major, rows, depth); - MatrixMap + MatrixMap rhs_matrix (rhs_data + static_cast(rhs_batched) * b * depth * cols, rhs_major, depth, cols); - MatrixMap + MatrixMap output_matrix(output_data + b * rows * cols, output_major, rows, cols); for (index_t r = 0; r < rows; ++r) { for (index_t c = 0; c < cols; ++c) { float sum = 0; for (index_t d = 0; d < depth; ++d) { - sum += lhs_matrix(r, d) * rhs_matrix(d, c); + sum += static_cast(lhs_matrix(r, d)) * + static_cast(rhs_matrix(d, c)); } // d *output_matrix.data(r, c) = sum; @@ -72,20 +109,21 @@ MaceStatus Gemm::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MaceStatus Gemm::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t lhs_rows, - const index_t lhs_cols, - const index_t rhs_rows, - const index_t rhs_cols, - const bool transpose_lhs, - const bool transpose_rhs, - const bool transpose_out, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { +template +MaceStatus Gemm::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const index_t batch, + const index_t lhs_rows, + const index_t lhs_cols, + const index_t rhs_rows, + const index_t rhs_cols, + const bool transpose_lhs, + const bool transpose_rhs, + const bool transpose_out, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { index_t rows = transpose_lhs ? lhs_cols : lhs_rows; index_t depth = transpose_lhs ? lhs_rows : lhs_cols; index_t cols = transpose_rhs ? rhs_rows : rhs_cols; @@ -96,24 +134,29 @@ MaceStatus Gemm::Compute(const OpContext *context, " vs. ", depth2); - return Compute(context, - lhs, - rhs, - batch, - rows, - cols, - depth, - transpose_lhs ? ColMajor : RowMajor, - transpose_rhs ? ColMajor : RowMajor, - transpose_out ? ColMajor : RowMajor, - lhs_batched, - rhs_batched, - output); + return Gemm::Compute(context, + lhs, + rhs, + batch, + rows, + cols, + depth, + transpose_lhs ? ColMajor : RowMajor, + transpose_rhs ? ColMajor : RowMajor, + transpose_out ? ColMajor : RowMajor, + lhs_batched, + rhs_batched, + output); } -typedef Gemm GemmRef; -MACE_REGISTER_DELEGATOR(registry, GemmRef, delegator::GemmParam, - MACE_DELEGATOR_KEY(Gemm, CPU, float, REF)) +void RegisterGemmDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF)); +} } // namespace ref } // namespace ops diff --git a/mace/ops/ref/gemm.h b/mace/ops/ref/gemm.h deleted file mode 100644 index b7b63fba856d862542f1afe4315990933c3271d2..0000000000000000000000000000000000000000 --- a/mace/ops/ref/gemm.h +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_GEMM_H_ -#define MACE_OPS_REF_GEMM_H_ - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/common/matrix.h" -#include "mace/ops/delegator/gemm.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class Gemm : public delegator::Gemm { - public: - explicit Gemm(const delegator::GemmParam ¶m) : delegator::Gemm(param) {} - ~Gemm() {} - MaceStatus Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t rows, - const index_t cols, - const index_t depth, - const MatrixMajor lhs_major, - const MatrixMajor rhs_major, - const MatrixMajor output_major, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; - -template<> -class Gemm : public delegator::Gemm { - public: - explicit Gemm(const delegator::GemmParam ¶m) : delegator::Gemm(param) {} - ~Gemm() {} - MaceStatus Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t rows, - const index_t cols, - const index_t depth, - const MatrixMajor lhs_major, - const MatrixMajor rhs_major, - const MatrixMajor output_major, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; - // Original matrix before transpose has row-major - MaceStatus Compute( - const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t lhs_rows, - const index_t lhs_cols, - const index_t rhs_rows, - const index_t rhs_cols, - const bool transpose_lhs, - const bool transpose_rhs, - const bool transpose_out, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_GEMM_H_ - diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc index 350412c2f548b67d737bcffc924c36582866d05f..05b04f3ce986918bea2ed1cc2702b54924990422 100644 --- a/mace/ops/ref/gemv.cc +++ b/mace/ops/ref/gemv.cc @@ -13,7 +13,7 @@ // limitations under the License. -#include "mace/ops/ref/gemv.h" +#include "mace/ops/delegator/gemv.h" #if defined(MACE_ENABLE_QUANTIZE) #include "mace/core/quantize.h" @@ -23,7 +23,27 @@ namespace mace { namespace ops { namespace ref { -MaceStatus Gemv::Compute(const OpContext *context, +template +class Gemv : public delegator::Gemv { + public: + explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} + ~Gemv() {} + // Always row-major after transpose + MaceStatus Compute( + const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; +}; + +template +MaceStatus Gemv::Compute(const OpContext *context, const Tensor *lhs, const Tensor *rhs, const Tensor *bias, @@ -39,18 +59,18 @@ MaceStatus Gemv::Compute(const OpContext *context, Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard output_guard(output); - const float *lhs_data = lhs->data(); - const float *rhs_data = rhs->data(); - const float *bias_data = nullptr; + const T *lhs_data = lhs->data(); + const T *rhs_data = rhs->data(); + const T *bias_data = nullptr; if (bias) { - bias_data = bias->data(); + bias_data = bias->data(); } - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); for (index_t b = 0; b < batch; ++b) { for (index_t h = 0; h < lhs_height; ++h) { - float sum = bias ? bias_data[h] : 0; + float sum = bias ? static_cast(bias_data[h]) : 0.f; for (index_t w = 0; w < lhs_width; ++w) { sum += lhs_data[ static_cast(lhs_batched) * b * lhs_height * lhs_width @@ -65,110 +85,15 @@ MaceStatus Gemv::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -#if defined(MACE_ENABLE_QUANTIZE) -MaceStatus Gemv::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { - MACE_UNUSED(context); - - Tensor::MappingGuard lhs_guard(lhs); - Tensor::MappingGuard rhs_guard(rhs); - Tensor::MappingGuard bias_guard(bias); - Tensor::MappingGuard output_guard(output); - const uint8_t *lhs_data = lhs->data(); - const uint8_t *rhs_data = rhs->data(); - const int32_t *bias_data = nullptr; - if (bias) { - bias_data = bias->data(); - } - - uint8_t *output_data = output->mutable_data(); - - MACE_CHECK(output->scale() > 0, "output scale must not be zero"); - const float - output_multiplier_float = lhs->scale() * rhs->scale() / output->scale(); - int32_t lhs_zero = lhs->zero_point(); - int32_t rhs_zero = rhs->zero_point(); - - for (index_t b = 0; b < batch; ++b) { - for (index_t h = 0; h < lhs_height; ++h) { - int32_t sum = bias ? bias_data[h] : 0; - for (index_t w = 0; w < lhs_width; ++w) { - sum += (lhs_data[ - static_cast(lhs_batched) * b * lhs_height * lhs_width - + h * lhs_width + w] - lhs_zero) - * (rhs_data[static_cast(rhs_batched) * b * lhs_width + w] - - rhs_zero); - } // w - - output_data[b * lhs_height + h] = - Saturate(std::roundf(sum * output_multiplier_float)); - } // h - } // b - return MaceStatus::MACE_SUCCESS; +void RegisterGemvDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::REF)); + MACE_REGISTER_BF16_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF)); } -MaceStatus Gemv::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { - MACE_UNUSED(context); - - Tensor::MappingGuard lhs_guard(lhs); - Tensor::MappingGuard rhs_guard(rhs); - Tensor::MappingGuard bias_guard(bias); - Tensor::MappingGuard output_guard(output); - const uint8_t *lhs_data = lhs->data(); - const uint8_t *rhs_data = rhs->data(); - const int32_t *bias_data = nullptr; - if (bias) { - bias_data = bias->data(); - } - - int32_t *output_data = output->mutable_data(); - - int32_t lhs_zero = lhs->zero_point(); - int32_t rhs_zero = rhs->zero_point(); - - for (index_t b = 0; b < batch; ++b) { - for (index_t h = 0; h < lhs_height; ++h) { - int32_t sum = bias ? bias_data[h] : 0; - for (index_t w = 0; w < lhs_width; ++w) { - sum += (lhs_data[ - static_cast(lhs_batched) * b * lhs_height * lhs_width - + h * lhs_width + w] - lhs_zero) - * (rhs_data[static_cast(rhs_batched) * b * lhs_width + w] - - rhs_zero); - } // w - - output_data[b * lhs_height + h] = sum; - } // h - } // b - return MaceStatus::MACE_SUCCESS; -} - -typedef Gemv GemvUint8Ref; -MACE_REGISTER_DELEGATOR(registry, GemvUint8Ref, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, Ref)) -#endif // MACE_ENABLE_QUANTIZE - -typedef Gemv GemvRef; -MACE_REGISTER_DELEGATOR(registry, GemvRef, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, CPU, float, REF)) - } // namespace ref } // namespace ops } // namespace mace diff --git a/mace/ops/ref/gemv.h b/mace/ops/ref/gemv.h deleted file mode 100644 index e14730bbd9556e0f14356c88e8276fcebd3ae5ec..0000000000000000000000000000000000000000 --- a/mace/ops/ref/gemv.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef MACE_OPS_REF_GEMV_H_ -#define MACE_OPS_REF_GEMV_H_ - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/delegator/gemv.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace ref { - -template -class Gemv : public delegator::Gemv { - public: - explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} - ~Gemv() {} - // Always row-major after transpose - MaceStatus Compute( - const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; - -template<> -class Gemv : public delegator::Gemv { - public: - explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} - ~Gemv() {} - // Always row-major after transpose - MaceStatus Compute( - const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; - -#if defined(MACE_ENABLE_QUANTIZE) -template<> -class Gemv : public delegator::Gemv { - public: - explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} - ~Gemv() {} - // Always row-major after transpose - MaceStatus Compute( - const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; - -template<> -class Gemv : public delegator::Gemv { - public: - explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} - ~Gemv() {} - // Always row-major after transpose - MaceStatus Compute( - const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) override; -}; -#endif // MACE_ENABLE_QUANTIZE - -} // namespace ref -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REF_GEMV_H_ - diff --git a/mace/ops/ref/q8/eltwise.cc b/mace/ops/ref/q8/eltwise.cc index 220378e4e0e1fdf52d091abf4d974f92edb57eec..b34a62ea5b3f1763418a35a581ce80474ceb2f85 100644 --- a/mace/ops/ref/q8/eltwise.cc +++ b/mace/ops/ref/q8/eltwise.cc @@ -107,8 +107,11 @@ MaceStatus Eltwise::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam, - MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, REF)) +void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Eltwise, delegator::EltwiseParam, + MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::REF)); +} } // namespace q8 } // namespace ref diff --git a/mace/ops/ref/q8/gemv.cc b/mace/ops/ref/q8/gemv.cc new file mode 100644 index 0000000000000000000000000000000000000000..763f23fbf5ed490cd8cee4110f40046a9556a99b --- /dev/null +++ b/mace/ops/ref/q8/gemv.cc @@ -0,0 +1,186 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/quantize.h" +#include "mace/ops/delegator/gemv.h" + +namespace mace { +namespace ops { +namespace ref { +namespace q8 { + +template +class Gemv : public delegator::Gemv { + public: + explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} + ~Gemv() {} + // Always row-major after transpose + MaceStatus Compute( + const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; +}; + +template<> +class Gemv : public delegator::Gemv { + public: + explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} + ~Gemv() {} + // Always row-major after transpose + MaceStatus Compute( + const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; +}; + +template<> +class Gemv : public delegator::Gemv { + public: + explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} + ~Gemv() {} + // Always row-major after transpose + MaceStatus Compute( + const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) override; +}; + +MaceStatus Gemv::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { + MACE_UNUSED(context); + + Tensor::MappingGuard lhs_guard(lhs); + Tensor::MappingGuard rhs_guard(rhs); + Tensor::MappingGuard bias_guard(bias); + Tensor::MappingGuard output_guard(output); + const uint8_t *lhs_data = lhs->data(); + const uint8_t *rhs_data = rhs->data(); + const int32_t *bias_data = nullptr; + if (bias) { + bias_data = bias->data(); + } + + uint8_t *output_data = output->mutable_data(); + + MACE_CHECK(output->scale() > 0, "output scale must not be zero"); + const float + output_multiplier_float = lhs->scale() * rhs->scale() / output->scale(); + int32_t lhs_zero = lhs->zero_point(); + int32_t rhs_zero = rhs->zero_point(); + + for (index_t b = 0; b < batch; ++b) { + for (index_t h = 0; h < lhs_height; ++h) { + int32_t sum = bias ? bias_data[h] : 0; + for (index_t w = 0; w < lhs_width; ++w) { + sum += (lhs_data[ + static_cast(lhs_batched) * b * lhs_height * lhs_width + + h * lhs_width + w] - lhs_zero) + * (rhs_data[static_cast(rhs_batched) * b * lhs_width + w] + - rhs_zero); + } // w + + output_data[b * lhs_height + h] = + Saturate(std::roundf(sum * output_multiplier_float)); + } // h + } // b + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Gemv::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { + MACE_UNUSED(context); + + Tensor::MappingGuard lhs_guard(lhs); + Tensor::MappingGuard rhs_guard(rhs); + Tensor::MappingGuard bias_guard(bias); + Tensor::MappingGuard output_guard(output); + const uint8_t *lhs_data = lhs->data(); + const uint8_t *rhs_data = rhs->data(); + const int32_t *bias_data = nullptr; + if (bias) { + bias_data = bias->data(); + } + + int32_t *output_data = output->mutable_data(); + + int32_t lhs_zero = lhs->zero_point(); + int32_t rhs_zero = rhs->zero_point(); + + for (index_t b = 0; b < batch; ++b) { + for (index_t h = 0; h < lhs_height; ++h) { + int32_t sum = bias ? bias_data[h] : 0; + for (index_t w = 0; w < lhs_width; ++w) { + sum += (lhs_data[ + static_cast(lhs_batched) * b * lhs_height * lhs_width + + h * lhs_width + w] - lhs_zero) + * (rhs_data[static_cast(rhs_batched) * b * lhs_width + w] + - rhs_zero); + } // w + + output_data[b * lhs_height + h] = sum; + } // h + } // b + return MaceStatus::MACE_SUCCESS; +} + +void RegisterGemvDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::REF)); + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::REF)); +} + +} // namespace q8 +} // namespace ref +} // namespace ops +} // namespace mace diff --git a/mace/ops/registry/op_delegators_registry.cc b/mace/ops/registry/op_delegators_registry.cc index a596878016b222f1606f39201d18b0a40653485f..4aac7282edae65c83211a50b16bfb641c18c7881 100644 --- a/mace/ops/registry/op_delegators_registry.cc +++ b/mace/ops/registry/op_delegators_registry.cc @@ -20,19 +20,18 @@ namespace ops { namespace ref { extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dRefDelegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dRefDelegator(OpDelegatorRegistry *registry); -extern void RegisterDepthwiseConv2dRefDelegator(OpDelegatorRegistry *registry); -extern void RegisterDepthwiseDeconv2dRefDelegator( - OpDelegatorRegistry *registry); -extern void RegisterGemmRefDelegator(OpDelegatorRegistry *registry); -extern void RegisterGemvRefDelegator(OpDelegatorRegistry *registry); +extern void RegisterConv2dDelegator(OpDelegatorRegistry *registry); +extern void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry); +extern void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry); +extern void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry); +extern void RegisterGemmDelegator(OpDelegatorRegistry *registry); +extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); #ifdef MACE_ENABLE_QUANTIZE namespace q8 { extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry); +extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); } // namespace q8 -extern void RegisterGemvUint8RefDelegator(OpDelegatorRegistry *registry); #endif // MACE_ENABLE_QUANTIZE } // namespace ref @@ -43,43 +42,26 @@ extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK1x7S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK7x1S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK1x15S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK15x1S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK3x3S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK3x3S2Delegator(OpDelegatorRegistry *registry); +extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry); +extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK5x5S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK7x7S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK7x7S2Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK7x7S3Delegator(OpDelegatorRegistry *registry); +extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry); +extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK2x2S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK2x2S2Delegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry); +extern void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry); +extern void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry); +extern void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry); -extern void RegisterDepthwiseConv2dK3x3S1Delegator( +extern void RegisterDepthwiseConv2dK3x3Delegator( OpDelegatorRegistry *registry); -extern void RegisterDepthwiseConv2dK3x3S2Delegator( +extern void RegisterDepthwiseDeconv2dK3x3Delegator( OpDelegatorRegistry *registry); -extern void RegisterDepthwiseDeconv2dK3x3S1Delegator( +extern void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry); +extern void RegisterDepthwiseDeconv2dK4x4Delegator( OpDelegatorRegistry *registry); -extern void RegisterDepthwiseDeconv2dK3x3S2Delegator( - OpDelegatorRegistry *registry); -extern void RegisterGroupDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterGroupDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry); -extern void RegisterDepthwiseDeconv2dK4x4S1Delegator( - OpDelegatorRegistry *registry); -extern void RegisterDepthwiseDeconv2dK4x4S2Delegator( - OpDelegatorRegistry *registry); -extern void RegisterGroupDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry); -extern void RegisterGroupDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry); +extern void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry); extern void RegisterDepthwiseDeconv2dGeneralDelegator( OpDelegatorRegistry *registry); extern void RegisterGroupDeconv2dGeneralDelegator( @@ -92,8 +74,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); #ifdef MACE_ENABLE_QUANTIZE namespace q8 { extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry); -extern void RegisterGemvUint8Delegator(OpDelegatorRegistry *registry); -extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry); +extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); } // namespace q8 #endif // MACE_ENABLE_QUANTIZE @@ -103,16 +84,16 @@ extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry); void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { ref::RegisterActivationDelegator(registry); ref::RegisterBiasAddDelegator(registry); - ref::RegisterConv2dRefDelegator(registry); - ref::RegisterDeconv2dRefDelegator(registry); - ref::RegisterDepthwiseConv2dRefDelegator(registry); - ref::RegisterDepthwiseDeconv2dRefDelegator(registry); - ref::RegisterGemmRefDelegator(registry); - ref::RegisterGemvRefDelegator(registry); + ref::RegisterConv2dDelegator(registry); + ref::RegisterDeconv2dDelegator(registry); + ref::RegisterDepthwiseConv2dDelegator(registry); + ref::RegisterDepthwiseDeconv2dDelegator(registry); + ref::RegisterGemmDelegator(registry); + ref::RegisterGemvDelegator(registry); #ifdef MACE_ENABLE_QUANTIZE ref::q8::RegisterEltwiseDelegator(registry); - ref::RegisterGemvUint8RefDelegator(registry); + ref::q8::RegisterGemvDelegator(registry); #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_NEON @@ -120,37 +101,23 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { arm::fp32::RegisterBiasAddDelegator(registry); arm::fp32::RegisterConv2dK1x1Delegator(registry); - arm::fp32::RegisterConv2dK1x7S1Delegator(registry); - arm::fp32::RegisterConv2dK7x1S1Delegator(registry); - arm::fp32::RegisterConv2dK1x15S1Delegator(registry); - arm::fp32::RegisterConv2dK15x1S1Delegator(registry); - arm::fp32::RegisterConv2dK3x3S1Delegator(registry); - arm::fp32::RegisterConv2dK3x3S2Delegator(registry); + arm::fp32::RegisterConv2dK1xNDelegator(registry); + arm::fp32::RegisterConv2dK3x3Delegator(registry); arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry); - arm::fp32::RegisterConv2dK5x5S1Delegator(registry); - arm::fp32::RegisterConv2dK7x7S1Delegator(registry); - arm::fp32::RegisterConv2dK7x7S2Delegator(registry); - arm::fp32::RegisterConv2dK7x7S3Delegator(registry); + arm::fp32::RegisterConv2dK5x5Delegator(registry); + arm::fp32::RegisterConv2dK7x7Delegator(registry); arm::fp32::RegisterConv2dGeneralDelegator(registry); - arm::fp32::RegisterDeconv2dK2x2S1Delegator(registry); - arm::fp32::RegisterDeconv2dK2x2S2Delegator(registry); - arm::fp32::RegisterDeconv2dK3x3S1Delegator(registry); - arm::fp32::RegisterDeconv2dK3x3S2Delegator(registry); - arm::fp32::RegisterDeconv2dK4x4S1Delegator(registry); - arm::fp32::RegisterDeconv2dK4x4S2Delegator(registry); + arm::fp32::RegisterDeconv2dK2x2Delegator(registry); + arm::fp32::RegisterDeconv2dK3x3Delegator(registry); + arm::fp32::RegisterDeconv2dK4x4Delegator(registry); arm::fp32::RegisterDeconv2dGeneralDelegator(registry); - arm::fp32::RegisterDepthwiseConv2dK3x3S1Delegator(registry); - arm::fp32::RegisterDepthwiseConv2dK3x3S2Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK3x3S1Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK3x3S2Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK3x3S1Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK3x3S2Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK4x4S1Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK4x4S2Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK4x4S1Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK4x4S2Delegator(registry); + arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry); + arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry); + arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry); + arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry); + arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry); arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry); arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry); @@ -159,8 +126,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { #ifdef MACE_ENABLE_QUANTIZE arm::q8::RegisterEltwiseDelegator(registry); - arm::q8::RegisterGemvUint8Delegator(registry); - arm::q8::RegisterGemvInt32Delegator(registry); + arm::q8::RegisterGemvDelegator(registry); #endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_NEON diff --git a/mace/ops/replace_index.cc b/mace/ops/replace_index.cc index 8b2f76db8ad9b133530e010935343f7eadbc7bad..e35a0d44077871bd68c502aee83435461f1bd577 100644 --- a/mace/ops/replace_index.cc +++ b/mace/ops/replace_index.cc @@ -98,6 +98,8 @@ class ReplaceIndexOp : public Operation { void RegisterReplaceIndex(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ReplaceIndex", ReplaceIndexOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index 63c91c2e3ad0d4035844b4d18ea75f2e3285579d..6183179e04694b600ae079c313d5b5161c4f7108 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -152,6 +152,7 @@ class ReshapeOp : public Operation { void RegisterReshape(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, int32_t); MACE_REGISTER_GPU_OP(op_registry, "Reshape", ReshapeOp); MACE_REGISTER_OP_CONDITION( diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 2fa891d1bb39016a5da3aff565d27ab78296c357..1621da8e723eeaa297f0f78a6c6bf6036bca3cea 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -57,15 +57,7 @@ inline T ComputeLerp(const T top_left, const T bottom_left, const T bottom_right, const float x_lerp, - const float y_lerp); - -template<> -inline float ComputeLerp(const float top_left, - const float top_right, - const float bottom_left, - const float bottom_right, - const float x_lerp, - const float y_lerp) { + const float y_lerp) { const float top = top_left + (top_right - top_left) * x_lerp; const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; return top + (bottom - top) * y_lerp; @@ -370,6 +362,8 @@ class ResizeBilinearOp : public Operation { void RegisterResizeBilinear(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index 201c4515878ec4872e45e8fb7cc6fb23b53cd43d..ef51b00ae01edc1fec8a7e93ede9c1a74642b271 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -176,6 +176,8 @@ class ResizeNearestNeighborOp : public Operation { void RegisterResizeNearestNeighbor(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", ResizeNearestNeighborOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ResizeNearestNeighbor", + ResizeNearestNeighborOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor", ResizeNearestNeighborOp); diff --git a/mace/ops/reverse.cc b/mace/ops/reverse.cc index af9670e34563ab506c15e4c2317091d9ad864e91..d49f14a146775b25d68414e2034fe28c8186f993 100644 --- a/mace/ops/reverse.cc +++ b/mace/ops/reverse.cc @@ -76,6 +76,8 @@ class ReverseOp : public Operation { void RegisterReverse(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Reverse", ReverseOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc index 1c2734205c0898e5216adeb0c7370ab73f773588..14725ece8f94cbfa51d73751dbe07e38fc3749fc 100644 --- a/mace/ops/scalar_math.cc +++ b/mace/ops/scalar_math.cc @@ -158,6 +158,8 @@ class ScalarMathOp : public Operation { void RegisterScalarMath(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "ScalarMath", ScalarMathOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, DeviceType::CPU, int32_t); } diff --git a/mace/ops/select.cc b/mace/ops/select.cc index 5001ba20140fa1634af972dc960776f979ea0753..b5816c4d9b2c7a7545ad2702da824707ef47b495 100644 --- a/mace/ops/select.cc +++ b/mace/ops/select.cc @@ -22,8 +22,8 @@ namespace ops { template class SelectOp; -template<> -class SelectOp : public Operation { +template +class SelectOp : public Operation { public: explicit SelectOp(OpConstructContext *context) : Operation(context) {} @@ -41,7 +41,7 @@ class SelectOp : public Operation { Tensor *output = this->Output(OUTPUT); const index_t condition_rank = condition->dim_size(); MACE_RETURN_IF_ERROR(output->Resize({condition->size(), condition_rank})); - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); const bool *condition_data = condition->data(); index_t i = 0; @@ -161,10 +161,10 @@ class SelectOp : public Operation { Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->Resize(x->shape())); - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); const bool *condition_data = condition->data(); - const float *x_data = x->data(); - const float *y_data = y->data(); + const T *x_data = x->data(); + const T *y_data = y->data(); const index_t condition_size = condition->size(); const index_t x_size = x->size(); @@ -182,7 +182,7 @@ class SelectOp : public Operation { MACE_ASSERT( block_size > 1 && x_size % condition_size == 0, "x_size should be a multiple of condition_size and greater than 1"); - const auto raw_block_size = block_size * sizeof(float); + const auto raw_block_size = block_size * sizeof(T); thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { for (index_t k = start; k < end; k += step) { auto offset = block_size * k; @@ -208,6 +208,8 @@ class SelectOp : public Operation { void RegisterSelect(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Select", SelectOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Select", SelectOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index 0071ec258cb260145625505a5a835011e1e65461..f51ec015845420d9a48089b1d6649fc6b867a8fa 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -62,6 +62,8 @@ class ShapeOp : public Operation { void RegisterShape(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Shape", ShapeOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index 14e71cad6ceb951f0cc6c6d3ba95ef81dd0fcea2..ff793a274d16217e0eefb3e8814def0f7f67f60d 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -87,6 +87,8 @@ class SliceOp : public Operation { void RegisterSlice(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Slice", SliceOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Slice", SliceOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index bf7cf202c8cffe528bcae1a9064cca8e0d4d967b..c233cba936a9d8360d1a9c94f14e2b4e648e0009 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -39,8 +39,8 @@ namespace ops { template class SoftmaxOp; -template<> -class SoftmaxOp : public Operation { +template +class SoftmaxOp : public Operation { public: explicit SoftmaxOp(OpConstructContext *context) : Operation(context), @@ -71,9 +71,9 @@ class SoftmaxOp : public Operation { protected: MaceStatus RunForNCHW(OpContext *context) { const Tensor *input = this->Input(INPUT); - const float *input_data = input->data(); + const T *input_data = input->data(); Tensor *output = this->Output(OUTPUT); - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); MACE_CHECK(input->dim_size() == 4, "The dim size of NCHW should be 4."); index_t hw_stride = input->dim(3); @@ -93,8 +93,8 @@ class SoftmaxOp : public Operation { for (index_t b_offset = 0; b_offset < batch_size; b_offset += batch_stride) { - const float *input_b_base = input_data + b_offset; - float *output_b_base = output_data + b_offset; + const T *input_b_base = input_data + b_offset; + T *output_b_base = output_data + b_offset; thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { const auto raw_step_size = step * sizeof(float); for (index_t k = start; k < end; k += step) { @@ -106,9 +106,9 @@ class SoftmaxOp : public Operation { for (index_t c_offset = 0; c_offset < class_size; c_offset += class_stride) { - const float *input_c_base = input_b_base + c_offset; + const T *input_c_base = input_b_base + c_offset; for (index_t k = start; k < end; k += step) { - const float *input_ptr = input_c_base + k; + const T *input_ptr = input_c_base + k; float *cache_k_ptr = cache_ptr + k; for (index_t i = 0; i < step; ++i) { cache_k_ptr[i] = std::max(cache_k_ptr[i], input_ptr[i]); @@ -118,14 +118,14 @@ class SoftmaxOp : public Operation { for (index_t c_offset = 0; c_offset < class_size; c_offset += class_stride) { - const float *input_c_base = input_b_base + c_offset; - float *output_c_base = output_b_base + c_offset; + const T *input_c_base = input_b_base + c_offset; + T *output_c_base = output_b_base + c_offset; for (index_t k = start; k < end; k += step) { - const float *input_ptr = input_c_base + k; - float *output_ptr = output_c_base + k; + const T *input_ptr = input_c_base + k; + T *output_ptr = output_c_base + k; float *cache_k_ptr = cache_ptr + k; for (index_t i = 0; i < step; ++i) { - output_ptr[i] = ::exp(input_ptr[i] - cache_k_ptr[i]); + output_ptr[i] = std::exp(input_ptr[i] - cache_k_ptr[i]); } } } @@ -136,24 +136,24 @@ class SoftmaxOp : public Operation { for (index_t c_offset = 0; c_offset < class_size; c_offset += class_stride) { - float *output_c_base = output_b_base + c_offset; + T *output_c_base = output_b_base + c_offset; for (index_t k = start; k < end; k += step) { - float *output_ptr = output_c_base + k; + T *output_ptr = output_c_base + k; float *cache_k_ptr = cache_ptr + k; for (index_t i = 0; i < step; ++i) { - cache_k_ptr[i] += output_ptr[i]; + cache_k_ptr[i] += static_cast(output_ptr[i]); } } } for (index_t c_offset = 0; c_offset < class_size; c_offset += class_stride) { - float *output_c_base = output_b_base + c_offset; + T *output_c_base = output_b_base + c_offset; for (index_t k = start; k < end; k += step) { - float *output_ptr = output_c_base + k; + T *output_ptr = output_c_base + k; float *cache_k_ptr = cache_ptr + k; for (index_t i = 0; i < step; ++i) { - output_ptr[i] = output_ptr[i] / cache_k_ptr[i]; + output_ptr[i] /= cache_k_ptr[i]; } } } @@ -161,9 +161,9 @@ class SoftmaxOp : public Operation { if (use_log_) { for (index_t c_offset = 0; c_offset < class_size; c_offset += class_stride) { - float *output_c_base = output_b_base + c_offset; + T *output_c_base = output_b_base + c_offset; for (index_t k = start; k < end; k += step) { - float *output_ptr = output_c_base + k; + T *output_ptr = output_c_base + k; for (index_t i = 0; i < step; ++i) { output_ptr[i] = std::log(output_ptr[i]); } @@ -179,7 +179,7 @@ class SoftmaxOp : public Operation { MaceStatus RunForNHWC(OpContext *context) { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); - float *output_data = output->mutable_data(); + T *output_data = output->mutable_data(); MACE_CHECK(input->dim_size() >= 2, "The input->dim_size() >= 2 failed."); index_t class_size = input->dim(input->dim_size() - 1); @@ -196,16 +196,16 @@ class SoftmaxOp : public Operation { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); - const float *input_data = input->data(); + const T *input_data = input->data(); float std_lowest = std::numeric_limits::lowest(); for (index_t b_offset = 0; b_offset < batch_size; b_offset += batch_stride) { - const float *input_b_ptr = input_data + b_offset; - float *output_b_ptr = output_data + b_offset; + const T *input_b_ptr = input_data + b_offset; + T *output_b_ptr = output_data + b_offset; thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { for (index_t k = start; k < end; k += step) { - const float *input_ptr = input_b_ptr + k; - float *output_ptr = output_b_ptr + k; + const T *input_ptr = input_b_ptr + k; + T *output_ptr = output_b_ptr + k; float max_val = std_lowest; for (index_t c = 0; c < class_size; ++c) { @@ -214,15 +214,15 @@ class SoftmaxOp : public Operation { float sum = 0; for (index_t c = 0; c < class_size; ++c) { - float exp_value = ::exp(input_ptr[c] - max_val); + float exp_value = std::exp(input_ptr[c] - max_val); sum += exp_value; output_ptr[c] = exp_value; } if (use_log_) { for (index_t c = 0; c < class_size; ++c) { - output_ptr[c] /= sum; - output_ptr[c] = std::log(output_ptr[c]); + float output = (static_cast(output_ptr[c])) / sum; + output_ptr[c] = std::log(output); } } else { for (index_t c = 0; c < class_size; ++c) { @@ -306,8 +306,8 @@ class SoftmaxOp : public Operation { float sum = 0; std::vector depth_cache(depth); for (index_t d = 0; d < depth; ++d) { - float exp_value = ::exp((static_cast(input_ptr[d]) - max_value) - * input_scale); + float exp_value = std::exp( + (static_cast(input_ptr[d]) - max_value) * input_scale); sum += exp_value; depth_cache[d] = exp_value; } @@ -524,6 +524,8 @@ class SoftmaxOp : public Operation { void RegisterSoftmax(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index e5d7ec5ca8ff5d33c215e913b4af4bd96b45cc71..641d746f46c95df1f6aee8ac8d4757b2b78732b9 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -90,8 +90,8 @@ class SpaceToBatchOpBase : public Operation { template class SpaceToBatchNDOp; -template<> -class SpaceToBatchNDOp : public SpaceToBatchOpBase { +template +class SpaceToBatchNDOp : public SpaceToBatchOpBase { public: explicit SpaceToBatchNDOp(OpConstructContext *context) : SpaceToBatchOpBase(context) {} @@ -115,8 +115,8 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { int block_shape_h = block_shape_[0]; int block_shape_w = block_shape_[1]; - const float *input_data = space_tensor->data(); - float *output_data = batch_tensor->mutable_data(); + const T *input_data = space_tensor->data(); + T *output_data = batch_tensor->mutable_data(); index_t in_batches = space_tensor->dim(0); index_t in_height = space_tensor->dim(2); @@ -158,20 +158,20 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { (in_width + pad_left - tile_w + block_shape_w - 1) / block_shape_w); - const float *input_base = + const T *input_base = input_data + (in_b * channels + c) * in_height * in_width; - float *output_base = + T *output_base = output_data + (b * channels + c) * out_height * out_width; memset(output_base + block_h * out_width, 0, - (valid_h_start - block_h) * out_width * sizeof(float)); + (valid_h_start - block_h) * out_width * sizeof(T)); index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; for (index_t h = valid_h_start; h < valid_h_end; ++h) { memset(output_base + h * out_width, 0, - valid_w_start * sizeof(float)); + valid_w_start * sizeof(T)); index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; for (index_t w = valid_w_start; w < valid_w_end; ++w) { @@ -183,13 +183,13 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { memset(output_base + h * out_width + valid_w_end, 0, - (out_width - valid_w_end) * sizeof(float)); + (out_width - valid_w_end) * sizeof(T)); } // h memset(output_base + valid_h_end * out_width, 0, (std::min(out_height, block_h + block_h_size) - valid_h_end) - * out_width * sizeof(float)); + * out_width * sizeof(T)); } // b } // block_h } // c @@ -332,6 +332,8 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { void RegisterSpaceToBatchND(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "SpaceToBatchND", + SpaceToBatchNDOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "SpaceToBatchND", diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 3653d09a9454057f2d2143774f4fa97ecc13167d..72ba64c713ef1adb0412bcf58af5f1e77c033189 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -28,8 +28,8 @@ namespace ops { template class SpaceToDepthOp; -template<> -class SpaceToDepthOp : public Operation { +template +class SpaceToDepthOp : public Operation { public: explicit SpaceToDepthOp(OpConstructContext *context) : Operation(context), @@ -59,8 +59,8 @@ class SpaceToDepthOp : public Operation { Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard output_guard(output); - const float *input_ptr = input->data(); - float *output_ptr = output->mutable_data(); + const T *input_ptr = input->data(); + T *output_ptr = output->mutable_data(); for (index_t b = 0; b < batch_size; ++b) { for (index_t d = 0; d < input_depth; ++d) { @@ -184,6 +184,8 @@ class SpaceToDepthOp : public Operation { void RegisterSpaceToDepth(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "SpaceToDepth", SpaceToDepthOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "SpaceToDepth", + SpaceToDepthOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "SpaceToDepth", diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc index af1536717ae66c3a1223c5bb7b4f346c7821cfd6..13c027c3caa3130bc4e75115cb593cbf46c5223e 100644 --- a/mace/ops/splice.cc +++ b/mace/ops/splice.cc @@ -157,6 +157,8 @@ class SpliceOp : public Operation { void RegisterSplice(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Splice", SpliceOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Splice", SpliceOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/split.cc b/mace/ops/split.cc index bb86aecbfc872e1d439b2aaa07bbbe93da81af7e..b0e4279156ca9665e7cd4e47c3bfff5f577e7f87 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -130,8 +130,8 @@ class SplitOp : public Operation { #endif // MACE_ENABLE_OPENCL void RegisterSplit(OpRegistry *op_registry) { - MACE_REGISTER_OP(op_registry, "Split", SplitOp, - DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Split", SplitOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Split", SplitOp, DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp); diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index 0e2b8d2bb891eceb5c46836af0e2e9b0bb81af15..901b52aba9360ff350e04f302c60ec52e1fcb5b0 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -67,9 +67,10 @@ class SqrDiffMeanOp : public Operation { const index_t img_size = input0->dim(2) * input0->dim(3); const index_t bc = input0->dim(0) * input0->dim(1); + // TODO(luxuhui): cache the output_ptr[i] for (int i = 0; i < bc; ++i) { for (int j = 0; j < img_size; ++j) { - T diff = input_ptr0[i * img_size + j] - input_ptr1[i]; + float diff = input_ptr0[i * img_size + j] - input_ptr1[i]; output_ptr[i] += diff * diff; } output_ptr[i] /= img_size; @@ -104,6 +105,8 @@ class SqrDiffMeanOp : public Operation { void RegisterSqrDiffMean(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, + DeviceType::CPU); MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp); } diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 590479dd327f382286632bd27458135281e6aec7..75afc92620b61fc278fdb97259e949dfbcde5c8c 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -80,6 +80,7 @@ class SqueezeOp : public SqueezeOpRaw { void RegisterSqueeze(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU); #ifdef MACE_ENABLE_QUANTIZE MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc index 87cc51a0c0e89d9d8a6c48d715ce10d32a08061c..790db5864be585066be3d36aeb3ba8b25b0b370b 100644 --- a/mace/ops/stack.cc +++ b/mace/ops/stack.cc @@ -80,6 +80,7 @@ class StackOp : public Operation { void RegisterStack(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Stack", StackOp, DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t); } diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc index bf44d5a162b19b1d813acc5c39ad9a1077622887..fa6edf340cc5e214d2c8b9768ac0ba3b2437f2d8 100644 --- a/mace/ops/strided_slice.cc +++ b/mace/ops/strided_slice.cc @@ -354,6 +354,8 @@ class StridedSliceOp : public Operation { void RegisterStridedSlice(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "StridedSlice", StridedSliceOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, DeviceType::CPU, int32_t); } diff --git a/mace/ops/subsample.cc b/mace/ops/subsample.cc index e3c2977e2e8b7f091c983d510faf1d51dea73a71..dd58e7eaab599b4dab8b3e129ec9f5512386fe14 100644 --- a/mace/ops/subsample.cc +++ b/mace/ops/subsample.cc @@ -104,6 +104,8 @@ class SubsampleOp : public Operation { void RegisterSubsample(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Subsample", SubsampleOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc index b8524a7480f3c5095e5bbf6d50ec92f3c26240ea..bd76ae5174f763a0889c6f7dc3aa4b3d9dd7f384 100644 --- a/mace/ops/sum_group.cc +++ b/mace/ops/sum_group.cc @@ -105,6 +105,8 @@ class SumGroupOp : public Operation { void RegisterSumGroup(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "SumGroup", SumGroupOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc index e2b2fa2eb72177ae153c1b70f27fb333ebaee1af..c05c00f561aa264b30b4bf569fcafd95e0f4c34c 100644 --- a/mace/ops/target_rms_norm.cc +++ b/mace/ops/target_rms_norm.cc @@ -152,6 +152,8 @@ class TargetRMSNormOp : public Operation { void RegisterTargetRMSNorm(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/tile.cc b/mace/ops/tile.cc index c09ca92104706649c525dc4a0bba258d5dbc1f0c..93413a7e8fcfadd9d70ae4f3ae1f117e47d7a779 100644 --- a/mace/ops/tile.cc +++ b/mace/ops/tile.cc @@ -113,6 +113,7 @@ class TileOp : public Operation { void RegisterTile(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Tile", TileOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Tile", TileOp, DeviceType::CPU); MACE_REGISTER_OP_CONDITION( op_registry, OpConditionBuilder("Tile").SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc index a366f3d421cec6dbd7172dc25b18bd660165cb12..3a5eb68b441ae9bf32664a38f8587e93c42a879b 100644 --- a/mace/ops/transpose.cc +++ b/mace/ops/transpose.cc @@ -27,11 +27,8 @@ namespace mace { namespace ops { -template -class TransposeOp; - -template -class TransposeOp : public Operation { +template +class TransposeOp : public Operation { public: explicit TransposeOp(OpConstructContext *context) : Operation(context), @@ -54,8 +51,8 @@ class TransposeOp : public Operation { Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); - const float *input_data = input->data(); - float *output_data = output->mutable_data(); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); return Transpose(&context->device()->cpu_runtime()->thread_pool(), input_data, input->shape(), dims_, output_data); @@ -68,6 +65,8 @@ class TransposeOp : public Operation { void RegisterTranspose(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Transpose", TransposeOp, + DeviceType::CPU); } } // namespace ops diff --git a/mace/ops/unsqueeze.cc b/mace/ops/unsqueeze.cc index cc28c14d8865f4bdcac79f6c5b8974f5530fba52..8fb7747e1e819477539beb1998eb51d9ec04e633 100644 --- a/mace/ops/unsqueeze.cc +++ b/mace/ops/unsqueeze.cc @@ -63,6 +63,8 @@ class UnsqueezeOp : public Operation { void RegisterUnsqueeze(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Unsqueeze", UnsqueezeOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp, DeviceType::CPU, int32_t); } diff --git a/mace/ops/unstack.cc b/mace/ops/unstack.cc index d0928614293dee689c77b607c57469c933c32b0a..6e46051faa2ab017988dce5e23fd1123c85dcb74 100644 --- a/mace/ops/unstack.cc +++ b/mace/ops/unstack.cc @@ -77,6 +77,8 @@ class UnstackOp : public Operation { void RegisterUnstack(OpRegistry *op_registry) { MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, DeviceType::CPU, float); + MACE_REGISTER_BF16_OP(op_registry, "Unstack", UnstackOp, + DeviceType::CPU); MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, DeviceType::CPU, int32_t); } diff --git a/test/ccunit/mace/ops/arm/fp32/gemm_test.cc b/test/ccunit/mace/ops/arm/fp32/gemm_test.cc index 65a516f966326661da8f214de5803fe32e2402b0..48a57b21e7cef9bbc166308c14039ccb9e682956 100644 --- a/test/ccunit/mace/ops/arm/fp32/gemm_test.cc +++ b/test/ccunit/mace/ops/arm/fp32/gemm_test.cc @@ -17,8 +17,8 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/gemm.h" -#include "mace/ops/ref/gemm.h" +#include "mace/ops/delegator/gemm.h" +#include "mace/ops/ops_test_util.h" #include "mace/ops/testing/test_utils.h" namespace mace { @@ -50,41 +50,48 @@ void TestGemmFloat32(const index_t batch, GenerateRandomRealTypeData(rhs.shape(), rhs_data); GenerateRandomRealTypeData(output.shape(), output_data); } - ::mace::ops::arm::fp32::Gemm gemm((delegator::GemmParam())); + utils::ThreadPool thread_pool(1, AFFINITY_NONE); thread_pool.Init(); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); - OpContext context(nullptr, &cpu_device); - gemm.Compute(&context, - &lhs, - &rhs, - batch, - rows, - cols, - depth, - lhs_major, - rhs_major, - output_major, - lhs_batched, - rhs_batched, - &output); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemm = delegator::Gemm::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON), + delegator::GemmParam()); + gemm->Compute(&context, + &lhs, + &rhs, + batch, + rows, + cols, + depth, + lhs_major, + rhs_major, + output_major, + lhs_batched, + rhs_batched, + &output); Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT); expected_output.Resize({batch, rows, cols}); - ::mace::ops::ref::Gemm gemm_ref((delegator::GemmParam())); - gemm_ref.Compute(nullptr, - &lhs, - &rhs, - batch, - rows, - cols, - depth, - lhs_major, - rhs_major, - output_major, - lhs_batched, - rhs_batched, - &expected_output); + std::unique_ptr gemm_ref = delegator::Gemm::Create( + context.workspace(), MACE_DELEGATOR_KEY( + Gemm, DeviceType::CPU, float, ImplType::REF), delegator::GemmParam()); + gemm_ref->Compute(&context, + &lhs, + &rhs, + batch, + rows, + cols, + depth, + lhs_major, + rhs_major, + output_major, + lhs_batched, + rhs_batched, + &expected_output); ExpectTensorNear(expected_output, output); } diff --git a/test/ccunit/mace/ops/arm/fp32/gemv_test.cc b/test/ccunit/mace/ops/arm/fp32/gemv_test.cc index 3a224ea261c3782ec37336f309fddd9ef539f110..54e0196c68864518e6a3021f48ba548ba8092ca0 100644 --- a/test/ccunit/mace/ops/arm/fp32/gemv_test.cc +++ b/test/ccunit/mace/ops/arm/fp32/gemv_test.cc @@ -17,8 +17,8 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/gemv.h" -#include "mace/ops/ref/gemv.h" +#include "mace/ops/delegator/gemv.h" +#include "mace/ops/ops_test_util.h" #include "mace/ops/testing/test_utils.h" namespace mace { @@ -52,34 +52,38 @@ void TestGemvFloat32(const index_t batch, utils::ThreadPool thread_pool(1, AFFINITY_NONE); thread_pool.Init(); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); - OpContext context(nullptr, &cpu_device); - ::mace::ops::arm::fp32::Gemv gemv = - ::mace::ops::arm::fp32::Gemv(DelegatorParam()); - gemv.Compute(&context, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &output); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemv = delegator::Gemv::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON), + DelegatorParam()); + gemv->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &output); Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT); expected_output.Resize({batch, height}); - ::mace::ops::ref::Gemv gemv_ref = - ::mace::ops::ref::Gemv(DelegatorParam()); - gemv_ref.Compute(nullptr, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &expected_output); + std::unique_ptr gemv_ref = delegator::Gemv::Create( + context.workspace(), MACE_DELEGATOR_KEY( + Gemv, DeviceType::CPU, float, ImplType::REF), DelegatorParam()); + gemv_ref->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &expected_output); ExpectTensorNear(expected_output, output); } diff --git a/test/ccunit/mace/ops/arm/q8/gemv_test.cc b/test/ccunit/mace/ops/arm/q8/gemv_test.cc index 619d343fdd4ccf9ea051b22d0004cb3edc1a5352..e970555169751e07e43b5a3828ac0e0eee7d9d56 100644 --- a/test/ccunit/mace/ops/arm/q8/gemv_test.cc +++ b/test/ccunit/mace/ops/arm/q8/gemv_test.cc @@ -17,8 +17,8 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/q8/gemv.h" -#include "mace/ops/ref/gemv.h" +#include "mace/ops/delegator/gemv.h" +#include "mace/ops/ops_test_util.h" #include "mace/ops/testing/test_utils.h" namespace mace { @@ -57,34 +57,38 @@ void TestGemvInt32(const index_t batch, utils::ThreadPool thread_pool(1, AFFINITY_NONE); thread_pool.Init(); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); - OpContext context(nullptr, &cpu_device); - mace::ops::arm::q8::Gemv gemv = - mace::ops::arm::q8::Gemv(DelegatorParam()); - gemv.Compute(&context, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &output); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemv = delegator::Gemv::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON), + DelegatorParam()); + gemv->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &output); Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32); expected_output.Resize({batch, height}); - mace::ops::ref::Gemv gemv_ref = - mace::ops::ref::Gemv(DelegatorParam()); - gemv_ref.Compute(nullptr, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &expected_output); + std::unique_ptr gemv_ref = delegator::Gemv::Create( + context.workspace(), MACE_DELEGATOR_KEY( + Gemv, DeviceType::CPU, int32_t, ImplType::REF), DelegatorParam()); + gemv_ref->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &expected_output); Tensor::MappingGuard output_guard(&output); Tensor::MappingGuard expected_guard(&expected_output); @@ -131,36 +135,40 @@ void TestGemvUint8(const index_t batch, utils::ThreadPool thread_pool(1, AFFINITY_NONE); thread_pool.Init(); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); - OpContext context(nullptr, &cpu_device); - mace::ops::arm::q8::Gemv gemv = - mace::ops::arm::q8::Gemv(DelegatorParam()); - gemv.Compute(&context, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &output); + OpsTestNet net; + OpContext context(net.ws(), &cpu_device); + std::unique_ptr gemv = delegator::Gemv::Create( + context.workspace(), + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON), + DelegatorParam()); + gemv->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &output); Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32); expected_output.SetScale(0.6); expected_output.SetZeroPoint(57); expected_output.Resize({batch, height}); - mace::ops::ref::Gemv gemv_ref = - mace::ops::ref::Gemv(DelegatorParam()); - gemv_ref.Compute(nullptr, - &lhs, - &rhs, - &bias, - batch, - height, - width, - lhs_batched, - rhs_batched, - &expected_output); + std::unique_ptr gemv_ref = delegator::Gemv::Create( + context.workspace(), MACE_DELEGATOR_KEY( + Gemv, DeviceType::CPU, uint8_t, ImplType::REF), DelegatorParam()); + gemv_ref->Compute(&context, + &lhs, + &rhs, + &bias, + batch, + height, + width, + lhs_batched, + rhs_batched, + &expected_output); Tensor::MappingGuard output_guard(&output); Tensor::MappingGuard expected_guard(&expected_output); diff --git a/test/ccunit/mace/ops/matmul_test.cc b/test/ccunit/mace/ops/matmul_test.cc index 9d46f0e1d97391e6dbf539f0cbee21b29918a1fc..4ab2ec767f4ba9ee25f71c5ff51de095d59fb7c2 100644 --- a/test/ccunit/mace/ops/matmul_test.cc +++ b/test/ccunit/mace/ops/matmul_test.cc @@ -16,7 +16,6 @@ #include "mace/ops/delegator/gemm.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/ref/gemm.h" namespace mace { namespace ops { @@ -112,7 +111,9 @@ void Complex(const std::vector &batch, .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - ref::Gemm gemm = ref::Gemm(delegator::GemmParam()); + std::unique_ptr gemm = delegator::Gemm::Create( + net.ws(), MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF), + delegator::GemmParam()); Tensor expected_output_tensor; std::vector expected_output_shape({rows, cols}); expected_output_shape.insert(expected_output_shape.begin(), @@ -121,20 +122,20 @@ void Complex(const std::vector &batch, expected_output_tensor.Resize(expected_output_shape); index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1, std::multiplies()); - gemm.Compute(nullptr, - net.GetTensor("A"), - net.GetTensor("B"), - batch_count, - lhs_rows, - lhs_cols, - rhs_rows, - rhs_cols, - transpose_lhs, - transpose_rhs, - false, - lhs_batched, - rhs_batched, - &expected_output_tensor); + gemm->Compute(nullptr, + net.GetTensor("A"), + net.GetTensor("B"), + batch_count, + lhs_rows, + lhs_cols, + rhs_rows, + rhs_cols, + transpose_lhs, + transpose_rhs, + false, + lhs_batched, + rhs_batched, + &expected_output_tensor); ExpectTensorNear(expected_output_tensor, *net.GetTensor("Output"), 1e-4, 1e-2); diff --git a/tools/cmake/cmake-build-aarch64-linux-gnu.sh b/tools/cmake/cmake-build-aarch64-linux-gnu.sh index 25f425ad3c4a65d5ebc5f2bfe0ce0522d1fb54b0..52a2c744b35eb125930e8c3162c5a604b1e71d54 100755 --- a/tools/cmake/cmake-build-aarch64-linux-gnu.sh +++ b/tools/cmake/cmake-build-aarch64-linux-gnu.sh @@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \ -DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ + -DMACE_ENABLE_BFLOAT16=ON \ -DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_TESTS=ON \ diff --git a/tools/cmake/cmake-build-arm-linux-gnueabihf.sh b/tools/cmake/cmake-build-arm-linux-gnueabihf.sh index 5c96ed5e5aa699b639a9cce2cdfa1a9d27f403d5..c0725ff174034bb834f08acf0141ad9872db5f0c 100755 --- a/tools/cmake/cmake-build-arm-linux-gnueabihf.sh +++ b/tools/cmake/cmake-build-arm-linux-gnueabihf.sh @@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \ -DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ + -DMACE_ENABLE_BFLOAT16=ON \ -DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_TESTS=ON \ diff --git a/tools/cmake/cmake-build-arm64-v8a.sh b/tools/cmake/cmake-build-arm64-v8a.sh index 999b0b74b1fc7ef36c827203f870ef44f45f25a1..2474b7b35a30aeeb45272cdc3d8eaa47e031abcb 100755 --- a/tools/cmake/cmake-build-arm64-v8a.sh +++ b/tools/cmake/cmake-build-arm64-v8a.sh @@ -31,13 +31,14 @@ cmake -DANDROID_ABI="arm64-v8a" \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \ -DANDROID_NATIVE_API_LEVEL=21 \ -DCMAKE_BUILD_TYPE=Release \ - -DANDROID_STL=c++_shared \ + -DANDROID_STL=c++_shared \ -DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \ -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \ -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \ + -DMACE_ENABLE_BFLOAT16=ON \ -DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_TESTS=ON \ diff --git a/tools/cmake/cmake-build-armeabi-v7a.sh b/tools/cmake/cmake-build-armeabi-v7a.sh index c98d196ce8791a50e5d3106cbea2a58fc27cc2dd..b7ae723daf76963a16b24c0daab7b6ca348e02cb 100755 --- a/tools/cmake/cmake-build-armeabi-v7a.sh +++ b/tools/cmake/cmake-build-armeabi-v7a.sh @@ -40,6 +40,7 @@ cmake -DANDROID_ABI="armeabi-v7a" \ -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \ -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \ -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \ + -DMACE_ENABLE_BFLOAT16=ON \ -DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_TESTS=ON \ diff --git a/tools/cmake/cmake-build-host.sh b/tools/cmake/cmake-build-host.sh index b76f8cf388b9b5656750cabafb57ee7760fe3d68..3902c5e52233d3c664c3852e32d536574da36d9b 100755 --- a/tools/cmake/cmake-build-host.sh +++ b/tools/cmake/cmake-build-host.sh @@ -18,6 +18,7 @@ mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR} cmake -DMACE_ENABLE_NEON=OFF \ -DMACE_ENABLE_QUANTIZE=OFF \ -DMACE_ENABLE_OPENCL=OFF \ + -DMACE_ENABLE_BFLOAT16=ON \ -DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_BENCHMARKS=ON \ -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE} \ diff --git a/tools/converter.py b/tools/converter.py index dd9a6cbc9218eef8fa771210b9e2ae0eb2ebbbc5..6f33885b9945135268db7ed7450b2a0e60ae39d1 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -171,7 +171,16 @@ def parse_device_type(runtime): return device_type -def get_hexagon_mode(configs): +def bfloat16_enabled(configs): + for model_name in configs[YAMLKeyword.models]: + model_config = configs[YAMLKeyword.models][model_name] + dtype = model_config.get(YAMLKeyword.data_type, FPDataType.fp16_fp32) + if dtype == FPDataType.bf16_fp32: + return True + return False + + +def hexagon_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: model_runtime = \ @@ -184,7 +193,7 @@ def get_hexagon_mode(configs): return False -def get_hta_mode(configs): +def hta_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: model_runtime = \ @@ -197,7 +206,7 @@ def get_hta_mode(configs): return False -def get_apu_mode(configs): +def apu_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: model_runtime = \ @@ -210,7 +219,7 @@ def get_apu_mode(configs): return False -def get_opencl_mode(configs): +def opencl_enabled(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: model_runtime = \ @@ -224,7 +233,7 @@ def get_opencl_mode(configs): return False -def get_quantize_mode(configs): +def quantize_enabled(configs): for model_name in configs[YAMLKeyword.models]: quantize = \ configs[YAMLKeyword.models][model_name].get( @@ -739,11 +748,12 @@ def build_model_lib(configs, address_sanitizer, debug_mode): MODEL_LIB_TARGET, abi=target_abi, toolchain=toolchain, - enable_hexagon=get_hexagon_mode(configs), - enable_hta=get_hta_mode(configs), - enable_apu=get_apu_mode(configs), - enable_opencl=get_opencl_mode(configs), - enable_quantize=get_quantize_mode(configs), + enable_hexagon=hexagon_enabled(configs), + enable_hta=hta_enabled(configs), + enable_apu=apu_enabled(configs), + enable_opencl=opencl_enabled(configs), + enable_quantize=quantize_enabled(configs), + enable_bfloat16=bfloat16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode), debug_mode=debug_mode @@ -900,12 +910,13 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, mace_run_target, abi=target_abi, toolchain=toolchain, - enable_hexagon=get_hexagon_mode(configs), - enable_hta=get_hta_mode(configs), - enable_apu=get_apu_mode(configs), + enable_hexagon=hexagon_enabled(configs), + enable_hta=hta_enabled(configs), + enable_apu=apu_enabled(configs), enable_openmp=enable_openmp, - enable_opencl=get_opencl_mode(configs), - enable_quantize=get_quantize_mode(configs), + enable_opencl=opencl_enabled(configs), + enable_quantize=quantize_enabled(configs), + enable_bfloat16=bfloat16_enabled(configs), address_sanitizer=address_sanitizer, symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type), debug_mode=debug_mode, diff --git a/tools/python/micro/scratch_computer.py b/tools/python/micro/scratch_computer.py index b88469bdf7350cdbf38dd65d049845cbda53fb71..986527189df7b95b14f1225f2b47eb6d43582889 100644 --- a/tools/python/micro/scratch_computer.py +++ b/tools/python/micro/scratch_computer.py @@ -74,6 +74,7 @@ class ScratchComputer: data_type == mace_pb2.DT_INT32: return 4 elif data_type == mace_pb2.DT_HALF or \ + data_type == mace_pb2.DT_BFLOAT16 or \ data_type == mace_pb2.DT_FLOAT16: return 2 elif data_type == mace_pb2.DT_UINT8: diff --git a/tools/python/utils/convert_util.py b/tools/python/utils/convert_util.py index 7d37a32be1fff6e35b3972a74c26ea68f785abaa..ba6a5cce637e1d865dd664ab824e41cd44079012 100644 --- a/tools/python/utils/convert_util.py +++ b/tools/python/utils/convert_util.py @@ -1,4 +1,3 @@ - # Copyright 2020 The MACE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -79,7 +78,7 @@ def merge_params(net_def, data_type): for tensor in net_def.tensors: if tensor.data_type == mace_pb2.DT_FLOAT \ or tensor.data_type == mace_pb2.DT_HALF \ - or tensor.data_type == mace_pb2.DT_FLOAT16\ + or tensor.data_type == mace_pb2.DT_FLOAT16 \ or tensor.data_type == mace_pb2.DT_BFLOAT16: del tensor.float_data[:] elif tensor.data_type == mace_pb2.DT_INT32: diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 219b8502fac4dd44f18886c2727b1633abd907ab..7642a881b5f0b751a7a1a66dd53c3fa44e50b3ad 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -270,6 +270,7 @@ def bazel_build(target, enable_neon=True, enable_opencl=True, enable_quantize=True, + enable_bfloat16=False, enable_rpcmem=True, address_sanitizer=False, symbol_hidden=True, @@ -286,6 +287,8 @@ def bazel_build(target, "openmp=%s" % str(enable_openmp).lower(), "--define", "quantize=%s" % str(enable_quantize).lower(), + "--define", + "bfloat16=%s" % str(enable_bfloat16).lower(), target, ) else: @@ -304,6 +307,8 @@ def bazel_build(target, "--define", "quantize=%s" % str(enable_quantize).lower(), "--define", + "bfloat16=%s" % str(enable_bfloat16).lower(), + "--define", "rpcmem=%s" % str(enable_rpcmem).lower(), "--define", "hexagon=%s" % str(enable_hexagon).lower(),