提交 950028c1 编写于 作者: L luxuhui

feature: support bf16

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 27757d88
...@@ -10,6 +10,7 @@ option(MACE_ENABLE_CUDA "whether to enable CUDA support" OFF) ...@@ -10,6 +10,7 @@ option(MACE_ENABLE_CUDA "whether to enable CUDA support" OFF)
option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF) option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF)
option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF) option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF)
option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF) option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF)
option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF)
option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF) option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF)
option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF) option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF)
option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON) option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON)
...@@ -116,6 +117,10 @@ if(MACE_ENABLE_MTK_APU) ...@@ -116,6 +117,10 @@ if(MACE_ENABLE_MTK_APU)
add_definitions(-DMACE_ENABLE_MTK_APU) add_definitions(-DMACE_ENABLE_MTK_APU)
endif(MACE_ENABLE_MTK_APU) endif(MACE_ENABLE_MTK_APU)
if(MACE_ENABLE_BFLOAT16)
add_definitions(-DMACE_ENABLE_BFLOAT16)
endif(MACE_ENABLE_BFLOAT16)
if(MACE_ENABLE_OBFUSCATE) if(MACE_ENABLE_OBFUSCATE)
add_definitions(-DMACE_OBFUSCATE_LITERALS) add_definitions(-DMACE_OBFUSCATE_LITERALS)
endif(MACE_ENABLE_OBFUSCATE) endif(MACE_ENABLE_OBFUSCATE)
......
...@@ -85,7 +85,7 @@ in one deployment file. ...@@ -85,7 +85,7 @@ in one deployment file.
* - runtime * - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type * - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types * - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats * - input_data_formats
...@@ -582,9 +582,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati ...@@ -582,9 +582,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However, Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float. if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float. For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
...@@ -63,7 +63,7 @@ There are many advanced options supported. ...@@ -63,7 +63,7 @@ There are many advanced options supported.
* - runtime * - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type * - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types * - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats * - input_data_formats
...@@ -438,9 +438,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati ...@@ -438,9 +438,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However, Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float. if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``. In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float. For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float. For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
...@@ -43,12 +43,13 @@ MACE 需要安装下列依赖: ...@@ -43,12 +43,13 @@ MACE 需要安装下列依赖:
- 版本和说明 - 版本和说明
* - Android NDK * - Android NDK
- `NDK 安装指南 <https://developer.android.com/ndk/guides/setup#install>`__ - `NDK 安装指南 <https://developer.android.com/ndk/guides/setup#install>`__
- Required by Android build, r15b, r15c, r16b, r17b - 安卓编译需要, bazel用户可以使用r15b及以上的版本, cmake用户可以使用r17b及以上版本
* - CMake * - CMake
- apt-get install cmake - apt-get install cmake
- >= 3.11.3 - >= 3.11.3
* - ADB * - ADB
- Linux:``apt-get install android-tools-adb`` Mac:``brew cask install android-platform-tools`` - | Linux:``apt-get install android-tools-adb``
| Mac:``brew cask install android-platform-tools``
- Android 运行需要, >= 1.0.32 - Android 运行需要, >= 1.0.32
* - TensorFlow * - TensorFlow
- pip install tensorflow==1.8.0 - pip install tensorflow==1.8.0
......
...@@ -132,6 +132,14 @@ config_setting( ...@@ -132,6 +132,14 @@ config_setting(
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
config_setting(
name = "bfloat16_enabled",
define_values = {
"bfloat16": "true",
},
visibility = ["//visibility:public"],
)
config_setting( config_setting(
name = "rpcmem_enabled", name = "rpcmem_enabled",
define_values = { define_values = {
......
...@@ -9,6 +9,7 @@ load( ...@@ -9,6 +9,7 @@ load(
"if_android", "if_android",
"if_android_armv7", "if_android_armv7",
"if_apu_enabled", "if_apu_enabled",
"if_bfloat16_enabled",
"if_hexagon_enabled", "if_hexagon_enabled",
"if_hexagon_or_hta_enabled", "if_hexagon_or_hta_enabled",
"if_hta_enabled", "if_hta_enabled",
...@@ -87,6 +88,8 @@ cc_library( ...@@ -87,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([ ]) + if_hta_enabled([
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_BFLOAT16_H_
#define MACE_CORE_BFLOAT16_H_
#ifdef MACE_ENABLE_BFLOAT16
#include <algorithm>
#include <cmath>
#include <sstream>
namespace mace {
union Sphinx {
uint32_t i;
float f;
Sphinx(uint32_t value) : i(value) {}
Sphinx(float value) : f(value) {}
};
class BFloat16 {
public:
BFloat16() : data_(0) {}
// we need implicit transformation, so `explicit` keyword is not used
BFloat16(float value) : data_(Sphinx(value).i >> 16) {} // NOLINT
operator float() const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f;
}
operator double() const {
return static_cast<double>(
Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
operator int() const {
return static_cast<int>(Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
template<typename T>
void operator=(T value) {
data_ = Sphinx(static_cast<float>(value)).i >> 16;
}
BFloat16 operator-() const {
return BFloat16(-(Sphinx(static_cast<uint32_t>(data_ << 16)).f));
}
template<typename T>
BFloat16 operator+(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f + static_cast<float>(value));
}
template<typename T>
BFloat16 operator-(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f - static_cast<float>(value));
}
template<typename T>
BFloat16 operator*(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f * static_cast<float>(value));
}
template<typename T>
BFloat16 operator/(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f / static_cast<float>(value));
}
template<typename T>
bool operator>(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f > static_cast<float>(value);
}
template<typename T>
bool operator>=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f >= static_cast<float>(value);
}
template<typename T>
bool operator<(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f < static_cast<float>(value);
}
template<typename T>
bool operator<=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f <= static_cast<float>(value);
}
template<typename T>
bool operator==(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f == static_cast<float>(value);
}
template<typename T>
void operator+=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f +
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator/=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f /
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator-=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f -
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator*=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f *
static_cast<float>(value)).i >> 16;
}
private:
uint16_t data_;
};
template<>
inline bool BFloat16::operator==(const BFloat16 &value) const {
return data_ == value.data_;
}
template<>
inline void BFloat16::operator=(const BFloat16 &value) {
data_ = value.data_;
}
} // namespace mace
namespace std {
inline float fabs(const mace::BFloat16 &value) {
return fabs(static_cast<float>(value));
}
inline float abs(const mace::BFloat16 &value) {
return abs(static_cast<float>(value));
}
inline float sqrt(const mace::BFloat16 &value) {
return sqrt(static_cast<float>(value));
}
inline float log(const mace::BFloat16 &value) {
return log(static_cast<float>(value));
}
inline float tanh(const mace::BFloat16 &value) {
return tanh(static_cast<float>(value));
}
inline float exp(const mace::BFloat16 &value) {
return exp(static_cast<float>(value));
}
inline int ceil(const mace::BFloat16 &value) {
return ceil(static_cast<float>(value));
}
inline int floor(const mace::BFloat16 &value) {
return floor(static_cast<float>(value));
}
inline float max(const mace::BFloat16 &a, const float &b) {
return max(static_cast<float>(a), b);
}
inline float max(const float &a, const mace::BFloat16 &b) {
return max(a, static_cast<float>(b));
}
inline float min(const mace::BFloat16 &a, const float &b) {
return min(static_cast<float>(a), b);
}
inline float min(const float &a, const mace::BFloat16 &b) {
return min(a, static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const mace::BFloat16 &b) {
return pow(static_cast<float>(a), static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const float &b) {
return pow(static_cast<float>(a), b);
}
inline float pow(const float &a, const mace::BFloat16 &b) {
return pow(a, static_cast<float>(b));
}
inline ostream &operator<<(ostream &ss, // NOLINT
const mace::BFloat16 &value) {
return ss << static_cast<float>(value);
}
} // namespace std
inline float operator+(const float &a, const mace::BFloat16 &value) {
return a + static_cast<float>(value);
}
inline float operator-(const float &a, const mace::BFloat16 &value) {
return a - static_cast<float>(value);
}
inline float operator*(const float &a, const mace::BFloat16 &value) {
return a * static_cast<float>(value);
}
inline float operator/(const float &a, const mace::BFloat16 &value) {
return a / static_cast<float>(value);
}
inline void operator+=(float &a, const mace::BFloat16 &value) { // NOLINT
a += static_cast<float>(value);
}
inline void operator-=(float &a, const mace::BFloat16 &value) { // NOLINT
a -= static_cast<float>(value);
}
inline void operator*=(float &a, const mace::BFloat16 &value) { // NOLINT
a *= static_cast<float>(value);
}
inline void operator/=(float &a, const mace::BFloat16 &value) { // NOLINT
a /= static_cast<float>(value);
}
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_CORE_BFLOAT16_H_
...@@ -28,9 +28,9 @@ enum ImplType { ...@@ -28,9 +28,9 @@ enum ImplType {
}; };
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
#define MACE_CPU_IMPL_TYPE NEON const ImplType kCpuImplType = ImplType::NEON;
#else #else
#define MACE_CPU_IMPL_TYPE REF const ImplType kCpuImplType = ImplType::REF;
#endif #endif
struct DelegatorParam { struct DelegatorParam {
......
...@@ -15,25 +15,86 @@ ...@@ -15,25 +15,86 @@
#include "mace/core/registry/op_delegator_registry.h" #include "mace/core/registry/op_delegator_registry.h"
#include <utility> #include <utility>
#include <sstream>
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
namespace mace { namespace mace {
MaceStatus OpDelegatorRegistry::Register(const std::string &key, namespace {
const char *kDefaultTag = "general";
}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type,
const char *in_tag)
: delegator_name(in_name), data_type(in_data_type),
device(in_device), impl_type(in_impl_type), tag(in_tag) {}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type)
: DelegatorInfo(in_name, in_data_type,
in_device, in_impl_type, kDefaultTag) {}
std::string DelegatorInfo::ToString() const {
std::stringstream ss;
ss << delegator_name << "_" << data_type << "_"
<< device << "_" << impl_type << "_" << tag;
return ss.str();
}
bool DelegatorInfo::operator==(const DelegatorInfo &info) const {
return device == info.device && impl_type == info.impl_type &&
data_type == info.data_type &&
delegator_name == info.delegator_name && tag == info.tag;
}
MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
DelegatorCreator creator) { DelegatorCreator creator) {
MACE_CHECK(registry_.count(key) == 0, "Register an exist key."); MACE_CHECK(registry_.count(key) == 0,
"Register an exist key: ", key.ToString());
registry_[key] = std::move(creator); registry_[key] = std::move(creator);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const { DelegatorCreator OpDelegatorRegistry::GetCreator(
MACE_CHECK(registry_.count(key) > 0, key, " not exist."); const DelegatorInfo &key) const {
return registry_.at(key); if (registry_.count(key) > 0) {
} return registry_.at(key);
}
DelegatorInfo info = key;
if (key.impl_type == ImplType::NEON) {
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
info.tag = key.tag;
}
template<> const char *DType<float>::name_ = "float"; info.impl_type = ImplType::REF;
template<> const char *DType<int>::name_ = "int"; if (registry_.count(info) > 0) {
template<> const char *DType<uint8_t>::name_ = "uint8_t"; VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
// for REF
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
LOG(FATAL) << "Delegator not exist: " << key.ToString();
return DelegatorCreator();
}
} // namespace mace } // namespace mace
...@@ -21,7 +21,9 @@ ...@@ -21,7 +21,9 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/ops/op_delegator.h" #include "mace/core/ops/op_delegator.h"
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -29,40 +31,50 @@ namespace mace { ...@@ -29,40 +31,50 @@ namespace mace {
typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)> typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)>
DelegatorCreator; DelegatorCreator;
struct DelegatorInfo {
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type,
const char *tag);
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type);
std::string ToString() const;
bool operator==(const DelegatorInfo &info) const;
std::string delegator_name;
DataType data_type;
DeviceType device;
ImplType impl_type;
std::string tag;
};
class OpDelegatorRegistry { class OpDelegatorRegistry {
public: public:
OpDelegatorRegistry() = default; OpDelegatorRegistry() = default;
~OpDelegatorRegistry() = default; ~OpDelegatorRegistry() = default;
MaceStatus Register(const std::string &key, DelegatorCreator creator); MaceStatus Register(const DelegatorInfo &key, DelegatorCreator creator);
DelegatorCreator GetCreator(const std::string &key) const; DelegatorCreator GetCreator(const DelegatorInfo &key) const;
private: private:
std::unordered_map<std::string, DelegatorCreator> registry_; struct HashName {
size_t operator()(const DelegatorInfo &delegator_info) const {
return std::hash<std::string>()(delegator_info.ToString());
}
};
std::unordered_map<DelegatorInfo, DelegatorCreator, HashName> registry_;
}; };
template<typename T>
struct DType { static const char *name_; };
template<> const char *DType<float>::name_;
template<> const char *DType<int>::name_;
template<> const char *DType<uint8_t>::name_;
} // namespace mace } // namespace mace
#ifndef MACE_DELEGATOR_KEY_TMP
#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \
(std::string(#delegator_name"_"#device"_"#impl"_") + DType<DT>::name_)
#endif // MACE_DELEGATOR_KEY_TMP
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_DELEGATOR_KEY_EX_TMP #ifndef MACE_DELEGATOR_KEY_EX_TMP
#define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \ #define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \
(std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType<DT>::name_) DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl, #tag)
#endif // MACE_DELEGATOR_KEY_EX_TMP #endif // MACE_DELEGATOR_KEY_EX_TMP
#ifndef MACE_DELEGATOR_KEY_EX #ifndef MACE_DELEGATOR_KEY_EX
...@@ -70,21 +82,32 @@ template<> const char *DType<uint8_t>::name_; ...@@ -70,21 +82,32 @@ template<> const char *DType<uint8_t>::name_;
MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag)
#endif // MACE_DELEGATOR_KEY_EX #endif // MACE_DELEGATOR_KEY_EX
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_REGISTER_DELEGATOR #ifndef MACE_REGISTER_DELEGATOR
#define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) \ #define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) \
void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \ registry->Register(key, OpDelegator::DefaultCreator<class_name, param_name>)
registry->Register( \
key, OpDelegator::DefaultCreator<class_name, param_name>); \
}
#endif // MACE_REGISTER_DELEGATOR #endif // MACE_REGISTER_DELEGATOR
#ifndef MACE_REGISTER_BF16_DELEGATOR
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key) \
MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key)
#else
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_DELEGATOR
#ifndef MACE_DEFINE_DELEGATOR_CREATOR #ifndef MACE_DEFINE_DELEGATOR_CREATOR
#define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \ #define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \
static std::unique_ptr<class_name> Create( \ static std::unique_ptr<class_name> Create( \
Workspace *workspace, const std::string &tag, \ Workspace *workspace, const DelegatorInfo &key, \
const DelegatorParam &param) { \ const DelegatorParam &param) { \
DelegatorCreator creator = \ DelegatorCreator creator = \
workspace->GetDelegatorRegistry()->GetCreator(tag); \ workspace->GetDelegatorRegistry()->GetCreator(key); \
std::unique_ptr<OpDelegator> delegator = creator(param); \ std::unique_ptr<OpDelegator> delegator = creator(param); \
return std::unique_ptr<class_name>( \ return std::unique_ptr<class_name>( \
static_cast<class_name *>(delegator.release())); \ static_cast<class_name *>(delegator.release())); \
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/types.h"
#include "mace/core/ops/operator.h" #include "mace/core/ops/operator.h"
#include "mace/core/ops/op_condition_builder.h" #include "mace/core/ops/op_condition_builder.h"
#include "mace/core/ops/op_condition_context.h" #include "mace/core/ops/op_condition_context.h"
...@@ -80,6 +82,26 @@ class OpRegistry { ...@@ -80,6 +82,26 @@ class OpRegistry {
DataTypeToEnum<dt>::value, \ DataTypeToEnum<dt>::value, \
OpRegistry::DefaultCreator<class_name>) OpRegistry::DefaultCreator<class_name>)
#ifndef MACE_REGISTER_BF16_OP
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device) \
MACE_REGISTER_OP(op_registry, op_type, class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP
#ifndef MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, \
class_name, device) \
MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \
class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \ #define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \ op_registry->Register( \
......
...@@ -53,6 +53,13 @@ namespace mace { ...@@ -53,6 +53,13 @@ namespace mace {
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) #define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS)
#endif #endif
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_CASE(BFloat16, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS)
#endif // MACE_ENABLE_BFLOAT16
#if MACE_ENABLE_OPENCL #if MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ #define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS))
...@@ -67,6 +74,7 @@ namespace mace { ...@@ -67,6 +74,7 @@ namespace mace {
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \ MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \ MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \ MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
case DT_INVALID: \ case DT_INVALID: \
INVALID_STATEMENTS; \ INVALID_STATEMENTS; \
...@@ -419,7 +427,8 @@ class Tensor { ...@@ -419,7 +427,8 @@ class Tensor {
if (i != 0 && i % shape_.back() == 0) { if (i != 0 && i % shape_.back() == 0) {
os << "\n"; os << "\n";
} }
MACE_RUN_WITH_TYPE_ENUM(dtype_, (os << (this->data<T>()[i]) << ", ")); MACE_RUN_WITH_TYPE_ENUM(
dtype_, (os << this->data<T>()[i] << ", "));
} }
LOG(INFO) << os.str(); LOG(INFO) << os.str();
} }
......
...@@ -25,6 +25,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -25,6 +25,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
case DT_FLOAT: case DT_FLOAT:
case DT_UINT8: case DT_UINT8:
case DT_INT32: case DT_INT32:
case DT_BFLOAT16:
return true; return true;
default: default:
return false; return false;
...@@ -36,7 +37,8 @@ std::string DataTypeToString(const DataType dt) { ...@@ -36,7 +37,8 @@ std::string DataTypeToString(const DataType dt) {
{DT_FLOAT, "DT_FLOAT"}, {DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"}, {DT_HALF, "DT_HALF"},
{DT_UINT8, "DT_UINT8"}, {DT_UINT8, "DT_UINT8"},
{DT_INT32, "DT_INT32"}}; {DT_INT32, "DT_INT32"},
{DT_BFLOAT16, "DT_BFLOAT16"}};
MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type"); MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type");
return dtype_string_map[dt]; return dtype_string_map[dt];
} }
...@@ -50,6 +52,10 @@ size_t GetEnumTypeSize(const DataType dt) { ...@@ -50,6 +52,10 @@ size_t GetEnumTypeSize(const DataType dt) {
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
case DT_FLOAT16: case DT_FLOAT16:
return sizeof(float16_t); return sizeof(float16_t);
#endif
#ifdef MACE_ENABLE_BFLOAT16
case DT_BFLOAT16:
return sizeof(BFloat16);
#endif #endif
case DT_UINT8: case DT_UINT8:
return sizeof(uint8_t); return sizeof(uint8_t);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include "mace/core/bfloat16.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "include/half.hpp" #include "include/half.hpp"
...@@ -57,6 +58,9 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); ...@@ -57,6 +58,9 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16); MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16);
#endif #endif
#ifdef MACE_ENABLE_BFLOAT16
MACE_MAPPING_DATA_TYPE_AND_ENUM(BFloat16, DT_BFLOAT16);
#endif
MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT); MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8); MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32); MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
......
...@@ -12,6 +12,7 @@ load( ...@@ -12,6 +12,7 @@ load(
"if_android", "if_android",
"if_android_armv7", "if_android_armv7",
"if_apu_enabled", "if_apu_enabled",
"if_bfloat16_enabled",
"if_darwin", "if_darwin",
"if_hexagon_enabled", "if_hexagon_enabled",
"if_hta_enabled", "if_hta_enabled",
...@@ -42,6 +43,8 @@ cc_library( ...@@ -42,6 +43,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([ ]) + if_hta_enabled([
...@@ -52,8 +55,8 @@ cc_library( ...@@ -52,8 +55,8 @@ cc_library(
"-DMACE_ENABLE_RPCMEM", "-DMACE_ENABLE_RPCMEM",
]), ]),
deps = [ deps = [
"//mace/ops",
"//include:public_headers", "//include:public_headers",
"//mace/ops",
], ],
alwayslink = 1, alwayslink = 1,
) )
......
...@@ -16,9 +16,11 @@ ...@@ -16,9 +16,11 @@
#include <numeric> #include <numeric>
#include <memory> #include <memory>
#include "mace/core/bfloat16.h"
#include "mace/core/device_context.h" #include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h" #include "mace/core/memory_optimizer.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/net_def_adapter.h"
#include "mace/core/registry/ops_registry.h" #include "mace/core/registry/ops_registry.h"
#include "mace/core/registry/op_delegator_registry.h" #include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/transpose.h" #include "mace/ops/common/transpose.h"
...@@ -29,7 +31,6 @@ ...@@ -29,7 +31,6 @@
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/port/env.h" #include "mace/port/env.h"
#include "mace/port/file_system.h" #include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/gpu_device.h"
...@@ -460,6 +461,7 @@ class MaceEngine::Impl { ...@@ -460,6 +461,7 @@ class MaceEngine::Impl {
std::unique_ptr<Workspace> ws_; std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
bool is_quantized_model_; bool is_quantized_model_;
DataType net_data_type_;
std::map<std::string, mace::InputOutputInfo> input_info_map_; std::map<std::string, mace::InputOutputInfo> input_info_map_;
std::map<std::string, mace::InputOutputInfo> output_info_map_; std::map<std::string, mace::InputOutputInfo> output_info_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_; std::unique_ptr<utils::ThreadPool> thread_pool_;
...@@ -565,6 +567,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -565,6 +567,7 @@ MaceStatus MaceEngine::Impl::Init(
#endif #endif
// mark quantized model flag // mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def); is_quantized_model_ = IsQuantizedModel(*net_def);
net_data_type_ = net_def->data_type();
// Get input and output information. // Get input and output information.
for (auto &input_info : net_def->input_info()) { for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info; input_info_map_[input_info.name()] = input_info;
...@@ -589,8 +592,8 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -589,8 +592,8 @@ MaceStatus MaceEngine::Impl::Init(
} }
input_tensor->Resize(shape); input_tensor->Resize(shape);
// Set to the default data format // Set to the default data format
input_tensor->set_data_format(static_cast<DataFormat>( input_tensor->set_data_format(
input_info_map_[input_name].data_format())); static_cast<DataFormat>(input_info_map_[input_name].data_format()));
} }
for (auto output_name : output_nodes) { for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) { if (output_info_map_.find(output_name) == output_info_map_.end()) {
...@@ -691,7 +694,8 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -691,7 +694,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile( MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
model_data_file.c_str(), &model_data_)); model_data_file.c_str(), &model_data_));
MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, MACE_RETURN_IF_ERROR(Init(
net_def, input_nodes, output_nodes,
reinterpret_cast<const unsigned char *>(model_data_->data()))); reinterpret_cast<const unsigned char *>(model_data_->data())));
if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON || if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
...@@ -753,11 +757,24 @@ MaceStatus MaceEngine::Impl::TransposeInput( ...@@ -753,11 +757,24 @@ MaceStatus MaceEngine::Impl::TransposeInput(
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) { if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>(); auto input_data = input_tensor->mutable_data<float>();
return ops::Transpose(thread_pool_.get(), if (net_data_type_ == DT_FLOAT || net_data_type_ == DataType::DT_HALF) {
input.second.data<float>().get(), return ops::Transpose(thread_pool_.get(),
input.second.shape(), input.second.data<float>().get(),
dst_dims, input.second.shape(),
input_data); dst_dims,
input_data);
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DT_BFLOAT16) {
auto *input_data = input_tensor->mutable_data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) { } else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>(); auto input_data = input_tensor->mutable_data<int>();
return ops::Transpose(thread_pool_.get(), return ops::Transpose(thread_pool_.get(),
...@@ -776,9 +793,22 @@ MaceStatus MaceEngine::Impl::TransposeInput( ...@@ -776,9 +793,22 @@ MaceStatus MaceEngine::Impl::TransposeInput(
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) { if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>(); if (net_data_type_ == DataType::DT_FLOAT ||
memcpy(input_data, input.second.data().get(), net_data_type_ == DataType::DT_HALF) {
input_tensor->size() * sizeof(float)); auto input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DataType::DT_BFLOAT16) {
auto input_data = input_tensor->mutable_data<BFloat16>();
const float *data = input.second.data().get();
for (index_t i = 0; i < input_tensor->size(); ++i) {
input_data[i] = data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) { } else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>(); auto input_data = input_tensor->mutable_data<int>();
memcpy(input_data, input.second.data().get(), memcpy(input_data, input.second.data().get(),
...@@ -842,6 +872,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput( ...@@ -842,6 +872,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
output_tensor->shape(), output_tensor->shape(),
dst_dims, dst_dims,
output->second.data<int>().get()); output->second.data<int>().get());
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
auto output_data = output_tensor->data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
output_data,
output_tensor->shape(),
dst_dims,
output->second.data<float>().get());
#endif // MACE_ENABLE_BFLOAT16
} else { } else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt; LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
return MaceStatus::MACE_INVALID_ARGS; return MaceStatus::MACE_INVALID_ARGS;
...@@ -864,6 +903,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput( ...@@ -864,6 +903,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
std::memcpy(output->second.data<int>().get(), std::memcpy(output->second.data<int>().get(),
output_tensor->data<int>(), output_tensor->data<int>(),
output_size * sizeof(int)); output_size * sizeof(int));
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
const auto *output_data = output_tensor->data<BFloat16>();
float *data = output->second.data<float>().get();
for (index_t i = 0; i < output_tensor->size(); ++i) {
data[i] = output_data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else { } else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt; LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
} }
......
...@@ -109,6 +109,12 @@ def if_quantize_enabled(a): ...@@ -109,6 +109,12 @@ def if_quantize_enabled(a):
"//conditions:default": [], "//conditions:default": [],
}) })
def if_bfloat16_enabled(a):
return select({
"//mace:bfloat16_enabled": a,
"//conditions:default": [],
})
def if_rpcmem_enabled(a): def if_rpcmem_enabled(a):
return select({ return select({
"//mace:rpcmem_enabled": a, "//mace:rpcmem_enabled": a,
......
...@@ -10,6 +10,7 @@ load( ...@@ -10,6 +10,7 @@ load(
"//mace:mace.bzl", "//mace:mace.bzl",
"if_android", "if_android",
"if_android_armv7", "if_android_armv7",
"if_bfloat16_enabled",
"if_hexagon_enabled", "if_hexagon_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
...@@ -46,6 +47,8 @@ cc_library( ...@@ -46,6 +47,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]), ]),
...@@ -85,6 +88,8 @@ cc_library( ...@@ -85,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]), ]),
...@@ -138,6 +143,8 @@ cc_library( ...@@ -138,6 +143,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]), ]),
...@@ -223,6 +230,8 @@ cc_library( ...@@ -223,6 +230,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]), ]),
...@@ -263,6 +272,8 @@ cc_library( ...@@ -263,6 +272,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]), ]),
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
#include "mace/core/ops/operator.h" #include "mace/core/ops/operator.h"
#include "mace/core/registry/ops_registry.h" #include "mace/core/registry/ops_registry.h"
#include "mace/ops/delegator/activation.h" #include "mace/ops/delegator/activation.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -43,11 +42,12 @@ class ActivationOp<DeviceType::CPU, T> : public Operation { ...@@ -43,11 +42,12 @@ class ActivationOp<DeviceType::CPU, T> : public Operation {
Operation::GetOptionalArg<std::string>("activation", "NOOP"))), Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
activation_delegator_(delegator::Activation::Create( activation_delegator_(delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam( delegator::ActivationParam(
activation_type_, activation_type_,
Operation::GetOptionalArg<T>("max_limit", 0), Operation::GetOptionalArg<float>("max_limit", 0.f),
Operation::GetOptionalArg<T>("leakyrelu_coefficient", 0)))) {} Operation::GetOptionalArg<float>(
"leakyrelu_coefficient", 0.f)))) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
...@@ -119,6 +119,8 @@ class ActivationOp<DeviceType::GPU, float> : public Operation { ...@@ -119,6 +119,8 @@ class ActivationOp<DeviceType::GPU, float> : public Operation {
void RegisterActivation(OpRegistry *op_registry) { void RegisterActivation(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Activation",
ActivationOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp); MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
......
...@@ -33,8 +33,8 @@ namespace ops { ...@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class AddNOp; class AddNOp;
template<> template<class T>
class AddNOp<DeviceType::CPU, float> : public Operation { class AddNOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit AddNOp(OpConstructContext *context) explicit AddNOp(OpConstructContext *context)
: Operation(context) {} : Operation(context) {}
...@@ -46,12 +46,12 @@ class AddNOp<DeviceType::CPU, float> : public Operation { ...@@ -46,12 +46,12 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
const index_t size = output->size(); const index_t size = output->size();
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
auto output_data = output->mutable_data<float>(); auto output_data = output->mutable_data<T>();
memset(output_data, 0, size * sizeof(float)); memset(output_data, 0, size * sizeof(T));
for (auto &input : inputs_) { for (auto &input : inputs_) {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
auto input_data = input->data<float>(); auto input_data = input->template data<T>();
for (index_t j = 0; j < size; ++j) { for (index_t j = 0; j < size; ++j) {
output_data[j] += input_data[j]; output_data[j] += input_data[j];
...@@ -95,6 +95,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation { ...@@ -95,6 +95,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation {
void RegisterAddN(OpRegistry *op_registry) { void RegisterAddN(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "AddN", AddNOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp); MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
......
...@@ -74,11 +74,12 @@ class ArgMaxOp : public Operation { ...@@ -74,11 +74,12 @@ class ArgMaxOp : public Operation {
if (argmin_) { if (argmin_) {
for (index_t i = 0; i < outer_size; ++i) { for (index_t i = 0; i < outer_size; ++i) {
int idx = 0; int idx = 0;
T min_value = std::numeric_limits<T>::max(); float min_value = std::numeric_limits<float>::max();
const T *input_ptr = input_data + i * inner_size; const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) { for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] < min_value) { float input_value = input_ptr[j];
min_value = input_ptr[j]; if (input_value < min_value) {
min_value = input_value;
idx = j; idx = j;
} }
} }
...@@ -87,11 +88,12 @@ class ArgMaxOp : public Operation { ...@@ -87,11 +88,12 @@ class ArgMaxOp : public Operation {
} else { } else {
for (index_t i = 0; i < outer_size; ++i) { for (index_t i = 0; i < outer_size; ++i) {
int idx = 0; int idx = 0;
T max_value = std::numeric_limits<T>::lowest(); float max_value = std::numeric_limits<float>::lowest();
const T *input_ptr = input_data + i * inner_size; const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) { for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] > max_value) { float input_value = input_ptr[j];
max_value = input_ptr[j]; if (input_value > max_value) {
max_value = input_value;
idx = j; idx = j;
} }
} }
...@@ -111,8 +113,8 @@ class ArgMaxOp : public Operation { ...@@ -111,8 +113,8 @@ class ArgMaxOp : public Operation {
void RegisterArgMax(OpRegistry *op_registry) { void RegisterArgMax(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -185,8 +185,11 @@ void Activation::DoActivation(const OpContext *context, ...@@ -185,8 +185,11 @@ void Activation::DoActivation(const OpContext *context,
} }
} }
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam, void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(Activation, CPU, float, NEON)) MACE_REGISTER_DELEGATOR(
registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -129,8 +129,11 @@ void BiasAdd::AddBias(const OpContext *context, ...@@ -129,8 +129,11 @@ void BiasAdd::AddBias(const OpContext *context,
} }
} }
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam, void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON)) MACE_REGISTER_DELEGATOR(
registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -113,8 +113,12 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, ...@@ -113,8 +113,12 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
output); output);
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam, void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1)) MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x1));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -861,18 +861,27 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, ...@@ -861,18 +861,27 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam, void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1)) MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1)) float, ImplType::NEON, K1x7S1));
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, registry, Conv2dK7x1S1, delegator::Conv2dParam,
NEON, K1x15S1)) MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam, float, ImplType::NEON, K7x1S1));
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
NEON, K15x1S1)) MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x15S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K15x1S1));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -737,10 +737,16 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -737,10 +737,16 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam, void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1)) MACE_REGISTER_DELEGATOR(
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam, registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2)) MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -801,9 +801,12 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context, ...@@ -801,9 +801,12 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
}, 0, batch, 1, 0, out_channels, 1); }, 0, batch, 1, 0, out_channels, 1);
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam, void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX( MACE_REGISTER_DELEGATOR(
Conv2d, CPU, float, NEON, K3x3Winograd)) registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3Winograd));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -258,8 +258,12 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, ...@@ -258,8 +258,12 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam, void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1)) MACE_REGISTER_DELEGATOR(
registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K5x5S1));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -722,12 +722,20 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, ...@@ -722,12 +722,20 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam, void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1)) MACE_REGISTER_DELEGATOR(
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam, registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2)) MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam, float, ImplType::NEON, K7x7S1));
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3)) MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S2));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S3));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -252,9 +252,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -252,9 +252,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR( void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
registry, Conv2dGeneral, delegator::Conv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General)) registry, Conv2dGeneral, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -335,12 +335,16 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, ...@@ -335,12 +335,16 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam, void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, MACE_REGISTER_DELEGATOR(
NEON, K2x2S1)) registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam, MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, float, ImplType::NEON, K2x2S1));
NEON, K2x2S2)) MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -464,12 +464,16 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, ...@@ -464,12 +464,16 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam, void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, MACE_REGISTER_DELEGATOR(
NEON, K3x3S1)) registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam, MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, float, ImplType::NEON, K3x3S1));
NEON, K3x3S2)) MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -574,12 +574,16 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, ...@@ -574,12 +574,16 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam, void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, MACE_REGISTER_DELEGATOR(
NEON, K4x4S1)) registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam, MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, float, ImplType::NEON, K4x4S1));
NEON, K4x4S2)) MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -124,9 +124,11 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, ...@@ -124,9 +124,11 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam, void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, MACE_REGISTER_DELEGATOR(
NEON, General)) registry, Deconv2dGeneral, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -512,12 +512,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -512,12 +512,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR( void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1)) registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
MACE_REGISTER_DELEGATOR( MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam, float, ImplType::NEON, K3x3S1));
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2)) MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -776,19 +776,27 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -776,19 +776,27 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR( void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1)) registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
MACE_REGISTER_DELEGATOR( MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam, float, ImplType::NEON, K3x3S1));
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2)) MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
MACE_REGISTER_DELEGATOR( MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam, float, ImplType::NEON, K3x3S2));
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1)) }
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam, void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2)) MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -959,19 +959,27 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -959,19 +959,27 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR( void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1)) registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
MACE_REGISTER_DELEGATOR( MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam, float, ImplType::NEON, K4x4S1));
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2)) MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
MACE_REGISTER_DELEGATOR( MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam, float, ImplType::NEON, K4x4S2));
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1)) }
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam, void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2)) MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -207,13 +207,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, ...@@ -207,13 +207,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR( void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General)) registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
MACE_REGISTER_DELEGATOR( void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General)) registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -1224,8 +1224,11 @@ MaceStatus Gemm::Compute(const OpContext *context, ...@@ -1224,8 +1224,11 @@ MaceStatus Gemm::Compute(const OpContext *context,
output); output);
} }
MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam, void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON)) MACE_REGISTER_DELEGATOR(
registry, Gemm, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -378,9 +378,11 @@ MaceStatus Gemv::Compute(const OpContext *context, ...@@ -378,9 +378,11 @@ MaceStatus Gemv::Compute(const OpContext *context,
#undef vaddvq_f32 #undef vaddvq_f32
#endif #endif
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON)) registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32 } // namespace fp32
} // namespace arm } // namespace arm
......
...@@ -162,8 +162,11 @@ MaceStatus Eltwise::Compute(const OpContext *context, ...@@ -162,8 +162,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam, void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON)) MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::NEON));
}
} // namespace q8 } // namespace q8
} // namespace arm } // namespace arm
......
...@@ -176,18 +176,14 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context, ...@@ -176,18 +176,14 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
template void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
class Gemv<uint8_t>; MACE_REGISTER_DELEGATOR(
template registry, Gemv<uint8_t>, DelegatorParam,
class Gemv<int32_t>; MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON));
MACE_REGISTER_DELEGATOR(
typedef Gemv<uint8_t> GemvUint8; registry, Gemv<int32_t>, DelegatorParam,
MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON));
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON)) }
typedef Gemv<int32_t> GemvInt32;
MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON))
} // namespace q8 } // namespace q8
} // namespace arm } // namespace arm
......
...@@ -33,8 +33,8 @@ namespace ops { ...@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class BatchNormOp; class BatchNormOp;
template<> template<class T>
class BatchNormOp<DeviceType::CPU, float> : public Operation { class BatchNormOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit BatchNormOp(OpConstructContext *context) explicit BatchNormOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -43,7 +43,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -43,7 +43,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
activation_delegator_( activation_delegator_(
delegator::Activation::Create( delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam( delegator::ActivationParam(
ops::StringToActivationType( ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
...@@ -91,13 +92,13 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -91,13 +92,13 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard offset_mapper(offset); Tensor::MappingGuard offset_mapper(offset);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>(); const T *input_ptr = input->data<T>();
const float *scale_ptr = scale->data<float>(); const T *scale_ptr = scale->data<T>();
const float *offset_ptr = offset->data<float>(); const T *offset_ptr = offset->data<T>();
float *output_ptr = output->mutable_data<float>(); T *output_ptr = output->mutable_data<T>();
std::vector<float> new_scale; std::vector<T> new_scale;
std::vector<float> new_offset; std::vector<T> new_offset;
if (not_folded) { if (not_folded) {
const Tensor *mean = this->Input(MEAN); const Tensor *mean = this->Input(MEAN);
const Tensor *var = this->Input(VAR); const Tensor *var = this->Input(VAR);
...@@ -109,8 +110,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -109,8 +110,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
new_offset.resize(channels); new_offset.resize(channels);
Tensor::MappingGuard mean_mapper(mean); Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var); Tensor::MappingGuard var_mapper(var);
const float *mean_ptr = mean->data<float>(); const T *mean_ptr = mean->data<T>();
const float *var_ptr = var->data<float>(); const T *var_ptr = var->data<T>();
thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start, thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start,
index_t end, index_t end,
...@@ -122,9 +123,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -122,9 +123,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, channels, 1); }, 0, channels, 1);
} }
const float *scale_data = not_folded ? new_scale.data() : scale_ptr; const T *scale_data = not_folded ? new_scale.data() : scale_ptr;
const float const T *offset_data = not_folded ? new_offset.data() : offset_ptr;
*offset_data = not_folded ? new_offset.data() : offset_ptr;
index_t channel_size = height * width; index_t channel_size = height * width;
index_t batch_size = channels * channel_size; index_t batch_size = channels * channel_size;
...@@ -232,6 +232,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation { ...@@ -232,6 +232,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation {
void RegisterBatchNorm(OpRegistry *op_registry) { void RegisterBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp); MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
} }
......
...@@ -84,8 +84,8 @@ class BatchToSpaceOpBase : public Operation { ...@@ -84,8 +84,8 @@ class BatchToSpaceOpBase : public Operation {
template<DeviceType D, class T> template<DeviceType D, class T>
class BatchToSpaceNDOp; class BatchToSpaceNDOp;
template<> template<class T>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { class BatchToSpaceNDOp<DeviceType::CPU, T> : public BatchToSpaceOpBase {
public: public:
explicit BatchToSpaceNDOp(OpConstructContext *context) explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {} : BatchToSpaceOpBase(context) {}
...@@ -108,8 +108,8 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -108,8 +108,8 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
int block_shape_h = block_shape_[0]; int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1]; int block_shape_w = block_shape_[1];
const float *input_data = batch_tensor->data<float>(); const T *input_data = batch_tensor->data<T>();
float *output_data = space_tensor->mutable_data<float>(); T *output_data = space_tensor->mutable_data<T>();
index_t in_batches = batch_tensor->dim(0); index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(2); index_t in_height = batch_tensor->dim(2);
...@@ -120,10 +120,11 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -120,10 +120,11 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
index_t out_height = space_tensor->dim(2); index_t out_height = space_tensor->dim(2);
index_t out_width = space_tensor->dim(3); index_t out_width = space_tensor->dim(3);
// 32k/sizeof(float)/out_width/block_shape // 32k/sizeof(T)/out_width/block_shape
index_t index_t block_h_size = std::max(
block_h_size = static_cast<index_t>(1),
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width); static_cast<index_t>(
(32 / sizeof(T)) * 1024 / block_shape_w / out_width));
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
...@@ -153,9 +154,9 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -153,9 +154,9 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
(out_width + pad_left - tile_w (out_width + pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const float *input_base = const T *input_base =
input_data + (in_b * channels + c) * in_height * in_width; input_data + (in_b * channels + c) * in_height * in_width;
float *output_base = T *output_base =
output_data + (b * channels + c) * out_height * out_width; output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top; index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
...@@ -290,6 +291,9 @@ void RegisterBatchToSpaceND(OpRegistry *op_registry) { ...@@ -290,6 +291,9 @@ void RegisterBatchToSpaceND(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float); BatchToSpaceNDOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND", MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t); BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
......
...@@ -33,15 +33,15 @@ namespace ops { ...@@ -33,15 +33,15 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class BiasAddOp; class BiasAddOp;
template<> template<class T>
class BiasAddOp<DeviceType::CPU, float> : public Operation { class BiasAddOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit BiasAddOp(OpConstructContext *context) explicit BiasAddOp(OpConstructContext *context)
: Operation(context), : Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)), has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
bias_add_delegator_(delegator::BiasAdd::Create( bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -67,9 +67,9 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation { ...@@ -67,9 +67,9 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard bias_mapper(bias); Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>(); const T *input_ptr = input->data<T>();
const float *bias_ptr = bias->data<float>(); const T *bias_ptr = bias->data<T>();
float *output_ptr = output->mutable_data<float>(); T *output_ptr = output->mutable_data<T>();
const std::vector<index_t> &shape = input->shape(); const std::vector<index_t> &shape = input->shape();
const index_t channels = *shape.rbegin(); const index_t channels = *shape.rbegin();
...@@ -162,6 +162,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation { ...@@ -162,6 +162,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation {
void RegisterBiasAdd(OpRegistry *op_registry) { void RegisterBiasAdd(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp); MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
......
...@@ -56,10 +56,8 @@ class CastOp : public Operation { ...@@ -56,10 +56,8 @@ class CastOp : public Operation {
}; };
void RegisterCast(OpRegistry *op_registry) { void RegisterCast(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cast", CastOp, MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, int32_t);
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, int32_t);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "Cast", CastOp, MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float16_t); DeviceType::CPU, float16_t);
......
...@@ -64,7 +64,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { ...@@ -64,7 +64,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const T *in_ptr = input_ptr + b * batch_size const T *in_ptr = input_ptr + b * batch_size
+ (g * channels_per_group + idx) * image_size; + (g * channels_per_group + idx) * image_size;
T *out_ptr = output_ptr + b * batch_size + c * image_size; T *out_ptr = output_ptr + b * batch_size + c * image_size;
memcpy(out_ptr, in_ptr, image_size * sizeof(float)); memcpy(out_ptr, in_ptr, image_size * sizeof(T));
} }
} }
...@@ -102,6 +102,8 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation { ...@@ -102,6 +102,8 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
void RegisterChannelShuffle(OpRegistry *op_registry) { void RegisterChannelShuffle(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle", MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float); ChannelShuffleOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp); MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#include "mace/ops/common/lstm.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const OpContext *context,
const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data) {
float i_scale = (embed_scales && scale_data) ? scale_data[0] : 1.0f;
float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f;
float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops
} // namespace mace
...@@ -12,25 +12,77 @@ ...@@ -12,25 +12,77 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#ifndef MACE_OPS_COMMON_LSTM_H_ #ifndef MACE_OPS_COMMON_LSTM_H_
#define MACE_OPS_COMMON_LSTM_H_ #define MACE_OPS_COMMON_LSTM_H_
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/math.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
void LSTMNonlinearKernel(const OpContext *opContext, template <typename T>
const float *input_data, void LSTMNonlinearKernel(const OpContext *context,
const float *prev_data, const T *input_data,
const float *scale_data, const T *prev_data,
const float *params_data, const T *scale_data,
const T *params_data,
bool embed_scales, bool embed_scales,
index_t params_stride, index_t params_stride,
index_t cell_dim, index_t cell_dim,
float *output_cell, T *output_cell,
float *output_data); T *output_data) {
float i_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[0]) : 1.0f;
float f_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[1]) : 1.0f;
float o_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[2]) : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
...@@ -26,10 +26,10 @@ ...@@ -26,10 +26,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<typename T> template<typename SrcT, typename DstT>
void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
const T *input, const SrcT *input,
T *output, DstT *output,
const index_t height, const index_t height,
const index_t width) { const index_t width) {
index_t image_size = height * width; index_t image_size = height * width;
...@@ -50,11 +50,11 @@ void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, ...@@ -50,11 +50,11 @@ void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
} }
template<> template<>
inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool, inline void TransposeNHWCToNCHWC3<float, float>(utils::ThreadPool *thread_pool,
const float *input, const float *input,
float *output, float *output,
const index_t height, const index_t height,
const index_t width) { const index_t width) {
index_t image_size = height * width; index_t image_size = height * width;
thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
...@@ -91,10 +91,10 @@ inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool, ...@@ -91,10 +91,10 @@ inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1); }, 0, height, 1);
} }
template<typename T> template<typename SrcT, typename DstT>
void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
const T *input, const SrcT *input,
T *output, DstT *output,
const index_t height, const index_t height,
const index_t width) { const index_t width) {
index_t image_size = height * width; index_t image_size = height * width;
...@@ -115,11 +115,11 @@ void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, ...@@ -115,11 +115,11 @@ void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
} }
template<> template<>
inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool, inline void TransposeNCHWToNHWCC2<float, float>(utils::ThreadPool *thread_pool,
const float *input, const float *input,
float *output, float *output,
const index_t height, const index_t height,
const index_t width) { const index_t width) {
index_t image_size = height * width; index_t image_size = height * width;
thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
...@@ -155,15 +155,15 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool, ...@@ -155,15 +155,15 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1); }, 0, height, 1);
} }
template<typename T> template<typename SrcT, typename DstT>
MaceStatus Transpose(utils::ThreadPool *thread_pool, MaceStatus Transpose(utils::ThreadPool *thread_pool,
const T *input, const SrcT *input,
const std::vector<int64_t> &input_shape, const std::vector<int64_t> &input_shape,
const std::vector<int> &dst_dims, const std::vector<int> &dst_dims,
T *output) { DstT *output) {
MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
(input_shape.size() == 3 && dst_dims.size() == 3) || (input_shape.size() == 3 && dst_dims.size() == 3) ||
(input_shape.size() == 4 && dst_dims.size() == 4), (input_shape.size() == 4 && dst_dims.size() == 4),
"Only support 2D, 3D or 4D transpose"); "Only support 2D, 3D or 4D transpose");
std::vector<index_t> output_shape; std::vector<index_t> output_shape;
...@@ -220,7 +220,6 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, ...@@ -220,7 +220,6 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t height = input_shape[1]; index_t height = input_shape[1];
index_t width = input_shape[2]; index_t width = input_shape[2];
index_t channel = input_shape[3]; index_t channel = input_shape[3];
size_t channel_raw_size = channel * sizeof(T);
index_t stride_i = height; index_t stride_i = height;
index_t stride_j = width; index_t stride_j = width;
index_t tile_size = std::max(static_cast<index_t>(1), index_t tile_size = std::max(static_cast<index_t>(1),
...@@ -232,9 +231,11 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, ...@@ -232,9 +231,11 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t end_j = std::min(j + tile_size, width); index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) { for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) { for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
memcpy(output + (tile_j * stride_i + tile_i) * channel, auto output_ptr = output + (tile_j * stride_i + tile_i) * channel;
input + (tile_i * stride_j + tile_j) * channel, auto input_ptr = input + (tile_i * stride_j + tile_j) * channel;
channel_raw_size); for (index_t k = 0; k < channel; ++k) {
output_ptr[k] = input_ptr[k];
}
} }
} }
} }
...@@ -296,14 +297,15 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool, ...@@ -296,14 +297,15 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
} }
}, 0, batch, 1, 0, height, tile_size, 0, width, tile_size); }, 0, batch, 1, 0, height, tile_size, 0, width, tile_size);
} else if (dst_dims == std::vector<int>{1, 0, 2}) { } else if (dst_dims == std::vector<int>{1, 0, 2}) {
size_t width_raw_size = width * sizeof(T);
thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0, thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (int i = start0; i < end0; i += step0) { for (int i = start0; i < end0; i += step0) {
for (int j = start1; j < end1; j += step1) { for (int j = start1; j < end1; j += step1) {
memcpy(output + (j * batch + i) * width, auto output_ptr = output + (j * batch + i) * width;
input + (i * height + j) * width, auto input_ptr = input + (i * height + j) * width;
width_raw_size); for (index_t k = 0; k < width; ++k) {
output_ptr[k] = input_ptr[k];
}
} }
} }
}, 0, batch, 1, 0, height, 1); }, 0, batch, 1, 0, height, 1);
......
...@@ -225,6 +225,8 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase { ...@@ -225,6 +225,8 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
void RegisterConcat(OpRegistry *op_registry) { void RegisterConcat(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
......
...@@ -53,15 +53,16 @@ namespace ops { ...@@ -53,15 +53,16 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class Conv2dOp; class Conv2dOp;
template<> template<class T>
class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { class Conv2dOp<DeviceType::CPU, T> : public ConvPool2dOpBase {
public: public:
explicit Conv2dOp(OpConstructContext *context) explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context), : ConvPool2dOpBase(context),
activation_delegator_( activation_delegator_(
delegator::Activation::Create( delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam( delegator::ActivationParam(
ops::StringToActivationType( ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
...@@ -71,7 +72,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -71,7 +72,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
0.0f)))), 0.0f)))),
bias_add_delegator_(delegator::BiasAdd::Create( bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -81,9 +82,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -81,9 +82,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
if (conv2d_delegator_ == nullptr) { if (conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, auto tag = MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, T, kCpuImplType);
MACE_CPU_IMPL_TYPE, General); if (kCpuImplType == NEON) {
if (MACE_CPU_IMPL_TYPE == NEON) {
// the following params are used to decide which conv delegator to use // the following params are used to decide which conv delegator to use
const index_t stride_h = strides_[0]; const index_t stride_h = strides_[0];
const index_t stride_w = strides_[1]; const index_t stride_w = strides_[1];
...@@ -98,63 +98,63 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -98,63 +98,63 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// We do not support changeable filter for now. // We do not support changeable filter for now.
if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) { && dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K1x1); kCpuImplType, K1x1);
} else if (filter_h == 3 && filter_w == 3 } else if (filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
if (input_channels >= 8 && channels >= 8) { if (input_channels >= 8 && channels >= 8) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3Winograd); kCpuImplType, K3x3Winograd);
} else { } else {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S1); kCpuImplType, K3x3S1);
} }
} else if (filter_h == 3 && filter_w == 3 } else if (filter_h == 3 && filter_w == 3
&& stride_h == 2 && stride_w == 2 && dilation_h == 1 && stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S2); kCpuImplType, K3x3S2);
} else if (filter_h == 5 && filter_w == 5 } else if (filter_h == 5 && filter_w == 5
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K5x5S1); kCpuImplType, K5x5S1);
} else if (filter_h == 7 && filter_w == 7 } else if (filter_h == 7 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K7x7S1); kCpuImplType, K7x7S1);
} else if (filter_h == 7 && filter_w == 7 } else if (filter_h == 7 && filter_w == 7
&& stride_h == 2 && stride_w == 2 && dilation_h == 1 && stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K7x7S2); kCpuImplType, K7x7S2);
} else if (filter_h == 7 && filter_w == 7 } else if (filter_h == 7 && filter_w == 7
&& stride_h == 3 && stride_w == 3 && dilation_h == 1 && stride_h == 3 && stride_w == 3 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K7x7S3); kCpuImplType, K7x7S3);
} else if (filter_h == 1 && filter_w == 7 } else if (filter_h == 1 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K1x7S1); kCpuImplType, K1x7S1);
} else if (filter_h == 7 && filter_w == 1 } else if (filter_h == 7 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K7x1S1); kCpuImplType, K7x1S1);
} else if (filter_h == 1 && filter_w == 15 } else if (filter_h == 1 && filter_w == 15
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K1x15S1); kCpuImplType, K1x15S1);
} else if (filter_h == 15 && filter_w == 1 } else if (filter_h == 15 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) { && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K15x1S1); kCpuImplType, K15x1S1);
} }
} }
delegator::Conv2dParam param(strides_, dilations_, delegator::Conv2dParam param(strides_, dilations_,
...@@ -497,8 +497,8 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase { ...@@ -497,8 +497,8 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterConv2D(OpRegistry *op_registry) { void RegisterConv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
......
...@@ -100,8 +100,7 @@ class CropOp<DeviceType::CPU, T> : public Operation { ...@@ -100,8 +100,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
input_data + (b + offsets[0]) * in_img_size + input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw + (c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3]; (h + offsets[2]) * input_shape[3] + offsets[3];
memcpy(out_ptr, in_ptr_bch, memcpy(out_ptr, in_ptr_bch, output_shape[3] * sizeof(T));
output_shape[3] * sizeof(T));
} }
} }
} }
...@@ -134,8 +133,8 @@ class CropOp<DeviceType::GPU, float> : public Operation { ...@@ -134,8 +133,8 @@ class CropOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistry *op_registry) { void RegisterCrop(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp, MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Crop", CropOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp); MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, op_registry,
......
...@@ -143,8 +143,8 @@ class CumsumOp<DeviceType::CPU, T> : public Operation { ...@@ -143,8 +143,8 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
}; };
void RegisterCumsum(OpRegistry *op_registry) { void RegisterCumsum(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -46,20 +46,21 @@ const std::vector<int> kDeconv2dStrides = {1, 1}; ...@@ -46,20 +46,21 @@ const std::vector<int> kDeconv2dStrides = {1, 1};
template<DeviceType D, class T> template<DeviceType D, class T>
class Deconv2dOp; class Deconv2dOp;
template<> template<class T>
class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { class Deconv2dOp<DeviceType::CPU, T> : public Deconv2dOpBase {
public: public:
explicit Deconv2dOp(OpConstructContext *context) explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context), : Deconv2dOpBase(context),
activation_delegator_( activation_delegator_(
delegator::Activation::Create( delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_, delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))), leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create( bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -80,11 +81,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -80,11 +81,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
if (deconv2d_delegator_ == nullptr) { if (deconv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, auto tag = MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, T, kCpuImplType);
MACE_CPU_IMPL_TYPE, General); if (kCpuImplType == NEON) {
if (MACE_CPU_IMPL_TYPE == NEON) {
const index_t kernel_h = filter->dim(2); const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3); const index_t kernel_w = filter->dim(3);
...@@ -104,23 +103,23 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -104,23 +103,23 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
strides_[0] == strides_[1] && strides_[0] == 2; strides_[0] == strides_[1] && strides_[0] == 2;
if (use_neon_2x2_s1) { if (use_neon_2x2_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K2x2S1); kCpuImplType, K2x2S1);
} else if (use_neon_2x2_s2) { } else if (use_neon_2x2_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K2x2S2); kCpuImplType, K2x2S2);
} else if (use_neon_3x3_s1) { } else if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S1); kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) { } else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S2); kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) { } else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S1); kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) { } else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S2); kCpuImplType, K4x4S2);
} }
} }
delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_, delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_,
...@@ -236,8 +235,8 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase { ...@@ -236,8 +235,8 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterDeconv2D(OpRegistry *op_registry) { void RegisterDeconv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp); MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
......
...@@ -27,7 +27,6 @@ namespace mace { ...@@ -27,7 +27,6 @@ namespace mace {
namespace ops { namespace ops {
enum ConvType { enum ConvType {
General,
K1x1, K1x1,
K1x7S1, K1x7S1,
K7x1S1, K7x1S1,
......
...@@ -21,12 +21,12 @@ ...@@ -21,12 +21,12 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/ops/op_delegator.h" #include "mace/core/ops/op_delegator.h"
#include "mace/core/registry/op_delegator_registry.h" #include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
enum DeconvType { enum DeconvType {
General,
K2x2S1, K2x2S1,
K2x2S2, K2x2S2,
K3x3S1, K3x3S1,
......
...@@ -28,8 +28,8 @@ namespace ops { ...@@ -28,8 +28,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class DepthToSpaceOp; class DepthToSpaceOp;
template<> template<class T>
class DepthToSpaceOp<CPU, float> : public Operation { class DepthToSpaceOp<CPU, T> : public Operation {
public: public:
explicit DepthToSpaceOp(OpConstructContext *context) explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -59,8 +59,8 @@ class DepthToSpaceOp<CPU, float> : public Operation { ...@@ -59,8 +59,8 @@ class DepthToSpaceOp<CPU, float> : public Operation {
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_ptr = input->data<float>(); const T *input_ptr = input->data<T>();
float *output_ptr = output->mutable_data<float>(); T *output_ptr = output->mutable_data<T>();
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) { for (index_t d = 0; d < output_depth; ++d) {
...@@ -188,6 +188,8 @@ class DepthToSpaceOp<DeviceType::GPU, float> : public Operation { ...@@ -188,6 +188,8 @@ class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
void RegisterDepthToSpace(OpRegistry *op_registry) { void RegisterDepthToSpace(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace", MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float); DepthToSpaceOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthToSpace", MACE_REGISTER_OP(op_registry, "DepthToSpace",
......
...@@ -63,20 +63,21 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase { ...@@ -63,20 +63,21 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T> template<DeviceType D, class T>
class DepthwiseConv2dOp; class DepthwiseConv2dOp;
template<> template<class T>
class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase { class DepthwiseConv2dOp<DeviceType::CPU, T> : public DepthwiseConv2dOpBase {
public: public:
explicit DepthwiseConv2dOp(OpConstructContext *context) explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context), : DepthwiseConv2dOpBase(context),
activation_delegator_( activation_delegator_(
delegator::Activation::Create( delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_, delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))), leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create( bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -93,9 +94,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase { ...@@ -93,9 +94,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
if (depthwise_conv2d_delegator_ == nullptr) { if (depthwise_conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, auto tag = MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
REF, General); T, ImplType::REF);
if (MACE_CPU_IMPL_TYPE == NEON) { if (kCpuImplType == NEON) {
const index_t filter_h = filter->dim(2); const index_t filter_h = filter->dim(2);
const index_t filter_w = filter->dim(3); const index_t filter_w = filter->dim(3);
const index_t stride_h = strides_[0]; const index_t stride_h = strides_[0];
...@@ -104,13 +105,13 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase { ...@@ -104,13 +105,13 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
const index_t dilation_w = dilations_[1]; const index_t dilation_w = dilations_[1];
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) { && dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S1); kCpuImplType, K3x3S1);
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2 } else if (filter_h == 3 && filter_w == 3 && stride_h == 2
&& stride_w == 2 && stride_w == 2
&& dilation_h == 1 && dilation_w == 1) { && dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S2); kCpuImplType, K3x3S2);
} }
} }
delegator::Conv2dParam param(strides_, dilations_, delegator::Conv2dParam param(strides_, dilations_,
...@@ -347,7 +348,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t> ...@@ -347,7 +348,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<> template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase { class DepthwiseConv2dOp<DeviceType::GPU, float> :
public DepthwiseConv2dOpBase {
public: public:
explicit DepthwiseConv2dOp(OpConstructContext *context) explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) { : DepthwiseConv2dOpBase(context) {
...@@ -402,6 +404,8 @@ class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase { ...@@ -402,6 +404,8 @@ class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
void RegisterDepthwiseConv2d(OpRegistry *op_registry) { void RegisterDepthwiseConv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU, float); DepthwiseConv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
......
...@@ -44,8 +44,8 @@ const std::vector<int> kDepthwiseStrides = {1, 1}; ...@@ -44,8 +44,8 @@ const std::vector<int> kDepthwiseStrides = {1, 1};
template<DeviceType D, class T> template<DeviceType D, class T>
class DepthwiseDeconv2dOp; class DepthwiseDeconv2dOp;
template<> template<class T>
class DepthwiseDeconv2dOp<DeviceType::CPU, float> class DepthwiseDeconv2dOp<DeviceType::CPU, T>
: public Deconv2dOpBase { : public Deconv2dOpBase {
public: public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context) explicit DepthwiseDeconv2dOp(OpConstructContext *context)
...@@ -53,12 +53,13 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -53,12 +53,13 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
activation_delegator_( activation_delegator_(
delegator::Activation::Create( delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_, delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))), leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create( bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -75,7 +76,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -75,7 +76,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
bool is_depthwise = group_ == in_channels; bool is_depthwise = group_ == in_channels;
if (depthwise_deconv2d_delegator_ == nullptr) { if (depthwise_deconv2d_delegator_ == nullptr) {
if (MACE_CPU_IMPL_TYPE == NEON) { if (kCpuImplType == NEON) {
const index_t kernel_h = filter->dim(2); const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3); const index_t kernel_w = filter->dim(3);
bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
...@@ -88,20 +89,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -88,20 +89,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
strides_[0] == strides_[1] && strides_[0] == 2; strides_[0] == strides_[1] && strides_[0] == 2;
if (is_depthwise) { if (is_depthwise) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, auto tag = MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, General); kCpuImplType);
if (use_neon_3x3_s1) { if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S1); kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) { } else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S2); kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) { } else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S1); kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) { } else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S2); kCpuImplType, K4x4S2);
} }
delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides, delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_, paddings_, padding_type_,
...@@ -109,20 +110,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float> ...@@ -109,20 +110,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create( depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create(
context->workspace(), tag, param); context->workspace(), tag, param);
} else { } else {
std::string tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, auto tag = MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, General); kCpuImplType);
if (use_neon_3x3_s1) { if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S1); kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) { } else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K3x3S2); kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) { } else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S1); kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) { } else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
MACE_CPU_IMPL_TYPE, K4x4S2); kCpuImplType, K4x4S2);
} }
delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides, delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_, paddings_, padding_type_,
...@@ -218,6 +219,8 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase { ...@@ -218,6 +219,8 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) { void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float); DepthwiseDeconv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp); MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
} }
......
...@@ -75,7 +75,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -75,7 +75,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Operation::GetRepeatedArgs<index_t>("out_cache_indexes")), Operation::GetRepeatedArgs<index_t>("out_cache_indexes")),
gemv_(delegator::Gemv::Create( gemv_(delegator::Gemv::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, T, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
inline void Validate() { inline void Validate() {
...@@ -107,14 +107,14 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -107,14 +107,14 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
") should be greater than zero."); ") should be greater than zero.");
} }
void UpdateCell(float *cell_data, void UpdateCell(T *cell_data,
const index_t cell_dim, const index_t cell_dim,
const float scale) { const float scale) {
if (std::abs(scale - 1.f) < 1e-6) if (std::abs(scale - 1.f) < 1e-6)
return; return;
const index_t rounds = cell_dim / 4; const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) { for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON #if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(cell_data + i); float32x4_t in_vec = vld1q_f32(cell_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale); float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec); in_vec = vmulq_f32(in_vec, scale_vec);
...@@ -130,18 +130,18 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -130,18 +130,18 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
} }
} }
void CopyAndUpdateCell(float *src_data, void CopyAndUpdateCell(T *src_data,
const index_t cell_dim, const index_t cell_dim,
const float scale, const float scale,
float *cell_data) { T *cell_data) {
if (std::abs(scale - 1.f) < 1e-6) { if (std::abs(scale - 1.f) < 1e-6) {
memcpy(cell_data, src_data, cell_dim * sizeof(float)); memcpy(cell_data, src_data, cell_dim * sizeof(T));
return; return;
} }
const index_t rounds = cell_dim / 4; const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) { for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON #if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(src_data + i); float32x4_t in_vec = vld1q_f32(src_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale); float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec); in_vec = vmulq_f32(in_vec, scale_vec);
...@@ -222,49 +222,54 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -222,49 +222,54 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
<< " output_dim: " << output_dim; << " output_dim: " << output_dim;
const index_t affine_a_in_size = const index_t affine_a_in_size =
PadAlignSize(affine_a_in_dim * sizeof(float)); PadAlignSize(affine_a_in_dim * sizeof(T));
const index_t affine_a_out_size = const index_t affine_a_out_size =
PadAlignSize(affine_a_out_dim * sizeof(float)); PadAlignSize(affine_a_out_dim * sizeof(T));
const index_t affine_b_in_size = const index_t affine_b_in_size =
PadAlignSize(affine_b_in_dim * sizeof(float)); PadAlignSize(affine_b_in_dim * sizeof(T));
const index_t affine_b_out_size = const index_t affine_b_out_size =
PadAlignSize(affine_b_out_dim * sizeof(float)); PadAlignSize(affine_b_out_dim * sizeof(T));
const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_); const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_);
const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_); const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_);
const index_t out_buf_size = const index_t out_buf_size =
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float)); PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(T));
const index_t cell_buf_size = const index_t cell_buf_size =
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(float)); PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
scratch->GrowSize(affine_a_in_size + affine_a_out_size scratch->GrowSize(affine_a_in_size + affine_a_out_size
+ affine_b_in_size + affine_b_out_size + affine_b_in_size + affine_b_out_size
+ out_buf_size + cell_buf_size); + out_buf_size + cell_buf_size);
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT); Tensor prev_out_buf(scratch->Scratch(out_buf_size), DataTypeToEnum<T>::v());
prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_}); prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_buf_data = prev_out_buf.mutable_data<float>(); T *prev_out_buf_data = prev_out_buf.mutable_data<T>();
Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT); Tensor prev_cell_buf(
scratch->Scratch(cell_buf_size), DataTypeToEnum<T>::v());
prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_}); prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_buf_data = prev_cell_buf.mutable_data<float>(); T *prev_cell_buf_data = prev_cell_buf.mutable_data<T>();
Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT); Tensor affine_a_in(
scratch->Scratch(affine_a_in_size), DataTypeToEnum<T>::v());
affine_a_in.Reshape({1, affine_a_in_dim}); affine_a_in.Reshape({1, affine_a_in_dim});
float *affine_a_in_data = affine_a_in.mutable_data<float>(); T *affine_a_in_data = affine_a_in.mutable_data<T>();
Tensor affine_a_out(scratch->Scratch(affine_a_out_size), DT_FLOAT); Tensor affine_a_out(
scratch->Scratch(affine_a_out_size), DataTypeToEnum<T>::v());
affine_a_out.Reshape({1, affine_a_out_dim}); affine_a_out.Reshape({1, affine_a_out_dim});
float *affine_a_out_data = affine_a_out.mutable_data<float>(); T *affine_a_out_data = affine_a_out.mutable_data<T>();
Tensor affine_b_in(scratch->Scratch(affine_b_in_size), DT_FLOAT); Tensor affine_b_in(
scratch->Scratch(affine_b_in_size), DataTypeToEnum<T>::v());
affine_b_in.Reshape({1, affine_b_in_dim}); affine_b_in.Reshape({1, affine_b_in_dim});
float *affine_b_in_data = affine_b_in.mutable_data<float>(); T *affine_b_in_data = affine_b_in.mutable_data<T>();
Tensor affine_b_out(scratch->Scratch(affine_b_out_size), DT_FLOAT); Tensor affine_b_out(
scratch->Scratch(affine_b_out_size), DataTypeToEnum<T>::v());
affine_b_out.Reshape({1, affine_b_out_dim}); affine_b_out.Reshape({1, affine_b_out_dim});
float *affine_b_out_data = affine_b_out.mutable_data<float>(); T *affine_b_out_data = affine_b_out.mutable_data<T>();
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
Tensor *out_cache = this->Output(OUT_CACHE); Tensor *out_cache = this->Output(OUT_CACHE);
...@@ -293,31 +298,31 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -293,31 +298,31 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard out_cache_guard(out_cache); Tensor::MappingGuard out_cache_guard(out_cache);
Tensor::MappingGuard cell_cache_guard(cell_cache); Tensor::MappingGuard cell_cache_guard(cell_cache);
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
const float *prev_out_data = prev_out->data<float>(); const T *prev_out_data = prev_out->data<T>();
const float *prev_cell_data = prev_cell->data<float>(); const T *prev_cell_data = prev_cell->data<T>();
const float *lstm_params_data = lstm_params->data<float>(); const T *lstm_params_data = lstm_params->data<T>();
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
float *out_cache_data = out_cache->mutable_data<float>(); T *out_cache_data = out_cache->mutable_data<T>();
float *cell_cache_data = cell_cache->mutable_data<float>(); T *cell_cache_data = cell_cache->mutable_data<T>();
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
memcpy(prev_out_buf_data, memcpy(prev_out_buf_data,
prev_out_data + b * out_buf_chunk * prev_out_dim_, prev_out_data + b * out_buf_chunk * prev_out_dim_,
sizeof(float) * out_buf_chunk * prev_out_dim_); sizeof(T) * out_buf_chunk * prev_out_dim_);
memcpy(prev_cell_buf_data, memcpy(prev_cell_buf_data,
prev_cell_data + b * cell_buf_chunk * prev_cell_dim_, prev_cell_data + b * cell_buf_chunk * prev_cell_dim_,
sizeof(float) * cell_buf_chunk * prev_cell_dim_); sizeof(T) * cell_buf_chunk * prev_cell_dim_);
for (index_t i = 0; i < out_chunk; ++i) { for (index_t i = 0; i < out_chunk; ++i) {
const float *input_ptr = const T *input_ptr =
input_data + (b * chunk + forward_indexes_[i]) * input_dim; input_data + (b * chunk + forward_indexes_[i]) * input_dim;
float *output_ptr = output_data + (b * out_chunk + i) * output_dim; T *output_ptr = output_data + (b * out_chunk + i) * output_dim;
// Append // Append
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float)); memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(T));
memcpy(affine_a_in_data + input_dim, memcpy(affine_a_in_data + input_dim,
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_, prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float)); prev_out_dim_ * sizeof(T));
// Affine // Affine
gemv_->Compute(context, gemv_->Compute(context,
weights_a, weights_a,
...@@ -330,11 +335,11 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -330,11 +335,11 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
false, false,
&affine_a_out); &affine_a_out);
// Prepare LSTMNonlinear input and output pointer // Prepare LSTMNonlinear input and output pointer
float *lstm_cell_ptr = T *lstm_cell_ptr =
prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_; prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr = lstm_cell_ptr; T *curr_cell_ptr = lstm_cell_ptr;
// LSTMNonlinear // LSTMNonlinear
LSTMNonlinearKernel(context, LSTMNonlinearKernel<T>(context,
affine_a_out_data, affine_a_out_data,
lstm_cell_ptr, lstm_cell_ptr,
nullptr, nullptr,
...@@ -359,9 +364,9 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -359,9 +364,9 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
// Output // Output
memcpy(output_ptr, memcpy(output_ptr,
affine_b_out_data, affine_b_out_data,
output_dim * sizeof(float)); output_dim * sizeof(T));
// Update // Update
float *curr_out_ptr = T *curr_out_ptr =
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_; prev_out_buf_data + i % out_buf_chunk * prev_out_dim_;
CopyAndUpdateCell(affine_b_out_data + prev_out_offset_, CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
prev_out_dim_, prev_out_dim_,
...@@ -371,22 +376,22 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -371,22 +376,22 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
for (size_t k = 0; k < out_cache_indexes_.size(); ++k) { for (size_t k = 0; k < out_cache_indexes_.size(); ++k) {
if (i == out_cache_indexes_[k]) { if (i == out_cache_indexes_[k]) {
const index_t idx = b * out_buf_chunk + k; const index_t idx = b * out_buf_chunk + k;
float *out_cache_ptr = T *out_cache_ptr =
out_cache_data + idx * prev_out_dim_; out_cache_data + idx * prev_out_dim_;
memcpy(out_cache_ptr, memcpy(out_cache_ptr,
curr_out_ptr, curr_out_ptr,
sizeof(float) * prev_out_dim_); sizeof(T) * prev_out_dim_);
} }
} }
for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) { for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) {
if (i == cell_cache_indexes_[k]) { if (i == cell_cache_indexes_[k]) {
const index_t idx = b * cell_buf_chunk + k; const index_t idx = b * cell_buf_chunk + k;
float *cell_cache_ptr = T *cell_cache_ptr =
cell_cache_data + idx * prev_cell_dim_; cell_cache_data + idx * prev_cell_dim_;
memcpy(cell_cache_ptr, memcpy(cell_cache_ptr,
curr_cell_ptr, curr_cell_ptr,
sizeof(float) * prev_cell_dim_); sizeof(T) * prev_cell_dim_);
} }
} }
} }
...@@ -416,6 +421,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation { ...@@ -416,6 +421,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
void RegisterDynamicLSTM(OpRegistry *op_registry) { void RegisterDynamicLSTM(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp, MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -1073,7 +1073,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1073,7 +1073,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
"scalar_input_index", 1)), "scalar_input_index", 1)),
eltwise_delegator_(delegator::Eltwise::Create( eltwise_delegator_(delegator::Eltwise::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, kCpuImplType),
delegator::EltwiseParam( delegator::EltwiseParam(
static_cast<ops::EltwiseType>( static_cast<ops::EltwiseType>(
Operation::GetOptionalArg<int>( Operation::GetOptionalArg<int>(
...@@ -1175,8 +1175,8 @@ class EltwiseOp<DeviceType::GPU, float> : public Operation { ...@@ -1175,8 +1175,8 @@ class EltwiseOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterEltwise(OpRegistry *op_registry) { void RegisterEltwise(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
......
...@@ -57,6 +57,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation { ...@@ -57,6 +57,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
void RegisterExpandDims(OpRegistry *op_registry) { void RegisterExpandDims(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
......
...@@ -89,15 +89,16 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation { ...@@ -89,15 +89,16 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
output_shape[dim_size - 2] = output_chunk; output_shape[dim_size - 2] = output_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float)); const index_t extract_out_size = PadAlignSize(output_dim * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
scratch->GrowSize(extract_out_size); scratch->GrowSize(extract_out_size);
Tensor extract_out(scratch->Scratch(extract_out_size), DT_FLOAT); Tensor extract_out(
scratch->Scratch(extract_out_size), DataTypeToEnum<T>::v());
extract_out.Reshape({1, output_dim}); extract_out.Reshape({1, output_dim});
extract_out.Clear(); extract_out.Clear();
float *extract_out_data = extract_out.mutable_data<float>(); T *extract_out_data = extract_out.mutable_data<T>();
Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output); Tensor::MappingGuard guard_output(output);
...@@ -162,7 +163,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation { ...@@ -162,7 +163,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
}, 0, input_dim, 1); }, 0, input_dim, 1);
} }
memcpy(output_data + (b * output_chunk + i) * output_dim, memcpy(output_data + (b * output_chunk + i) * output_dim,
extract_out_data, output_dim * sizeof(float)); extract_out_data, output_dim * sizeof(T));
} }
} }
...@@ -180,6 +181,8 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation { ...@@ -180,6 +181,8 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
void RegisterExtractPooling(OpRegistry *op_registry) { void RegisterExtractPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp, MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -22,8 +22,8 @@ namespace ops { ...@@ -22,8 +22,8 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class FillOp; class FillOp;
template <> template <class T>
class FillOp<DeviceType::CPU, float> : public Operation { class FillOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit FillOp(OpConstructContext *context) explicit FillOp(OpConstructContext *context)
: Operation(context) {} : Operation(context) {}
...@@ -46,11 +46,11 @@ class FillOp<DeviceType::CPU, float> : public Operation { ...@@ -46,11 +46,11 @@ class FillOp<DeviceType::CPU, float> : public Operation {
} }
Tensor::MappingGuard value_guard(value); Tensor::MappingGuard value_guard(value);
const float *value_data = value->data<float>(); const T *value_data = value->data<T>();
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
std::fill(output_data, output_data + output->size(), *value_data); std::fill(output_data, output_data + output->size(), *value_data);
...@@ -65,6 +65,7 @@ class FillOp<DeviceType::CPU, float> : public Operation { ...@@ -65,6 +65,7 @@ class FillOp<DeviceType::CPU, float> : public Operation {
void RegisterFill(OpRegistry *op_registry) { void RegisterFill(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Fill", FillOp, MACE_REGISTER_OP(op_registry, "Fill", FillOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Fill", FillOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -56,20 +56,20 @@ class FullyConnectedOpBase : public Operation { ...@@ -56,20 +56,20 @@ class FullyConnectedOpBase : public Operation {
template<DeviceType D, class T> template<DeviceType D, class T>
class FullyConnectedOp; class FullyConnectedOp;
template<> template<class T>
class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase { class FullyConnectedOp<DeviceType::CPU, T> : public FullyConnectedOpBase {
public: public:
explicit FullyConnectedOp(OpConstructContext *context) explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context), : FullyConnectedOpBase(context),
activation_delegator_(delegator::Activation::Create( activation_delegator_(delegator::Activation::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam(activation_, delegator::ActivationParam(activation_,
relux_max_limit_, relux_max_limit_,
leakyrelu_coefficient_))), leakyrelu_coefficient_))),
gemv_(delegator::Gemv::Create( gemv_(delegator::Gemv::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -127,7 +127,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t> ...@@ -127,7 +127,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
: FullyConnectedOpBase(context), : FullyConnectedOpBase(context),
gemv_(delegator::Gemv::Create( gemv_(delegator::Gemv::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -226,6 +226,8 @@ class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase { ...@@ -226,6 +226,8 @@ class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
void RegisterFullyConnected(OpRegistry *op_registry) { void RegisterFullyConnected(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "FullyConnected", MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU, float); FullyConnectedOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "FullyConnected", MACE_REGISTER_OP(op_registry, "FullyConnected",
......
...@@ -89,6 +89,7 @@ class GatherOp : public Operation { ...@@ -89,6 +89,7 @@ class GatherOp : public Operation {
void RegisterGather(OpRegistry *op_registry) { void RegisterGather(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Gather", GatherOp, MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Gather", GatherOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Gather", GatherOp, MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
......
...@@ -36,6 +36,8 @@ class IdentityOp : public Operation { ...@@ -36,6 +36,8 @@ class IdentityOp : public Operation {
void RegisterIdentity(OpRegistry *op_registry) { void RegisterIdentity(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp, MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
......
...@@ -166,6 +166,7 @@ class IfDefinedOp<DeviceType::CPU, T> : public Operation { ...@@ -166,6 +166,7 @@ class IfDefinedOp<DeviceType::CPU, T> : public Operation {
void RegisterIfDefined(OpRegistry *op_registry) { void RegisterIfDefined(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp, MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "IfDefined", IfDefinedOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -105,6 +105,8 @@ class InferConv2dShapeOp : public Operation { ...@@ -105,6 +105,8 @@ class InferConv2dShapeOp : public Operation {
void RegisterInferConv2dShape(OpRegistry *op_registry) { void RegisterInferConv2dShape(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float); InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape", MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t); InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
......
...@@ -28,8 +28,8 @@ namespace ops { ...@@ -28,8 +28,8 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class KaldiBatchNormOp; class KaldiBatchNormOp;
template <> template <class T>
class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { class KaldiBatchNormOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit KaldiBatchNormOp(OpConstructContext *context) explicit KaldiBatchNormOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -40,13 +40,13 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -40,13 +40,13 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
test_mode_(static_cast<bool>( test_mode_(static_cast<bool>(
Operation::GetOptionalArg<int>("test_mode", 0))) {} Operation::GetOptionalArg<int>("test_mode", 0))) {}
void CalculateMeanVar(const float *input_data, void CalculateMeanVar(const T *input_data,
index_t length, index_t length,
index_t stride, index_t stride,
float mean_scale, float mean_scale,
float var_scale, float var_scale,
float *mean_data, T *mean_data,
float *var_data) { T *var_data) {
float mean_value = 0.f; float mean_value = 0.f;
float var_value = 0.f; float var_value = 0.f;
for (index_t i = 0; i < length; ++i) { for (index_t i = 0; i < length; ++i) {
...@@ -84,8 +84,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -84,8 +84,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
utils::ThreadPool utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool(); &thread_pool = context->device()->cpu_runtime()->thread_pool();
...@@ -102,8 +102,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -102,8 +102,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
&& scale->size() == block_dim_); && scale->size() == block_dim_);
Tensor::MappingGuard scale_guard(scale); Tensor::MappingGuard scale_guard(scale);
Tensor::MappingGuard offset_guard(offset); Tensor::MappingGuard offset_guard(offset);
const float *scale_data = scale->data<float>(); const T *scale_data = scale->data<T>();
const float *offset_data = offset->data<float>(); const T *offset_data = offset->data<T>();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
...@@ -116,18 +116,18 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -116,18 +116,18 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, num_rows, 1, 0, block_dim_, 1); }, 0, num_rows, 1, 0, block_dim_, 1);
} else { } else {
const index_t buf_size = const index_t buf_size =
PadAlignSize(block_dim_ * sizeof(float)); PadAlignSize(block_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
scratch->GrowSize(2 * buf_size); scratch->GrowSize(2 * buf_size);
Tensor mean(scratch->Scratch(buf_size), DT_FLOAT); Tensor mean(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
mean.Reshape({block_dim_}); mean.Reshape({block_dim_});
float *mean_data = mean.mutable_data<float>(); T *mean_data = mean.mutable_data<T>();
Tensor var(scratch->Scratch(buf_size), DT_FLOAT); Tensor var(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
var.Reshape({block_dim_}); var.Reshape({block_dim_});
float *var_data = var.mutable_data<float>(); T *var_data = var.mutable_data<T>();
float var_scale = 1.0f / (target_rms_ * target_rms_); float var_scale = 1.0f / (target_rms_ * target_rms_);
float mean_scale = 1.0f / num_rows; float mean_scale = 1.0f / num_rows;
...@@ -171,6 +171,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation { ...@@ -171,6 +171,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
void RegisterKaldiBatchNorm(OpRegistry *op_registry) { void RegisterKaldiBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp, MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -24,8 +24,8 @@ namespace ops { ...@@ -24,8 +24,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class LocalResponseNormOp; class LocalResponseNormOp;
template<> template<class T>
class LocalResponseNormOp<DeviceType::CPU, float> : public Operation { class LocalResponseNormOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit LocalResponseNormOp(OpConstructContext *context) explicit LocalResponseNormOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -49,8 +49,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation { ...@@ -49,8 +49,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
const index_t height = input->dim(2); const index_t height = input->dim(2);
const index_t width = input->dim(3); const index_t width = input->dim(3);
const float *input_ptr = input->data<float>(); const T *input_ptr = input->data<T>();
float *output_ptr = output->mutable_data<float>(); T *output_ptr = output->mutable_data<T>();
const index_t image_size = height * width; const index_t image_size = height * width;
const index_t batch_size = channels * image_size; const index_t batch_size = channels * image_size;
...@@ -95,6 +95,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation { ...@@ -95,6 +95,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
void RegisterLocalResponseNorm(OpRegistry *op_registry) { void RegisterLocalResponseNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LocalResponseNorm", MACE_REGISTER_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU, float); LocalResponseNormOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -35,8 +35,8 @@ namespace ops { ...@@ -35,8 +35,8 @@ namespace ops {
template<DeviceType D, typename T> template<DeviceType D, typename T>
class LpNormOp; class LpNormOp;
template<> template<class T>
class LpNormOp<DeviceType::CPU, float> : public Operation { class LpNormOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit LpNormOp(OpConstructContext *context) explicit LpNormOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -59,8 +59,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation { ...@@ -59,8 +59,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output); Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>(); const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<float>(); auto *output_data = output->mutable_data<T>();
utils::ThreadPool utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool(); &thread_pool = context->device()->cpu_runtime()->thread_pool();
auto outer_loop = std::accumulate(input_shape.begin(), auto outer_loop = std::accumulate(input_shape.begin(),
...@@ -95,7 +95,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation { ...@@ -95,7 +95,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i; auto output_data_base = output_data + inner_loop * i;
norm_ptr[i] = std::accumulate(output_data_base, norm_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f); output_data_base + inner_loop,
static_cast<T>(0.0f));
norm_ptr[i] = std::pow(norm_ptr[i], power); norm_ptr[i] = std::pow(norm_ptr[i], power);
norm_ptr[i] += 1e-6; norm_ptr[i] += 1e-6;
} }
...@@ -151,6 +152,8 @@ class LpNormOp<DeviceType::GPU, float> : public Operation { ...@@ -151,6 +152,8 @@ class LpNormOp<DeviceType::GPU, float> : public Operation {
void RegisterLpNorm(OpRegistry *op_registry) { void RegisterLpNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp, MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp); MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp);
} }
......
...@@ -70,27 +70,27 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation { ...@@ -70,27 +70,27 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard params_guard(params); Tensor::MappingGuard params_guard(params);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<T>(); const T *input_data = input->data<T>();
const float *params_data = params->data<T>(); const T *params_data = params->data<T>();
float *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
for (int r = 0; r < num_rows; ++r) { for (int r = 0; r < num_rows; ++r) {
const float *input_row = input_data + r * input_cols; const T *input_row = input_data + r * input_cols;
const float *prev_row = input_row + 4 * cell_dim; const T *prev_row = input_row + 4 * cell_dim;
const float *scale_data = const T *scale_data =
embed_scales ? prev_row + cell_dim : nullptr; embed_scales ? prev_row + cell_dim : nullptr;
float *output_cell = output_data + r * output_dim; T *output_cell = output_data + r * output_dim;
float *output_row = output_cell + cell_dim; T *output_row = output_cell + cell_dim;
LSTMNonlinearKernel(context, LSTMNonlinearKernel<T>(context,
input_row, input_row,
prev_row, prev_row,
scale_data, scale_data,
params_data, params_data,
embed_scales, embed_scales,
params_stride, params_stride,
cell_dim, cell_dim,
output_cell, output_cell,
output_row); output_row);
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
...@@ -104,6 +104,8 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation { ...@@ -104,6 +104,8 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
void RegisterLSTMNonlinear(OpRegistry *op_registry) { void RegisterLSTMNonlinear(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp, MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -92,18 +92,18 @@ class MatMulOpBase : public Operation { ...@@ -92,18 +92,18 @@ class MatMulOpBase : public Operation {
template<DeviceType D, class T> template<DeviceType D, class T>
class MatMulOp; class MatMulOp;
template<> template<class T>
class MatMulOp<CPU, float> : public MatMulOpBase { class MatMulOp<CPU, T> : public MatMulOpBase {
public: public:
explicit MatMulOp(OpConstructContext *context) explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context), : MatMulOpBase(context),
gemm_(delegator::Gemm::Create( gemm_(delegator::Gemm::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Gemm, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, T, kCpuImplType),
delegator::GemmParam())), delegator::GemmParam())),
gemv_(delegator::Gemv::Create( gemv_(delegator::Gemv::Create(
context->workspace(), context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE), MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {} DelegatorParam())) {}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -197,8 +197,8 @@ class MatMulOp<CPU, float> : public MatMulOpBase { ...@@ -197,8 +197,8 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
"bias' dim should be <= 2."); "bias' dim should be <= 2.");
Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard c_guard(C); Tensor::MappingGuard c_guard(C);
const float *bias_data = bias->data<float>(); const T *bias_data = bias->data<T>();
float *c_data = C->mutable_data<float>(); T *c_data = C->mutable_data<T>();
utils::ThreadPool utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool(); &thread_pool = context->device()->cpu_runtime()->thread_pool();
...@@ -599,6 +599,8 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase { ...@@ -599,6 +599,8 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
void RegisterMatMul(OpRegistry *op_registry) { void RegisterMatMul(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
...@@ -30,8 +30,8 @@ namespace ops { ...@@ -30,8 +30,8 @@ namespace ops {
template<DeviceType D, typename T> template<DeviceType D, typename T>
class MVNormOp; class MVNormOp;
template<> template<class T>
class MVNormOp<DeviceType::CPU, float> : public Operation { class MVNormOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit MVNormOp(OpConstructContext *context) explicit MVNormOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -52,8 +52,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation { ...@@ -52,8 +52,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input); Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output); Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>(); const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<float>(); auto *output_data = output->mutable_data<T>();
const auto input_size = input->size(); const auto input_size = input->size();
const auto outer_loop = const auto outer_loop =
...@@ -71,7 +71,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation { ...@@ -71,7 +71,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
const auto offset = inner_loop * i; const auto offset = inner_loop * i;
mean_ptr[i] = std::accumulate(input_data + offset, mean_ptr[i] = std::accumulate(input_data + offset,
input_data + offset + inner_loop, 0.0f); input_data + offset + inner_loop,
static_cast<T>(0.0f));
mean_ptr[i] /= inner_loop; mean_ptr[i] /= inner_loop;
} }
}, 0, outer_loop, 1); }, 0, outer_loop, 1);
...@@ -105,7 +106,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation { ...@@ -105,7 +106,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i; auto output_data_base = output_data + inner_loop * i;
mean_v_ptr[i] = std::accumulate(output_data_base, mean_v_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f); output_data_base + inner_loop,
static_cast<T>(0.0f));
mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_; mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_;
} }
}, 0, outer_loop, 1); }, 0, outer_loop, 1);
...@@ -169,6 +171,8 @@ class MVNormOp<DeviceType::GPU, float> : public Operation { ...@@ -169,6 +171,8 @@ class MVNormOp<DeviceType::GPU, float> : public Operation {
void RegisterMVNorm(OpRegistry *op_registry) { void RegisterMVNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp, MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp); MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp);
} }
......
...@@ -39,10 +39,10 @@ class OneHotOpBase : public Operation { ...@@ -39,10 +39,10 @@ class OneHotOpBase : public Operation {
int axis_; int axis_;
}; };
template <DeviceType D, typename T> template<DeviceType D, typename T>
class OneHotOp; class OneHotOp;
template <typename T> template<typename T>
class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase { class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
public: public:
explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {} explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {}
...@@ -81,15 +81,17 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase { ...@@ -81,15 +81,17 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (axis == 1) { if (axis == 1) {
for (index_t i = 0; i < batch; ++i) { for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < depth_; ++j) { for (index_t j = 0; j < depth_; ++j) {
output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ : float input_value = input_ptr[i];
off_value_; output_ptr[i * depth_ + j] =
input_value == j ? on_value_ : off_value_;
} }
} }
} else { } else {
for (index_t i = 0; i < depth_; ++i) { for (index_t i = 0; i < depth_; ++i) {
for (index_t j = 0; j < batch; ++j) { for (index_t j = 0; j < batch; ++j) {
output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ : float input_value = input_ptr[j];
off_value_; output_ptr[i * batch + j] =
input_value == i ? on_value_ : off_value_;
} }
} }
} }
...@@ -110,7 +112,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase { ...@@ -110,7 +112,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) { if (left == 0) {
for (index_t i = 0; i < length; ++i) { for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == i ? on_value_ : off_value_; float input_value = **input_ptr;
**output_ptr = input_value == i ? on_value_ : off_value_;
++(*output_ptr); ++(*output_ptr);
} }
...@@ -130,7 +133,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase { ...@@ -130,7 +133,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) { if (left == 0) {
for (index_t i = 0; i < length; ++i) { for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == test ? on_value_ : off_value_; float input_value = **input_ptr;
**output_ptr = input_value == test ? on_value_ : off_value_;
++(*output_ptr); ++(*output_ptr);
++(*input_ptr); ++(*input_ptr);
} }
...@@ -144,9 +148,9 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase { ...@@ -144,9 +148,9 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
} }
}; };
void RegisterOneHot(OpRegistry *op_registry) { void RegisterOneHot(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -200,8 +200,8 @@ class PadOp<DeviceType::GPU, float> : public Operation { ...@@ -200,8 +200,8 @@ class PadOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterPad(OpRegistry *op_registry) { void RegisterPad(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pad", PadOp, MACE_REGISTER_OP(op_registry, "Pad", PadOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Pad", PadOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp); MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
} }
......
...@@ -87,6 +87,8 @@ class PadContextOp<DeviceType::CPU, T> : public Operation { ...@@ -87,6 +87,8 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
void RegisterPadContext(OpRegistry *op_registry) { void RegisterPadContext(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp, MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -80,7 +80,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation { ...@@ -80,7 +80,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) { for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size; const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_; T *out_base = output_data + i * output_dim_;
T temp_result = 0; T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) { for (index_t g = 0; g < group_size; ++g) {
T value = T value =
(std::fabs(in_base[g]) (std::fabs(in_base[g])
...@@ -99,9 +99,9 @@ class PNormOp<DeviceType::CPU, T> : public Operation { ...@@ -99,9 +99,9 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) { for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size; const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_; T *out_base = output_data + i * output_dim_;
T temp_result = 0; T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) { for (index_t g = 0; g < group_size; ++g) {
temp_result += std::abs(in_base[g]);; temp_result += std::abs(in_base[g]);
} }
out_base[j] = temp_result; out_base[j] = temp_result;
} }
...@@ -114,7 +114,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation { ...@@ -114,7 +114,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) { for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size; const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_; T *out_base = output_data + i * output_dim_;
T temp_result = 0; T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) { for (index_t g = 0; g < group_size; ++g) {
temp_result += in_base[g] * in_base[g]; temp_result += in_base[g] * in_base[g];
} }
...@@ -136,6 +136,8 @@ class PNormOp<DeviceType::CPU, T> : public Operation { ...@@ -136,6 +136,8 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
void RegisterPNorm(OpRegistry *op_registry) { void RegisterPNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PNorm", PNormOp, MACE_REGISTER_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -60,8 +60,8 @@ class PoolingOpBase : public ConvPool2dOpBase { ...@@ -60,8 +60,8 @@ class PoolingOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T> template<DeviceType D, class T>
class PoolingOp; class PoolingOp;
template<> template<class T>
class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { class PoolingOp<DeviceType::CPU, T> : public PoolingOpBase {
public: public:
explicit PoolingOp(OpConstructContext *context) explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) {} : PoolingOpBase(context) {}
...@@ -93,8 +93,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { ...@@ -93,8 +93,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor); Tensor::MappingGuard output_guard(output_tensor);
const float *input = input_tensor->data<float>(); const T *input = input_tensor->data<T>();
float *output = output_tensor->mutable_data<float>(); T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
...@@ -127,14 +127,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { ...@@ -127,14 +127,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
private: private:
void MaxPooling(const OpContext *context, void MaxPooling(const OpContext *context,
const float *input, const T *input,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
const int *filter_hw, const int *filter_hw,
const int *stride_hw, const int *stride_hw,
const int *dilation_hw, const int *dilation_hw,
const int *pad_hw, const int *pad_hw,
float *output) { T *output) {
const index_t batch = out_shape[0]; const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2]; const index_t out_height = out_shape[2];
...@@ -184,14 +184,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { ...@@ -184,14 +184,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
} }
void AvgPooling(const OpContext *context, void AvgPooling(const OpContext *context,
const float *input, const T *input,
const index_t *in_shape, const index_t *in_shape,
const index_t *out_shape, const index_t *out_shape,
const int *filter_hw, const int *filter_hw,
const int *stride_hw, const int *stride_hw,
const int *dilation_hw, const int *dilation_hw,
const int *pad_hw, const int *pad_hw,
float *output) { T *output) {
const index_t batch = out_shape[0]; const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2]; const index_t out_height = out_shape[2];
...@@ -514,6 +514,8 @@ class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase { ...@@ -514,6 +514,8 @@ class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
void RegisterPooling(OpRegistry *op_registry) { void RegisterPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
......
...@@ -148,6 +148,8 @@ class PriorBoxOp : public Operation { ...@@ -148,6 +148,8 @@ class PriorBoxOp : public Operation {
void RegisterPriorBox(OpRegistry *op_registry) { void RegisterPriorBox(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp, MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -145,7 +145,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -145,7 +145,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
MACE_UNUSED(context); MACE_UNUSED(context);
if (reduce_first_axis_) { if (reduce_first_axis_) {
if (type == ReduceType::MEAN) { if (type == ReduceType::MEAN) {
T tmp = 0; T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i]; tmp = tmp + input[i];
} }
...@@ -169,7 +169,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -169,7 +169,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
} }
output[0] = tmp; output[0] = tmp;
} else if (type == ReduceType::SUM) { } else if (type == ReduceType::SUM) {
T tmp = 0; T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i]; tmp = tmp + input[i];
} }
...@@ -193,7 +193,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -193,7 +193,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) { if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
T tmp = 0; T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) { for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i]; tmp += input[j * data_reshape_[1] + i];
} }
...@@ -225,7 +225,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -225,7 +225,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
} }
} else if (type == ReduceType::SUM) { } else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
T tmp = 0; T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) { for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i]; tmp += input[j * data_reshape_[1] + i];
} }
...@@ -239,7 +239,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -239,7 +239,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) { if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
T tmp = 0; T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) { for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j]; tmp += input[i * data_reshape_[1] + j];
} }
...@@ -271,7 +271,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -271,7 +271,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
} }
} else if (type == ReduceType::SUM) { } else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
T tmp = 0; T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) { for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j]; tmp += input[i * data_reshape_[1] + j];
} }
...@@ -335,9 +335,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase { ...@@ -335,9 +335,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
T tmp = 1; T tmp = 1;
for (int j = 0; j < data_reshape_[2]; ++j) { for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[0]; ++k) { for (int k = 0; k < data_reshape_[0]; ++k) {
tmp *= tmp *= input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
input[(k * data_reshape_[1] + i) * data_reshape_[2]
+ j];
} }
} }
output[i] = tmp; output[i] = tmp;
...@@ -1036,6 +1034,8 @@ class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase { ...@@ -1036,6 +1034,8 @@ class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
void RegisterReduce(OpRegistry *op_registry) { void RegisterReduce(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, int); DeviceType::CPU, int);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
......
...@@ -20,6 +20,7 @@ namespace mace { ...@@ -20,6 +20,7 @@ namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
template<typename T>
class Activation : public delegator::Activation { class Activation : public delegator::Activation {
public: public:
explicit Activation(const delegator::ActivationParam &param) explicit Activation(const delegator::ActivationParam &param)
...@@ -34,9 +35,10 @@ class Activation : public delegator::Activation { ...@@ -34,9 +35,10 @@ class Activation : public delegator::Activation {
Tensor *output); Tensor *output);
}; };
MaceStatus Activation::Compute(const OpContext *context, template<typename T>
const Tensor *input, MaceStatus Activation<T>::Compute(const OpContext *context,
Tensor *output) { const Tensor *input,
Tensor *output) {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
if (input != output) { if (input != output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
...@@ -49,12 +51,13 @@ MaceStatus Activation::Compute(const OpContext *context, ...@@ -49,12 +51,13 @@ MaceStatus Activation::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void Activation::DoActivation(const OpContext *context, template<typename T>
const Tensor *input, void Activation<T>::DoActivation(const OpContext *context,
Tensor *output) { const Tensor *input,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
auto input_ptr = input->data<float>(); auto input_ptr = input->data<T>();
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<T>();
const index_t size = input->size(); const index_t size = input->size();
switch (type_) { switch (type_) {
...@@ -77,7 +80,7 @@ void Activation::DoActivation(const OpContext *context, ...@@ -77,7 +80,7 @@ void Activation::DoActivation(const OpContext *context,
case LEAKYRELU: { case LEAKYRELU: {
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
*output_ptr = *output_ptr =
std::max(*input_ptr, 0.f) std::max<float>(*input_ptr, 0.f)
+ std::min(*input_ptr, 0.f) * leakyrelu_coefficient_; + std::min(*input_ptr, 0.f) * leakyrelu_coefficient_;
++input_ptr; ++input_ptr;
++output_ptr; ++output_ptr;
...@@ -107,8 +110,14 @@ void Activation::DoActivation(const OpContext *context, ...@@ -107,8 +110,14 @@ void Activation::DoActivation(const OpContext *context,
} }
} }
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam, void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(Activation, CPU, float, REF)) MACE_REGISTER_DELEGATOR(
registry, Activation<float>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Activation<BFloat16>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
...@@ -18,6 +18,7 @@ namespace mace { ...@@ -18,6 +18,7 @@ namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
template<typename T>
class BiasAdd : public delegator::BiasAdd { class BiasAdd : public delegator::BiasAdd {
public: public:
explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {} explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
...@@ -31,10 +32,11 @@ class BiasAdd : public delegator::BiasAdd { ...@@ -31,10 +32,11 @@ class BiasAdd : public delegator::BiasAdd {
const Tensor *bias, Tensor *output); const Tensor *bias, Tensor *output);
}; };
MaceStatus BiasAdd::Compute(const OpContext *context, template<typename T>
const Tensor *input, MaceStatus BiasAdd<T>::Compute(const OpContext *context,
const Tensor *bias, const Tensor *input,
Tensor *output) { const Tensor *bias,
Tensor *output) {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard bias_guard(bias);
if (input != output) { if (input != output) {
...@@ -54,14 +56,15 @@ MaceStatus BiasAdd::Compute(const OpContext *context, ...@@ -54,14 +56,15 @@ MaceStatus BiasAdd::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void BiasAdd::AddBias(const OpContext *context, template<typename T>
const Tensor *input, void BiasAdd<T>::AddBias(const OpContext *context,
const Tensor *bias, const Tensor *input,
mace::Tensor *output) { const Tensor *bias,
mace::Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto bias_data = bias->data<float>(); auto bias_data = bias->data<T>();
auto output_data = output->mutable_data<float>(); auto output_data = output->mutable_data<T>();
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t channels = input->dim(1); const index_t channels = input->dim(1);
...@@ -84,8 +87,14 @@ void BiasAdd::AddBias(const OpContext *context, ...@@ -84,8 +87,14 @@ void BiasAdd::AddBias(const OpContext *context,
} }
} }
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam, void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, REF)) MACE_REGISTER_DELEGATOR(
registry, BiasAdd<float>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, BiasAdd<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
...@@ -12,19 +12,32 @@ ...@@ -12,19 +12,32 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/ref/conv_2d.h"
#include <vector> #include <vector>
#include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus Conv2d<float>::Compute(const OpContext *context, template<typename T>
const Tensor *input, class Conv2d : public delegator::Conv2d {
const Tensor *filter, public:
Tensor *output) { explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus Conv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
const std::vector<index_t> in_shape = input->shape(); const std::vector<index_t> in_shape = input->shape();
...@@ -62,9 +75,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context, ...@@ -62,9 +75,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<float>(); auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) { for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; ++m) { for (index_t m = 0; m < filter_shape[0]; ++m) {
...@@ -74,7 +87,7 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context, ...@@ -74,7 +87,7 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = filter_shape[1]; const index_t in_channels = filter_shape[1];
float *out_ptr_base = T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
...@@ -82,9 +95,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context, ...@@ -82,9 +95,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
float sum = 0; float sum = 0;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr = const T *filter_ptr =
filter_data + m * in_channels * filter_size + c * filter_size; filter_data + m * in_channels * filter_size + c * filter_size;
for (index_t kh = 0; kh < filter_shape[2]; ++kh) { for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
...@@ -94,7 +107,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context, ...@@ -94,7 +107,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t const index_t
iw = -pad_left + w * strides_[1] + kw * dilations_[1]; iw = -pad_left + w * strides_[1] + kw * dilations_[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw]; float input_value = in_ptr_base[ih * in_width + iw];
float filter_value = filter_ptr[kw];
sum += input_value * filter_value;
} }
} // kw } // kw
filter_ptr += filter_shape[3]; filter_ptr += filter_shape[3];
...@@ -109,9 +124,14 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context, ...@@ -109,9 +124,14 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
typedef Conv2d<float> Conv2dRef; void RegisterConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(registry, Conv2dRef, delegator::Conv2dParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, REF, General)) registry, Conv2d<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2d<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_CONV_2D_H_
#define MACE_OPS_REF_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Conv2d : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class Conv2d<float> : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_CONV_2D_H_
...@@ -16,18 +16,36 @@ ...@@ -16,18 +16,36 @@
#include <memory> #include <memory>
#include <functional> #include <functional>
#include <vector> #include <vector>
#include "mace/ops/ref/deconv_2d.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus Deconv2d<float>::Compute(const OpContext *context, template<typename T>
const Tensor *input, class Deconv2d : public delegator::Deconv2d {
const Tensor *filter, public:
const Tensor *output_shape, explicit Deconv2d(const delegator::Deconv2dParam &param)
Tensor *output) { : delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus Deconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
std::vector<index_t> out_shape; std::vector<index_t> out_shape;
...@@ -65,15 +83,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context, ...@@ -65,15 +83,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(), std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(), padded_out_shape.end(),
1, 1,
std::multiplies<index_t>()) * sizeof(float); std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size); index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size); scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor> std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
padded_out scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
padded_out->Reshape(padded_out_shape); padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out); padded_output = std::move(padded_out);
} }
...@@ -88,10 +105,10 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context, ...@@ -88,10 +105,10 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<float>(); auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<float>(); auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
...@@ -122,7 +139,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context, ...@@ -122,7 +139,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t oc = 0; oc < out_channels; ++oc) { for (index_t oc = 0; oc < out_channels; ++oc) {
float *out_base = T *out_base =
pad_out_data + (b * out_channels + oc) * out_img_size; pad_out_data + (b * out_channels + oc) * out_img_size;
for (index_t i = 0; i < in_height; ++i) { for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) { for (index_t j = 0; j < in_width; ++j) {
...@@ -148,13 +165,13 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context, ...@@ -148,13 +165,13 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) { for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < out_channels; ++j) { for (index_t j = 0; j < out_channels; ++j) {
for (index_t k = 0; k < out_height; ++k) { for (index_t k = 0; k < out_height; ++k) {
const float *input_base = const T *input_base =
pad_out_data pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top)) + ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width; * pad_out_width;
float *output_base = T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width; out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
} }
} }
} }
...@@ -162,10 +179,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context, ...@@ -162,10 +179,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
typedef Deconv2d<float> Deconv2dRef; void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, Deconv2dRef, delegator::Deconv2dParam, registry, Deconv2d<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, REF, General)) MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Deconv2d<BFloat16>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DECONV_2D_H_
#define MACE_OPS_REF_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Deconv2d : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class Deconv2d<float> : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DECONV_2D_H_
...@@ -12,19 +12,32 @@ ...@@ -12,19 +12,32 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/ref/depthwise_conv_2d.h"
#include <vector> #include <vector>
#include "mace/ops/delegator/depthwise_conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context, template<typename T>
const Tensor *input, class DepthwiseConv2d : public delegator::DepthwiseConv2d {
const Tensor *filter, public:
Tensor *output) { explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseConv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
const std::vector<index_t> in_shape = input->shape(); const std::vector<index_t> in_shape = input->shape();
...@@ -65,9 +78,9 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context, ...@@ -65,9 +78,9 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<float>(); auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) { for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
...@@ -80,16 +93,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context, ...@@ -80,16 +93,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3]; const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
float *out_ptr_base = T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
float sum = 0; float sum = 0;
const float *in_ptr_base = const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr = const T *filter_ptr =
filter_data + multi_index * in_channels * filter_size filter_data + multi_index * in_channels * filter_size
+ c * filter_size; + c * filter_size;
...@@ -115,10 +128,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context, ...@@ -115,10 +128,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
typedef DepthwiseConv2d<float> DepthwiseConv2dRef; void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dRef, delegator::DepthwiseConv2dParam, registry, DepthwiseConv2d<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, REF, General)) MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseConv2d<BFloat16>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class DepthwiseConv2d : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class DepthwiseConv2d<float> : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
...@@ -15,18 +15,52 @@ ...@@ -15,18 +15,52 @@
#include <utility> #include <utility>
#include <memory> #include <memory>
#include <functional> #include <functional>
#include "mace/ops/ref/depthwise_deconv_2d.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, template<typename T>
const Tensor *input, class GroupDeconv2d : public delegator::GroupDeconv2d {
const Tensor *filter, public:
const Tensor *output_shape, explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
Tensor *output) { : delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
class DepthwiseDeconv2d : public GroupDeconv2d<T> {
public:
explicit DepthwiseDeconv2d<T>(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d<T>(param) {}
~DepthwiseDeconv2d<T>() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseDeconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
std::vector<index_t> out_shape; std::vector<index_t> out_shape;
...@@ -41,15 +75,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -41,15 +75,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::vector<int> out_pad_size; std::vector<int> out_pad_size;
CalDeconvOutputShapeAndPadSize(input->shape(), CalDeconvOutputShapeAndPadSize(input->shape(),
filter->shape(), filter->shape(),
strides_, GroupDeconv2d<T>::strides_,
padding_type_, GroupDeconv2d<T>::padding_type_,
paddings_, GroupDeconv2d<T>::paddings_,
input->dim(1), input->dim(1),
&out_shape, &out_shape,
nullptr, nullptr,
&out_pad_size, &out_pad_size,
&padded_out_shape, &padded_out_shape,
framework_type_, GroupDeconv2d<T>::framework_type_,
DataFormat::NCHW); DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape)); MACE_RETURN_IF_ERROR(output->Resize(out_shape));
...@@ -64,15 +98,14 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -64,15 +98,14 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(), std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(), padded_out_shape.end(),
1, 1,
std::multiplies<index_t>()) * sizeof(float); std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size); index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size); scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor> std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
padded_out scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
padded_out->Reshape(padded_out_shape); padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out); padded_output = std::move(padded_out);
} }
...@@ -87,10 +120,10 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -87,10 +120,10 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<float>(); auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<float>(); auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
...@@ -119,15 +152,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -119,15 +152,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
float *out_base = T *out_base =
pad_out_data + (b * channels + c) * out_img_size; pad_out_data + (b * channels + c) * out_img_size;
for (index_t i = 0; i < in_height; ++i) { for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) { for (index_t j = 0; j < in_width; ++j) {
const index_t out_offset = const index_t out_offset = i * GroupDeconv2d<T>::strides_[0] *
i * strides_[0] * pad_out_width + j * strides_[1]; pad_out_width + j * GroupDeconv2d<T>::strides_[1];
const index_t input_idx = const index_t input_idx =
(b * channels + c) * in_img_size + i * in_width + j; (b * channels + c) * in_img_size + i * in_width + j;
const float val = input_data[input_idx]; const T val = input_data[input_idx];
const index_t kernel_offset = c * kernel_size; const index_t kernel_offset = c * kernel_size;
for (int k = 0; k < kernel_size; ++k) { for (int k = 0; k < kernel_size; ++k) {
const index_t out_idx = out_offset + index_map[k]; const index_t out_idx = out_offset + index_map[k];
...@@ -143,13 +176,13 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -143,13 +176,13 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) { for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < channels; ++j) { for (index_t j = 0; j < channels; ++j) {
for (index_t k = 0; k < out_height; ++k) { for (index_t k = 0; k < out_height; ++k) {
const float *input_base = const T *input_base =
pad_out_data pad_out_data
+ ((i * channels + j) * pad_out_height + (k + pad_top)) + ((i * channels + j) * pad_out_height + (k + pad_top))
* pad_out_width; * pad_out_width;
float *output_base = T *output_base =
out_data + ((i * channels + j) * out_height + k) * out_width; out_data + ((i * channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
} }
} }
} }
...@@ -157,11 +190,12 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context, ...@@ -157,11 +190,12 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context, template<typename T>
const Tensor *input, MaceStatus GroupDeconv2d<T>::Compute(const OpContext *context,
const Tensor *filter, const Tensor *input,
const Tensor *output_shape, const Tensor *filter,
Tensor *output) { const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
std::vector<index_t> out_shape; std::vector<index_t> out_shape;
...@@ -199,15 +233,14 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context, ...@@ -199,15 +233,14 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(), std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(), padded_out_shape.end(),
1, 1,
std::multiplies<index_t>()) * sizeof(float); std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size); index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size); scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor> std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
padded_out scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
padded_out->Reshape(padded_out_shape); padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out); padded_output = std::move(padded_out);
} }
...@@ -222,10 +255,10 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context, ...@@ -222,10 +255,10 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<float>(); auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<float>(); auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
...@@ -288,13 +321,13 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context, ...@@ -288,13 +321,13 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
for (int j = 0; j < out_channels; ++j) { for (int j = 0; j < out_channels; ++j) {
for (int k = 0; k < out_height; ++k) { for (int k = 0; k < out_height; ++k) {
const float *input_base = const T *input_base =
pad_out_data pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top)) + ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width; * pad_out_width;
float *output_base = T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width; out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
} }
} }
} }
...@@ -302,10 +335,16 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context, ...@@ -302,10 +335,16 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
typedef DepthwiseDeconv2d<float> DepthwiseDeconv2dRef; void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dRef, delegator::DepthwiseDeconv2dParam, registry, DepthwiseDeconv2d<float>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, REF, General)) MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseDeconv2d<BFloat16>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class GroupDeconv2d : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename OUTPUT_TYPE>
class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2d &param)
: GroupDeconv2d<OUTPUT_TYPE>(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class GroupDeconv2d<float> : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
...@@ -12,56 +12,93 @@ ...@@ -12,56 +12,93 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/delegator/gemm.h"
#include "mace/ops/ref/gemm.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus Gemm<float>::Compute(const OpContext *context, template<typename T>
const Tensor *lhs, class Gemm : public delegator::Gemm {
const Tensor *rhs, public:
const index_t batch, explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
const index_t rows, ~Gemm() {}
const index_t cols, MaceStatus Compute(const OpContext *context,
const index_t depth, const Tensor *lhs,
const MatrixMajor lhs_major, const Tensor *rhs,
const MatrixMajor rhs_major, const index_t batch,
const MatrixMajor output_major, const index_t rows,
const bool lhs_batched, const index_t cols,
const bool rhs_batched, const index_t depth,
Tensor *output) { const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<typename T>
MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context); MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs); Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>(); const T *lhs_data = lhs->data<T>();
const float *rhs_data = rhs->data<float>(); const T *rhs_data = rhs->data<T>();
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
MatrixMap<const float> MatrixMap<const T>
lhs_matrix lhs_matrix
(lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth, (lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
lhs_major, lhs_major,
rows, rows,
depth); depth);
MatrixMap<const float> MatrixMap<const T>
rhs_matrix rhs_matrix
(rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols, (rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
rhs_major, rhs_major,
depth, depth,
cols); cols);
MatrixMap<float> MatrixMap<T>
output_matrix(output_data + b * rows * cols, output_major, rows, cols); output_matrix(output_data + b * rows * cols, output_major, rows, cols);
for (index_t r = 0; r < rows; ++r) { for (index_t r = 0; r < rows; ++r) {
for (index_t c = 0; c < cols; ++c) { for (index_t c = 0; c < cols; ++c) {
float sum = 0; float sum = 0;
for (index_t d = 0; d < depth; ++d) { for (index_t d = 0; d < depth; ++d) {
sum += lhs_matrix(r, d) * rhs_matrix(d, c); sum += static_cast<float>(lhs_matrix(r, d)) *
static_cast<float>(rhs_matrix(d, c));
} // d } // d
*output_matrix.data(r, c) = sum; *output_matrix.data(r, c) = sum;
...@@ -72,20 +109,21 @@ MaceStatus Gemm<float>::Compute(const OpContext *context, ...@@ -72,20 +109,21 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Gemm<float>::Compute(const OpContext *context, template<typename T>
const Tensor *lhs, MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *rhs, const Tensor *lhs,
const index_t batch, const Tensor *rhs,
const index_t lhs_rows, const index_t batch,
const index_t lhs_cols, const index_t lhs_rows,
const index_t rhs_rows, const index_t lhs_cols,
const index_t rhs_cols, const index_t rhs_rows,
const bool transpose_lhs, const index_t rhs_cols,
const bool transpose_rhs, const bool transpose_lhs,
const bool transpose_out, const bool transpose_rhs,
const bool lhs_batched, const bool transpose_out,
const bool rhs_batched, const bool lhs_batched,
Tensor *output) { const bool rhs_batched,
Tensor *output) {
index_t rows = transpose_lhs ? lhs_cols : lhs_rows; index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
index_t depth = transpose_lhs ? lhs_rows : lhs_cols; index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
index_t cols = transpose_rhs ? rhs_rows : rhs_cols; index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
...@@ -96,24 +134,29 @@ MaceStatus Gemm<float>::Compute(const OpContext *context, ...@@ -96,24 +134,29 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
" vs. ", " vs. ",
depth2); depth2);
return Compute(context, return Gemm<T>::Compute(context,
lhs, lhs,
rhs, rhs,
batch, batch,
rows, rows,
cols, cols,
depth, depth,
transpose_lhs ? ColMajor : RowMajor, transpose_lhs ? ColMajor : RowMajor,
transpose_rhs ? ColMajor : RowMajor, transpose_rhs ? ColMajor : RowMajor,
transpose_out ? ColMajor : RowMajor, transpose_out ? ColMajor : RowMajor,
lhs_batched, lhs_batched,
rhs_batched, rhs_batched,
output); output);
} }
typedef Gemm<float> GemmRef; void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(registry, GemmRef, delegator::GemmParam, MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY(Gemm, CPU, float, REF)) registry, Gemm<float>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Gemm<BFloat16>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_GEMM_H_
#define MACE_OPS_REF_GEMM_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/matrix.h"
#include "mace/ops/delegator/gemm.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Gemm : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<>
class Gemm<float> : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_GEMM_H_
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include "mace/ops/ref/gemv.h" #include "mace/ops/delegator/gemv.h"
#if defined(MACE_ENABLE_QUANTIZE) #if defined(MACE_ENABLE_QUANTIZE)
#include "mace/core/quantize.h" #include "mace/core/quantize.h"
...@@ -23,7 +23,27 @@ namespace mace { ...@@ -23,7 +23,27 @@ namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
MaceStatus Gemv<float>::Compute(const OpContext *context, template<typename T>
class Gemv : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<typename T>
MaceStatus Gemv<T>::Compute(const OpContext *context,
const Tensor *lhs, const Tensor *lhs,
const Tensor *rhs, const Tensor *rhs,
const Tensor *bias, const Tensor *bias,
...@@ -39,18 +59,18 @@ MaceStatus Gemv<float>::Compute(const OpContext *context, ...@@ -39,18 +59,18 @@ MaceStatus Gemv<float>::Compute(const OpContext *context,
Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>(); const T *lhs_data = lhs->data<T>();
const float *rhs_data = rhs->data<float>(); const T *rhs_data = rhs->data<T>();
const float *bias_data = nullptr; const T *bias_data = nullptr;
if (bias) { if (bias) {
bias_data = bias->data<float>(); bias_data = bias->data<T>();
} }
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) { for (index_t h = 0; h < lhs_height; ++h) {
float sum = bias ? bias_data[h] : 0; float sum = bias ? static_cast<float>(bias_data[h]) : 0.f;
for (index_t w = 0; w < lhs_width; ++w) { for (index_t w = 0; w < lhs_width; ++w) {
sum += lhs_data[ sum += lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
...@@ -65,110 +85,15 @@ MaceStatus Gemv<float>::Compute(const OpContext *context, ...@@ -65,110 +85,15 @@ MaceStatus Gemv<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
#if defined(MACE_ENABLE_QUANTIZE) void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context, MACE_REGISTER_DELEGATOR(
const Tensor *lhs, registry, Gemv<float>, DelegatorParam,
const Tensor *rhs, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::REF));
const Tensor *bias, MACE_REGISTER_BF16_DELEGATOR(
const index_t batch, registry, Gemv<BFloat16>, DelegatorParam,
const index_t lhs_height, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF));
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
typedef Gemv<uint8_t> GemvUint8Ref;
MACE_REGISTER_DELEGATOR(registry, GemvUint8Ref, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, Ref))
#endif // MACE_ENABLE_QUANTIZE
typedef Gemv<float> GemvRef;
MACE_REGISTER_DELEGATOR(registry, GemvRef, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, float, REF))
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -107,8 +107,11 @@ MaceStatus Eltwise::Compute(const OpContext *context, ...@@ -107,8 +107,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam, void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, REF)) MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::REF));
}
} // namespace q8 } // namespace q8
} // namespace ref } // namespace ref
......
...@@ -12,58 +12,33 @@ ...@@ -12,58 +12,33 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/core/quantize.h"
#ifndef MACE_OPS_REF_GEMV_H_
#define MACE_OPS_REF_GEMV_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/delegator/gemv.h" #include "mace/ops/delegator/gemv.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace ref { namespace ref {
namespace q8 {
template<typename OUTPUT_TYPE> template<typename T>
class Gemv : public delegator::Gemv { class Gemv : public delegator::Gemv {
public: public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {} explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {} ~Gemv() {}
// Always row-major after transpose // Always row-major after transpose
MaceStatus Compute( MaceStatus Compute(
const OpContext *context, const OpContext *context,
const Tensor *lhs, const Tensor *lhs,
const Tensor *rhs, const Tensor *rhs,
const Tensor *bias, const Tensor *bias,
const index_t batch, const index_t batch,
const index_t lhs_height, const index_t lhs_height,
const index_t lhs_width, const index_t lhs_width,
const bool lhs_batched, const bool lhs_batched,
const bool rhs_batched, const bool rhs_batched,
Tensor *output) override; Tensor *output) override;
};
template<>
class Gemv<float> : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
}; };
#if defined(MACE_ENABLE_QUANTIZE)
template<> template<>
class Gemv<uint8_t> : public delegator::Gemv { class Gemv<uint8_t> : public delegator::Gemv {
public: public:
...@@ -101,11 +76,111 @@ class Gemv<int32_t> : public delegator::Gemv { ...@@ -101,11 +76,111 @@ class Gemv<int32_t> : public delegator::Gemv {
const bool rhs_batched, const bool rhs_batched,
Tensor *output) override; Tensor *output) override;
}; };
#endif // MACE_ENABLE_QUANTIZE
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::REF));
MACE_REGISTER_DELEGATOR(
registry, Gemv<int32_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::REF));
}
} // namespace q8
} // namespace ref } // namespace ref
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_REF_GEMV_H_
...@@ -20,19 +20,18 @@ namespace ops { ...@@ -20,19 +20,18 @@ namespace ops {
namespace ref { namespace ref {
extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dRefDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dRefDelegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dRefDelegator(OpDelegatorRegistry *registry); extern void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dRefDelegator( extern void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry);
OpDelegatorRegistry *registry); extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemmRefDelegator(OpDelegatorRegistry *registry); extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvRefDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
namespace q8 { namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry); extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace q8 } // namespace q8
extern void RegisterGemvUint8RefDelegator(OpDelegatorRegistry *registry);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
} // namespace ref } // namespace ref
...@@ -43,43 +42,26 @@ extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); ...@@ -43,43 +42,26 @@ extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x7S1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x1S1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x15S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK15x1S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5S1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S1Delegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S2Delegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry); extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S1Delegator( extern void RegisterDepthwiseConv2dK3x3Delegator(
OpDelegatorRegistry *registry); OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S2Delegator( extern void RegisterDepthwiseDeconv2dK3x3Delegator(
OpDelegatorRegistry *registry); OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S1Delegator( extern void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4Delegator(
OpDelegatorRegistry *registry); OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S2Delegator( extern void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S1Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S2Delegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dGeneralDelegator( extern void RegisterDepthwiseDeconv2dGeneralDelegator(
OpDelegatorRegistry *registry); OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dGeneralDelegator( extern void RegisterGroupDeconv2dGeneralDelegator(
...@@ -92,8 +74,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); ...@@ -92,8 +74,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
namespace q8 { namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry); extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvUint8Delegator(OpDelegatorRegistry *registry); extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
} // namespace q8 } // namespace q8
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
...@@ -103,16 +84,16 @@ extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry); ...@@ -103,16 +84,16 @@ extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
ref::RegisterActivationDelegator(registry); ref::RegisterActivationDelegator(registry);
ref::RegisterBiasAddDelegator(registry); ref::RegisterBiasAddDelegator(registry);
ref::RegisterConv2dRefDelegator(registry); ref::RegisterConv2dDelegator(registry);
ref::RegisterDeconv2dRefDelegator(registry); ref::RegisterDeconv2dDelegator(registry);
ref::RegisterDepthwiseConv2dRefDelegator(registry); ref::RegisterDepthwiseConv2dDelegator(registry);
ref::RegisterDepthwiseDeconv2dRefDelegator(registry); ref::RegisterDepthwiseDeconv2dDelegator(registry);
ref::RegisterGemmRefDelegator(registry); ref::RegisterGemmDelegator(registry);
ref::RegisterGemvRefDelegator(registry); ref::RegisterGemvDelegator(registry);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
ref::q8::RegisterEltwiseDelegator(registry); ref::q8::RegisterEltwiseDelegator(registry);
ref::RegisterGemvUint8RefDelegator(registry); ref::q8::RegisterGemvDelegator(registry);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
...@@ -120,37 +101,23 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { ...@@ -120,37 +101,23 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
arm::fp32::RegisterBiasAddDelegator(registry); arm::fp32::RegisterBiasAddDelegator(registry);
arm::fp32::RegisterConv2dK1x1Delegator(registry); arm::fp32::RegisterConv2dK1x1Delegator(registry);
arm::fp32::RegisterConv2dK1x7S1Delegator(registry); arm::fp32::RegisterConv2dK1xNDelegator(registry);
arm::fp32::RegisterConv2dK7x1S1Delegator(registry); arm::fp32::RegisterConv2dK3x3Delegator(registry);
arm::fp32::RegisterConv2dK1x15S1Delegator(registry);
arm::fp32::RegisterConv2dK15x1S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S2Delegator(registry);
arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry); arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
arm::fp32::RegisterConv2dK5x5S1Delegator(registry); arm::fp32::RegisterConv2dK5x5Delegator(registry);
arm::fp32::RegisterConv2dK7x7S1Delegator(registry); arm::fp32::RegisterConv2dK7x7Delegator(registry);
arm::fp32::RegisterConv2dK7x7S2Delegator(registry);
arm::fp32::RegisterConv2dK7x7S3Delegator(registry);
arm::fp32::RegisterConv2dGeneralDelegator(registry); arm::fp32::RegisterConv2dGeneralDelegator(registry);
arm::fp32::RegisterDeconv2dK2x2S1Delegator(registry); arm::fp32::RegisterDeconv2dK2x2Delegator(registry);
arm::fp32::RegisterDeconv2dK2x2S2Delegator(registry); arm::fp32::RegisterDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S1Delegator(registry); arm::fp32::RegisterDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDeconv2dGeneralDelegator(registry); arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S1Delegator(registry); arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S2Delegator(registry); arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S1Delegator(registry); arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S2Delegator(registry); arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S1Delegator(registry); arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry); arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry); arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
...@@ -159,8 +126,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { ...@@ -159,8 +126,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
arm::q8::RegisterEltwiseDelegator(registry); arm::q8::RegisterEltwiseDelegator(registry);
arm::q8::RegisterGemvUint8Delegator(registry); arm::q8::RegisterGemvDelegator(registry);
arm::q8::RegisterGemvInt32Delegator(registry);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
......
...@@ -98,6 +98,8 @@ class ReplaceIndexOp<DeviceType::CPU, T> : public Operation { ...@@ -98,6 +98,8 @@ class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
void RegisterReplaceIndex(OpRegistry *op_registry) { void RegisterReplaceIndex(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp, MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -152,6 +152,7 @@ class ReshapeOp<GPU, float> : public Operation { ...@@ -152,6 +152,7 @@ class ReshapeOp<GPU, float> : public Operation {
void RegisterReshape(OpRegistry *op_registry) { void RegisterReshape(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, int32_t); MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, int32_t);
MACE_REGISTER_GPU_OP(op_registry, "Reshape", ReshapeOp); MACE_REGISTER_GPU_OP(op_registry, "Reshape", ReshapeOp);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
......
...@@ -57,15 +57,7 @@ inline T ComputeLerp(const T top_left, ...@@ -57,15 +57,7 @@ inline T ComputeLerp(const T top_left,
const T bottom_left, const T bottom_left,
const T bottom_right, const T bottom_right,
const float x_lerp, const float x_lerp,
const float y_lerp); const float y_lerp) {
template<>
inline float ComputeLerp<float>(const float top_left,
const float top_right,
const float bottom_left,
const float bottom_right,
const float x_lerp,
const float y_lerp) {
const float top = top_left + (top_right - top_left) * x_lerp; const float top = top_left + (top_right - top_left) * x_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
return top + (bottom - top) * y_lerp; return top + (bottom - top) * y_lerp;
...@@ -370,6 +362,8 @@ class ResizeBilinearOp<DeviceType::GPU, float> : public Operation { ...@@ -370,6 +362,8 @@ class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
void RegisterResizeBilinear(OpRegistry *op_registry) { void RegisterResizeBilinear(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
......
...@@ -176,6 +176,8 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation { ...@@ -176,6 +176,8 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
void RegisterResizeNearestNeighbor(OpRegistry *op_registry) { void RegisterResizeNearestNeighbor(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor", MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU, float); ResizeNearestNeighborOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor", MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp); ResizeNearestNeighborOp);
......
...@@ -76,6 +76,8 @@ class ReverseOp<DeviceType::CPU, T> : public Operation { ...@@ -76,6 +76,8 @@ class ReverseOp<DeviceType::CPU, T> : public Operation {
void RegisterReverse(OpRegistry *op_registry) { void RegisterReverse(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp, MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reverse", ReverseOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -158,6 +158,8 @@ class ScalarMathOp : public Operation { ...@@ -158,6 +158,8 @@ class ScalarMathOp : public Operation {
void RegisterScalarMath(OpRegistry *op_registry) { void RegisterScalarMath(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
} }
......
...@@ -22,8 +22,8 @@ namespace ops { ...@@ -22,8 +22,8 @@ namespace ops {
template<DeviceType D, typename T> template<DeviceType D, typename T>
class SelectOp; class SelectOp;
template<> template<class T>
class SelectOp<DeviceType::CPU, float> : public Operation { class SelectOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit SelectOp(OpConstructContext *context) explicit SelectOp(OpConstructContext *context)
: Operation(context) {} : Operation(context) {}
...@@ -41,7 +41,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation { ...@@ -41,7 +41,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
const index_t condition_rank = condition->dim_size(); const index_t condition_rank = condition->dim_size();
MACE_RETURN_IF_ERROR(output->Resize({condition->size(), condition_rank})); MACE_RETURN_IF_ERROR(output->Resize({condition->size(), condition_rank}));
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
const bool *condition_data = condition->data<bool>(); const bool *condition_data = condition->data<bool>();
index_t i = 0; index_t i = 0;
...@@ -161,10 +161,10 @@ class SelectOp<DeviceType::CPU, float> : public Operation { ...@@ -161,10 +161,10 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_RETURN_IF_ERROR(output->Resize(x->shape())); MACE_RETURN_IF_ERROR(output->Resize(x->shape()));
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
const bool *condition_data = condition->data<bool>(); const bool *condition_data = condition->data<bool>();
const float *x_data = x->data<float>(); const T *x_data = x->data<T>();
const float *y_data = y->data<float>(); const T *y_data = y->data<T>();
const index_t condition_size = condition->size(); const index_t condition_size = condition->size();
const index_t x_size = x->size(); const index_t x_size = x->size();
...@@ -182,7 +182,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation { ...@@ -182,7 +182,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
MACE_ASSERT( MACE_ASSERT(
block_size > 1 && x_size % condition_size == 0, block_size > 1 && x_size % condition_size == 0,
"x_size should be a multiple of condition_size and greater than 1"); "x_size should be a multiple of condition_size and greater than 1");
const auto raw_block_size = block_size * sizeof(float); const auto raw_block_size = block_size * sizeof(T);
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
auto offset = block_size * k; auto offset = block_size * k;
...@@ -208,6 +208,8 @@ class SelectOp<DeviceType::CPU, float> : public Operation { ...@@ -208,6 +208,8 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
void RegisterSelect(OpRegistry *op_registry) { void RegisterSelect(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Select", SelectOp, MACE_REGISTER_OP(op_registry, "Select", SelectOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Select", SelectOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -62,6 +62,8 @@ class ShapeOp : public Operation { ...@@ -62,6 +62,8 @@ class ShapeOp : public Operation {
void RegisterShape(OpRegistry *op_registry) { void RegisterShape(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Shape", ShapeOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -87,6 +87,8 @@ class SliceOp<DeviceType::CPU, T> : public Operation { ...@@ -87,6 +87,8 @@ class SliceOp<DeviceType::CPU, T> : public Operation {
void RegisterSlice(OpRegistry *op_registry) { void RegisterSlice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Slice", SliceOp, MACE_REGISTER_OP(op_registry, "Slice", SliceOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Slice", SliceOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -39,8 +39,8 @@ namespace ops { ...@@ -39,8 +39,8 @@ namespace ops {
template<DeviceType D, typename T> template<DeviceType D, typename T>
class SoftmaxOp; class SoftmaxOp;
template<> template<class T>
class SoftmaxOp<DeviceType::CPU, float> : public Operation { class SoftmaxOp<DeviceType::CPU, T> : public Operation {
public: public:
explicit SoftmaxOp(OpConstructContext *context) explicit SoftmaxOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -71,9 +71,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -71,9 +71,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
protected: protected:
MaceStatus RunForNCHW(OpContext *context) { MaceStatus RunForNCHW(OpContext *context) {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
MACE_CHECK(input->dim_size() == 4, "The dim size of NCHW should be 4."); MACE_CHECK(input->dim_size() == 4, "The dim size of NCHW should be 4.");
index_t hw_stride = input->dim(3); index_t hw_stride = input->dim(3);
...@@ -93,8 +93,8 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -93,8 +93,8 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t b_offset = 0; for (index_t b_offset = 0;
b_offset < batch_size; b_offset += batch_stride) { b_offset < batch_size; b_offset += batch_stride) {
const float *input_b_base = input_data + b_offset; const T *input_b_base = input_data + b_offset;
float *output_b_base = output_data + b_offset; T *output_b_base = output_data + b_offset;
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
const auto raw_step_size = step * sizeof(float); const auto raw_step_size = step * sizeof(float);
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
...@@ -106,9 +106,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -106,9 +106,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size; for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) { c_offset += class_stride) {
const float *input_c_base = input_b_base + c_offset; const T *input_c_base = input_b_base + c_offset;
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_c_base + k; const T *input_ptr = input_c_base + k;
float *cache_k_ptr = cache_ptr + k; float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) { for (index_t i = 0; i < step; ++i) {
cache_k_ptr[i] = std::max(cache_k_ptr[i], input_ptr[i]); cache_k_ptr[i] = std::max(cache_k_ptr[i], input_ptr[i]);
...@@ -118,14 +118,14 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -118,14 +118,14 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size; for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) { c_offset += class_stride) {
const float *input_c_base = input_b_base + c_offset; const T *input_c_base = input_b_base + c_offset;
float *output_c_base = output_b_base + c_offset; T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_c_base + k; const T *input_ptr = input_c_base + k;
float *output_ptr = output_c_base + k; T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k; float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) { for (index_t i = 0; i < step; ++i) {
output_ptr[i] = ::exp(input_ptr[i] - cache_k_ptr[i]); output_ptr[i] = std::exp(input_ptr[i] - cache_k_ptr[i]);
} }
} }
} }
...@@ -136,24 +136,24 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -136,24 +136,24 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size; for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) { c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset; T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k; T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k; float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) { for (index_t i = 0; i < step; ++i) {
cache_k_ptr[i] += output_ptr[i]; cache_k_ptr[i] += static_cast<float>(output_ptr[i]);
} }
} }
} }
for (index_t c_offset = 0; c_offset < class_size; for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) { c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset; T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k; T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k; float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) { for (index_t i = 0; i < step; ++i) {
output_ptr[i] = output_ptr[i] / cache_k_ptr[i]; output_ptr[i] /= cache_k_ptr[i];
} }
} }
} }
...@@ -161,9 +161,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -161,9 +161,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
if (use_log_) { if (use_log_) {
for (index_t c_offset = 0; c_offset < class_size; for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) { c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset; T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k; T *output_ptr = output_c_base + k;
for (index_t i = 0; i < step; ++i) { for (index_t i = 0; i < step; ++i) {
output_ptr[i] = std::log(output_ptr[i]); output_ptr[i] = std::log(output_ptr[i]);
} }
...@@ -179,7 +179,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -179,7 +179,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
MaceStatus RunForNHWC(OpContext *context) { MaceStatus RunForNHWC(OpContext *context) {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
MACE_CHECK(input->dim_size() >= 2, "The input->dim_size() >= 2 failed."); MACE_CHECK(input->dim_size() >= 2, "The input->dim_size() >= 2 failed.");
index_t class_size = input->dim(input->dim_size() - 1); index_t class_size = input->dim(input->dim_size() - 1);
...@@ -196,16 +196,16 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -196,16 +196,16 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
utils::ThreadPool utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool(); &thread_pool = context->device()->cpu_runtime()->thread_pool();
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
float std_lowest = std::numeric_limits<float>::lowest(); float std_lowest = std::numeric_limits<float>::lowest();
for (index_t b_offset = 0; b_offset < batch_size; for (index_t b_offset = 0; b_offset < batch_size;
b_offset += batch_stride) { b_offset += batch_stride) {
const float *input_b_ptr = input_data + b_offset; const T *input_b_ptr = input_data + b_offset;
float *output_b_ptr = output_data + b_offset; T *output_b_ptr = output_data + b_offset;
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t k = start; k < end; k += step) { for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_b_ptr + k; const T *input_ptr = input_b_ptr + k;
float *output_ptr = output_b_ptr + k; T *output_ptr = output_b_ptr + k;
float max_val = std_lowest; float max_val = std_lowest;
for (index_t c = 0; c < class_size; ++c) { for (index_t c = 0; c < class_size; ++c) {
...@@ -214,15 +214,15 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -214,15 +214,15 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
float sum = 0; float sum = 0;
for (index_t c = 0; c < class_size; ++c) { for (index_t c = 0; c < class_size; ++c) {
float exp_value = ::exp(input_ptr[c] - max_val); float exp_value = std::exp(input_ptr[c] - max_val);
sum += exp_value; sum += exp_value;
output_ptr[c] = exp_value; output_ptr[c] = exp_value;
} }
if (use_log_) { if (use_log_) {
for (index_t c = 0; c < class_size; ++c) { for (index_t c = 0; c < class_size; ++c) {
output_ptr[c] /= sum; float output = (static_cast<float>(output_ptr[c])) / sum;
output_ptr[c] = std::log(output_ptr[c]); output_ptr[c] = std::log(output);
} }
} else { } else {
for (index_t c = 0; c < class_size; ++c) { for (index_t c = 0; c < class_size; ++c) {
...@@ -306,8 +306,8 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -306,8 +306,8 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
float sum = 0; float sum = 0;
std::vector<float> depth_cache(depth); std::vector<float> depth_cache(depth);
for (index_t d = 0; d < depth; ++d) { for (index_t d = 0; d < depth; ++d) {
float exp_value = ::exp((static_cast<int>(input_ptr[d]) - max_value) float exp_value = std::exp(
* input_scale); (static_cast<int>(input_ptr[d]) - max_value) * input_scale);
sum += exp_value; sum += exp_value;
depth_cache[d] = exp_value; depth_cache[d] = exp_value;
} }
...@@ -524,6 +524,8 @@ class SoftmaxOp<DeviceType::GPU, float> : public Operation { ...@@ -524,6 +524,8 @@ class SoftmaxOp<DeviceType::GPU, float> : public Operation {
void RegisterSoftmax(OpRegistry *op_registry) { void RegisterSoftmax(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
......
...@@ -90,8 +90,8 @@ class SpaceToBatchOpBase : public Operation { ...@@ -90,8 +90,8 @@ class SpaceToBatchOpBase : public Operation {
template<DeviceType D, class T> template<DeviceType D, class T>
class SpaceToBatchNDOp; class SpaceToBatchNDOp;
template<> template<class T>
class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { class SpaceToBatchNDOp<DeviceType::CPU, T> : public SpaceToBatchOpBase {
public: public:
explicit SpaceToBatchNDOp(OpConstructContext *context) explicit SpaceToBatchNDOp(OpConstructContext *context)
: SpaceToBatchOpBase(context) {} : SpaceToBatchOpBase(context) {}
...@@ -115,8 +115,8 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { ...@@ -115,8 +115,8 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
int block_shape_h = block_shape_[0]; int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1]; int block_shape_w = block_shape_[1];
const float *input_data = space_tensor->data<float>(); const T *input_data = space_tensor->data<T>();
float *output_data = batch_tensor->mutable_data<float>(); T *output_data = batch_tensor->mutable_data<T>();
index_t in_batches = space_tensor->dim(0); index_t in_batches = space_tensor->dim(0);
index_t in_height = space_tensor->dim(2); index_t in_height = space_tensor->dim(2);
...@@ -158,20 +158,20 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { ...@@ -158,20 +158,20 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
(in_width + pad_left - tile_w (in_width + pad_left - tile_w
+ block_shape_w - 1) + block_shape_w - 1)
/ block_shape_w); / block_shape_w);
const float *input_base = const T *input_base =
input_data + (in_b * channels + c) * in_height * in_width; input_data + (in_b * channels + c) * in_height * in_width;
float *output_base = T *output_base =
output_data + (b * channels + c) * out_height * out_width; output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width, memset(output_base + block_h * out_width,
0, 0,
(valid_h_start - block_h) * out_width * sizeof(float)); (valid_h_start - block_h) * out_width * sizeof(T));
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top; index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) { for (index_t h = valid_h_start; h < valid_h_end; ++h) {
memset(output_base + h * out_width, memset(output_base + h * out_width,
0, 0,
valid_w_start * sizeof(float)); valid_w_start * sizeof(T));
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left; index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) { for (index_t w = valid_w_start; w < valid_w_end; ++w) {
...@@ -183,13 +183,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { ...@@ -183,13 +183,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
memset(output_base + h * out_width + valid_w_end, memset(output_base + h * out_width + valid_w_end,
0, 0,
(out_width - valid_w_end) * sizeof(float)); (out_width - valid_w_end) * sizeof(T));
} // h } // h
memset(output_base + valid_h_end * out_width, memset(output_base + valid_h_end * out_width,
0, 0,
(std::min(out_height, block_h + block_h_size) - valid_h_end) (std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float)); * out_width * sizeof(T));
} // b } // b
} // block_h } // block_h
} // c } // c
...@@ -332,6 +332,8 @@ class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase { ...@@ -332,6 +332,8 @@ class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
void RegisterSpaceToBatchND(OpRegistry *op_registry) { void RegisterSpaceToBatchND(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToBatchND", MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::CPU, float); SpaceToBatchNDOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "SpaceToBatchND", MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
......
...@@ -28,8 +28,8 @@ namespace ops { ...@@ -28,8 +28,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class SpaceToDepthOp; class SpaceToDepthOp;
template<> template<class T>
class SpaceToDepthOp<CPU, float> : public Operation { class SpaceToDepthOp<CPU, T> : public Operation {
public: public:
explicit SpaceToDepthOp(OpConstructContext *context) explicit SpaceToDepthOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -59,8 +59,8 @@ class SpaceToDepthOp<CPU, float> : public Operation { ...@@ -59,8 +59,8 @@ class SpaceToDepthOp<CPU, float> : public Operation {
Tensor::MappingGuard logits_guard(input); Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_ptr = input->data<float>(); const T *input_ptr = input->data<T>();
float *output_ptr = output->mutable_data<float>(); T *output_ptr = output->mutable_data<T>();
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < input_depth; ++d) { for (index_t d = 0; d < input_depth; ++d) {
...@@ -184,6 +184,8 @@ class SpaceToDepthOp<DeviceType::GPU, float> : public Operation { ...@@ -184,6 +184,8 @@ class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
void RegisterSpaceToDepth(OpRegistry *op_registry) { void RegisterSpaceToDepth(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToDepth", MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU, float); SpaceToDepthOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "SpaceToDepth", MACE_REGISTER_OP(op_registry, "SpaceToDepth",
......
...@@ -157,6 +157,8 @@ class SpliceOp<DeviceType::CPU, T> : public Operation { ...@@ -157,6 +157,8 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
void RegisterSplice(OpRegistry *op_registry) { void RegisterSplice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Splice", SpliceOp, MACE_REGISTER_OP(op_registry, "Splice", SpliceOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Splice", SpliceOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -130,8 +130,8 @@ class SplitOp<DeviceType::GPU, float> : public Operation { ...@@ -130,8 +130,8 @@ class SplitOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
void RegisterSplit(OpRegistry *op_registry) { void RegisterSplit(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Split", SplitOp, MACE_REGISTER_OP(op_registry, "Split", SplitOp, DeviceType::CPU, float);
DeviceType::CPU, float); MACE_REGISTER_BF16_OP(op_registry, "Split", SplitOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp); MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
......
...@@ -67,9 +67,10 @@ class SqrDiffMeanOp : public Operation { ...@@ -67,9 +67,10 @@ class SqrDiffMeanOp : public Operation {
const index_t img_size = input0->dim(2) * input0->dim(3); const index_t img_size = input0->dim(2) * input0->dim(3);
const index_t bc = input0->dim(0) * input0->dim(1); const index_t bc = input0->dim(0) * input0->dim(1);
// TODO(luxuhui): cache the output_ptr[i]
for (int i = 0; i < bc; ++i) { for (int i = 0; i < bc; ++i) {
for (int j = 0; j < img_size; ++j) { for (int j = 0; j < img_size; ++j) {
T diff = input_ptr0[i * img_size + j] - input_ptr1[i]; float diff = input_ptr0[i * img_size + j] - input_ptr1[i];
output_ptr[i] += diff * diff; output_ptr[i] += diff * diff;
} }
output_ptr[i] /= img_size; output_ptr[i] /= img_size;
...@@ -104,6 +105,8 @@ class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation { ...@@ -104,6 +105,8 @@ class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
void RegisterSqrDiffMean(OpRegistry *op_registry) { void RegisterSqrDiffMean(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp); MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
} }
......
...@@ -80,6 +80,7 @@ class SqueezeOp : public SqueezeOpRaw { ...@@ -80,6 +80,7 @@ class SqueezeOp : public SqueezeOpRaw {
void RegisterSqueeze(OpRegistry *op_registry) { void RegisterSqueeze(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
......
...@@ -80,6 +80,7 @@ class StackOp : public Operation { ...@@ -80,6 +80,7 @@ class StackOp : public Operation {
void RegisterStack(OpRegistry *op_registry) { void RegisterStack(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Stack", StackOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t); MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
} }
......
...@@ -354,6 +354,8 @@ class StridedSliceOp : public Operation { ...@@ -354,6 +354,8 @@ class StridedSliceOp : public Operation {
void RegisterStridedSlice(OpRegistry *op_registry) { void RegisterStridedSlice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
} }
......
...@@ -104,6 +104,8 @@ class SubsampleOp<DeviceType::CPU, T> : public Operation { ...@@ -104,6 +104,8 @@ class SubsampleOp<DeviceType::CPU, T> : public Operation {
void RegisterSubsample(OpRegistry *op_registry) { void RegisterSubsample(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp, MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Subsample", SubsampleOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -105,6 +105,8 @@ class SumGroupOp<DeviceType::CPU, T> : public Operation { ...@@ -105,6 +105,8 @@ class SumGroupOp<DeviceType::CPU, T> : public Operation {
void RegisterSumGroup(OpRegistry *op_registry) { void RegisterSumGroup(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp, MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SumGroup", SumGroupOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -152,6 +152,8 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation { ...@@ -152,6 +152,8 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
void RegisterTargetRMSNorm(OpRegistry *op_registry) { void RegisterTargetRMSNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp, MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -113,6 +113,7 @@ class TileOp : public Operation { ...@@ -113,6 +113,7 @@ class TileOp : public Operation {
void RegisterTile(OpRegistry *op_registry) { void RegisterTile(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Tile", TileOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Tile", TileOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Tile", TileOp, DeviceType::CPU);
MACE_REGISTER_OP_CONDITION( MACE_REGISTER_OP_CONDITION(
op_registry, OpConditionBuilder("Tile").SetDevicePlacerFunc( op_registry, OpConditionBuilder("Tile").SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
......
...@@ -27,11 +27,8 @@ ...@@ -27,11 +27,8 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
template<DeviceType D, typename T> template<DeviceType D, class T>
class TransposeOp; class TransposeOp : public Operation {
template<DeviceType D>
class TransposeOp<D, float> : public Operation {
public: public:
explicit TransposeOp(OpConstructContext *context) explicit TransposeOp(OpConstructContext *context)
: Operation(context), : Operation(context),
...@@ -54,8 +51,8 @@ class TransposeOp<D, float> : public Operation { ...@@ -54,8 +51,8 @@ class TransposeOp<D, float> : public Operation {
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>(); const T *input_data = input->data<T>();
float *output_data = output->mutable_data<float>(); T *output_data = output->mutable_data<T>();
return Transpose(&context->device()->cpu_runtime()->thread_pool(), return Transpose(&context->device()->cpu_runtime()->thread_pool(),
input_data, input->shape(), dims_, output_data); input_data, input->shape(), dims_, output_data);
...@@ -68,6 +65,8 @@ class TransposeOp<D, float> : public Operation { ...@@ -68,6 +65,8 @@ class TransposeOp<D, float> : public Operation {
void RegisterTranspose(OpRegistry *op_registry) { void RegisterTranspose(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp, MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Transpose", TransposeOp,
DeviceType::CPU);
} }
} // namespace ops } // namespace ops
......
...@@ -63,6 +63,8 @@ class UnsqueezeOp : public Operation { ...@@ -63,6 +63,8 @@ class UnsqueezeOp : public Operation {
void RegisterUnsqueeze(OpRegistry *op_registry) { void RegisterUnsqueeze(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp, MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp, MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
} }
......
...@@ -77,6 +77,8 @@ class UnstackOp : public Operation { ...@@ -77,6 +77,8 @@ class UnstackOp : public Operation {
void RegisterUnstack(OpRegistry *op_registry) { void RegisterUnstack(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU, float); DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU, int32_t); DeviceType::CPU, int32_t);
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/gemm.h" #include "mace/ops/delegator/gemm.h"
#include "mace/ops/ref/gemm.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h" #include "mace/ops/testing/test_utils.h"
namespace mace { namespace mace {
...@@ -50,41 +50,48 @@ void TestGemmFloat32(const index_t batch, ...@@ -50,41 +50,48 @@ void TestGemmFloat32(const index_t batch,
GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data); GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data);
GenerateRandomRealTypeData<float>(output.shape(), output_data); GenerateRandomRealTypeData<float>(output.shape(), output_data);
} }
::mace::ops::arm::fp32::Gemm gemm((delegator::GemmParam()));
utils::ThreadPool thread_pool(1, AFFINITY_NONE); utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init(); thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device); OpsTestNet net;
gemm.Compute(&context, OpContext context(net.ws(), &cpu_device);
&lhs, std::unique_ptr<delegator::Gemm> gemm = delegator::Gemm::Create(
&rhs, context.workspace(),
batch, MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON),
rows, delegator::GemmParam());
cols, gemm->Compute(&context,
depth, &lhs,
lhs_major, &rhs,
rhs_major, batch,
output_major, rows,
lhs_batched, cols,
rhs_batched, depth,
&output); lhs_major,
rhs_major,
output_major,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT); Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
expected_output.Resize({batch, rows, cols}); expected_output.Resize({batch, rows, cols});
::mace::ops::ref::Gemm<float> gemm_ref((delegator::GemmParam())); std::unique_ptr<delegator::Gemm> gemm_ref = delegator::Gemm::Create(
gemm_ref.Compute(nullptr, context.workspace(), MACE_DELEGATOR_KEY(
&lhs, Gemm, DeviceType::CPU, float, ImplType::REF), delegator::GemmParam());
&rhs, gemm_ref->Compute(&context,
batch, &lhs,
rows, &rhs,
cols, batch,
depth, rows,
lhs_major, cols,
rhs_major, depth,
output_major, lhs_major,
lhs_batched, rhs_major,
rhs_batched, output_major,
&expected_output); lhs_batched,
rhs_batched,
&expected_output);
ExpectTensorNear<float>(expected_output, output); ExpectTensorNear<float>(expected_output, output);
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/gemv.h" #include "mace/ops/delegator/gemv.h"
#include "mace/ops/ref/gemv.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h" #include "mace/ops/testing/test_utils.h"
namespace mace { namespace mace {
...@@ -52,34 +52,38 @@ void TestGemvFloat32(const index_t batch, ...@@ -52,34 +52,38 @@ void TestGemvFloat32(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE); utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init(); thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device); OpsTestNet net;
::mace::ops::arm::fp32::Gemv gemv = OpContext context(net.ws(), &cpu_device);
::mace::ops::arm::fp32::Gemv(DelegatorParam()); std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
gemv.Compute(&context, context.workspace(),
&lhs, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON),
&rhs, DelegatorParam());
&bias, gemv->Compute(&context,
batch, &lhs,
height, &rhs,
width, &bias,
lhs_batched, batch,
rhs_batched, height,
&output); width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT); Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
expected_output.Resize({batch, height}); expected_output.Resize({batch, height});
::mace::ops::ref::Gemv<float> gemv_ref = std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
::mace::ops::ref::Gemv<float>(DelegatorParam()); context.workspace(), MACE_DELEGATOR_KEY(
gemv_ref.Compute(nullptr, Gemv, DeviceType::CPU, float, ImplType::REF), DelegatorParam());
&lhs, gemv_ref->Compute(&context,
&rhs, &lhs,
&bias, &rhs,
batch, &bias,
height, batch,
width, height,
lhs_batched, width,
rhs_batched, lhs_batched,
&expected_output); rhs_batched,
&expected_output);
ExpectTensorNear<float>(expected_output, output); ExpectTensorNear<float>(expected_output, output);
} }
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/q8/gemv.h" #include "mace/ops/delegator/gemv.h"
#include "mace/ops/ref/gemv.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h" #include "mace/ops/testing/test_utils.h"
namespace mace { namespace mace {
...@@ -57,34 +57,38 @@ void TestGemvInt32(const index_t batch, ...@@ -57,34 +57,38 @@ void TestGemvInt32(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE); utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init(); thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device); OpsTestNet net;
mace::ops::arm::q8::Gemv<int32_t> gemv = OpContext context(net.ws(), &cpu_device);
mace::ops::arm::q8::Gemv<int32_t>(DelegatorParam()); std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
gemv.Compute(&context, context.workspace(),
&lhs, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON),
&rhs, DelegatorParam());
&bias, gemv->Compute(&context,
batch, &lhs,
height, &rhs,
width, &bias,
lhs_batched, batch,
rhs_batched, height,
&output); width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32); Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
expected_output.Resize({batch, height}); expected_output.Resize({batch, height});
mace::ops::ref::Gemv<int32_t> gemv_ref = std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
mace::ops::ref::Gemv<int32_t>(DelegatorParam()); context.workspace(), MACE_DELEGATOR_KEY(
gemv_ref.Compute(nullptr, Gemv, DeviceType::CPU, int32_t, ImplType::REF), DelegatorParam());
&lhs, gemv_ref->Compute(&context,
&rhs, &lhs,
&bias, &rhs,
batch, &bias,
height, batch,
width, height,
lhs_batched, width,
rhs_batched, lhs_batched,
&expected_output); rhs_batched,
&expected_output);
Tensor::MappingGuard output_guard(&output); Tensor::MappingGuard output_guard(&output);
Tensor::MappingGuard expected_guard(&expected_output); Tensor::MappingGuard expected_guard(&expected_output);
...@@ -131,36 +135,40 @@ void TestGemvUint8(const index_t batch, ...@@ -131,36 +135,40 @@ void TestGemvUint8(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE); utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init(); thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device); OpsTestNet net;
mace::ops::arm::q8::Gemv<uint8_t> gemv = OpContext context(net.ws(), &cpu_device);
mace::ops::arm::q8::Gemv<uint8_t>(DelegatorParam()); std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
gemv.Compute(&context, context.workspace(),
&lhs, MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON),
&rhs, DelegatorParam());
&bias, gemv->Compute(&context,
batch, &lhs,
height, &rhs,
width, &bias,
lhs_batched, batch,
rhs_batched, height,
&output); width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32); Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
expected_output.SetScale(0.6); expected_output.SetScale(0.6);
expected_output.SetZeroPoint(57); expected_output.SetZeroPoint(57);
expected_output.Resize({batch, height}); expected_output.Resize({batch, height});
mace::ops::ref::Gemv<uint8_t> gemv_ref = std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
mace::ops::ref::Gemv<uint8_t>(DelegatorParam()); context.workspace(), MACE_DELEGATOR_KEY(
gemv_ref.Compute(nullptr, Gemv, DeviceType::CPU, uint8_t, ImplType::REF), DelegatorParam());
&lhs, gemv_ref->Compute(&context,
&rhs, &lhs,
&bias, &rhs,
batch, &bias,
height, batch,
width, height,
lhs_batched, width,
rhs_batched, lhs_batched,
&expected_output); rhs_batched,
&expected_output);
Tensor::MappingGuard output_guard(&output); Tensor::MappingGuard output_guard(&output);
Tensor::MappingGuard expected_guard(&expected_output); Tensor::MappingGuard expected_guard(&expected_output);
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "mace/ops/delegator/gemm.h" #include "mace/ops/delegator/gemm.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/ref/gemm.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -112,7 +111,9 @@ void Complex(const std::vector<index_t> &batch, ...@@ -112,7 +111,9 @@ void Complex(const std::vector<index_t> &batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(CPU); net.RunOp(CPU);
ref::Gemm<float> gemm = ref::Gemm<float>(delegator::GemmParam()); std::unique_ptr<delegator::Gemm> gemm = delegator::Gemm::Create(
net.ws(), MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF),
delegator::GemmParam());
Tensor expected_output_tensor; Tensor expected_output_tensor;
std::vector<index_t> expected_output_shape({rows, cols}); std::vector<index_t> expected_output_shape({rows, cols});
expected_output_shape.insert(expected_output_shape.begin(), expected_output_shape.insert(expected_output_shape.begin(),
...@@ -121,20 +122,20 @@ void Complex(const std::vector<index_t> &batch, ...@@ -121,20 +122,20 @@ void Complex(const std::vector<index_t> &batch,
expected_output_tensor.Resize(expected_output_shape); expected_output_tensor.Resize(expected_output_shape);
index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1, index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
gemm.Compute(nullptr, gemm->Compute(nullptr,
net.GetTensor("A"), net.GetTensor("A"),
net.GetTensor("B"), net.GetTensor("B"),
batch_count, batch_count,
lhs_rows, lhs_rows,
lhs_cols, lhs_cols,
rhs_rows, rhs_rows,
rhs_cols, rhs_cols,
transpose_lhs, transpose_lhs,
transpose_rhs, transpose_rhs,
false, false,
lhs_batched, lhs_batched,
rhs_batched, rhs_batched,
&expected_output_tensor); &expected_output_tensor);
ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"), ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"),
1e-4, 1e-2); 1e-4, 1e-2);
......
...@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \ ...@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \
-DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_TESTS=ON \
......
...@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \ ...@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \
-DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_TESTS=ON \
......
...@@ -31,13 +31,14 @@ cmake -DANDROID_ABI="arm64-v8a" \ ...@@ -31,13 +31,14 @@ cmake -DANDROID_ABI="arm64-v8a" \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
-DANDROID_NATIVE_API_LEVEL=21 \ -DANDROID_NATIVE_API_LEVEL=21 \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DANDROID_STL=c++_shared \ -DANDROID_STL=c++_shared \
-DMACE_ENABLE_NEON=ON \ -DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \ -DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \ -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \ -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \
-DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \ -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \
-DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \ -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_TESTS=ON \
......
...@@ -40,6 +40,7 @@ cmake -DANDROID_ABI="armeabi-v7a" \ ...@@ -40,6 +40,7 @@ cmake -DANDROID_ABI="armeabi-v7a" \
-DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \ -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \
-DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \ -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \
-DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \ -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \ -DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \ -DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_TESTS=ON \
......
...@@ -18,6 +18,7 @@ mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR} ...@@ -18,6 +18,7 @@ mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
cmake -DMACE_ENABLE_NEON=OFF \ cmake -DMACE_ENABLE_NEON=OFF \
-DMACE_ENABLE_QUANTIZE=OFF \ -DMACE_ENABLE_QUANTIZE=OFF \
-DMACE_ENABLE_OPENCL=OFF \ -DMACE_ENABLE_OPENCL=OFF \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_TESTS=ON \ -DMACE_ENABLE_TESTS=ON \
-DMACE_ENABLE_BENCHMARKS=ON \ -DMACE_ENABLE_BENCHMARKS=ON \
-DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE} \ -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE} \
......
...@@ -171,7 +171,16 @@ def parse_device_type(runtime): ...@@ -171,7 +171,16 @@ def parse_device_type(runtime):
return device_type return device_type
def get_hexagon_mode(configs): def bfloat16_enabled(configs):
for model_name in configs[YAMLKeyword.models]:
model_config = configs[YAMLKeyword.models][model_name]
dtype = model_config.get(YAMLKeyword.data_type, FPDataType.fp16_fp32)
if dtype == FPDataType.bf16_fp32:
return True
return False
def hexagon_enabled(configs):
runtime_list = [] runtime_list = []
for model_name in configs[YAMLKeyword.models]: for model_name in configs[YAMLKeyword.models]:
model_runtime = \ model_runtime = \
...@@ -184,7 +193,7 @@ def get_hexagon_mode(configs): ...@@ -184,7 +193,7 @@ def get_hexagon_mode(configs):
return False return False
def get_hta_mode(configs): def hta_enabled(configs):
runtime_list = [] runtime_list = []
for model_name in configs[YAMLKeyword.models]: for model_name in configs[YAMLKeyword.models]:
model_runtime = \ model_runtime = \
...@@ -197,7 +206,7 @@ def get_hta_mode(configs): ...@@ -197,7 +206,7 @@ def get_hta_mode(configs):
return False return False
def get_apu_mode(configs): def apu_enabled(configs):
runtime_list = [] runtime_list = []
for model_name in configs[YAMLKeyword.models]: for model_name in configs[YAMLKeyword.models]:
model_runtime = \ model_runtime = \
...@@ -210,7 +219,7 @@ def get_apu_mode(configs): ...@@ -210,7 +219,7 @@ def get_apu_mode(configs):
return False return False
def get_opencl_mode(configs): def opencl_enabled(configs):
runtime_list = [] runtime_list = []
for model_name in configs[YAMLKeyword.models]: for model_name in configs[YAMLKeyword.models]:
model_runtime = \ model_runtime = \
...@@ -224,7 +233,7 @@ def get_opencl_mode(configs): ...@@ -224,7 +233,7 @@ def get_opencl_mode(configs):
return False return False
def get_quantize_mode(configs): def quantize_enabled(configs):
for model_name in configs[YAMLKeyword.models]: for model_name in configs[YAMLKeyword.models]:
quantize = \ quantize = \
configs[YAMLKeyword.models][model_name].get( configs[YAMLKeyword.models][model_name].get(
...@@ -739,11 +748,12 @@ def build_model_lib(configs, address_sanitizer, debug_mode): ...@@ -739,11 +748,12 @@ def build_model_lib(configs, address_sanitizer, debug_mode):
MODEL_LIB_TARGET, MODEL_LIB_TARGET,
abi=target_abi, abi=target_abi,
toolchain=toolchain, toolchain=toolchain,
enable_hexagon=get_hexagon_mode(configs), enable_hexagon=hexagon_enabled(configs),
enable_hta=get_hta_mode(configs), enable_hta=hta_enabled(configs),
enable_apu=get_apu_mode(configs), enable_apu=apu_enabled(configs),
enable_opencl=get_opencl_mode(configs), enable_opencl=opencl_enabled(configs),
enable_quantize=get_quantize_mode(configs), enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs),
address_sanitizer=address_sanitizer, address_sanitizer=address_sanitizer,
symbol_hidden=get_symbol_hidden_mode(debug_mode), symbol_hidden=get_symbol_hidden_mode(debug_mode),
debug_mode=debug_mode debug_mode=debug_mode
...@@ -900,12 +910,13 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, ...@@ -900,12 +910,13 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
mace_run_target, mace_run_target,
abi=target_abi, abi=target_abi,
toolchain=toolchain, toolchain=toolchain,
enable_hexagon=get_hexagon_mode(configs), enable_hexagon=hexagon_enabled(configs),
enable_hta=get_hta_mode(configs), enable_hta=hta_enabled(configs),
enable_apu=get_apu_mode(configs), enable_apu=apu_enabled(configs),
enable_openmp=enable_openmp, enable_openmp=enable_openmp,
enable_opencl=get_opencl_mode(configs), enable_opencl=opencl_enabled(configs),
enable_quantize=get_quantize_mode(configs), enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs),
address_sanitizer=address_sanitizer, address_sanitizer=address_sanitizer,
symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type), symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type),
debug_mode=debug_mode, debug_mode=debug_mode,
......
...@@ -74,6 +74,7 @@ class ScratchComputer: ...@@ -74,6 +74,7 @@ class ScratchComputer:
data_type == mace_pb2.DT_INT32: data_type == mace_pb2.DT_INT32:
return 4 return 4
elif data_type == mace_pb2.DT_HALF or \ elif data_type == mace_pb2.DT_HALF or \
data_type == mace_pb2.DT_BFLOAT16 or \
data_type == mace_pb2.DT_FLOAT16: data_type == mace_pb2.DT_FLOAT16:
return 2 return 2
elif data_type == mace_pb2.DT_UINT8: elif data_type == mace_pb2.DT_UINT8:
......
# Copyright 2020 The MACE Authors. All Rights Reserved. # Copyright 2020 The MACE Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -79,7 +78,7 @@ def merge_params(net_def, data_type): ...@@ -79,7 +78,7 @@ def merge_params(net_def, data_type):
for tensor in net_def.tensors: for tensor in net_def.tensors:
if tensor.data_type == mace_pb2.DT_FLOAT \ if tensor.data_type == mace_pb2.DT_FLOAT \
or tensor.data_type == mace_pb2.DT_HALF \ or tensor.data_type == mace_pb2.DT_HALF \
or tensor.data_type == mace_pb2.DT_FLOAT16\ or tensor.data_type == mace_pb2.DT_FLOAT16 \
or tensor.data_type == mace_pb2.DT_BFLOAT16: or tensor.data_type == mace_pb2.DT_BFLOAT16:
del tensor.float_data[:] del tensor.float_data[:]
elif tensor.data_type == mace_pb2.DT_INT32: elif tensor.data_type == mace_pb2.DT_INT32:
......
...@@ -270,6 +270,7 @@ def bazel_build(target, ...@@ -270,6 +270,7 @@ def bazel_build(target,
enable_neon=True, enable_neon=True,
enable_opencl=True, enable_opencl=True,
enable_quantize=True, enable_quantize=True,
enable_bfloat16=False,
enable_rpcmem=True, enable_rpcmem=True,
address_sanitizer=False, address_sanitizer=False,
symbol_hidden=True, symbol_hidden=True,
...@@ -286,6 +287,8 @@ def bazel_build(target, ...@@ -286,6 +287,8 @@ def bazel_build(target,
"openmp=%s" % str(enable_openmp).lower(), "openmp=%s" % str(enable_openmp).lower(),
"--define", "--define",
"quantize=%s" % str(enable_quantize).lower(), "quantize=%s" % str(enable_quantize).lower(),
"--define",
"bfloat16=%s" % str(enable_bfloat16).lower(),
target, target,
) )
else: else:
...@@ -304,6 +307,8 @@ def bazel_build(target, ...@@ -304,6 +307,8 @@ def bazel_build(target,
"--define", "--define",
"quantize=%s" % str(enable_quantize).lower(), "quantize=%s" % str(enable_quantize).lower(),
"--define", "--define",
"bfloat16=%s" % str(enable_bfloat16).lower(),
"--define",
"rpcmem=%s" % str(enable_rpcmem).lower(), "rpcmem=%s" % str(enable_rpcmem).lower(),
"--define", "--define",
"hexagon=%s" % str(enable_hexagon).lower(), "hexagon=%s" % str(enable_hexagon).lower(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册