提交 950028c1 编写于 作者: L luxuhui

feature: support bf16

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 27757d88
......@@ -10,6 +10,7 @@ option(MACE_ENABLE_CUDA "whether to enable CUDA support" OFF)
option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF)
option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF)
option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF)
option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF)
option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF)
option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF)
option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON)
......@@ -116,6 +117,10 @@ if(MACE_ENABLE_MTK_APU)
add_definitions(-DMACE_ENABLE_MTK_APU)
endif(MACE_ENABLE_MTK_APU)
if(MACE_ENABLE_BFLOAT16)
add_definitions(-DMACE_ENABLE_BFLOAT16)
endif(MACE_ENABLE_BFLOAT16)
if(MACE_ENABLE_OBFUSCATE)
add_definitions(-DMACE_OBFUSCATE_LITERALS)
endif(MACE_ENABLE_OBFUSCATE)
......
......@@ -85,7 +85,7 @@ in one deployment file.
* - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
......@@ -582,9 +582,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
......@@ -63,7 +63,7 @@ There are many advanced options supported.
* - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
......@@ -438,9 +438,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
......@@ -43,12 +43,13 @@ MACE 需要安装下列依赖:
- 版本和说明
* - Android NDK
- `NDK 安装指南 <https://developer.android.com/ndk/guides/setup#install>`__
- Required by Android build, r15b, r15c, r16b, r17b
- 安卓编译需要, bazel用户可以使用r15b及以上的版本, cmake用户可以使用r17b及以上版本
* - CMake
- apt-get install cmake
- >= 3.11.3
* - ADB
- Linux:``apt-get install android-tools-adb`` Mac:``brew cask install android-platform-tools``
- | Linux:``apt-get install android-tools-adb``
| Mac:``brew cask install android-platform-tools``
- Android 运行需要, >= 1.0.32
* - TensorFlow
- pip install tensorflow==1.8.0
......
......@@ -132,6 +132,14 @@ config_setting(
visibility = ["//visibility:public"],
)
config_setting(
name = "bfloat16_enabled",
define_values = {
"bfloat16": "true",
},
visibility = ["//visibility:public"],
)
config_setting(
name = "rpcmem_enabled",
define_values = {
......
......@@ -9,6 +9,7 @@ load(
"if_android",
"if_android_armv7",
"if_apu_enabled",
"if_bfloat16_enabled",
"if_hexagon_enabled",
"if_hexagon_or_hta_enabled",
"if_hta_enabled",
......@@ -87,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_BFLOAT16_H_
#define MACE_CORE_BFLOAT16_H_
#ifdef MACE_ENABLE_BFLOAT16
#include <algorithm>
#include <cmath>
#include <sstream>
namespace mace {
union Sphinx {
uint32_t i;
float f;
Sphinx(uint32_t value) : i(value) {}
Sphinx(float value) : f(value) {}
};
class BFloat16 {
public:
BFloat16() : data_(0) {}
// we need implicit transformation, so `explicit` keyword is not used
BFloat16(float value) : data_(Sphinx(value).i >> 16) {} // NOLINT
operator float() const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f;
}
operator double() const {
return static_cast<double>(
Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
operator int() const {
return static_cast<int>(Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
template<typename T>
void operator=(T value) {
data_ = Sphinx(static_cast<float>(value)).i >> 16;
}
BFloat16 operator-() const {
return BFloat16(-(Sphinx(static_cast<uint32_t>(data_ << 16)).f));
}
template<typename T>
BFloat16 operator+(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f + static_cast<float>(value));
}
template<typename T>
BFloat16 operator-(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f - static_cast<float>(value));
}
template<typename T>
BFloat16 operator*(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f * static_cast<float>(value));
}
template<typename T>
BFloat16 operator/(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f / static_cast<float>(value));
}
template<typename T>
bool operator>(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f > static_cast<float>(value);
}
template<typename T>
bool operator>=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f >= static_cast<float>(value);
}
template<typename T>
bool operator<(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f < static_cast<float>(value);
}
template<typename T>
bool operator<=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f <= static_cast<float>(value);
}
template<typename T>
bool operator==(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f == static_cast<float>(value);
}
template<typename T>
void operator+=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f +
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator/=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f /
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator-=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f -
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator*=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f *
static_cast<float>(value)).i >> 16;
}
private:
uint16_t data_;
};
template<>
inline bool BFloat16::operator==(const BFloat16 &value) const {
return data_ == value.data_;
}
template<>
inline void BFloat16::operator=(const BFloat16 &value) {
data_ = value.data_;
}
} // namespace mace
namespace std {
inline float fabs(const mace::BFloat16 &value) {
return fabs(static_cast<float>(value));
}
inline float abs(const mace::BFloat16 &value) {
return abs(static_cast<float>(value));
}
inline float sqrt(const mace::BFloat16 &value) {
return sqrt(static_cast<float>(value));
}
inline float log(const mace::BFloat16 &value) {
return log(static_cast<float>(value));
}
inline float tanh(const mace::BFloat16 &value) {
return tanh(static_cast<float>(value));
}
inline float exp(const mace::BFloat16 &value) {
return exp(static_cast<float>(value));
}
inline int ceil(const mace::BFloat16 &value) {
return ceil(static_cast<float>(value));
}
inline int floor(const mace::BFloat16 &value) {
return floor(static_cast<float>(value));
}
inline float max(const mace::BFloat16 &a, const float &b) {
return max(static_cast<float>(a), b);
}
inline float max(const float &a, const mace::BFloat16 &b) {
return max(a, static_cast<float>(b));
}
inline float min(const mace::BFloat16 &a, const float &b) {
return min(static_cast<float>(a), b);
}
inline float min(const float &a, const mace::BFloat16 &b) {
return min(a, static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const mace::BFloat16 &b) {
return pow(static_cast<float>(a), static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const float &b) {
return pow(static_cast<float>(a), b);
}
inline float pow(const float &a, const mace::BFloat16 &b) {
return pow(a, static_cast<float>(b));
}
inline ostream &operator<<(ostream &ss, // NOLINT
const mace::BFloat16 &value) {
return ss << static_cast<float>(value);
}
} // namespace std
inline float operator+(const float &a, const mace::BFloat16 &value) {
return a + static_cast<float>(value);
}
inline float operator-(const float &a, const mace::BFloat16 &value) {
return a - static_cast<float>(value);
}
inline float operator*(const float &a, const mace::BFloat16 &value) {
return a * static_cast<float>(value);
}
inline float operator/(const float &a, const mace::BFloat16 &value) {
return a / static_cast<float>(value);
}
inline void operator+=(float &a, const mace::BFloat16 &value) { // NOLINT
a += static_cast<float>(value);
}
inline void operator-=(float &a, const mace::BFloat16 &value) { // NOLINT
a -= static_cast<float>(value);
}
inline void operator*=(float &a, const mace::BFloat16 &value) { // NOLINT
a *= static_cast<float>(value);
}
inline void operator/=(float &a, const mace::BFloat16 &value) { // NOLINT
a /= static_cast<float>(value);
}
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_CORE_BFLOAT16_H_
......@@ -28,9 +28,9 @@ enum ImplType {
};
#ifdef MACE_ENABLE_NEON
#define MACE_CPU_IMPL_TYPE NEON
const ImplType kCpuImplType = ImplType::NEON;
#else
#define MACE_CPU_IMPL_TYPE REF
const ImplType kCpuImplType = ImplType::REF;
#endif
struct DelegatorParam {
......
......@@ -15,25 +15,86 @@
#include "mace/core/registry/op_delegator_registry.h"
#include <utility>
#include <sstream>
#include "mace/utils/logging.h"
namespace mace {
MaceStatus OpDelegatorRegistry::Register(const std::string &key,
namespace {
const char *kDefaultTag = "general";
}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type,
const char *in_tag)
: delegator_name(in_name), data_type(in_data_type),
device(in_device), impl_type(in_impl_type), tag(in_tag) {}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type)
: DelegatorInfo(in_name, in_data_type,
in_device, in_impl_type, kDefaultTag) {}
std::string DelegatorInfo::ToString() const {
std::stringstream ss;
ss << delegator_name << "_" << data_type << "_"
<< device << "_" << impl_type << "_" << tag;
return ss.str();
}
bool DelegatorInfo::operator==(const DelegatorInfo &info) const {
return device == info.device && impl_type == info.impl_type &&
data_type == info.data_type &&
delegator_name == info.delegator_name && tag == info.tag;
}
MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
DelegatorCreator creator) {
MACE_CHECK(registry_.count(key) == 0, "Register an exist key.");
MACE_CHECK(registry_.count(key) == 0,
"Register an exist key: ", key.ToString());
registry_[key] = std::move(creator);
return MaceStatus::MACE_SUCCESS;
}
DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const {
MACE_CHECK(registry_.count(key) > 0, key, " not exist.");
DelegatorCreator OpDelegatorRegistry::GetCreator(
const DelegatorInfo &key) const {
if (registry_.count(key) > 0) {
return registry_.at(key);
}
}
DelegatorInfo info = key;
if (key.impl_type == ImplType::NEON) {
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
info.tag = key.tag;
}
template<> const char *DType<float>::name_ = "float";
template<> const char *DType<int>::name_ = "int";
template<> const char *DType<uint8_t>::name_ = "uint8_t";
info.impl_type = ImplType::REF;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
// for REF
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
LOG(FATAL) << "Delegator not exist: " << key.ToString();
return DelegatorCreator();
}
} // namespace mace
......@@ -21,7 +21,9 @@
#include <unordered_map>
#include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/ops/op_delegator.h"
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
......@@ -29,40 +31,50 @@ namespace mace {
typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)>
DelegatorCreator;
struct DelegatorInfo {
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type,
const char *tag);
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type);
std::string ToString() const;
bool operator==(const DelegatorInfo &info) const;
std::string delegator_name;
DataType data_type;
DeviceType device;
ImplType impl_type;
std::string tag;
};
class OpDelegatorRegistry {
public:
OpDelegatorRegistry() = default;
~OpDelegatorRegistry() = default;
MaceStatus Register(const std::string &key, DelegatorCreator creator);
DelegatorCreator GetCreator(const std::string &key) const;
MaceStatus Register(const DelegatorInfo &key, DelegatorCreator creator);
DelegatorCreator GetCreator(const DelegatorInfo &key) const;
private:
std::unordered_map<std::string, DelegatorCreator> registry_;
struct HashName {
size_t operator()(const DelegatorInfo &delegator_info) const {
return std::hash<std::string>()(delegator_info.ToString());
}
};
std::unordered_map<DelegatorInfo, DelegatorCreator, HashName> registry_;
};
template<typename T>
struct DType { static const char *name_; };
template<> const char *DType<float>::name_;
template<> const char *DType<int>::name_;
template<> const char *DType<uint8_t>::name_;
} // namespace mace
#ifndef MACE_DELEGATOR_KEY_TMP
#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \
(std::string(#delegator_name"_"#device"_"#impl"_") + DType<DT>::name_)
#endif // MACE_DELEGATOR_KEY_TMP
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_DELEGATOR_KEY_EX_TMP
#define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \
(std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType<DT>::name_)
DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl, #tag)
#endif // MACE_DELEGATOR_KEY_EX_TMP
#ifndef MACE_DELEGATOR_KEY_EX
......@@ -70,21 +82,32 @@ template<> const char *DType<uint8_t>::name_;
MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag)
#endif // MACE_DELEGATOR_KEY_EX
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_REGISTER_DELEGATOR
#define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) \
void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \
registry->Register( \
key, OpDelegator::DefaultCreator<class_name, param_name>); \
}
registry->Register(key, OpDelegator::DefaultCreator<class_name, param_name>)
#endif // MACE_REGISTER_DELEGATOR
#ifndef MACE_REGISTER_BF16_DELEGATOR
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key) \
MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key)
#else
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_DELEGATOR
#ifndef MACE_DEFINE_DELEGATOR_CREATOR
#define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \
static std::unique_ptr<class_name> Create( \
Workspace *workspace, const std::string &tag, \
Workspace *workspace, const DelegatorInfo &key, \
const DelegatorParam &param) { \
DelegatorCreator creator = \
workspace->GetDelegatorRegistry()->GetCreator(tag); \
workspace->GetDelegatorRegistry()->GetCreator(key); \
std::unique_ptr<OpDelegator> delegator = creator(param); \
return std::unique_ptr<class_name>( \
static_cast<class_name *>(delegator.release())); \
......
......@@ -22,6 +22,8 @@
#include <unordered_map>
#include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/types.h"
#include "mace/core/ops/operator.h"
#include "mace/core/ops/op_condition_builder.h"
#include "mace/core/ops/op_condition_context.h"
......@@ -80,6 +82,26 @@ class OpRegistry {
DataTypeToEnum<dt>::value, \
OpRegistry::DefaultCreator<class_name>)
#ifndef MACE_REGISTER_BF16_OP
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device) \
MACE_REGISTER_OP(op_registry, op_type, class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP
#ifndef MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, \
class_name, device) \
MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \
class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
......
......@@ -53,6 +53,13 @@ namespace mace {
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS)
#endif
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_CASE(BFloat16, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS)
#endif // MACE_ENABLE_BFLOAT16
#if MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS))
......@@ -67,6 +74,7 @@ namespace mace {
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
case DT_INVALID: \
INVALID_STATEMENTS; \
......@@ -419,7 +427,8 @@ class Tensor {
if (i != 0 && i % shape_.back() == 0) {
os << "\n";
}
MACE_RUN_WITH_TYPE_ENUM(dtype_, (os << (this->data<T>()[i]) << ", "));
MACE_RUN_WITH_TYPE_ENUM(
dtype_, (os << this->data<T>()[i] << ", "));
}
LOG(INFO) << os.str();
}
......
......@@ -25,6 +25,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
case DT_FLOAT:
case DT_UINT8:
case DT_INT32:
case DT_BFLOAT16:
return true;
default:
return false;
......@@ -36,7 +37,8 @@ std::string DataTypeToString(const DataType dt) {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_UINT8, "DT_UINT8"},
{DT_INT32, "DT_INT32"}};
{DT_INT32, "DT_INT32"},
{DT_BFLOAT16, "DT_BFLOAT16"}};
MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type");
return dtype_string_map[dt];
}
......@@ -50,6 +52,10 @@ size_t GetEnumTypeSize(const DataType dt) {
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
case DT_FLOAT16:
return sizeof(float16_t);
#endif
#ifdef MACE_ENABLE_BFLOAT16
case DT_BFLOAT16:
return sizeof(BFloat16);
#endif
case DT_UINT8:
return sizeof(uint8_t);
......
......@@ -21,6 +21,7 @@
#include <arm_neon.h>
#endif
#include "mace/core/bfloat16.h"
#include "mace/proto/mace.pb.h"
#include "include/half.hpp"
......@@ -57,6 +58,9 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16);
#endif
#ifdef MACE_ENABLE_BFLOAT16
MACE_MAPPING_DATA_TYPE_AND_ENUM(BFloat16, DT_BFLOAT16);
#endif
MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
......
......@@ -12,6 +12,7 @@ load(
"if_android",
"if_android_armv7",
"if_apu_enabled",
"if_bfloat16_enabled",
"if_darwin",
"if_hexagon_enabled",
"if_hta_enabled",
......@@ -42,6 +43,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
......@@ -52,8 +55,8 @@ cc_library(
"-DMACE_ENABLE_RPCMEM",
]),
deps = [
"//mace/ops",
"//include:public_headers",
"//mace/ops",
],
alwayslink = 1,
)
......
......@@ -16,9 +16,11 @@
#include <numeric>
#include <memory>
#include "mace/core/bfloat16.h"
#include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/core/net_def_adapter.h"
#include "mace/core/registry/ops_registry.h"
#include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/transpose.h"
......@@ -29,7 +31,6 @@
#include "mace/public/mace.h"
#include "mace/port/env.h"
#include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
......@@ -460,6 +461,7 @@ class MaceEngine::Impl {
std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_;
bool is_quantized_model_;
DataType net_data_type_;
std::map<std::string, mace::InputOutputInfo> input_info_map_;
std::map<std::string, mace::InputOutputInfo> output_info_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_;
......@@ -565,6 +567,7 @@ MaceStatus MaceEngine::Impl::Init(
#endif
// mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def);
net_data_type_ = net_def->data_type();
// Get input and output information.
for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info;
......@@ -589,8 +592,8 @@ MaceStatus MaceEngine::Impl::Init(
}
input_tensor->Resize(shape);
// Set to the default data format
input_tensor->set_data_format(static_cast<DataFormat>(
input_info_map_[input_name].data_format()));
input_tensor->set_data_format(
static_cast<DataFormat>(input_info_map_[input_name].data_format()));
}
for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) {
......@@ -691,7 +694,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
model_data_file.c_str(), &model_data_));
MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes,
MACE_RETURN_IF_ERROR(Init(
net_def, input_nodes, output_nodes,
reinterpret_cast<const unsigned char *>(model_data_->data())));
if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
......@@ -753,11 +757,24 @@ MaceStatus MaceEngine::Impl::TransposeInput(
Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>();
if (net_data_type_ == DT_FLOAT || net_data_type_ == DataType::DT_HALF) {
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DT_BFLOAT16) {
auto *input_data = input_tensor->mutable_data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
return ops::Transpose(thread_pool_.get(),
......@@ -776,9 +793,22 @@ MaceStatus MaceEngine::Impl::TransposeInput(
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) {
if (net_data_type_ == DataType::DT_FLOAT ||
net_data_type_ == DataType::DT_HALF) {
auto input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DataType::DT_BFLOAT16) {
auto input_data = input_tensor->mutable_data<BFloat16>();
const float *data = input.second.data().get();
for (index_t i = 0; i < input_tensor->size(); ++i) {
input_data[i] = data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
memcpy(input_data, input.second.data().get(),
......@@ -842,6 +872,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
output_tensor->shape(),
dst_dims,
output->second.data<int>().get());
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
auto output_data = output_tensor->data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
output_data,
output_tensor->shape(),
dst_dims,
output->second.data<float>().get());
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
return MaceStatus::MACE_INVALID_ARGS;
......@@ -864,6 +903,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
std::memcpy(output->second.data<int>().get(),
output_tensor->data<int>(),
output_size * sizeof(int));
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
const auto *output_data = output_tensor->data<BFloat16>();
float *data = output->second.data<float>().get();
for (index_t i = 0; i < output_tensor->size(); ++i) {
data[i] = output_data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
}
......
......@@ -109,6 +109,12 @@ def if_quantize_enabled(a):
"//conditions:default": [],
})
def if_bfloat16_enabled(a):
return select({
"//mace:bfloat16_enabled": a,
"//conditions:default": [],
})
def if_rpcmem_enabled(a):
return select({
"//mace:rpcmem_enabled": a,
......
......@@ -10,6 +10,7 @@ load(
"//mace:mace.bzl",
"if_android",
"if_android_armv7",
"if_bfloat16_enabled",
"if_hexagon_enabled",
"if_neon_enabled",
"if_opencl_enabled",
......@@ -46,6 +47,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -85,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -138,6 +143,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -223,6 +230,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -263,6 +272,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......
......@@ -19,7 +19,6 @@
#include "mace/core/ops/operator.h"
#include "mace/core/registry/ops_registry.h"
#include "mace/ops/delegator/activation.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -43,11 +42,12 @@ class ActivationOp<DeviceType::CPU, T> : public Operation {
Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
activation_delegator_(delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam(
activation_type_,
Operation::GetOptionalArg<T>("max_limit", 0),
Operation::GetOptionalArg<T>("leakyrelu_coefficient", 0)))) {}
Operation::GetOptionalArg<float>("max_limit", 0.f),
Operation::GetOptionalArg<float>(
"leakyrelu_coefficient", 0.f)))) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -119,6 +119,8 @@ class ActivationOp<DeviceType::GPU, float> : public Operation {
void RegisterActivation(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Activation",
ActivationOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T>
class AddNOp;
template<>
class AddNOp<DeviceType::CPU, float> : public Operation {
template<class T>
class AddNOp<DeviceType::CPU, T> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {}
......@@ -46,12 +46,12 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
const index_t size = output->size();
Tensor::MappingGuard output_guard(output);
auto output_data = output->mutable_data<float>();
memset(output_data, 0, size * sizeof(float));
auto output_data = output->mutable_data<T>();
memset(output_data, 0, size * sizeof(T));
for (auto &input : inputs_) {
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<float>();
auto input_data = input->template data<T>();
for (index_t j = 0; j < size; ++j) {
output_data[j] += input_data[j];
......@@ -95,6 +95,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation {
void RegisterAddN(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "AddN", AddNOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -74,11 +74,12 @@ class ArgMaxOp : public Operation {
if (argmin_) {
for (index_t i = 0; i < outer_size; ++i) {
int idx = 0;
T min_value = std::numeric_limits<T>::max();
float min_value = std::numeric_limits<float>::max();
const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] < min_value) {
min_value = input_ptr[j];
float input_value = input_ptr[j];
if (input_value < min_value) {
min_value = input_value;
idx = j;
}
}
......@@ -87,11 +88,12 @@ class ArgMaxOp : public Operation {
} else {
for (index_t i = 0; i < outer_size; ++i) {
int idx = 0;
T max_value = std::numeric_limits<T>::lowest();
float max_value = std::numeric_limits<float>::lowest();
const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] > max_value) {
max_value = input_ptr[j];
float input_value = input_ptr[j];
if (input_value > max_value) {
max_value = input_value;
idx = j;
}
}
......@@ -111,8 +113,8 @@ class ArgMaxOp : public Operation {
void RegisterArgMax(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -185,8 +185,11 @@ void Activation::DoActivation(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, CPU, float, NEON))
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -129,8 +129,11 @@ void BiasAdd::AddBias(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON))
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -113,8 +113,12 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
output);
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1))
void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x1));
}
} // namespace fp32
} // namespace arm
......
......@@ -861,18 +861,27 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
NEON, K1x15S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
NEON, K15x1S1))
void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x1S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x15S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K15x1S1));
}
} // namespace fp32
} // namespace arm
......
......@@ -737,10 +737,16 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2))
void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -801,9 +801,12 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
}, 0, batch, 1, 0, out_channels, 1);
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(
Conv2d, CPU, float, NEON, K3x3Winograd))
void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3Winograd));
}
} // namespace fp32
} // namespace arm
......
......@@ -258,8 +258,12 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1))
void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K5x5S1));
}
} // namespace fp32
} // namespace arm
......
......@@ -722,12 +722,20 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3))
void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S2));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S3));
}
} // namespace fp32
} // namespace arm
......
......@@ -252,9 +252,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General))
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -335,12 +335,16 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K2x2S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K2x2S2))
void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -464,12 +464,16 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K3x3S2))
void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -574,12 +574,16 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K4x4S2))
void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -124,9 +124,11 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, General))
void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dGeneral, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -512,12 +512,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2))
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -776,19 +776,27 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2))
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
MACE_REGISTER_DELEGATOR(
void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2))
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -959,19 +959,27 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2))
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
MACE_REGISTER_DELEGATOR(
void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2))
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -207,13 +207,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General))
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
MACE_REGISTER_DELEGATOR(
void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General))
MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -1224,8 +1224,11 @@ MaceStatus Gemm::Compute(const OpContext *context,
output);
}
MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON))
void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemm, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -378,9 +378,11 @@ MaceStatus Gemv::Compute(const OpContext *context,
#undef vaddvq_f32
#endif
MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON))
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -162,8 +162,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON))
void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::NEON));
}
} // namespace q8
} // namespace arm
......
......@@ -176,18 +176,14 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
template
class Gemv<uint8_t>;
template
class Gemv<int32_t>;
typedef Gemv<uint8_t> GemvUint8;
MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON))
typedef Gemv<int32_t> GemvInt32;
MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON))
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON));
MACE_REGISTER_DELEGATOR(
registry, Gemv<int32_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON));
}
} // namespace q8
} // namespace arm
......
......@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T>
class BatchNormOp;
template<>
class BatchNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class BatchNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit BatchNormOp(OpConstructContext *context)
: Operation(context),
......@@ -43,7 +43,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(
ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
......@@ -91,13 +92,13 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard offset_mapper(offset);
Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>();
const float *scale_ptr = scale->data<float>();
const float *offset_ptr = offset->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
const T *scale_ptr = scale->data<T>();
const T *offset_ptr = offset->data<T>();
T *output_ptr = output->mutable_data<T>();
std::vector<float> new_scale;
std::vector<float> new_offset;
std::vector<T> new_scale;
std::vector<T> new_offset;
if (not_folded) {
const Tensor *mean = this->Input(MEAN);
const Tensor *var = this->Input(VAR);
......@@ -109,8 +110,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
new_offset.resize(channels);
Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var);
const float *mean_ptr = mean->data<float>();
const float *var_ptr = var->data<float>();
const T *mean_ptr = mean->data<T>();
const T *var_ptr = var->data<T>();
thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start,
index_t end,
......@@ -122,9 +123,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, channels, 1);
}
const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
const float
*offset_data = not_folded ? new_offset.data() : offset_ptr;
const T *scale_data = not_folded ? new_scale.data() : scale_ptr;
const T *offset_data = not_folded ? new_offset.data() : offset_ptr;
index_t channel_size = height * width;
index_t batch_size = channels * channel_size;
......@@ -232,6 +232,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation {
void RegisterBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
}
......
......@@ -84,8 +84,8 @@ class BatchToSpaceOpBase : public Operation {
template<DeviceType D, class T>
class BatchToSpaceNDOp;
template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
template<class T>
class BatchToSpaceNDOp<DeviceType::CPU, T> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {}
......@@ -108,8 +108,8 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
const float *input_data = batch_tensor->data<float>();
float *output_data = space_tensor->mutable_data<float>();
const T *input_data = batch_tensor->data<T>();
T *output_data = space_tensor->mutable_data<T>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(2);
......@@ -120,10 +120,11 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
index_t out_height = space_tensor->dim(2);
index_t out_width = space_tensor->dim(3);
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// 32k/sizeof(T)/out_width/block_shape
index_t block_h_size = std::max(
static_cast<index_t>(1),
static_cast<index_t>(
(32 / sizeof(T)) * 1024 / block_shape_w / out_width));
// make channel outter loop so we can make best use of cache
for (index_t c = 0; c < channels; ++c) {
......@@ -153,9 +154,9 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
(out_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
const T *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
T *output_base =
output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
......@@ -290,6 +291,9 @@ void RegisterBatchToSpaceND(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
......
......@@ -33,15 +33,15 @@ namespace ops {
template<DeviceType D, class T>
class BiasAddOp;
template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation {
template<class T>
class BiasAddOp<DeviceType::CPU, T> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -67,9 +67,9 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>();
const float *bias_ptr = bias->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>();
const std::vector<index_t> &shape = input->shape();
const index_t channels = *shape.rbegin();
......@@ -162,6 +162,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation {
void RegisterBiasAdd(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -56,10 +56,8 @@ class CastOp : public Operation {
};
void RegisterCast(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, int32_t);
MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, int32_t);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float16_t);
......
......@@ -64,7 +64,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const T *in_ptr = input_ptr + b * batch_size
+ (g * channels_per_group + idx) * image_size;
T *out_ptr = output_ptr + b * batch_size + c * image_size;
memcpy(out_ptr, in_ptr, image_size * sizeof(float));
memcpy(out_ptr, in_ptr, image_size * sizeof(T));
}
}
......@@ -102,6 +102,8 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
void RegisterChannelShuffle(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#include "mace/ops/common/lstm.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const OpContext *context,
const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data) {
float i_scale = (embed_scales && scale_data) ? scale_data[0] : 1.0f;
float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f;
float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops
} // namespace mace
......@@ -12,25 +12,77 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#ifndef MACE_OPS_COMMON_LSTM_H_
#define MACE_OPS_COMMON_LSTM_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/types.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const OpContext *opContext,
const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
template <typename T>
void LSTMNonlinearKernel(const OpContext *context,
const T *input_data,
const T *prev_data,
const T *scale_data,
const T *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data);
T *output_cell,
T *output_data) {
float i_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[0]) : 1.0f;
float f_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[1]) : 1.0f;
float o_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[2]) : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops
} // namespace mace
......
......@@ -26,10 +26,10 @@
namespace mace {
namespace ops {
template<typename T>
template<typename SrcT, typename DstT>
void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
const T *input,
T *output,
const SrcT *input,
DstT *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
......@@ -50,7 +50,7 @@ void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
}
template<>
inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
inline void TransposeNHWCToNCHWC3<float, float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
......@@ -91,10 +91,10 @@ inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1);
}
template<typename T>
template<typename SrcT, typename DstT>
void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
const T *input,
T *output,
const SrcT *input,
DstT *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
......@@ -115,7 +115,7 @@ void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
}
template<>
inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
inline void TransposeNCHWToNHWCC2<float, float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
......@@ -155,12 +155,12 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1);
}
template<typename T>
template<typename SrcT, typename DstT>
MaceStatus Transpose(utils::ThreadPool *thread_pool,
const T *input,
const SrcT *input,
const std::vector<int64_t> &input_shape,
const std::vector<int> &dst_dims,
T *output) {
DstT *output) {
MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
(input_shape.size() == 3 && dst_dims.size() == 3) ||
(input_shape.size() == 4 && dst_dims.size() == 4),
......@@ -220,7 +220,6 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t height = input_shape[1];
index_t width = input_shape[2];
index_t channel = input_shape[3];
size_t channel_raw_size = channel * sizeof(T);
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = std::max(static_cast<index_t>(1),
......@@ -232,9 +231,11 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
memcpy(output + (tile_j * stride_i + tile_i) * channel,
input + (tile_i * stride_j + tile_j) * channel,
channel_raw_size);
auto output_ptr = output + (tile_j * stride_i + tile_i) * channel;
auto input_ptr = input + (tile_i * stride_j + tile_j) * channel;
for (index_t k = 0; k < channel; ++k) {
output_ptr[k] = input_ptr[k];
}
}
}
}
......@@ -296,14 +297,15 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
}
}, 0, batch, 1, 0, height, tile_size, 0, width, tile_size);
} else if (dst_dims == std::vector<int>{1, 0, 2}) {
size_t width_raw_size = width * sizeof(T);
thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (int i = start0; i < end0; i += step0) {
for (int j = start1; j < end1; j += step1) {
memcpy(output + (j * batch + i) * width,
input + (i * height + j) * width,
width_raw_size);
auto output_ptr = output + (j * batch + i) * width;
auto input_ptr = input + (i * height + j) * width;
for (index_t k = 0; k < width; ++k) {
output_ptr[k] = input_ptr[k];
}
}
}
}, 0, batch, 1, 0, height, 1);
......
......@@ -225,6 +225,8 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
void RegisterConcat(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, int32_t);
......
......@@ -53,15 +53,16 @@ namespace ops {
template<DeviceType D, class T>
class Conv2dOp;
template<>
class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
template<class T>
class Conv2dOp<DeviceType::CPU, T> : public ConvPool2dOpBase {
public:
explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(
ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
......@@ -71,7 +72,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
0.0f)))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -81,9 +82,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
Tensor *output = this->Output(OUTPUT);
if (conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, T, kCpuImplType);
if (kCpuImplType == NEON) {
// the following params are used to decide which conv delegator to use
const index_t stride_h = strides_[0];
const index_t stride_w = strides_[1];
......@@ -98,63 +98,63 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// We do not support changeable filter for now.
if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x1);
} else if (filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
if (input_channels >= 8 && channels >= 8) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3Winograd);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3Winograd);
} else {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
}
} else if (filter_h == 3 && filter_w == 3
&& stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (filter_h == 5 && filter_w == 5
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K5x5S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K5x5S1);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S1);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S2);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S2);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 3 && stride_w == 3 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S3);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S3);
} else if (filter_h == 1 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x7S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x7S1);
} else if (filter_h == 7 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x1S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x1S1);
} else if (filter_h == 1 && filter_w == 15
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x15S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x15S1);
} else if (filter_h == 15 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K15x1S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K15x1S1);
}
}
delegator::Conv2dParam param(strides_, dilations_,
......@@ -497,8 +497,8 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_OPENCL
void RegisterConv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
......
......@@ -100,8 +100,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3];
memcpy(out_ptr, in_ptr_bch,
output_shape[3] * sizeof(T));
memcpy(out_ptr, in_ptr_bch, output_shape[3] * sizeof(T));
}
}
}
......@@ -134,8 +133,8 @@ class CropOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Crop", CropOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -143,8 +143,8 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
};
void RegisterCumsum(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -46,20 +46,21 @@ const std::vector<int> kDeconv2dStrides = {1, 1};
template<DeviceType D, class T>
class Deconv2dOp;
template<>
class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
template<class T>
class Deconv2dOp<DeviceType::CPU, T> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -80,11 +81,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
if (deconv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, T, kCpuImplType);
if (kCpuImplType == NEON) {
const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3);
......@@ -104,23 +103,23 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
strides_[0] == strides_[1] && strides_[0] == 2;
if (use_neon_2x2_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K2x2S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K2x2S1);
} else if (use_neon_2x2_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K2x2S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K2x2S2);
} else if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
}
delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_,
......@@ -236,8 +235,8 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
#endif // MACE_ENABLE_OPENCL
void RegisterDeconv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
......
......@@ -27,7 +27,6 @@ namespace mace {
namespace ops {
enum ConvType {
General,
K1x1,
K1x7S1,
K7x1S1,
......
......@@ -21,12 +21,12 @@
#include "mace/core/ops/op_context.h"
#include "mace/core/ops/op_delegator.h"
#include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
namespace ops {
enum DeconvType {
General,
K2x2S1,
K2x2S2,
K3x3S1,
......
......@@ -28,8 +28,8 @@ namespace ops {
template<DeviceType D, class T>
class DepthToSpaceOp;
template<>
class DepthToSpaceOp<CPU, float> : public Operation {
template<class T>
class DepthToSpaceOp<CPU, T> : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context),
......@@ -59,8 +59,8 @@ class DepthToSpaceOp<CPU, float> : public Operation {
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) {
......@@ -188,6 +188,8 @@ class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
void RegisterDepthToSpace(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthToSpace",
......
......@@ -63,20 +63,21 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T>
class DepthwiseConv2dOp;
template<>
class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
template<class T>
class DepthwiseConv2dOp<DeviceType::CPU, T> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -93,9 +94,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
MACE_CHECK_NOTNULL(output);
if (depthwise_conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
REF, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
T, ImplType::REF);
if (kCpuImplType == NEON) {
const index_t filter_h = filter->dim(2);
const index_t filter_w = filter->dim(3);
const index_t stride_h = strides_[0];
......@@ -104,13 +105,13 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
const index_t dilation_w = dilations_[1];
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2
&& stride_w == 2
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
}
}
delegator::Conv2dParam param(strides_, dilations_,
......@@ -347,7 +348,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#ifdef MACE_ENABLE_OPENCL
template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
class DepthwiseConv2dOp<DeviceType::GPU, float> :
public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
......@@ -402,6 +404,8 @@ class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
void RegisterDepthwiseConv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
......
......@@ -44,8 +44,8 @@ const std::vector<int> kDepthwiseStrides = {1, 1};
template<DeviceType D, class T>
class DepthwiseDeconv2dOp;
template<>
class DepthwiseDeconv2dOp<DeviceType::CPU, float>
template<class T>
class DepthwiseDeconv2dOp<DeviceType::CPU, T>
: public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
......@@ -53,12 +53,13 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -75,7 +76,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
bool is_depthwise = group_ == in_channels;
if (depthwise_deconv2d_delegator_ == nullptr) {
if (MACE_CPU_IMPL_TYPE == NEON) {
if (kCpuImplType == NEON) {
const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3);
bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
......@@ -88,20 +89,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
strides_[0] == strides_[1] && strides_[0] == 2;
if (is_depthwise) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
auto tag = MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType);
if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_,
......@@ -109,20 +110,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create(
context->workspace(), tag, param);
} else {
std::string tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
auto tag = MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType);
if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_,
......@@ -218,6 +219,8 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
}
......
......@@ -75,7 +75,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Operation::GetRepeatedArgs<index_t>("out_cache_indexes")),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, T, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
inline void Validate() {
......@@ -107,14 +107,14 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
") should be greater than zero.");
}
void UpdateCell(float *cell_data,
void UpdateCell(T *cell_data,
const index_t cell_dim,
const float scale) {
if (std::abs(scale - 1.f) < 1e-6)
return;
const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(cell_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
......@@ -130,18 +130,18 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
}
}
void CopyAndUpdateCell(float *src_data,
void CopyAndUpdateCell(T *src_data,
const index_t cell_dim,
const float scale,
float *cell_data) {
T *cell_data) {
if (std::abs(scale - 1.f) < 1e-6) {
memcpy(cell_data, src_data, cell_dim * sizeof(float));
memcpy(cell_data, src_data, cell_dim * sizeof(T));
return;
}
const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(src_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
......@@ -222,49 +222,54 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
<< " output_dim: " << output_dim;
const index_t affine_a_in_size =
PadAlignSize(affine_a_in_dim * sizeof(float));
PadAlignSize(affine_a_in_dim * sizeof(T));
const index_t affine_a_out_size =
PadAlignSize(affine_a_out_dim * sizeof(float));
PadAlignSize(affine_a_out_dim * sizeof(T));
const index_t affine_b_in_size =
PadAlignSize(affine_b_in_dim * sizeof(float));
PadAlignSize(affine_b_in_dim * sizeof(T));
const index_t affine_b_out_size =
PadAlignSize(affine_b_out_dim * sizeof(float));
PadAlignSize(affine_b_out_dim * sizeof(T));
const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_);
const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_);
const index_t out_buf_size =
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float));
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(T));
const index_t cell_buf_size =
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(float));
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(affine_a_in_size + affine_a_out_size
+ affine_b_in_size + affine_b_out_size
+ out_buf_size + cell_buf_size);
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT);
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DataTypeToEnum<T>::v());
prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_buf_data = prev_out_buf.mutable_data<float>();
T *prev_out_buf_data = prev_out_buf.mutable_data<T>();
Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT);
Tensor prev_cell_buf(
scratch->Scratch(cell_buf_size), DataTypeToEnum<T>::v());
prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_buf_data = prev_cell_buf.mutable_data<float>();
T *prev_cell_buf_data = prev_cell_buf.mutable_data<T>();
Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT);
Tensor affine_a_in(
scratch->Scratch(affine_a_in_size), DataTypeToEnum<T>::v());
affine_a_in.Reshape({1, affine_a_in_dim});
float *affine_a_in_data = affine_a_in.mutable_data<float>();
T *affine_a_in_data = affine_a_in.mutable_data<T>();
Tensor affine_a_out(scratch->Scratch(affine_a_out_size), DT_FLOAT);
Tensor affine_a_out(
scratch->Scratch(affine_a_out_size), DataTypeToEnum<T>::v());
affine_a_out.Reshape({1, affine_a_out_dim});
float *affine_a_out_data = affine_a_out.mutable_data<float>();
T *affine_a_out_data = affine_a_out.mutable_data<T>();
Tensor affine_b_in(scratch->Scratch(affine_b_in_size), DT_FLOAT);
Tensor affine_b_in(
scratch->Scratch(affine_b_in_size), DataTypeToEnum<T>::v());
affine_b_in.Reshape({1, affine_b_in_dim});
float *affine_b_in_data = affine_b_in.mutable_data<float>();
T *affine_b_in_data = affine_b_in.mutable_data<T>();
Tensor affine_b_out(scratch->Scratch(affine_b_out_size), DT_FLOAT);
Tensor affine_b_out(
scratch->Scratch(affine_b_out_size), DataTypeToEnum<T>::v());
affine_b_out.Reshape({1, affine_b_out_dim});
float *affine_b_out_data = affine_b_out.mutable_data<float>();
T *affine_b_out_data = affine_b_out.mutable_data<T>();
Tensor *output = this->Output(OUTPUT);
Tensor *out_cache = this->Output(OUT_CACHE);
......@@ -293,31 +298,31 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard out_cache_guard(out_cache);
Tensor::MappingGuard cell_cache_guard(cell_cache);
const float *input_data = input->data<float>();
const float *prev_out_data = prev_out->data<float>();
const float *prev_cell_data = prev_cell->data<float>();
const float *lstm_params_data = lstm_params->data<float>();
float *output_data = output->mutable_data<float>();
float *out_cache_data = out_cache->mutable_data<float>();
float *cell_cache_data = cell_cache->mutable_data<float>();
const T *input_data = input->data<T>();
const T *prev_out_data = prev_out->data<T>();
const T *prev_cell_data = prev_cell->data<T>();
const T *lstm_params_data = lstm_params->data<T>();
T *output_data = output->mutable_data<T>();
T *out_cache_data = out_cache->mutable_data<T>();
T *cell_cache_data = cell_cache->mutable_data<T>();
for (int b = 0; b < batch; ++b) {
memcpy(prev_out_buf_data,
prev_out_data + b * out_buf_chunk * prev_out_dim_,
sizeof(float) * out_buf_chunk * prev_out_dim_);
sizeof(T) * out_buf_chunk * prev_out_dim_);
memcpy(prev_cell_buf_data,
prev_cell_data + b * cell_buf_chunk * prev_cell_dim_,
sizeof(float) * cell_buf_chunk * prev_cell_dim_);
sizeof(T) * cell_buf_chunk * prev_cell_dim_);
for (index_t i = 0; i < out_chunk; ++i) {
const float *input_ptr =
const T *input_ptr =
input_data + (b * chunk + forward_indexes_[i]) * input_dim;
float *output_ptr = output_data + (b * out_chunk + i) * output_dim;
T *output_ptr = output_data + (b * out_chunk + i) * output_dim;
// Append
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float));
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(T));
memcpy(affine_a_in_data + input_dim,
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float));
prev_out_dim_ * sizeof(T));
// Affine
gemv_->Compute(context,
weights_a,
......@@ -330,11 +335,11 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
false,
&affine_a_out);
// Prepare LSTMNonlinear input and output pointer
float *lstm_cell_ptr =
T *lstm_cell_ptr =
prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr = lstm_cell_ptr;
T *curr_cell_ptr = lstm_cell_ptr;
// LSTMNonlinear
LSTMNonlinearKernel(context,
LSTMNonlinearKernel<T>(context,
affine_a_out_data,
lstm_cell_ptr,
nullptr,
......@@ -359,9 +364,9 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
// Output
memcpy(output_ptr,
affine_b_out_data,
output_dim * sizeof(float));
output_dim * sizeof(T));
// Update
float *curr_out_ptr =
T *curr_out_ptr =
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_;
CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
prev_out_dim_,
......@@ -371,22 +376,22 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
for (size_t k = 0; k < out_cache_indexes_.size(); ++k) {
if (i == out_cache_indexes_[k]) {
const index_t idx = b * out_buf_chunk + k;
float *out_cache_ptr =
T *out_cache_ptr =
out_cache_data + idx * prev_out_dim_;
memcpy(out_cache_ptr,
curr_out_ptr,
sizeof(float) * prev_out_dim_);
sizeof(T) * prev_out_dim_);
}
}
for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) {
if (i == cell_cache_indexes_[k]) {
const index_t idx = b * cell_buf_chunk + k;
float *cell_cache_ptr =
T *cell_cache_ptr =
cell_cache_data + idx * prev_cell_dim_;
memcpy(cell_cache_ptr,
curr_cell_ptr,
sizeof(float) * prev_cell_dim_);
sizeof(T) * prev_cell_dim_);
}
}
}
......@@ -416,6 +421,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
void RegisterDynamicLSTM(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -1073,7 +1073,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
"scalar_input_index", 1)),
eltwise_delegator_(delegator::Eltwise::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, kCpuImplType),
delegator::EltwiseParam(
static_cast<ops::EltwiseType>(
Operation::GetOptionalArg<int>(
......@@ -1175,8 +1175,8 @@ class EltwiseOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterEltwise(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::CPU, int32_t);
......
......@@ -57,6 +57,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
void RegisterExpandDims(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, int32_t);
......
......@@ -89,15 +89,16 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
output_shape[dim_size - 2] = output_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float));
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(extract_out_size);
Tensor extract_out(scratch->Scratch(extract_out_size), DT_FLOAT);
Tensor extract_out(
scratch->Scratch(extract_out_size), DataTypeToEnum<T>::v());
extract_out.Reshape({1, output_dim});
extract_out.Clear();
float *extract_out_data = extract_out.mutable_data<float>();
T *extract_out_data = extract_out.mutable_data<T>();
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
......@@ -162,7 +163,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
}, 0, input_dim, 1);
}
memcpy(output_data + (b * output_chunk + i) * output_dim,
extract_out_data, output_dim * sizeof(float));
extract_out_data, output_dim * sizeof(T));
}
}
......@@ -180,6 +181,8 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
void RegisterExtractPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -22,8 +22,8 @@ namespace ops {
template <DeviceType D, class T>
class FillOp;
template <>
class FillOp<DeviceType::CPU, float> : public Operation {
template <class T>
class FillOp<DeviceType::CPU, T> : public Operation {
public:
explicit FillOp(OpConstructContext *context)
: Operation(context) {}
......@@ -46,11 +46,11 @@ class FillOp<DeviceType::CPU, float> : public Operation {
}
Tensor::MappingGuard value_guard(value);
const float *value_data = value->data<float>();
const T *value_data = value->data<T>();
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard output_guard(output);
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
std::fill(output_data, output_data + output->size(), *value_data);
......@@ -65,6 +65,7 @@ class FillOp<DeviceType::CPU, float> : public Operation {
void RegisterFill(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Fill", FillOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Fill", FillOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -56,20 +56,20 @@ class FullyConnectedOpBase : public Operation {
template<DeviceType D, class T>
class FullyConnectedOp;
template<>
class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
template<class T>
class FullyConnectedOp<DeviceType::CPU, T> : public FullyConnectedOpBase {
public:
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context),
activation_delegator_(delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam(activation_,
relux_max_limit_,
leakyrelu_coefficient_))),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -127,7 +127,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
: FullyConnectedOpBase(context),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -226,6 +226,8 @@ class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
void RegisterFullyConnected(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "FullyConnected",
......
......@@ -89,6 +89,7 @@ class GatherOp : public Operation {
void RegisterGather(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Gather", GatherOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
......
......@@ -36,6 +36,8 @@ class IdentityOp : public Operation {
void RegisterIdentity(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
......
......@@ -166,6 +166,7 @@ class IfDefinedOp<DeviceType::CPU, T> : public Operation {
void RegisterIfDefined(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "IfDefined", IfDefinedOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -105,6 +105,8 @@ class InferConv2dShapeOp : public Operation {
void RegisterInferConv2dShape(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
......
......@@ -28,8 +28,8 @@ namespace ops {
template <DeviceType D, class T>
class KaldiBatchNormOp;
template <>
class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
template <class T>
class KaldiBatchNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit KaldiBatchNormOp(OpConstructContext *context)
: Operation(context),
......@@ -40,13 +40,13 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
test_mode_(static_cast<bool>(
Operation::GetOptionalArg<int>("test_mode", 0))) {}
void CalculateMeanVar(const float *input_data,
void CalculateMeanVar(const T *input_data,
index_t length,
index_t stride,
float mean_scale,
float var_scale,
float *mean_data,
float *var_data) {
T *mean_data,
T *var_data) {
float mean_value = 0.f;
float var_value = 0.f;
for (index_t i = 0; i < length; ++i) {
......@@ -84,8 +84,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
......@@ -102,8 +102,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
&& scale->size() == block_dim_);
Tensor::MappingGuard scale_guard(scale);
Tensor::MappingGuard offset_guard(offset);
const float *scale_data = scale->data<float>();
const float *offset_data = offset->data<float>();
const T *scale_data = scale->data<T>();
const T *offset_data = offset->data<T>();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
......@@ -116,18 +116,18 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, num_rows, 1, 0, block_dim_, 1);
} else {
const index_t buf_size =
PadAlignSize(block_dim_ * sizeof(float));
PadAlignSize(block_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(2 * buf_size);
Tensor mean(scratch->Scratch(buf_size), DT_FLOAT);
Tensor mean(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
mean.Reshape({block_dim_});
float *mean_data = mean.mutable_data<float>();
T *mean_data = mean.mutable_data<T>();
Tensor var(scratch->Scratch(buf_size), DT_FLOAT);
Tensor var(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
var.Reshape({block_dim_});
float *var_data = var.mutable_data<float>();
T *var_data = var.mutable_data<T>();
float var_scale = 1.0f / (target_rms_ * target_rms_);
float mean_scale = 1.0f / num_rows;
......@@ -171,6 +171,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
void RegisterKaldiBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -24,8 +24,8 @@ namespace ops {
template<DeviceType D, class T>
class LocalResponseNormOp;
template<>
class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class LocalResponseNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit LocalResponseNormOp(OpConstructContext *context)
: Operation(context),
......@@ -49,8 +49,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
const index_t height = input->dim(2);
const index_t width = input->dim(3);
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
const index_t image_size = height * width;
const index_t batch_size = channels * image_size;
......@@ -95,6 +95,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
void RegisterLocalResponseNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -35,8 +35,8 @@ namespace ops {
template<DeviceType D, typename T>
class LpNormOp;
template<>
class LpNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class LpNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit LpNormOp(OpConstructContext *context)
: Operation(context),
......@@ -59,8 +59,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>();
auto *output_data = output->mutable_data<float>();
const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
auto outer_loop = std::accumulate(input_shape.begin(),
......@@ -95,7 +95,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i;
norm_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f);
output_data_base + inner_loop,
static_cast<T>(0.0f));
norm_ptr[i] = std::pow(norm_ptr[i], power);
norm_ptr[i] += 1e-6;
}
......@@ -151,6 +152,8 @@ class LpNormOp<DeviceType::GPU, float> : public Operation {
void RegisterLpNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp);
}
......
......@@ -70,18 +70,18 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard params_guard(params);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<T>();
const float *params_data = params->data<T>();
float *output_data = output->mutable_data<T>();
const T *input_data = input->data<T>();
const T *params_data = params->data<T>();
T *output_data = output->mutable_data<T>();
for (int r = 0; r < num_rows; ++r) {
const float *input_row = input_data + r * input_cols;
const float *prev_row = input_row + 4 * cell_dim;
const float *scale_data =
const T *input_row = input_data + r * input_cols;
const T *prev_row = input_row + 4 * cell_dim;
const T *scale_data =
embed_scales ? prev_row + cell_dim : nullptr;
float *output_cell = output_data + r * output_dim;
float *output_row = output_cell + cell_dim;
LSTMNonlinearKernel(context,
T *output_cell = output_data + r * output_dim;
T *output_row = output_cell + cell_dim;
LSTMNonlinearKernel<T>(context,
input_row,
prev_row,
scale_data,
......@@ -104,6 +104,8 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
void RegisterLSTMNonlinear(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -92,18 +92,18 @@ class MatMulOpBase : public Operation {
template<DeviceType D, class T>
class MatMulOp;
template<>
class MatMulOp<CPU, float> : public MatMulOpBase {
template<class T>
class MatMulOp<CPU, T> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context),
gemm_(delegator::Gemm::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemm, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, T, kCpuImplType),
delegator::GemmParam())),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -197,8 +197,8 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
"bias' dim should be <= 2.");
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard c_guard(C);
const float *bias_data = bias->data<float>();
float *c_data = C->mutable_data<float>();
const T *bias_data = bias->data<T>();
T *c_data = C->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
......@@ -599,6 +599,8 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
void RegisterMatMul(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
......@@ -30,8 +30,8 @@ namespace ops {
template<DeviceType D, typename T>
class MVNormOp;
template<>
class MVNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class MVNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit MVNormOp(OpConstructContext *context)
: Operation(context),
......@@ -52,8 +52,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>();
auto *output_data = output->mutable_data<float>();
const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<T>();
const auto input_size = input->size();
const auto outer_loop =
......@@ -71,7 +71,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
const auto offset = inner_loop * i;
mean_ptr[i] = std::accumulate(input_data + offset,
input_data + offset + inner_loop, 0.0f);
input_data + offset + inner_loop,
static_cast<T>(0.0f));
mean_ptr[i] /= inner_loop;
}
}, 0, outer_loop, 1);
......@@ -105,7 +106,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i;
mean_v_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f);
output_data_base + inner_loop,
static_cast<T>(0.0f));
mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_;
}
}, 0, outer_loop, 1);
......@@ -169,6 +171,8 @@ class MVNormOp<DeviceType::GPU, float> : public Operation {
void RegisterMVNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp);
}
......
......@@ -39,10 +39,10 @@ class OneHotOpBase : public Operation {
int axis_;
};
template <DeviceType D, typename T>
template<DeviceType D, typename T>
class OneHotOp;
template <typename T>
template<typename T>
class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
public:
explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {}
......@@ -81,15 +81,17 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (axis == 1) {
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < depth_; ++j) {
output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ :
off_value_;
float input_value = input_ptr[i];
output_ptr[i * depth_ + j] =
input_value == j ? on_value_ : off_value_;
}
}
} else {
for (index_t i = 0; i < depth_; ++i) {
for (index_t j = 0; j < batch; ++j) {
output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ :
off_value_;
float input_value = input_ptr[j];
output_ptr[i * batch + j] =
input_value == i ? on_value_ : off_value_;
}
}
}
......@@ -110,7 +112,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) {
for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == i ? on_value_ : off_value_;
float input_value = **input_ptr;
**output_ptr = input_value == i ? on_value_ : off_value_;
++(*output_ptr);
}
......@@ -130,7 +133,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) {
for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == test ? on_value_ : off_value_;
float input_value = **input_ptr;
**output_ptr = input_value == test ? on_value_ : off_value_;
++(*output_ptr);
++(*input_ptr);
}
......@@ -144,9 +148,9 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
}
};
void RegisterOneHot(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -200,8 +200,8 @@ class PadOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterPad(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Pad", PadOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Pad", PadOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
}
......
......@@ -87,6 +87,8 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
void RegisterPadContext(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -80,7 +80,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
T value =
(std::fabs(in_base[g])
......@@ -99,9 +99,9 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
temp_result += std::abs(in_base[g]);;
temp_result += std::abs(in_base[g]);
}
out_base[j] = temp_result;
}
......@@ -114,7 +114,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
temp_result += in_base[g] * in_base[g];
}
......@@ -136,6 +136,8 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
void RegisterPNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -60,8 +60,8 @@ class PoolingOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T>
class PoolingOp;
template<>
class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
template<class T>
class PoolingOp<DeviceType::CPU, T> : public PoolingOpBase {
public:
explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) {}
......@@ -93,8 +93,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor);
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data();
int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
......@@ -127,14 +127,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
private:
void MaxPooling(const OpContext *context,
const float *input,
const T *input,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
T *output) {
const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
......@@ -184,14 +184,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
}
void AvgPooling(const OpContext *context,
const float *input,
const T *input,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
T *output) {
const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
......@@ -514,6 +514,8 @@ class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
void RegisterPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
......
......@@ -148,6 +148,8 @@ class PriorBoxOp : public Operation {
void RegisterPriorBox(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -145,7 +145,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
MACE_UNUSED(context);
if (reduce_first_axis_) {
if (type == ReduceType::MEAN) {
T tmp = 0;
T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i];
}
......@@ -169,7 +169,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
output[0] = tmp;
} else if (type == ReduceType::SUM) {
T tmp = 0;
T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i];
}
......@@ -193,7 +193,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i];
}
......@@ -225,7 +225,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
} else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i];
}
......@@ -239,7 +239,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j];
}
......@@ -271,7 +271,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
} else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j];
}
......@@ -335,9 +335,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
T tmp = 1;
for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[0]; ++k) {
tmp *=
input[(k * data_reshape_[1] + i) * data_reshape_[2]
+ j];
tmp *= input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
}
}
output[i] = tmp;
......@@ -1036,6 +1034,8 @@ class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
void RegisterReduce(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, int);
#ifdef MACE_ENABLE_QUANTIZE
......
......@@ -20,6 +20,7 @@ namespace mace {
namespace ops {
namespace ref {
template<typename T>
class Activation : public delegator::Activation {
public:
explicit Activation(const delegator::ActivationParam &param)
......@@ -34,7 +35,8 @@ class Activation : public delegator::Activation {
Tensor *output);
};
MaceStatus Activation::Compute(const OpContext *context,
template<typename T>
MaceStatus Activation<T>::Compute(const OpContext *context,
const Tensor *input,
Tensor *output) {
Tensor::MappingGuard input_guard(input);
......@@ -49,12 +51,13 @@ MaceStatus Activation::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
void Activation::DoActivation(const OpContext *context,
template<typename T>
void Activation<T>::DoActivation(const OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_UNUSED(context);
auto input_ptr = input->data<float>();
auto output_ptr = output->mutable_data<float>();
auto input_ptr = input->data<T>();
auto output_ptr = output->mutable_data<T>();
const index_t size = input->size();
switch (type_) {
......@@ -77,7 +80,7 @@ void Activation::DoActivation(const OpContext *context,
case LEAKYRELU: {
for (index_t i = 0; i < size; ++i) {
*output_ptr =
std::max(*input_ptr, 0.f)
std::max<float>(*input_ptr, 0.f)
+ std::min(*input_ptr, 0.f) * leakyrelu_coefficient_;
++input_ptr;
++output_ptr;
......@@ -107,8 +110,14 @@ void Activation::DoActivation(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, CPU, float, REF))
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation<float>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Activation<BFloat16>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
......@@ -18,6 +18,7 @@ namespace mace {
namespace ops {
namespace ref {
template<typename T>
class BiasAdd : public delegator::BiasAdd {
public:
explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
......@@ -31,7 +32,8 @@ class BiasAdd : public delegator::BiasAdd {
const Tensor *bias, Tensor *output);
};
MaceStatus BiasAdd::Compute(const OpContext *context,
template<typename T>
MaceStatus BiasAdd<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
......@@ -54,14 +56,15 @@ MaceStatus BiasAdd::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
void BiasAdd::AddBias(const OpContext *context,
template<typename T>
void BiasAdd<T>::AddBias(const OpContext *context,
const Tensor *input,
const Tensor *bias,
mace::Tensor *output) {
MACE_UNUSED(context);
auto input_data = input->data<float>();
auto bias_data = bias->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto bias_data = bias->data<T>();
auto output_data = output->mutable_data<T>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
......@@ -84,8 +87,14 @@ void BiasAdd::AddBias(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, REF))
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd<float>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, BiasAdd<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
......@@ -12,16 +12,29 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/conv_2d.h"
#include <vector>
#include "mace/ops/delegator/conv_2d.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Conv2d<float>::Compute(const OpContext *context,
template<typename T>
class Conv2d : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus Conv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
......@@ -62,9 +75,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; ++m) {
......@@ -74,7 +87,7 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3];
const index_t in_channels = filter_shape[1];
float *out_ptr_base =
T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) {
......@@ -82,9 +95,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
float sum = 0;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr =
const T *filter_ptr =
filter_data + m * in_channels * filter_size + c * filter_size;
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
......@@ -94,7 +107,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t
iw = -pad_left + w * strides_[1] + kw * dilations_[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
float input_value = in_ptr_base[ih * in_width + iw];
float filter_value = filter_ptr[kw];
sum += input_value * filter_value;
}
} // kw
filter_ptr += filter_shape[3];
......@@ -109,9 +124,14 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef Conv2d<float> Conv2dRef;
MACE_REGISTER_DELEGATOR(registry, Conv2dRef, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, REF, General))
void RegisterConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2d<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2d<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_CONV_2D_H_
#define MACE_OPS_REF_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Conv2d : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class Conv2d<float> : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_CONV_2D_H_
......@@ -16,14 +16,32 @@
#include <memory>
#include <functional>
#include <vector>
#include "mace/ops/ref/deconv_2d.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/utils/memory.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Deconv2d<float>::Compute(const OpContext *context,
template<typename T>
class Deconv2d : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus Deconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
......@@ -65,15 +83,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -88,10 +105,10 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -122,7 +139,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) {
for (index_t oc = 0; oc < out_channels; ++oc) {
float *out_base =
T *out_base =
pad_out_data + (b * out_channels + oc) * out_img_size;
for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) {
......@@ -148,13 +165,13 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < out_channels; ++j) {
for (index_t k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -162,10 +179,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef Deconv2d<float> Deconv2dRef;
MACE_REGISTER_DELEGATOR(
registry, Deconv2dRef, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, REF, General))
void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2d<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Deconv2d<BFloat16>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DECONV_2D_H_
#define MACE_OPS_REF_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Deconv2d : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class Deconv2d<float> : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DECONV_2D_H_
......@@ -12,16 +12,29 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/depthwise_conv_2d.h"
#include <vector>
#include "mace/ops/delegator/depthwise_conv_2d.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
template<typename T>
class DepthwiseConv2d : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseConv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
......@@ -65,9 +78,9 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < out_shape[1]; ++m) {
......@@ -80,16 +93,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
float *out_ptr_base =
T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
float sum = 0;
const float *in_ptr_base =
const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr =
const T *filter_ptr =
filter_data + multi_index * in_channels * filter_size
+ c * filter_size;
......@@ -115,10 +128,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef DepthwiseConv2d<float> DepthwiseConv2dRef;
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dRef, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, REF, General))
void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2d<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseConv2d<BFloat16>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class DepthwiseConv2d : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class DepthwiseConv2d<float> : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
......@@ -15,14 +15,48 @@
#include <utility>
#include <memory>
#include <functional>
#include "mace/ops/ref/depthwise_deconv_2d.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/utils/memory.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
template<typename T>
class GroupDeconv2d : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
class DepthwiseDeconv2d : public GroupDeconv2d<T> {
public:
explicit DepthwiseDeconv2d<T>(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d<T>(param) {}
~DepthwiseDeconv2d<T>() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseDeconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
......@@ -41,15 +75,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::vector<int> out_pad_size;
CalDeconvOutputShapeAndPadSize(input->shape(),
filter->shape(),
strides_,
padding_type_,
paddings_,
GroupDeconv2d<T>::strides_,
GroupDeconv2d<T>::padding_type_,
GroupDeconv2d<T>::paddings_,
input->dim(1),
&out_shape,
nullptr,
&out_pad_size,
&padded_out_shape,
framework_type_,
GroupDeconv2d<T>::framework_type_,
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......@@ -64,15 +98,14 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -87,10 +120,10 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -119,15 +152,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
float *out_base =
T *out_base =
pad_out_data + (b * channels + c) * out_img_size;
for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) {
const index_t out_offset =
i * strides_[0] * pad_out_width + j * strides_[1];
const index_t out_offset = i * GroupDeconv2d<T>::strides_[0] *
pad_out_width + j * GroupDeconv2d<T>::strides_[1];
const index_t input_idx =
(b * channels + c) * in_img_size + i * in_width + j;
const float val = input_data[input_idx];
const T val = input_data[input_idx];
const index_t kernel_offset = c * kernel_size;
for (int k = 0; k < kernel_size; ++k) {
const index_t out_idx = out_offset + index_map[k];
......@@ -143,13 +176,13 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < channels; ++j) {
for (index_t k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -157,7 +190,8 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
template<typename T>
MaceStatus GroupDeconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
......@@ -199,15 +233,14 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -222,10 +255,10 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -288,13 +321,13 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < out_channels; ++j) {
for (int k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -302,10 +335,16 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef DepthwiseDeconv2d<float> DepthwiseDeconv2dRef;
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dRef, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, REF, General))
void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2d<float>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseDeconv2d<BFloat16>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class GroupDeconv2d : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename OUTPUT_TYPE>
class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2d &param)
: GroupDeconv2d<OUTPUT_TYPE>(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class GroupDeconv2d<float> : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
......@@ -12,14 +12,50 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/gemm.h"
#include "mace/ops/delegator/gemm.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Gemm<float>::Compute(const OpContext *context,
template<typename T>
class Gemm : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<typename T>
MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
......@@ -37,31 +73,32 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>();
const float *rhs_data = rhs->data<float>();
float *output_data = output->mutable_data<float>();
const T *lhs_data = lhs->data<T>();
const T *rhs_data = rhs->data<T>();
T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) {
MatrixMap<const float>
MatrixMap<const T>
lhs_matrix
(lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
lhs_major,
rows,
depth);
MatrixMap<const float>
MatrixMap<const T>
rhs_matrix
(rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
rhs_major,
depth,
cols);
MatrixMap<float>
MatrixMap<T>
output_matrix(output_data + b * rows * cols, output_major, rows, cols);
for (index_t r = 0; r < rows; ++r) {
for (index_t c = 0; c < cols; ++c) {
float sum = 0;
for (index_t d = 0; d < depth; ++d) {
sum += lhs_matrix(r, d) * rhs_matrix(d, c);
sum += static_cast<float>(lhs_matrix(r, d)) *
static_cast<float>(rhs_matrix(d, c));
} // d
*output_matrix.data(r, c) = sum;
......@@ -72,7 +109,8 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemm<float>::Compute(const OpContext *context,
template<typename T>
MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
......@@ -96,7 +134,7 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
" vs. ",
depth2);
return Compute(context,
return Gemm<T>::Compute(context,
lhs,
rhs,
batch,
......@@ -111,9 +149,14 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
output);
}
typedef Gemm<float> GemmRef;
MACE_REGISTER_DELEGATOR(registry, GemmRef, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, CPU, float, REF))
void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemm<float>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Gemm<BFloat16>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_GEMM_H_
#define MACE_OPS_REF_GEMM_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/matrix.h"
#include "mace/ops/delegator/gemm.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Gemm : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<>
class Gemm<float> : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_GEMM_H_
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "mace/ops/ref/gemv.h"
#include "mace/ops/delegator/gemv.h"
#if defined(MACE_ENABLE_QUANTIZE)
#include "mace/core/quantize.h"
......@@ -23,7 +23,14 @@ namespace mace {
namespace ops {
namespace ref {
MaceStatus Gemv<float>::Compute(const OpContext *context,
template<typename T>
class Gemv : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
......@@ -32,90 +39,11 @@ MaceStatus Gemv<float>::Compute(const OpContext *context,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>();
const float *rhs_data = rhs->data<float>();
const float *bias_data = nullptr;
if (bias) {
bias_data = bias->data<float>();
}
float *output_data = output->mutable_data<float>();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
float sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w]
* rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w];
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
#if defined(MACE_ENABLE_QUANTIZE)
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor *output) override;
};
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
template<typename T>
MaceStatus Gemv<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
......@@ -131,43 +59,40 @@ MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
const T *lhs_data = lhs->data<T>();
const T *rhs_data = rhs->data<T>();
const T *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
bias_data = bias->data<T>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
float sum = bias ? static_cast<float>(bias_data[h]) : 0.f;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
sum += lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
+ h * lhs_width + w]
* rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w];
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
typedef Gemv<uint8_t> GemvUint8Ref;
MACE_REGISTER_DELEGATOR(registry, GemvUint8Ref, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, Ref))
#endif // MACE_ENABLE_QUANTIZE
typedef Gemv<float> GemvRef;
MACE_REGISTER_DELEGATOR(registry, GemvRef, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, float, REF))
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<float>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Gemv<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
......@@ -107,8 +107,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, REF))
void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::REF));
}
} // namespace q8
} // namespace ref
......
......@@ -12,20 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_GEMV_H_
#define MACE_OPS_REF_GEMV_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/core/quantize.h"
#include "mace/ops/delegator/gemv.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
namespace q8 {
template<typename OUTPUT_TYPE>
template<typename T>
class Gemv : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
......@@ -45,7 +40,7 @@ class Gemv : public delegator::Gemv {
};
template<>
class Gemv<float> : public delegator::Gemv {
class Gemv<uint8_t> : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
......@@ -63,9 +58,8 @@ class Gemv<float> : public delegator::Gemv {
Tensor *output) override;
};
#if defined(MACE_ENABLE_QUANTIZE)
template<>
class Gemv<uint8_t> : public delegator::Gemv {
class Gemv<int32_t> : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
......@@ -83,14 +77,7 @@ class Gemv<uint8_t> : public delegator::Gemv {
Tensor *output) override;
};
template<>
class Gemv<int32_t> : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
......@@ -99,13 +86,101 @@ class Gemv<int32_t> : public delegator::Gemv {
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
#endif // MACE_ENABLE_QUANTIZE
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::REF));
MACE_REGISTER_DELEGATOR(
registry, Gemv<int32_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::REF));
}
} // namespace q8
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_GEMV_H_
......@@ -20,19 +20,18 @@ namespace ops {
namespace ref {
extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dRefDelegator(
OpDelegatorRegistry *registry);
extern void RegisterGemmRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE
namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace q8
extern void RegisterGemvUint8RefDelegator(OpDelegatorRegistry *registry);
#endif // MACE_ENABLE_QUANTIZE
} // namespace ref
......@@ -43,43 +42,26 @@ extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x7S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x1S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x15S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK15x1S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S1Delegator(
extern void RegisterDepthwiseConv2dK3x3Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S2Delegator(
extern void RegisterDepthwiseDeconv2dK3x3Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S1Delegator(
extern void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S2Delegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S1Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S2Delegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dGeneralDelegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dGeneralDelegator(
......@@ -92,8 +74,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE
namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvUint8Delegator(OpDelegatorRegistry *registry);
extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace q8
#endif // MACE_ENABLE_QUANTIZE
......@@ -103,16 +84,16 @@ extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
ref::RegisterActivationDelegator(registry);
ref::RegisterBiasAddDelegator(registry);
ref::RegisterConv2dRefDelegator(registry);
ref::RegisterDeconv2dRefDelegator(registry);
ref::RegisterDepthwiseConv2dRefDelegator(registry);
ref::RegisterDepthwiseDeconv2dRefDelegator(registry);
ref::RegisterGemmRefDelegator(registry);
ref::RegisterGemvRefDelegator(registry);
ref::RegisterConv2dDelegator(registry);
ref::RegisterDeconv2dDelegator(registry);
ref::RegisterDepthwiseConv2dDelegator(registry);
ref::RegisterDepthwiseDeconv2dDelegator(registry);
ref::RegisterGemmDelegator(registry);
ref::RegisterGemvDelegator(registry);
#ifdef MACE_ENABLE_QUANTIZE
ref::q8::RegisterEltwiseDelegator(registry);
ref::RegisterGemvUint8RefDelegator(registry);
ref::q8::RegisterGemvDelegator(registry);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_NEON
......@@ -120,37 +101,23 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
arm::fp32::RegisterBiasAddDelegator(registry);
arm::fp32::RegisterConv2dK1x1Delegator(registry);
arm::fp32::RegisterConv2dK1x7S1Delegator(registry);
arm::fp32::RegisterConv2dK7x1S1Delegator(registry);
arm::fp32::RegisterConv2dK1x15S1Delegator(registry);
arm::fp32::RegisterConv2dK15x1S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S2Delegator(registry);
arm::fp32::RegisterConv2dK1xNDelegator(registry);
arm::fp32::RegisterConv2dK3x3Delegator(registry);
arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
arm::fp32::RegisterConv2dK5x5S1Delegator(registry);
arm::fp32::RegisterConv2dK7x7S1Delegator(registry);
arm::fp32::RegisterConv2dK7x7S2Delegator(registry);
arm::fp32::RegisterConv2dK7x7S3Delegator(registry);
arm::fp32::RegisterConv2dK5x5Delegator(registry);
arm::fp32::RegisterConv2dK7x7Delegator(registry);
arm::fp32::RegisterConv2dGeneralDelegator(registry);
arm::fp32::RegisterDeconv2dK2x2S1Delegator(registry);
arm::fp32::RegisterDeconv2dK2x2S2Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDeconv2dK2x2Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
......@@ -159,8 +126,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
#ifdef MACE_ENABLE_QUANTIZE
arm::q8::RegisterEltwiseDelegator(registry);
arm::q8::RegisterGemvUint8Delegator(registry);
arm::q8::RegisterGemvInt32Delegator(registry);
arm::q8::RegisterGemvDelegator(registry);
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_NEON
......
......@@ -98,6 +98,8 @@ class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
void RegisterReplaceIndex(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU);
}
} // namespace ops
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册