提交 950028c1 编写于 作者: L luxuhui

feature: support bf16

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 27757d88
......@@ -10,6 +10,7 @@ option(MACE_ENABLE_CUDA "whether to enable CUDA support" OFF)
option(MACE_ENABLE_HEXAGON_DSP "whether to enable Hexagon DSP support" OFF)
option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support" OFF)
option(MACE_ENABLE_MTK_APU "whether to enable MTK APU support" OFF)
option(MACE_ENABLE_BFLOAT16 "whether to enable bfloat16 support" OFF)
option(MACE_ENABLE_TESTS "whether to build c++ unit tests" OFF)
option(MACE_ENABLE_BENCHMARKS "whether to build c++ micro benchmarks" OFF)
option(MACE_ENABLE_OPT_SIZE "whether to build with optimized binary size" ON)
......@@ -116,6 +117,10 @@ if(MACE_ENABLE_MTK_APU)
add_definitions(-DMACE_ENABLE_MTK_APU)
endif(MACE_ENABLE_MTK_APU)
if(MACE_ENABLE_BFLOAT16)
add_definitions(-DMACE_ENABLE_BFLOAT16)
endif(MACE_ENABLE_BFLOAT16)
if(MACE_ENABLE_OBFUSCATE)
add_definitions(-DMACE_OBFUSCATE_LITERALS)
endif(MACE_ENABLE_OBFUSCATE)
......
......@@ -85,7 +85,7 @@ in one deployment file.
* - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
......@@ -582,9 +582,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
......@@ -63,7 +63,7 @@ There are many advanced options supported.
* - runtime
- The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
* - data_type
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU; [fp16_fp32, bf16_fp32, fp32_fp32] for CPU, default is fp16_fp32.
* - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
......@@ -438,9 +438,10 @@ half (16bit) can be used to reduce it by half with negligible accuracy degradati
Therefore, the default storage type for a regular model in MACE is half. However,
if the model is very sensitive to accuracy, storage type can be changed to float.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``.
In the deployment file, ``data_type`` is ``fp16_fp32`` by default and can be changed to ``fp32_fp32``,
for CPU it can also be changed to ``bf16_fp32``.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float.
For CPU, ``fp16_fp32`` means that the weights are saved in half and actual inference is in float; while ``bf16_fp32`` means that the weights are saved in bfloat16 and actual inference is in float.
For GPU, ``fp16_fp32`` means that the ops in GPU take half as inputs and outputs while kernel execution in float.
......
......@@ -43,12 +43,13 @@ MACE 需要安装下列依赖:
- 版本和说明
* - Android NDK
- `NDK 安装指南 <https://developer.android.com/ndk/guides/setup#install>`__
- Required by Android build, r15b, r15c, r16b, r17b
- 安卓编译需要, bazel用户可以使用r15b及以上的版本, cmake用户可以使用r17b及以上版本
* - CMake
- apt-get install cmake
- >= 3.11.3
* - ADB
- Linux:``apt-get install android-tools-adb`` Mac:``brew cask install android-platform-tools``
- | Linux:``apt-get install android-tools-adb``
| Mac:``brew cask install android-platform-tools``
- Android 运行需要, >= 1.0.32
* - TensorFlow
- pip install tensorflow==1.8.0
......
......@@ -132,6 +132,14 @@ config_setting(
visibility = ["//visibility:public"],
)
config_setting(
name = "bfloat16_enabled",
define_values = {
"bfloat16": "true",
},
visibility = ["//visibility:public"],
)
config_setting(
name = "rpcmem_enabled",
define_values = {
......
......@@ -9,6 +9,7 @@ load(
"if_android",
"if_android_armv7",
"if_apu_enabled",
"if_bfloat16_enabled",
"if_hexagon_enabled",
"if_hexagon_or_hta_enabled",
"if_hta_enabled",
......@@ -87,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_BFLOAT16_H_
#define MACE_CORE_BFLOAT16_H_
#ifdef MACE_ENABLE_BFLOAT16
#include <algorithm>
#include <cmath>
#include <sstream>
namespace mace {
union Sphinx {
uint32_t i;
float f;
Sphinx(uint32_t value) : i(value) {}
Sphinx(float value) : f(value) {}
};
class BFloat16 {
public:
BFloat16() : data_(0) {}
// we need implicit transformation, so `explicit` keyword is not used
BFloat16(float value) : data_(Sphinx(value).i >> 16) {} // NOLINT
operator float() const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f;
}
operator double() const {
return static_cast<double>(
Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
operator int() const {
return static_cast<int>(Sphinx(static_cast<uint32_t>(data_ << 16)).f);
}
template<typename T>
void operator=(T value) {
data_ = Sphinx(static_cast<float>(value)).i >> 16;
}
BFloat16 operator-() const {
return BFloat16(-(Sphinx(static_cast<uint32_t>(data_ << 16)).f));
}
template<typename T>
BFloat16 operator+(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f + static_cast<float>(value));
}
template<typename T>
BFloat16 operator-(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f - static_cast<float>(value));
}
template<typename T>
BFloat16 operator*(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f * static_cast<float>(value));
}
template<typename T>
BFloat16 operator/(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f / static_cast<float>(value));
}
template<typename T>
bool operator>(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f > static_cast<float>(value);
}
template<typename T>
bool operator>=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f >= static_cast<float>(value);
}
template<typename T>
bool operator<(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f < static_cast<float>(value);
}
template<typename T>
bool operator<=(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f <= static_cast<float>(value);
}
template<typename T>
bool operator==(T value) const {
return Sphinx(
static_cast<uint32_t>(data_ << 16)).f == static_cast<float>(value);
}
template<typename T>
void operator+=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f +
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator/=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f /
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator-=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f -
static_cast<float>(value)).i >> 16;
}
template<typename T>
void operator*=(T value) {
data_ = Sphinx(Sphinx(static_cast<uint32_t>(data_ << 16)).f *
static_cast<float>(value)).i >> 16;
}
private:
uint16_t data_;
};
template<>
inline bool BFloat16::operator==(const BFloat16 &value) const {
return data_ == value.data_;
}
template<>
inline void BFloat16::operator=(const BFloat16 &value) {
data_ = value.data_;
}
} // namespace mace
namespace std {
inline float fabs(const mace::BFloat16 &value) {
return fabs(static_cast<float>(value));
}
inline float abs(const mace::BFloat16 &value) {
return abs(static_cast<float>(value));
}
inline float sqrt(const mace::BFloat16 &value) {
return sqrt(static_cast<float>(value));
}
inline float log(const mace::BFloat16 &value) {
return log(static_cast<float>(value));
}
inline float tanh(const mace::BFloat16 &value) {
return tanh(static_cast<float>(value));
}
inline float exp(const mace::BFloat16 &value) {
return exp(static_cast<float>(value));
}
inline int ceil(const mace::BFloat16 &value) {
return ceil(static_cast<float>(value));
}
inline int floor(const mace::BFloat16 &value) {
return floor(static_cast<float>(value));
}
inline float max(const mace::BFloat16 &a, const float &b) {
return max(static_cast<float>(a), b);
}
inline float max(const float &a, const mace::BFloat16 &b) {
return max(a, static_cast<float>(b));
}
inline float min(const mace::BFloat16 &a, const float &b) {
return min(static_cast<float>(a), b);
}
inline float min(const float &a, const mace::BFloat16 &b) {
return min(a, static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const mace::BFloat16 &b) {
return pow(static_cast<float>(a), static_cast<float>(b));
}
inline float pow(const mace::BFloat16 &a, const float &b) {
return pow(static_cast<float>(a), b);
}
inline float pow(const float &a, const mace::BFloat16 &b) {
return pow(a, static_cast<float>(b));
}
inline ostream &operator<<(ostream &ss, // NOLINT
const mace::BFloat16 &value) {
return ss << static_cast<float>(value);
}
} // namespace std
inline float operator+(const float &a, const mace::BFloat16 &value) {
return a + static_cast<float>(value);
}
inline float operator-(const float &a, const mace::BFloat16 &value) {
return a - static_cast<float>(value);
}
inline float operator*(const float &a, const mace::BFloat16 &value) {
return a * static_cast<float>(value);
}
inline float operator/(const float &a, const mace::BFloat16 &value) {
return a / static_cast<float>(value);
}
inline void operator+=(float &a, const mace::BFloat16 &value) { // NOLINT
a += static_cast<float>(value);
}
inline void operator-=(float &a, const mace::BFloat16 &value) { // NOLINT
a -= static_cast<float>(value);
}
inline void operator*=(float &a, const mace::BFloat16 &value) { // NOLINT
a *= static_cast<float>(value);
}
inline void operator/=(float &a, const mace::BFloat16 &value) { // NOLINT
a /= static_cast<float>(value);
}
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_CORE_BFLOAT16_H_
......@@ -28,9 +28,9 @@ enum ImplType {
};
#ifdef MACE_ENABLE_NEON
#define MACE_CPU_IMPL_TYPE NEON
const ImplType kCpuImplType = ImplType::NEON;
#else
#define MACE_CPU_IMPL_TYPE REF
const ImplType kCpuImplType = ImplType::REF;
#endif
struct DelegatorParam {
......
......@@ -15,25 +15,86 @@
#include "mace/core/registry/op_delegator_registry.h"
#include <utility>
#include <sstream>
#include "mace/utils/logging.h"
namespace mace {
MaceStatus OpDelegatorRegistry::Register(const std::string &key,
namespace {
const char *kDefaultTag = "general";
}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type,
const char *in_tag)
: delegator_name(in_name), data_type(in_data_type),
device(in_device), impl_type(in_impl_type), tag(in_tag) {}
DelegatorInfo::DelegatorInfo(const char *in_name, DataType in_data_type,
DeviceType in_device, ImplType in_impl_type)
: DelegatorInfo(in_name, in_data_type,
in_device, in_impl_type, kDefaultTag) {}
std::string DelegatorInfo::ToString() const {
std::stringstream ss;
ss << delegator_name << "_" << data_type << "_"
<< device << "_" << impl_type << "_" << tag;
return ss.str();
}
bool DelegatorInfo::operator==(const DelegatorInfo &info) const {
return device == info.device && impl_type == info.impl_type &&
data_type == info.data_type &&
delegator_name == info.delegator_name && tag == info.tag;
}
MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
DelegatorCreator creator) {
MACE_CHECK(registry_.count(key) == 0, "Register an exist key.");
MACE_CHECK(registry_.count(key) == 0,
"Register an exist key: ", key.ToString());
registry_[key] = std::move(creator);
return MaceStatus::MACE_SUCCESS;
}
DelegatorCreator OpDelegatorRegistry::GetCreator(const std::string &key) const {
MACE_CHECK(registry_.count(key) > 0, key, " not exist.");
return registry_.at(key);
}
DelegatorCreator OpDelegatorRegistry::GetCreator(
const DelegatorInfo &key) const {
if (registry_.count(key) > 0) {
return registry_.at(key);
}
DelegatorInfo info = key;
if (key.impl_type == ImplType::NEON) {
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
info.tag = key.tag;
}
template<> const char *DType<float>::name_ = "float";
template<> const char *DType<int>::name_ = "int";
template<> const char *DType<uint8_t>::name_ = "uint8_t";
info.impl_type = ImplType::REF;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
// for REF
if (info.tag != kDefaultTag) {
info.tag = kDefaultTag;
if (registry_.count(info) > 0) {
VLOG(1) << key.ToString()
<< " delegator fall back to " << info.ToString();
return registry_.at(info);
}
}
LOG(FATAL) << "Delegator not exist: " << key.ToString();
return DelegatorCreator();
}
} // namespace mace
......@@ -21,7 +21,9 @@
#include <unordered_map>
#include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/ops/op_delegator.h"
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
......@@ -29,40 +31,50 @@ namespace mace {
typedef std::function<std::unique_ptr<OpDelegator>(const DelegatorParam &)>
DelegatorCreator;
struct DelegatorInfo {
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type,
const char *tag);
explicit DelegatorInfo(const char *delegator_name,
DataType data_type,
DeviceType device,
ImplType impl_type);
std::string ToString() const;
bool operator==(const DelegatorInfo &info) const;
std::string delegator_name;
DataType data_type;
DeviceType device;
ImplType impl_type;
std::string tag;
};
class OpDelegatorRegistry {
public:
OpDelegatorRegistry() = default;
~OpDelegatorRegistry() = default;
MaceStatus Register(const std::string &key, DelegatorCreator creator);
DelegatorCreator GetCreator(const std::string &key) const;
MaceStatus Register(const DelegatorInfo &key, DelegatorCreator creator);
DelegatorCreator GetCreator(const DelegatorInfo &key) const;
private:
std::unordered_map<std::string, DelegatorCreator> registry_;
struct HashName {
size_t operator()(const DelegatorInfo &delegator_info) const {
return std::hash<std::string>()(delegator_info.ToString());
}
};
std::unordered_map<DelegatorInfo, DelegatorCreator, HashName> registry_;
};
template<typename T>
struct DType { static const char *name_; };
template<> const char *DType<float>::name_;
template<> const char *DType<int>::name_;
template<> const char *DType<uint8_t>::name_;
} // namespace mace
#ifndef MACE_DELEGATOR_KEY_TMP
#define MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl) \
(std::string(#delegator_name"_"#device"_"#impl"_") + DType<DT>::name_)
#endif // MACE_DELEGATOR_KEY_TMP
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
MACE_DELEGATOR_KEY_TMP(delegator_name, device, DT, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_DELEGATOR_KEY_EX_TMP
#define MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag) \
(std::string(#delegator_name"_"#device"_"#impl"_"#tag"_") + DType<DT>::name_)
DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl, #tag)
#endif // MACE_DELEGATOR_KEY_EX_TMP
#ifndef MACE_DELEGATOR_KEY_EX
......@@ -70,21 +82,32 @@ template<> const char *DType<uint8_t>::name_;
MACE_DELEGATOR_KEY_EX_TMP(delegator_name, device, DT, impl, tag)
#endif // MACE_DELEGATOR_KEY_EX
#ifndef MACE_DELEGATOR_KEY
#define MACE_DELEGATOR_KEY(delegator_name, device, DT, impl) \
DelegatorInfo(#delegator_name, DataTypeToEnum<DT>::value, device, impl)
#endif // MACE_DELEGATOR_KEY
#ifndef MACE_REGISTER_DELEGATOR
#define MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key) \
void Register##class_name##Delegator(OpDelegatorRegistry *registry) { \
registry->Register( \
key, OpDelegator::DefaultCreator<class_name, param_name>); \
}
registry->Register(key, OpDelegator::DefaultCreator<class_name, param_name>)
#endif // MACE_REGISTER_DELEGATOR
#ifndef MACE_REGISTER_BF16_DELEGATOR
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key) \
MACE_REGISTER_DELEGATOR(registry, class_name, param_name, key)
#else
#define MACE_REGISTER_BF16_DELEGATOR(registry, class_name, param_name, key)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_DELEGATOR
#ifndef MACE_DEFINE_DELEGATOR_CREATOR
#define MACE_DEFINE_DELEGATOR_CREATOR(class_name) \
static std::unique_ptr<class_name> Create( \
Workspace *workspace, const std::string &tag, \
Workspace *workspace, const DelegatorInfo &key, \
const DelegatorParam &param) { \
DelegatorCreator creator = \
workspace->GetDelegatorRegistry()->GetCreator(tag); \
workspace->GetDelegatorRegistry()->GetCreator(key); \
std::unique_ptr<OpDelegator> delegator = creator(param); \
return std::unique_ptr<class_name>( \
static_cast<class_name *>(delegator.release())); \
......
......@@ -22,6 +22,8 @@
#include <unordered_map>
#include <vector>
#include "mace/core/bfloat16.h"
#include "mace/core/types.h"
#include "mace/core/ops/operator.h"
#include "mace/core/ops/op_condition_builder.h"
#include "mace/core/ops/op_condition_context.h"
......@@ -80,6 +82,26 @@ class OpRegistry {
DataTypeToEnum<dt>::value, \
OpRegistry::DefaultCreator<class_name>)
#ifndef MACE_REGISTER_BF16_OP
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device) \
MACE_REGISTER_OP(op_registry, op_type, class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP
#ifndef MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, \
class_name, device) \
MACE_REGISTER_OP_BY_CLASS(op_registry, op_type, \
class_name, device, BFloat16)
#else
#define MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, op_type, class_name, device)
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_REGISTER_BF16_OP_BY_CLASS
#ifdef MACE_ENABLE_OPENCL
#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
op_registry->Register( \
......
......@@ -53,6 +53,13 @@ namespace mace {
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS)
#endif
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_CASE(BFloat16, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS)
#endif // MACE_ENABLE_BFLOAT16
#if MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS))
......@@ -67,6 +74,7 @@ namespace mace {
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_BFLOAT16(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
case DT_INVALID: \
INVALID_STATEMENTS; \
......@@ -419,7 +427,8 @@ class Tensor {
if (i != 0 && i % shape_.back() == 0) {
os << "\n";
}
MACE_RUN_WITH_TYPE_ENUM(dtype_, (os << (this->data<T>()[i]) << ", "));
MACE_RUN_WITH_TYPE_ENUM(
dtype_, (os << this->data<T>()[i] << ", "));
}
LOG(INFO) << os.str();
}
......
......@@ -25,6 +25,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
case DT_FLOAT:
case DT_UINT8:
case DT_INT32:
case DT_BFLOAT16:
return true;
default:
return false;
......@@ -36,7 +37,8 @@ std::string DataTypeToString(const DataType dt) {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_UINT8, "DT_UINT8"},
{DT_INT32, "DT_INT32"}};
{DT_INT32, "DT_INT32"},
{DT_BFLOAT16, "DT_BFLOAT16"}};
MACE_CHECK(dt != DT_INVALID, "Not support Invalid data type");
return dtype_string_map[dt];
}
......@@ -50,6 +52,10 @@ size_t GetEnumTypeSize(const DataType dt) {
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
case DT_FLOAT16:
return sizeof(float16_t);
#endif
#ifdef MACE_ENABLE_BFLOAT16
case DT_BFLOAT16:
return sizeof(BFloat16);
#endif
case DT_UINT8:
return sizeof(uint8_t);
......
......@@ -21,6 +21,7 @@
#include <arm_neon.h>
#endif
#include "mace/core/bfloat16.h"
#include "mace/proto/mace.pb.h"
#include "include/half.hpp"
......@@ -57,6 +58,9 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16);
#endif
#ifdef MACE_ENABLE_BFLOAT16
MACE_MAPPING_DATA_TYPE_AND_ENUM(BFloat16, DT_BFLOAT16);
#endif
MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
......
......@@ -12,6 +12,7 @@ load(
"if_android",
"if_android_armv7",
"if_apu_enabled",
"if_bfloat16_enabled",
"if_darwin",
"if_hexagon_enabled",
"if_hta_enabled",
......@@ -42,6 +43,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
......@@ -52,8 +55,8 @@ cc_library(
"-DMACE_ENABLE_RPCMEM",
]),
deps = [
"//mace/ops",
"//include:public_headers",
"//mace/ops",
],
alwayslink = 1,
)
......
......@@ -16,9 +16,11 @@
#include <numeric>
#include <memory>
#include "mace/core/bfloat16.h"
#include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/core/net_def_adapter.h"
#include "mace/core/registry/ops_registry.h"
#include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/transpose.h"
......@@ -29,7 +31,6 @@
#include "mace/public/mace.h"
#include "mace/port/env.h"
#include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
......@@ -460,6 +461,7 @@ class MaceEngine::Impl {
std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_;
bool is_quantized_model_;
DataType net_data_type_;
std::map<std::string, mace::InputOutputInfo> input_info_map_;
std::map<std::string, mace::InputOutputInfo> output_info_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_;
......@@ -565,6 +567,7 @@ MaceStatus MaceEngine::Impl::Init(
#endif
// mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def);
net_data_type_ = net_def->data_type();
// Get input and output information.
for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info;
......@@ -589,8 +592,8 @@ MaceStatus MaceEngine::Impl::Init(
}
input_tensor->Resize(shape);
// Set to the default data format
input_tensor->set_data_format(static_cast<DataFormat>(
input_info_map_[input_name].data_format()));
input_tensor->set_data_format(
static_cast<DataFormat>(input_info_map_[input_name].data_format()));
}
for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) {
......@@ -691,7 +694,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
model_data_file.c_str(), &model_data_));
MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes,
MACE_RETURN_IF_ERROR(Init(
net_def, input_nodes, output_nodes,
reinterpret_cast<const unsigned char *>(model_data_->data())));
if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
......@@ -753,11 +757,24 @@ MaceStatus MaceEngine::Impl::TransposeInput(
Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>();
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
if (net_data_type_ == DT_FLOAT || net_data_type_ == DataType::DT_HALF) {
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DT_BFLOAT16) {
auto *input_data = input_tensor->mutable_data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data);
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
return ops::Transpose(thread_pool_.get(),
......@@ -776,9 +793,22 @@ MaceStatus MaceEngine::Impl::TransposeInput(
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
if (net_data_type_ == DataType::DT_FLOAT ||
net_data_type_ == DataType::DT_HALF) {
auto input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
#ifdef MACE_ENABLE_BFLOAT16
} else if (net_data_type_ == DataType::DT_BFLOAT16) {
auto input_data = input_tensor->mutable_data<BFloat16>();
const float *data = input.second.data().get();
for (index_t i = 0; i < input_tensor->size(); ++i) {
input_data[i] = data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "Invalid net data type: " << net_data_type_;
}
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
memcpy(input_data, input.second.data().get(),
......@@ -842,6 +872,15 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
output_tensor->shape(),
dst_dims,
output->second.data<int>().get());
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
auto output_data = output_tensor->data<BFloat16>();
return ops::Transpose(thread_pool_.get(),
output_data,
output_tensor->shape(),
dst_dims,
output->second.data<float>().get());
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
return MaceStatus::MACE_INVALID_ARGS;
......@@ -864,6 +903,14 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
std::memcpy(output->second.data<int>().get(),
output_tensor->data<int>(),
output_size * sizeof(int));
#ifdef MACE_ENABLE_BFLOAT16
} else if (output_dt == DataType::DT_BFLOAT16) {
const auto *output_data = output_tensor->data<BFloat16>();
float *data = output->second.data<float>().get();
for (index_t i = 0; i < output_tensor->size(); ++i) {
data[i] = output_data[i];
}
#endif // MACE_ENABLE_BFLOAT16
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
}
......
......@@ -109,6 +109,12 @@ def if_quantize_enabled(a):
"//conditions:default": [],
})
def if_bfloat16_enabled(a):
return select({
"//mace:bfloat16_enabled": a,
"//conditions:default": [],
})
def if_rpcmem_enabled(a):
return select({
"//mace:rpcmem_enabled": a,
......
......@@ -10,6 +10,7 @@ load(
"//mace:mace.bzl",
"if_android",
"if_android_armv7",
"if_bfloat16_enabled",
"if_hexagon_enabled",
"if_neon_enabled",
"if_opencl_enabled",
......@@ -46,6 +47,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -85,6 +88,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -138,6 +143,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -223,6 +230,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......@@ -263,6 +272,8 @@ cc_library(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......
......@@ -19,7 +19,6 @@
#include "mace/core/ops/operator.h"
#include "mace/core/registry/ops_registry.h"
#include "mace/ops/delegator/activation.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -43,11 +42,12 @@ class ActivationOp<DeviceType::CPU, T> : public Operation {
Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
activation_delegator_(delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, T, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam(
activation_type_,
Operation::GetOptionalArg<T>("max_limit", 0),
Operation::GetOptionalArg<T>("leakyrelu_coefficient", 0)))) {}
Operation::GetOptionalArg<float>("max_limit", 0.f),
Operation::GetOptionalArg<float>(
"leakyrelu_coefficient", 0.f)))) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -119,6 +119,8 @@ class ActivationOp<DeviceType::GPU, float> : public Operation {
void RegisterActivation(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Activation",
ActivationOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T>
class AddNOp;
template<>
class AddNOp<DeviceType::CPU, float> : public Operation {
template<class T>
class AddNOp<DeviceType::CPU, T> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {}
......@@ -46,12 +46,12 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
const index_t size = output->size();
Tensor::MappingGuard output_guard(output);
auto output_data = output->mutable_data<float>();
memset(output_data, 0, size * sizeof(float));
auto output_data = output->mutable_data<T>();
memset(output_data, 0, size * sizeof(T));
for (auto &input : inputs_) {
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<float>();
auto input_data = input->template data<T>();
for (index_t j = 0; j < size; ++j) {
output_data[j] += input_data[j];
......@@ -95,6 +95,7 @@ class AddNOp<DeviceType::GPU, float> : public Operation {
void RegisterAddN(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "AddN", AddNOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -74,11 +74,12 @@ class ArgMaxOp : public Operation {
if (argmin_) {
for (index_t i = 0; i < outer_size; ++i) {
int idx = 0;
T min_value = std::numeric_limits<T>::max();
float min_value = std::numeric_limits<float>::max();
const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] < min_value) {
min_value = input_ptr[j];
float input_value = input_ptr[j];
if (input_value < min_value) {
min_value = input_value;
idx = j;
}
}
......@@ -87,11 +88,12 @@ class ArgMaxOp : public Operation {
} else {
for (index_t i = 0; i < outer_size; ++i) {
int idx = 0;
T max_value = std::numeric_limits<T>::lowest();
float max_value = std::numeric_limits<float>::lowest();
const T *input_ptr = input_data + i * inner_size;
for (index_t j = 0; j < inner_size; ++j) {
if (input_ptr[j] > max_value) {
max_value = input_ptr[j];
float input_value = input_ptr[j];
if (input_value > max_value) {
max_value = input_value;
idx = j;
}
}
......@@ -111,8 +113,8 @@ class ArgMaxOp : public Operation {
void RegisterArgMax(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ArgMax", ArgMaxOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -185,8 +185,11 @@ void Activation::DoActivation(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, CPU, float, NEON))
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -129,8 +129,11 @@ void BiasAdd::AddBias(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, NEON))
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -113,8 +113,12 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
output);
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x1))
void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x1));
}
} // namespace fp32
} // namespace arm
......
......@@ -861,18 +861,27 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K1x7S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x1S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
NEON, K1x15S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
NEON, K15x1S1))
void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x1S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x15S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K15x1S1));
}
} // namespace fp32
} // namespace arm
......
......@@ -737,10 +737,16 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K3x3S2))
void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -801,9 +801,12 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
}, 0, batch, 1, 0, out_channels, 1);
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(
Conv2d, CPU, float, NEON, K3x3Winograd))
void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3Winograd));
}
} // namespace fp32
} // namespace arm
......
......@@ -258,8 +258,12 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K5x5S1))
void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K5x5S1));
}
} // namespace fp32
} // namespace arm
......
......@@ -722,12 +722,20 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S1))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S2))
MACE_REGISTER_DELEGATOR(registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, K7x7S3))
void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S2));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S3));
}
} // namespace fp32
} // namespace arm
......
......@@ -252,9 +252,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, NEON, General))
void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -335,12 +335,16 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K2x2S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K2x2S2))
void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -464,12 +464,16 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K3x3S2))
void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -574,12 +574,16 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, K4x4S2))
void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -124,9 +124,11 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Deconv2dGeneral, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
NEON, General))
void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dGeneral, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -512,12 +512,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, NEON, K3x3S2))
void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -776,19 +776,27 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K3x3S2))
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S1))
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K3x3S2))
void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -959,19 +959,27 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, K4x4S2))
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S1))
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, K4x4S2))
void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm
......
......@@ -207,13 +207,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, NEON, General))
void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float, NEON, General))
void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -1224,8 +1224,11 @@ MaceStatus Gemm::Compute(const OpContext *context,
output);
}
MACE_REGISTER_DELEGATOR(registry, Gemm, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, CPU, float, NEON))
void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemm, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -378,9 +378,11 @@ MaceStatus Gemv::Compute(const OpContext *context,
#undef vaddvq_f32
#endif
MACE_REGISTER_DELEGATOR(registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, float, NEON))
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm
......
......@@ -162,8 +162,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, NEON))
void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::NEON));
}
} // namespace q8
} // namespace arm
......
......@@ -176,18 +176,14 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
template
class Gemv<uint8_t>;
template
class Gemv<int32_t>;
typedef Gemv<uint8_t> GemvUint8;
MACE_REGISTER_DELEGATOR(registry, GemvUint8, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, NEON))
typedef Gemv<int32_t> GemvInt32;
MACE_REGISTER_DELEGATOR(registry, GemvInt32, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, int32_t, NEON))
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON));
MACE_REGISTER_DELEGATOR(
registry, Gemv<int32_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON));
}
} // namespace q8
} // namespace arm
......
......@@ -33,8 +33,8 @@ namespace ops {
template<DeviceType D, class T>
class BatchNormOp;
template<>
class BatchNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class BatchNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit BatchNormOp(OpConstructContext *context)
: Operation(context),
......@@ -43,7 +43,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(
ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
......@@ -91,13 +92,13 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard offset_mapper(offset);
Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>();
const float *scale_ptr = scale->data<float>();
const float *offset_ptr = offset->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
const T *scale_ptr = scale->data<T>();
const T *offset_ptr = offset->data<T>();
T *output_ptr = output->mutable_data<T>();
std::vector<float> new_scale;
std::vector<float> new_offset;
std::vector<T> new_scale;
std::vector<T> new_offset;
if (not_folded) {
const Tensor *mean = this->Input(MEAN);
const Tensor *var = this->Input(VAR);
......@@ -109,8 +110,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
new_offset.resize(channels);
Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var);
const float *mean_ptr = mean->data<float>();
const float *var_ptr = var->data<float>();
const T *mean_ptr = mean->data<T>();
const T *var_ptr = var->data<T>();
thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start,
index_t end,
......@@ -122,9 +123,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, channels, 1);
}
const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
const float
*offset_data = not_folded ? new_offset.data() : offset_ptr;
const T *scale_data = not_folded ? new_scale.data() : scale_ptr;
const T *offset_data = not_folded ? new_offset.data() : offset_ptr;
index_t channel_size = height * width;
index_t batch_size = channels * channel_size;
......@@ -232,6 +232,7 @@ class BatchNormOp<DeviceType::GPU, float> : public Operation {
void RegisterBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
}
......
......@@ -84,8 +84,8 @@ class BatchToSpaceOpBase : public Operation {
template<DeviceType D, class T>
class BatchToSpaceNDOp;
template<>
class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
template<class T>
class BatchToSpaceNDOp<DeviceType::CPU, T> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {}
......@@ -108,8 +108,8 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
const float *input_data = batch_tensor->data<float>();
float *output_data = space_tensor->mutable_data<float>();
const T *input_data = batch_tensor->data<T>();
T *output_data = space_tensor->mutable_data<T>();
index_t in_batches = batch_tensor->dim(0);
index_t in_height = batch_tensor->dim(2);
......@@ -120,10 +120,11 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
index_t out_height = space_tensor->dim(2);
index_t out_width = space_tensor->dim(3);
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size =
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// 32k/sizeof(T)/out_width/block_shape
index_t block_h_size = std::max(
static_cast<index_t>(1),
static_cast<index_t>(
(32 / sizeof(T)) * 1024 / block_shape_w / out_width));
// make channel outter loop so we can make best use of cache
for (index_t c = 0; c < channels; ++c) {
......@@ -153,9 +154,9 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
(out_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
const T *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
T *output_base =
output_data + (b * channels + c) * out_height * out_width;
index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
......@@ -290,6 +291,9 @@ void RegisterBatchToSpaceND(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
......
......@@ -33,15 +33,15 @@ namespace ops {
template<DeviceType D, class T>
class BiasAddOp;
template<>
class BiasAddOp<DeviceType::CPU, float> : public Operation {
template<class T>
class BiasAddOp<DeviceType::CPU, T> : public Operation {
public:
explicit BiasAddOp(OpConstructContext *context)
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -67,9 +67,9 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
const float *input_ptr = input->data<float>();
const float *bias_ptr = bias->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>();
const std::vector<index_t> &shape = input->shape();
const index_t channels = *shape.rbegin();
......@@ -162,6 +162,7 @@ class BiasAddOp<DeviceType::GPU, float> : public Operation {
void RegisterBiasAdd(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -56,10 +56,8 @@ class CastOp : public Operation {
};
void RegisterCast(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, int32_t);
MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cast", CastOp, DeviceType::CPU, int32_t);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float16_t);
......
......@@ -64,7 +64,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
const T *in_ptr = input_ptr + b * batch_size
+ (g * channels_per_group + idx) * image_size;
T *out_ptr = output_ptr + b * batch_size + c * image_size;
memcpy(out_ptr, in_ptr, image_size * sizeof(float));
memcpy(out_ptr, in_ptr, image_size * sizeof(T));
}
}
......@@ -102,6 +102,8 @@ class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
void RegisterChannelShuffle(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ChannelShuffle",
ChannelShuffleOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#include "mace/ops/common/lstm.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const OpContext *context,
const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data) {
float i_scale = (embed_scales && scale_data) ? scale_data[0] : 1.0f;
float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f;
float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops
} // namespace mace
......@@ -12,25 +12,77 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Details are in
// http://kaldi-asr.org/doc/nnet-simple-component_8h_source.html#l02164
#ifndef MACE_OPS_COMMON_LSTM_H_
#define MACE_OPS_COMMON_LSTM_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/types.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
void LSTMNonlinearKernel(const OpContext *opContext,
const float *input_data,
const float *prev_data,
const float *scale_data,
const float *params_data,
template <typename T>
void LSTMNonlinearKernel(const OpContext *context,
const T *input_data,
const T *prev_data,
const T *scale_data,
const T *params_data,
bool embed_scales,
index_t params_stride,
index_t cell_dim,
float *output_cell,
float *output_data);
T *output_cell,
T *output_data) {
float i_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[0]) : 1.0f;
float f_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[1]) : 1.0f;
float o_scale =
(embed_scales && scale_data) ? static_cast<float>(scale_data[2]) : 1.0f;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (prev_data == nullptr) {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part);
float c_t = i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
} else {
for (index_t c = start; c < end; c += step) {
float i_part = input_data[c];
float f_part = input_data[c + cell_dim];
float c_part = input_data[c + 2 * cell_dim];
float o_part = input_data[c + 3 * cell_dim];
float c_prev = prev_data[c];
float w_ic = params_data[c];
float w_fc = params_data[c + params_stride];
float w_oc = params_data[c + params_stride * 2];
float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
float c_t =
f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
float o_t = ScalarSigmoid(o_part + w_oc * c_t);
float m_t = o_t * o_scale * std::tanh(c_t);
output_cell[c] = c_t;
output_data[c] = m_t;
}
}
}, 0, cell_dim, 1);
}
} // namespace ops
} // namespace mace
......
......@@ -26,10 +26,10 @@
namespace mace {
namespace ops {
template<typename T>
template<typename SrcT, typename DstT>
void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
const T *input,
T *output,
const SrcT *input,
DstT *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
......@@ -50,11 +50,11 @@ void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
}
template<>
inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
const index_t width) {
inline void TransposeNHWCToNCHWC3<float, float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
......@@ -91,10 +91,10 @@ inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1);
}
template<typename T>
template<typename SrcT, typename DstT>
void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
const T *input,
T *output,
const SrcT *input,
DstT *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
......@@ -115,11 +115,11 @@ void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
}
template<>
inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
const index_t width) {
inline void TransposeNCHWToNHWCC2<float, float>(utils::ThreadPool *thread_pool,
const float *input,
float *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
......@@ -155,15 +155,15 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
}, 0, height, 1);
}
template<typename T>
template<typename SrcT, typename DstT>
MaceStatus Transpose(utils::ThreadPool *thread_pool,
const T *input,
const SrcT *input,
const std::vector<int64_t> &input_shape,
const std::vector<int> &dst_dims,
T *output) {
DstT *output) {
MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
(input_shape.size() == 3 && dst_dims.size() == 3) ||
(input_shape.size() == 4 && dst_dims.size() == 4),
(input_shape.size() == 3 && dst_dims.size() == 3) ||
(input_shape.size() == 4 && dst_dims.size() == 4),
"Only support 2D, 3D or 4D transpose");
std::vector<index_t> output_shape;
......@@ -220,7 +220,6 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t height = input_shape[1];
index_t width = input_shape[2];
index_t channel = input_shape[3];
size_t channel_raw_size = channel * sizeof(T);
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = std::max(static_cast<index_t>(1),
......@@ -232,9 +231,11 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
memcpy(output + (tile_j * stride_i + tile_i) * channel,
input + (tile_i * stride_j + tile_j) * channel,
channel_raw_size);
auto output_ptr = output + (tile_j * stride_i + tile_i) * channel;
auto input_ptr = input + (tile_i * stride_j + tile_j) * channel;
for (index_t k = 0; k < channel; ++k) {
output_ptr[k] = input_ptr[k];
}
}
}
}
......@@ -296,14 +297,15 @@ MaceStatus Transpose(utils::ThreadPool *thread_pool,
}
}, 0, batch, 1, 0, height, tile_size, 0, width, tile_size);
} else if (dst_dims == std::vector<int>{1, 0, 2}) {
size_t width_raw_size = width * sizeof(T);
thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (int i = start0; i < end0; i += step0) {
for (int j = start1; j < end1; j += step1) {
memcpy(output + (j * batch + i) * width,
input + (i * height + j) * width,
width_raw_size);
auto output_ptr = output + (j * batch + i) * width;
auto input_ptr = input + (i * height + j) * width;
for (index_t k = 0; k < width; ++k) {
output_ptr[k] = input_ptr[k];
}
}
}
}, 0, batch, 1, 0, height, 1);
......
......@@ -225,6 +225,8 @@ class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
void RegisterConcat(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
DeviceType::CPU, int32_t);
......
......@@ -53,15 +53,16 @@ namespace ops {
template<DeviceType D, class T>
class Conv2dOp;
template<>
class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
template<class T>
class Conv2dOp<DeviceType::CPU, T> : public ConvPool2dOpBase {
public:
explicit Conv2dOp(OpConstructContext *context)
: ConvPool2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(
ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
......@@ -71,7 +72,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
0.0f)))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -81,9 +82,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
Tensor *output = this->Output(OUTPUT);
if (conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, T, kCpuImplType);
if (kCpuImplType == NEON) {
// the following params are used to decide which conv delegator to use
const index_t stride_h = strides_[0];
const index_t stride_w = strides_[1];
......@@ -98,63 +98,63 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// We do not support changeable filter for now.
if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x1);
} else if (filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
if (input_channels >= 8 && channels >= 8) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3Winograd);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3Winograd);
} else {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
}
} else if (filter_h == 3 && filter_w == 3
&& stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (filter_h == 5 && filter_w == 5
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K5x5S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K5x5S1);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S1);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 2 && stride_w == 2 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S2);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S2);
} else if (filter_h == 7 && filter_w == 7
&& stride_h == 3 && stride_w == 3 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x7S3);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x7S3);
} else if (filter_h == 1 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x7S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x7S1);
} else if (filter_h == 7 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K7x1S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K7x1S1);
} else if (filter_h == 1 && filter_w == 15
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K1x15S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K1x15S1);
} else if (filter_h == 15 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1
&& dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K15x1S1);
tag = MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, T,
kCpuImplType, K15x1S1);
}
}
delegator::Conv2dParam param(strides_, dilations_,
......@@ -497,8 +497,8 @@ class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
#endif // MACE_ENABLE_OPENCL
void RegisterConv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
......
......@@ -100,8 +100,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
input_data + (b + offsets[0]) * in_img_size +
(c + offsets[1]) * in_hw +
(h + offsets[2]) * input_shape[3] + offsets[3];
memcpy(out_ptr, in_ptr_bch,
output_shape[3] * sizeof(T));
memcpy(out_ptr, in_ptr_bch, output_shape[3] * sizeof(T));
}
}
}
......@@ -134,8 +133,8 @@ class CropOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterCrop(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Crop", CropOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
MACE_REGISTER_OP_CONDITION(
op_registry,
......
......@@ -143,8 +143,8 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
};
void RegisterCumsum(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Cumsum", CumsumOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -46,20 +46,21 @@ const std::vector<int> kDeconv2dStrides = {1, 1};
template<DeviceType D, class T>
class Deconv2dOp;
template<>
class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
template<class T>
class Deconv2dOp<DeviceType::CPU, T> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -80,11 +81,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
if (deconv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, T, kCpuImplType);
if (kCpuImplType == NEON) {
const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3);
......@@ -104,23 +103,23 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
strides_[0] == strides_[1] && strides_[0] == 2;
if (use_neon_2x2_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K2x2S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K2x2S1);
} else if (use_neon_2x2_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K2x2S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K2x2S2);
} else if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
}
delegator::Deconv2dParam param(strides_, kDeconv2dStrides, paddings_,
......@@ -236,8 +235,8 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
#endif // MACE_ENABLE_OPENCL
void RegisterDeconv2D(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
......
......@@ -27,7 +27,6 @@ namespace mace {
namespace ops {
enum ConvType {
General,
K1x1,
K1x7S1,
K7x1S1,
......
......@@ -21,12 +21,12 @@
#include "mace/core/ops/op_context.h"
#include "mace/core/ops/op_delegator.h"
#include "mace/core/registry/op_delegator_registry.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
namespace ops {
enum DeconvType {
General,
K2x2S1,
K2x2S2,
K3x3S1,
......
......@@ -28,8 +28,8 @@ namespace ops {
template<DeviceType D, class T>
class DepthToSpaceOp;
template<>
class DepthToSpaceOp<CPU, float> : public Operation {
template<class T>
class DepthToSpaceOp<CPU, T> : public Operation {
public:
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context),
......@@ -59,8 +59,8 @@ class DepthToSpaceOp<CPU, float> : public Operation {
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) {
......@@ -188,6 +188,8 @@ class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
void RegisterDepthToSpace(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthToSpace",
DepthToSpaceOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthToSpace",
......
......@@ -63,20 +63,21 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T>
class DepthwiseConv2dOp;
template<>
class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
template<class T>
class DepthwiseConv2dOp<DeviceType::CPU, T> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context),
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -93,9 +94,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
MACE_CHECK_NOTNULL(output);
if (depthwise_conv2d_delegator_ == nullptr) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
REF, General);
if (MACE_CPU_IMPL_TYPE == NEON) {
auto tag = MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
T, ImplType::REF);
if (kCpuImplType == NEON) {
const index_t filter_h = filter->dim(2);
const index_t filter_w = filter->dim(3);
const index_t stride_h = strides_[0];
......@@ -104,13 +105,13 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
const index_t dilation_w = dilations_[1];
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2
&& stride_w == 2
&& dilation_h == 1 && dilation_w == 1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
}
}
delegator::Conv2dParam param(strides_, dilations_,
......@@ -347,7 +348,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
#ifdef MACE_ENABLE_OPENCL
template<>
class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
class DepthwiseConv2dOp<DeviceType::GPU, float> :
public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
......@@ -402,6 +404,8 @@ class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
void RegisterDepthwiseConv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
......
......@@ -44,8 +44,8 @@ const std::vector<int> kDepthwiseStrides = {1, 1};
template<DeviceType D, class T>
class DepthwiseDeconv2dOp;
template<>
class DepthwiseDeconv2dOp<DeviceType::CPU, float>
template<class T>
class DepthwiseDeconv2dOp<DeviceType::CPU, T>
: public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
......@@ -53,12 +53,13 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
activation_delegator_(
delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU,
T, kCpuImplType),
delegator::ActivationParam(activation_, relux_max_limit_,
leakyrelu_coefficient_))),
bias_add_delegator_(delegator::BiasAdd::Create(
context->workspace(),
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -75,7 +76,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
bool is_depthwise = group_ == in_channels;
if (depthwise_deconv2d_delegator_ == nullptr) {
if (MACE_CPU_IMPL_TYPE == NEON) {
if (kCpuImplType == NEON) {
const index_t kernel_h = filter->dim(2);
const index_t kernel_w = filter->dim(3);
bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
......@@ -88,20 +89,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
strides_[0] == strides_[1] && strides_[0] == 2;
if (is_depthwise) {
std::string tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
auto tag = MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType);
if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
delegator::DepthwiseDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_,
......@@ -109,20 +110,20 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
depthwise_deconv2d_delegator_ = delegator::DepthwiseDeconv2d::Create(
context->workspace(), tag, param);
} else {
std::string tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, General);
auto tag = MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType);
if (use_neon_3x3_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S1);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S1);
} else if (use_neon_3x3_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K3x3S2);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K3x3S2);
} else if (use_neon_4x4_s1) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S1);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S1);
} else if (use_neon_4x4_s2) {
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, CPU, float,
MACE_CPU_IMPL_TYPE, K4x4S2);
tag = MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, T,
kCpuImplType, K4x4S2);
}
delegator::GroupDeconv2dParam param(strides_, kDepthwiseStrides,
paddings_, padding_type_,
......@@ -218,6 +219,8 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
void RegisterDepthwiseDeconv2d(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DepthwiseDeconv2d",
DepthwiseDeconv2dOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
}
......
......@@ -75,7 +75,7 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Operation::GetRepeatedArgs<index_t>("out_cache_indexes")),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, T, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
inline void Validate() {
......@@ -107,14 +107,14 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
") should be greater than zero.");
}
void UpdateCell(float *cell_data,
void UpdateCell(T *cell_data,
const index_t cell_dim,
const float scale) {
if (std::abs(scale - 1.f) < 1e-6)
return;
const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(cell_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
......@@ -130,18 +130,18 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
}
}
void CopyAndUpdateCell(float *src_data,
void CopyAndUpdateCell(T *src_data,
const index_t cell_dim,
const float scale,
float *cell_data) {
T *cell_data) {
if (std::abs(scale - 1.f) < 1e-6) {
memcpy(cell_data, src_data, cell_dim * sizeof(float));
memcpy(cell_data, src_data, cell_dim * sizeof(T));
return;
}
const index_t rounds = cell_dim / 4;
for (index_t i = 0; i < rounds * 4; i += 4) {
#ifdef MACE_ENABLE_NEON
#if defined(MACE_ENABLE_NEON) and not defined(MACE_ENABLE_BFLOAT16)
float32x4_t in_vec = vld1q_f32(src_data + i);
float32x4_t scale_vec = vdupq_n_f32(scale);
in_vec = vmulq_f32(in_vec, scale_vec);
......@@ -222,49 +222,54 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
<< " output_dim: " << output_dim;
const index_t affine_a_in_size =
PadAlignSize(affine_a_in_dim * sizeof(float));
PadAlignSize(affine_a_in_dim * sizeof(T));
const index_t affine_a_out_size =
PadAlignSize(affine_a_out_dim * sizeof(float));
PadAlignSize(affine_a_out_dim * sizeof(T));
const index_t affine_b_in_size =
PadAlignSize(affine_b_in_dim * sizeof(float));
PadAlignSize(affine_b_in_dim * sizeof(T));
const index_t affine_b_out_size =
PadAlignSize(affine_b_out_dim * sizeof(float));
PadAlignSize(affine_b_out_dim * sizeof(T));
const int out_buf_chunk = abs(prev_out_delay_ / subsample_factor_);
const int cell_buf_chunk = abs(prev_cell_delay_ / subsample_factor_);
const index_t out_buf_size =
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(float));
PadAlignSize(out_buf_chunk * prev_out_dim_ * sizeof(T));
const index_t cell_buf_size =
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(float));
PadAlignSize(cell_buf_chunk * prev_cell_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(affine_a_in_size + affine_a_out_size
+ affine_b_in_size + affine_b_out_size
+ out_buf_size + cell_buf_size);
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DT_FLOAT);
Tensor prev_out_buf(scratch->Scratch(out_buf_size), DataTypeToEnum<T>::v());
prev_out_buf.Reshape({out_buf_chunk, prev_out_dim_});
float *prev_out_buf_data = prev_out_buf.mutable_data<float>();
T *prev_out_buf_data = prev_out_buf.mutable_data<T>();
Tensor prev_cell_buf(scratch->Scratch(cell_buf_size), DT_FLOAT);
Tensor prev_cell_buf(
scratch->Scratch(cell_buf_size), DataTypeToEnum<T>::v());
prev_cell_buf.Reshape({cell_buf_chunk, prev_cell_dim_});
float *prev_cell_buf_data = prev_cell_buf.mutable_data<float>();
T *prev_cell_buf_data = prev_cell_buf.mutable_data<T>();
Tensor affine_a_in(scratch->Scratch(affine_a_in_size), DT_FLOAT);
Tensor affine_a_in(
scratch->Scratch(affine_a_in_size), DataTypeToEnum<T>::v());
affine_a_in.Reshape({1, affine_a_in_dim});
float *affine_a_in_data = affine_a_in.mutable_data<float>();
T *affine_a_in_data = affine_a_in.mutable_data<T>();
Tensor affine_a_out(scratch->Scratch(affine_a_out_size), DT_FLOAT);
Tensor affine_a_out(
scratch->Scratch(affine_a_out_size), DataTypeToEnum<T>::v());
affine_a_out.Reshape({1, affine_a_out_dim});
float *affine_a_out_data = affine_a_out.mutable_data<float>();
T *affine_a_out_data = affine_a_out.mutable_data<T>();
Tensor affine_b_in(scratch->Scratch(affine_b_in_size), DT_FLOAT);
Tensor affine_b_in(
scratch->Scratch(affine_b_in_size), DataTypeToEnum<T>::v());
affine_b_in.Reshape({1, affine_b_in_dim});
float *affine_b_in_data = affine_b_in.mutable_data<float>();
T *affine_b_in_data = affine_b_in.mutable_data<T>();
Tensor affine_b_out(scratch->Scratch(affine_b_out_size), DT_FLOAT);
Tensor affine_b_out(
scratch->Scratch(affine_b_out_size), DataTypeToEnum<T>::v());
affine_b_out.Reshape({1, affine_b_out_dim});
float *affine_b_out_data = affine_b_out.mutable_data<float>();
T *affine_b_out_data = affine_b_out.mutable_data<T>();
Tensor *output = this->Output(OUTPUT);
Tensor *out_cache = this->Output(OUT_CACHE);
......@@ -293,31 +298,31 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard out_cache_guard(out_cache);
Tensor::MappingGuard cell_cache_guard(cell_cache);
const float *input_data = input->data<float>();
const float *prev_out_data = prev_out->data<float>();
const float *prev_cell_data = prev_cell->data<float>();
const float *lstm_params_data = lstm_params->data<float>();
float *output_data = output->mutable_data<float>();
float *out_cache_data = out_cache->mutable_data<float>();
float *cell_cache_data = cell_cache->mutable_data<float>();
const T *input_data = input->data<T>();
const T *prev_out_data = prev_out->data<T>();
const T *prev_cell_data = prev_cell->data<T>();
const T *lstm_params_data = lstm_params->data<T>();
T *output_data = output->mutable_data<T>();
T *out_cache_data = out_cache->mutable_data<T>();
T *cell_cache_data = cell_cache->mutable_data<T>();
for (int b = 0; b < batch; ++b) {
memcpy(prev_out_buf_data,
prev_out_data + b * out_buf_chunk * prev_out_dim_,
sizeof(float) * out_buf_chunk * prev_out_dim_);
sizeof(T) * out_buf_chunk * prev_out_dim_);
memcpy(prev_cell_buf_data,
prev_cell_data + b * cell_buf_chunk * prev_cell_dim_,
sizeof(float) * cell_buf_chunk * prev_cell_dim_);
sizeof(T) * cell_buf_chunk * prev_cell_dim_);
for (index_t i = 0; i < out_chunk; ++i) {
const float *input_ptr =
const T *input_ptr =
input_data + (b * chunk + forward_indexes_[i]) * input_dim;
float *output_ptr = output_data + (b * out_chunk + i) * output_dim;
T *output_ptr = output_data + (b * out_chunk + i) * output_dim;
// Append
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(float));
memcpy(affine_a_in_data, input_ptr, input_dim * sizeof(T));
memcpy(affine_a_in_data + input_dim,
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_,
prev_out_dim_ * sizeof(float));
prev_out_dim_ * sizeof(T));
// Affine
gemv_->Compute(context,
weights_a,
......@@ -330,11 +335,11 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
false,
&affine_a_out);
// Prepare LSTMNonlinear input and output pointer
float *lstm_cell_ptr =
T *lstm_cell_ptr =
prev_cell_buf_data + i % cell_buf_chunk * prev_cell_dim_;
float *curr_cell_ptr = lstm_cell_ptr;
T *curr_cell_ptr = lstm_cell_ptr;
// LSTMNonlinear
LSTMNonlinearKernel(context,
LSTMNonlinearKernel<T>(context,
affine_a_out_data,
lstm_cell_ptr,
nullptr,
......@@ -359,9 +364,9 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
// Output
memcpy(output_ptr,
affine_b_out_data,
output_dim * sizeof(float));
output_dim * sizeof(T));
// Update
float *curr_out_ptr =
T *curr_out_ptr =
prev_out_buf_data + i % out_buf_chunk * prev_out_dim_;
CopyAndUpdateCell(affine_b_out_data + prev_out_offset_,
prev_out_dim_,
......@@ -371,22 +376,22 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
for (size_t k = 0; k < out_cache_indexes_.size(); ++k) {
if (i == out_cache_indexes_[k]) {
const index_t idx = b * out_buf_chunk + k;
float *out_cache_ptr =
T *out_cache_ptr =
out_cache_data + idx * prev_out_dim_;
memcpy(out_cache_ptr,
curr_out_ptr,
sizeof(float) * prev_out_dim_);
sizeof(T) * prev_out_dim_);
}
}
for (size_t k = 0; k < cell_cache_indexes_.size(); ++k) {
if (i == cell_cache_indexes_[k]) {
const index_t idx = b * cell_buf_chunk + k;
float *cell_cache_ptr =
T *cell_cache_ptr =
cell_cache_data + idx * prev_cell_dim_;
memcpy(cell_cache_ptr,
curr_cell_ptr,
sizeof(float) * prev_cell_dim_);
sizeof(T) * prev_cell_dim_);
}
}
}
......@@ -416,6 +421,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
void RegisterDynamicLSTM(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "DynamicLSTM", DynamicLSTMOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -1073,7 +1073,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
"scalar_input_index", 1)),
eltwise_delegator_(delegator::Eltwise::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, kCpuImplType),
delegator::EltwiseParam(
static_cast<ops::EltwiseType>(
Operation::GetOptionalArg<int>(
......@@ -1175,8 +1175,8 @@ class EltwiseOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterEltwise(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
DeviceType::CPU, int32_t);
......
......@@ -57,6 +57,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
void RegisterExpandDims(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, int32_t);
......
......@@ -89,15 +89,16 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
output_shape[dim_size - 2] = output_chunk;
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(float));
const index_t extract_out_size = PadAlignSize(output_dim * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(extract_out_size);
Tensor extract_out(scratch->Scratch(extract_out_size), DT_FLOAT);
Tensor extract_out(
scratch->Scratch(extract_out_size), DataTypeToEnum<T>::v());
extract_out.Reshape({1, output_dim});
extract_out.Clear();
float *extract_out_data = extract_out.mutable_data<float>();
T *extract_out_data = extract_out.mutable_data<T>();
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
......@@ -162,7 +163,7 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
}, 0, input_dim, 1);
}
memcpy(output_data + (b * output_chunk + i) * output_dim,
extract_out_data, output_dim * sizeof(float));
extract_out_data, output_dim * sizeof(T));
}
}
......@@ -180,6 +181,8 @@ class ExtractPoolingOp<DeviceType::CPU, T> : public Operation {
void RegisterExtractPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ExtractPooling", ExtractPoolingOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -22,8 +22,8 @@ namespace ops {
template <DeviceType D, class T>
class FillOp;
template <>
class FillOp<DeviceType::CPU, float> : public Operation {
template <class T>
class FillOp<DeviceType::CPU, T> : public Operation {
public:
explicit FillOp(OpConstructContext *context)
: Operation(context) {}
......@@ -46,11 +46,11 @@ class FillOp<DeviceType::CPU, float> : public Operation {
}
Tensor::MappingGuard value_guard(value);
const float *value_data = value->data<float>();
const T *value_data = value->data<T>();
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard output_guard(output);
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
std::fill(output_data, output_data + output->size(), *value_data);
......@@ -65,6 +65,7 @@ class FillOp<DeviceType::CPU, float> : public Operation {
void RegisterFill(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Fill", FillOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Fill", FillOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -56,20 +56,20 @@ class FullyConnectedOpBase : public Operation {
template<DeviceType D, class T>
class FullyConnectedOp;
template<>
class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
template<class T>
class FullyConnectedOp<DeviceType::CPU, T> : public FullyConnectedOpBase {
public:
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context),
activation_delegator_(delegator::Activation::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Activation, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, T, kCpuImplType),
delegator::ActivationParam(activation_,
relux_max_limit_,
leakyrelu_coefficient_))),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -127,7 +127,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
: FullyConnectedOpBase(context),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -226,6 +226,8 @@ class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
void RegisterFullyConnected(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "FullyConnected",
FullyConnectedOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "FullyConnected",
......
......@@ -89,6 +89,7 @@ class GatherOp : public Operation {
void RegisterGather(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Gather", GatherOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
......
......@@ -36,6 +36,8 @@ class IdentityOp : public Operation {
void RegisterIdentity(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
......
......@@ -166,6 +166,7 @@ class IfDefinedOp<DeviceType::CPU, T> : public Operation {
void RegisterIfDefined(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "IfDefined", IfDefinedOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "IfDefined", IfDefinedOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -105,6 +105,8 @@ class InferConv2dShapeOp : public Operation {
void RegisterInferConv2dShape(OpRegistry *op_registry) {
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU);
MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
InferConv2dShapeOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
......
......@@ -28,8 +28,8 @@ namespace ops {
template <DeviceType D, class T>
class KaldiBatchNormOp;
template <>
class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
template <class T>
class KaldiBatchNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit KaldiBatchNormOp(OpConstructContext *context)
: Operation(context),
......@@ -40,13 +40,13 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
test_mode_(static_cast<bool>(
Operation::GetOptionalArg<int>("test_mode", 0))) {}
void CalculateMeanVar(const float *input_data,
void CalculateMeanVar(const T *input_data,
index_t length,
index_t stride,
float mean_scale,
float var_scale,
float *mean_data,
float *var_data) {
T *mean_data,
T *var_data) {
float mean_value = 0.f;
float var_value = 0.f;
for (index_t i = 0; i < length; ++i) {
......@@ -84,8 +84,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
......@@ -102,8 +102,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
&& scale->size() == block_dim_);
Tensor::MappingGuard scale_guard(scale);
Tensor::MappingGuard offset_guard(offset);
const float *scale_data = scale->data<float>();
const float *offset_data = offset->data<float>();
const T *scale_data = scale->data<T>();
const T *offset_data = offset->data<T>();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
......@@ -116,18 +116,18 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
}, 0, num_rows, 1, 0, block_dim_, 1);
} else {
const index_t buf_size =
PadAlignSize(block_dim_ * sizeof(float));
PadAlignSize(block_dim_ * sizeof(T));
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(2 * buf_size);
Tensor mean(scratch->Scratch(buf_size), DT_FLOAT);
Tensor mean(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
mean.Reshape({block_dim_});
float *mean_data = mean.mutable_data<float>();
T *mean_data = mean.mutable_data<T>();
Tensor var(scratch->Scratch(buf_size), DT_FLOAT);
Tensor var(scratch->Scratch(buf_size), DataTypeToEnum<T>::v());
var.Reshape({block_dim_});
float *var_data = var.mutable_data<float>();
T *var_data = var.mutable_data<T>();
float var_scale = 1.0f / (target_rms_ * target_rms_);
float mean_scale = 1.0f / num_rows;
......@@ -171,6 +171,8 @@ class KaldiBatchNormOp<DeviceType::CPU, float> : public Operation {
void RegisterKaldiBatchNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "KaldiBatchNorm", KaldiBatchNormOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -24,8 +24,8 @@ namespace ops {
template<DeviceType D, class T>
class LocalResponseNormOp;
template<>
class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class LocalResponseNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit LocalResponseNormOp(OpConstructContext *context)
: Operation(context),
......@@ -49,8 +49,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
const index_t height = input->dim(2);
const index_t width = input->dim(3);
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
const index_t image_size = height * width;
const index_t batch_size = channels * image_size;
......@@ -95,6 +95,8 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
void RegisterLocalResponseNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LocalResponseNorm",
LocalResponseNormOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -35,8 +35,8 @@ namespace ops {
template<DeviceType D, typename T>
class LpNormOp;
template<>
class LpNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class LpNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit LpNormOp(OpConstructContext *context)
: Operation(context),
......@@ -59,8 +59,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>();
auto *output_data = output->mutable_data<float>();
const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
auto outer_loop = std::accumulate(input_shape.begin(),
......@@ -95,7 +95,8 @@ class LpNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i;
norm_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f);
output_data_base + inner_loop,
static_cast<T>(0.0f));
norm_ptr[i] = std::pow(norm_ptr[i], power);
norm_ptr[i] += 1e-6;
}
......@@ -151,6 +152,8 @@ class LpNormOp<DeviceType::GPU, float> : public Operation {
void RegisterLpNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LpNorm", LpNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp);
}
......
......@@ -70,27 +70,27 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard params_guard(params);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<T>();
const float *params_data = params->data<T>();
float *output_data = output->mutable_data<T>();
const T *input_data = input->data<T>();
const T *params_data = params->data<T>();
T *output_data = output->mutable_data<T>();
for (int r = 0; r < num_rows; ++r) {
const float *input_row = input_data + r * input_cols;
const float *prev_row = input_row + 4 * cell_dim;
const float *scale_data =
const T *input_row = input_data + r * input_cols;
const T *prev_row = input_row + 4 * cell_dim;
const T *scale_data =
embed_scales ? prev_row + cell_dim : nullptr;
float *output_cell = output_data + r * output_dim;
float *output_row = output_cell + cell_dim;
LSTMNonlinearKernel(context,
input_row,
prev_row,
scale_data,
params_data,
embed_scales,
params_stride,
cell_dim,
output_cell,
output_row);
T *output_cell = output_data + r * output_dim;
T *output_row = output_cell + cell_dim;
LSTMNonlinearKernel<T>(context,
input_row,
prev_row,
scale_data,
params_data,
embed_scales,
params_stride,
cell_dim,
output_cell,
output_row);
}
return MaceStatus::MACE_SUCCESS;
......@@ -104,6 +104,8 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
void RegisterLSTMNonlinear(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "LSTMNonlinear", LSTMNonlinearOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -92,18 +92,18 @@ class MatMulOpBase : public Operation {
template<DeviceType D, class T>
class MatMulOp;
template<>
class MatMulOp<CPU, float> : public MatMulOpBase {
template<class T>
class MatMulOp<CPU, T> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context),
gemm_(delegator::Gemm::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemm, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, T, kCpuImplType),
delegator::GemmParam())),
gemv_(delegator::Gemv::Create(
context->workspace(),
MACE_DELEGATOR_KEY(Gemv, CPU, float, MACE_CPU_IMPL_TYPE),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, T, kCpuImplType),
DelegatorParam())) {}
MaceStatus Run(OpContext *context) override {
......@@ -197,8 +197,8 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
"bias' dim should be <= 2.");
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard c_guard(C);
const float *bias_data = bias->data<float>();
float *c_data = C->mutable_data<float>();
const T *bias_data = bias->data<T>();
T *c_data = C->mutable_data<T>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
......@@ -599,6 +599,8 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
void RegisterMatMul(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
......
......@@ -30,8 +30,8 @@ namespace ops {
template<DeviceType D, typename T>
class MVNormOp;
template<>
class MVNormOp<DeviceType::CPU, float> : public Operation {
template<class T>
class MVNormOp<DeviceType::CPU, T> : public Operation {
public:
explicit MVNormOp(OpConstructContext *context)
: Operation(context),
......@@ -52,8 +52,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_output(output);
const auto *input_data = input->data<float>();
auto *output_data = output->mutable_data<float>();
const auto *input_data = input->data<T>();
auto *output_data = output->mutable_data<T>();
const auto input_size = input->size();
const auto outer_loop =
......@@ -71,7 +71,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
const auto offset = inner_loop * i;
mean_ptr[i] = std::accumulate(input_data + offset,
input_data + offset + inner_loop, 0.0f);
input_data + offset + inner_loop,
static_cast<T>(0.0f));
mean_ptr[i] /= inner_loop;
}
}, 0, outer_loop, 1);
......@@ -105,7 +106,8 @@ class MVNormOp<DeviceType::CPU, float> : public Operation {
for (index_t i = start; i < end; i += step) {
auto output_data_base = output_data + inner_loop * i;
mean_v_ptr[i] = std::accumulate(output_data_base,
output_data_base + inner_loop, 0.0f);
output_data_base + inner_loop,
static_cast<T>(0.0f));
mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_;
}
}, 0, outer_loop, 1);
......@@ -169,6 +171,8 @@ class MVNormOp<DeviceType::GPU, float> : public Operation {
void RegisterMVNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "MVNorm", MVNormOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp);
}
......
......@@ -39,10 +39,10 @@ class OneHotOpBase : public Operation {
int axis_;
};
template <DeviceType D, typename T>
template<DeviceType D, typename T>
class OneHotOp;
template <typename T>
template<typename T>
class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
public:
explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {}
......@@ -81,15 +81,17 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (axis == 1) {
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < depth_; ++j) {
output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ :
off_value_;
float input_value = input_ptr[i];
output_ptr[i * depth_ + j] =
input_value == j ? on_value_ : off_value_;
}
}
} else {
for (index_t i = 0; i < depth_; ++i) {
for (index_t j = 0; j < batch; ++j) {
output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ :
off_value_;
float input_value = input_ptr[j];
output_ptr[i * batch + j] =
input_value == i ? on_value_ : off_value_;
}
}
}
......@@ -110,7 +112,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) {
for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == i ? on_value_ : off_value_;
float input_value = **input_ptr;
**output_ptr = input_value == i ? on_value_ : off_value_;
++(*output_ptr);
}
......@@ -130,7 +133,8 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
if (left == 0) {
for (index_t i = 0; i < length; ++i) {
**output_ptr = **input_ptr == test ? on_value_ : off_value_;
float input_value = **input_ptr;
**output_ptr = input_value == test ? on_value_ : off_value_;
++(*output_ptr);
++(*input_ptr);
}
......@@ -144,9 +148,9 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
}
};
void RegisterOneHot(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU);
}
} // namespace ops
......
......@@ -200,8 +200,8 @@ class PadOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterPad(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pad", PadOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Pad", PadOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Pad", PadOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
}
......
......@@ -87,6 +87,8 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
void RegisterPadContext(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PadContext", PadContextOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -80,7 +80,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
T value =
(std::fabs(in_base[g])
......@@ -99,9 +99,9 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
temp_result += std::abs(in_base[g]);;
temp_result += std::abs(in_base[g]);
}
out_base[j] = temp_result;
}
......@@ -114,7 +114,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
for (index_t j = start1; j < end1; j += step1) {
const T *in_base = input_data + i * input_dim + j * group_size;
T *out_base = output_data + i * output_dim_;
T temp_result = 0;
T temp_result = 0.f;
for (index_t g = 0; g < group_size; ++g) {
temp_result += in_base[g] * in_base[g];
}
......@@ -136,6 +136,8 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
void RegisterPNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PNorm", PNormOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -60,8 +60,8 @@ class PoolingOpBase : public ConvPool2dOpBase {
template<DeviceType D, class T>
class PoolingOp;
template<>
class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
template<class T>
class PoolingOp<DeviceType::CPU, T> : public PoolingOpBase {
public:
explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) {}
......@@ -93,8 +93,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
Tensor::MappingGuard input_guard(input_tensor);
Tensor::MappingGuard output_guard(output_tensor);
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>();
const index_t *input_shape = input_tensor->shape().data();
int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
......@@ -127,14 +127,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
private:
void MaxPooling(const OpContext *context,
const float *input,
const T *input,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
T *output) {
const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
......@@ -184,14 +184,14 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
}
void AvgPooling(const OpContext *context,
const float *input,
const T *input,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
T *output) {
const index_t batch = out_shape[0];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
......@@ -514,6 +514,8 @@ class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
void RegisterPooling(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Pooling", PoolingOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
......
......@@ -148,6 +148,8 @@ class PriorBoxOp : public Operation {
void RegisterPriorBox(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "PriorBox", PriorBoxOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -145,7 +145,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
MACE_UNUSED(context);
if (reduce_first_axis_) {
if (type == ReduceType::MEAN) {
T tmp = 0;
T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i];
}
......@@ -169,7 +169,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
output[0] = tmp;
} else if (type == ReduceType::SUM) {
T tmp = 0;
T tmp = 0.f;
for (int i = 0; i < data_reshape_[0]; ++i) {
tmp = tmp + input[i];
}
......@@ -193,7 +193,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i];
}
......@@ -225,7 +225,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
} else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[0]; ++j) {
tmp += input[j * data_reshape_[1] + i];
}
......@@ -239,7 +239,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
if (type == ReduceType::MEAN) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j];
}
......@@ -271,7 +271,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
}
} else if (type == ReduceType::SUM) {
for (index_t i = start; i < end; i += step) {
T tmp = 0;
T tmp = 0.f;
for (int j = 0; j < data_reshape_[1]; ++j) {
tmp += input[i * data_reshape_[1] + j];
}
......@@ -335,9 +335,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
T tmp = 1;
for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[0]; ++k) {
tmp *=
input[(k * data_reshape_[1] + i) * data_reshape_[2]
+ j];
tmp *= input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
}
}
output[i] = tmp;
......@@ -1036,6 +1034,8 @@ class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
void RegisterReduce(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::CPU, int);
#ifdef MACE_ENABLE_QUANTIZE
......
......@@ -20,6 +20,7 @@ namespace mace {
namespace ops {
namespace ref {
template<typename T>
class Activation : public delegator::Activation {
public:
explicit Activation(const delegator::ActivationParam &param)
......@@ -34,9 +35,10 @@ class Activation : public delegator::Activation {
Tensor *output);
};
MaceStatus Activation::Compute(const OpContext *context,
const Tensor *input,
Tensor *output) {
template<typename T>
MaceStatus Activation<T>::Compute(const OpContext *context,
const Tensor *input,
Tensor *output) {
Tensor::MappingGuard input_guard(input);
if (input != output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
......@@ -49,12 +51,13 @@ MaceStatus Activation::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
void Activation::DoActivation(const OpContext *context,
const Tensor *input,
Tensor *output) {
template<typename T>
void Activation<T>::DoActivation(const OpContext *context,
const Tensor *input,
Tensor *output) {
MACE_UNUSED(context);
auto input_ptr = input->data<float>();
auto output_ptr = output->mutable_data<float>();
auto input_ptr = input->data<T>();
auto output_ptr = output->mutable_data<T>();
const index_t size = input->size();
switch (type_) {
......@@ -77,7 +80,7 @@ void Activation::DoActivation(const OpContext *context,
case LEAKYRELU: {
for (index_t i = 0; i < size; ++i) {
*output_ptr =
std::max(*input_ptr, 0.f)
std::max<float>(*input_ptr, 0.f)
+ std::min(*input_ptr, 0.f) * leakyrelu_coefficient_;
++input_ptr;
++output_ptr;
......@@ -107,8 +110,14 @@ void Activation::DoActivation(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, CPU, float, REF))
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation<float>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Activation<BFloat16>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
......@@ -18,6 +18,7 @@ namespace mace {
namespace ops {
namespace ref {
template<typename T>
class BiasAdd : public delegator::BiasAdd {
public:
explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
......@@ -31,10 +32,11 @@ class BiasAdd : public delegator::BiasAdd {
const Tensor *bias, Tensor *output);
};
MaceStatus BiasAdd::Compute(const OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
template<typename T>
MaceStatus BiasAdd<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard bias_guard(bias);
if (input != output) {
......@@ -54,14 +56,15 @@ MaceStatus BiasAdd::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
void BiasAdd::AddBias(const OpContext *context,
const Tensor *input,
const Tensor *bias,
mace::Tensor *output) {
template<typename T>
void BiasAdd<T>::AddBias(const OpContext *context,
const Tensor *input,
const Tensor *bias,
mace::Tensor *output) {
MACE_UNUSED(context);
auto input_data = input->data<float>();
auto bias_data = bias->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto bias_data = bias->data<T>();
auto output_data = output->mutable_data<T>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
......@@ -84,8 +87,14 @@ void BiasAdd::AddBias(const OpContext *context,
}
}
MACE_REGISTER_DELEGATOR(registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, CPU, float, REF))
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd<float>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, BiasAdd<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
......@@ -12,19 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/conv_2d.h"
#include <vector>
#include "mace/ops/delegator/conv_2d.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Conv2d<float>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
template<typename T>
class Conv2d : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus Conv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
MACE_UNUSED(context);
const std::vector<index_t> in_shape = input->shape();
......@@ -62,9 +75,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; ++m) {
......@@ -74,7 +87,7 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3];
const index_t in_channels = filter_shape[1];
float *out_ptr_base =
T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) {
......@@ -82,9 +95,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
float sum = 0;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr =
const T *filter_ptr =
filter_data + m * in_channels * filter_size + c * filter_size;
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
......@@ -94,7 +107,9 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const index_t
iw = -pad_left + w * strides_[1] + kw * dilations_[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
float input_value = in_ptr_base[ih * in_width + iw];
float filter_value = filter_ptr[kw];
sum += input_value * filter_value;
}
} // kw
filter_ptr += filter_shape[3];
......@@ -109,9 +124,14 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef Conv2d<float> Conv2dRef;
MACE_REGISTER_DELEGATOR(registry, Conv2dRef, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, CPU, float, REF, General))
void RegisterConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2d<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2d<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_CONV_2D_H_
#define MACE_OPS_REF_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Conv2d : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class Conv2d<float> : public delegator::Conv2d {
public:
explicit Conv2d(const delegator::Conv2dParam &param)
: delegator::Conv2d(param) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_CONV_2D_H_
......@@ -16,18 +16,36 @@
#include <memory>
#include <functional>
#include <vector>
#include "mace/ops/ref/deconv_2d.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/utils/memory.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Deconv2d<float>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
template<typename T>
class Deconv2d : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus Deconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context);
std::vector<index_t> out_shape;
......@@ -65,15 +83,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -88,10 +105,10 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -122,7 +139,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) {
for (index_t oc = 0; oc < out_channels; ++oc) {
float *out_base =
T *out_base =
pad_out_data + (b * out_channels + oc) * out_img_size;
for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) {
......@@ -148,13 +165,13 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < out_channels; ++j) {
for (index_t k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -162,10 +179,14 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef Deconv2d<float> Deconv2dRef;
MACE_REGISTER_DELEGATOR(
registry, Deconv2dRef, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, CPU, float, REF, General))
void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2d<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Deconv2d<BFloat16>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DECONV_2D_H_
#define MACE_OPS_REF_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Deconv2d : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class Deconv2d<float> : public delegator::Deconv2d {
public:
explicit Deconv2d(const delegator::Deconv2dParam &param)
: delegator::Deconv2d(param) {}
~Deconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DECONV_2D_H_
......@@ -12,19 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/depthwise_conv_2d.h"
#include <vector>
#include "mace/ops/delegator/depthwise_conv_2d.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
template<typename T>
class DepthwiseConv2d : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseConv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
MACE_UNUSED(context);
const std::vector<index_t> in_shape = input->shape();
......@@ -65,9 +78,9 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto output_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto output_data = output->mutable_data<T>();
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < out_shape[1]; ++m) {
......@@ -80,16 +93,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
float *out_ptr_base =
T *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
float sum = 0;
const float *in_ptr_base =
const T *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr =
const T *filter_ptr =
filter_data + multi_index * in_channels * filter_size
+ c * filter_size;
......@@ -115,10 +128,16 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef DepthwiseConv2d<float> DepthwiseConv2dRef;
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dRef, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, CPU, float, REF, General))
void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2d<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseConv2d<BFloat16>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY(DepthwiseConv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class DepthwiseConv2d : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
template<>
class DepthwiseConv2d<float> : public delegator::DepthwiseConv2d {
public:
explicit DepthwiseConv2d(const delegator::DepthwiseConv2dParam &param)
: delegator::DepthwiseConv2d(param) {}
~DepthwiseConv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_CONV_2D_H_
......@@ -15,18 +15,52 @@
#include <utility>
#include <memory>
#include <functional>
#include "mace/ops/ref/depthwise_deconv_2d.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/utils/memory.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
template<typename T>
class GroupDeconv2d : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
class DepthwiseDeconv2d : public GroupDeconv2d<T> {
public:
explicit DepthwiseDeconv2d<T>(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d<T>(param) {}
~DepthwiseDeconv2d<T>() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename T>
MaceStatus DepthwiseDeconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context);
std::vector<index_t> out_shape;
......@@ -41,15 +75,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::vector<int> out_pad_size;
CalDeconvOutputShapeAndPadSize(input->shape(),
filter->shape(),
strides_,
padding_type_,
paddings_,
GroupDeconv2d<T>::strides_,
GroupDeconv2d<T>::padding_type_,
GroupDeconv2d<T>::paddings_,
input->dim(1),
&out_shape,
nullptr,
&out_pad_size,
&padded_out_shape,
framework_type_,
GroupDeconv2d<T>::framework_type_,
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......@@ -64,15 +98,14 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -87,10 +120,10 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -119,15 +152,15 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
float *out_base =
T *out_base =
pad_out_data + (b * channels + c) * out_img_size;
for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) {
const index_t out_offset =
i * strides_[0] * pad_out_width + j * strides_[1];
const index_t out_offset = i * GroupDeconv2d<T>::strides_[0] *
pad_out_width + j * GroupDeconv2d<T>::strides_[1];
const index_t input_idx =
(b * channels + c) * in_img_size + i * in_width + j;
const float val = input_data[input_idx];
const T val = input_data[input_idx];
const index_t kernel_offset = c * kernel_size;
for (int k = 0; k < kernel_size; ++k) {
const index_t out_idx = out_offset + index_map[k];
......@@ -143,13 +176,13 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < channels; ++j) {
for (index_t k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -157,11 +190,12 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
template<typename T>
MaceStatus GroupDeconv2d<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
MACE_UNUSED(context);
std::vector<index_t> out_shape;
......@@ -199,15 +233,14 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
std::multiplies<index_t>()) * sizeof(T);
ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size);
scratch->GrowSize(scratch_size);
std::unique_ptr<Tensor>
padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
std::unique_ptr<Tensor> padded_out(make_unique<Tensor>(
scratch->Scratch(scratch_size), DataTypeToEnum<T>::v()));
padded_out->Reshape(padded_out_shape);
padded_output = std::move(padded_out);
}
......@@ -222,10 +255,10 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto pad_out_data = out_tensor->mutable_data<float>();
auto out_data = output->mutable_data<float>();
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto pad_out_data = out_tensor->mutable_data<T>();
auto out_data = output->mutable_data<T>();
auto &in_shape = input->shape();
......@@ -288,13 +321,13 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < out_channels; ++j) {
for (int k = 0; k < out_height; ++k) {
const float *input_base =
const T *input_base =
pad_out_data
+ ((i * out_channels + j) * pad_out_height + (k + pad_top))
* pad_out_width;
float *output_base =
T *output_base =
out_data + ((i * out_channels + j) * out_height + k) * out_width;
memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
memcpy(output_base, input_base + pad_left, out_width * sizeof(T));
}
}
}
......@@ -302,10 +335,16 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
typedef DepthwiseDeconv2d<float> DepthwiseDeconv2dRef;
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dRef, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, CPU, float, REF, General))
void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2d<float>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseDeconv2d<BFloat16>, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class GroupDeconv2d : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<typename OUTPUT_TYPE>
class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2d &param)
: GroupDeconv2d<OUTPUT_TYPE>(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class GroupDeconv2d<float> : public delegator::GroupDeconv2d {
public:
explicit GroupDeconv2d(const delegator::GroupDeconv2dParam &param)
: delegator::GroupDeconv2d(param) {}
virtual ~GroupDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
template<>
class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
public:
explicit DepthwiseDeconv2d(const delegator::DepthwiseDeconv2dParam &param)
: GroupDeconv2d(param) {}
~DepthwiseDeconv2d() = default;
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
......@@ -12,56 +12,93 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ref/gemm.h"
#include "mace/ops/delegator/gemm.h"
namespace mace {
namespace ops {
namespace ref {
MaceStatus Gemm<float>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
template<typename T>
class Gemm : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<typename T>
MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>();
const float *rhs_data = rhs->data<float>();
float *output_data = output->mutable_data<float>();
const T *lhs_data = lhs->data<T>();
const T *rhs_data = rhs->data<T>();
T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) {
MatrixMap<const float>
MatrixMap<const T>
lhs_matrix
(lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
lhs_major,
rows,
depth);
MatrixMap<const float>
MatrixMap<const T>
rhs_matrix
(rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
rhs_major,
depth,
cols);
MatrixMap<float>
MatrixMap<T>
output_matrix(output_data + b * rows * cols, output_major, rows, cols);
for (index_t r = 0; r < rows; ++r) {
for (index_t c = 0; c < cols; ++c) {
float sum = 0;
for (index_t d = 0; d < depth; ++d) {
sum += lhs_matrix(r, d) * rhs_matrix(d, c);
sum += static_cast<float>(lhs_matrix(r, d)) *
static_cast<float>(rhs_matrix(d, c));
} // d
*output_matrix.data(r, c) = sum;
......@@ -72,20 +109,21 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemm<float>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
template<typename T>
MaceStatus Gemm<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
......@@ -96,24 +134,29 @@ MaceStatus Gemm<float>::Compute(const OpContext *context,
" vs. ",
depth2);
return Compute(context,
lhs,
rhs,
batch,
rows,
cols,
depth,
transpose_lhs ? ColMajor : RowMajor,
transpose_rhs ? ColMajor : RowMajor,
transpose_out ? ColMajor : RowMajor,
lhs_batched,
rhs_batched,
output);
return Gemm<T>::Compute(context,
lhs,
rhs,
batch,
rows,
cols,
depth,
transpose_lhs ? ColMajor : RowMajor,
transpose_rhs ? ColMajor : RowMajor,
transpose_out ? ColMajor : RowMajor,
lhs_batched,
rhs_batched,
output);
}
typedef Gemm<float> GemmRef;
MACE_REGISTER_DELEGATOR(registry, GemmRef, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, CPU, float, REF))
void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemm<float>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Gemm<BFloat16>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::REF));
}
} // namespace ref
} // namespace ops
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_GEMM_H_
#define MACE_OPS_REF_GEMM_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/matrix.h"
#include "mace/ops/delegator/gemm.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
template<typename OUTPUT_TYPE>
class Gemm : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<>
class Gemm<float> : public delegator::Gemm {
public:
explicit Gemm(const delegator::GemmParam &param) : delegator::Gemm(param) {}
~Gemm() {}
MaceStatus Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t rows,
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
// Original matrix before transpose has row-major
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const index_t batch,
const index_t lhs_rows,
const index_t lhs_cols,
const index_t rhs_rows,
const index_t rhs_cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool transpose_out,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_GEMM_H_
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "mace/ops/ref/gemv.h"
#include "mace/ops/delegator/gemv.h"
#if defined(MACE_ENABLE_QUANTIZE)
#include "mace/core/quantize.h"
......@@ -23,7 +23,27 @@ namespace mace {
namespace ops {
namespace ref {
MaceStatus Gemv<float>::Compute(const OpContext *context,
template<typename T>
class Gemv : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<typename T>
MaceStatus Gemv<T>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
......@@ -39,18 +59,18 @@ MaceStatus Gemv<float>::Compute(const OpContext *context,
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>();
const float *rhs_data = rhs->data<float>();
const float *bias_data = nullptr;
const T *lhs_data = lhs->data<T>();
const T *rhs_data = rhs->data<T>();
const T *bias_data = nullptr;
if (bias) {
bias_data = bias->data<float>();
bias_data = bias->data<T>();
}
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
float sum = bias ? bias_data[h] : 0;
float sum = bias ? static_cast<float>(bias_data[h]) : 0.f;
for (index_t w = 0; w < lhs_width; ++w) {
sum += lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
......@@ -65,110 +85,15 @@ MaceStatus Gemv<float>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
#if defined(MACE_ENABLE_QUANTIZE)
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<float>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::REF));
MACE_REGISTER_BF16_DELEGATOR(
registry, Gemv<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, BFloat16, ImplType::REF));
}
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
typedef Gemv<uint8_t> GemvUint8Ref;
MACE_REGISTER_DELEGATOR(registry, GemvUint8Ref, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, uint8_t, Ref))
#endif // MACE_ENABLE_QUANTIZE
typedef Gemv<float> GemvRef;
MACE_REGISTER_DELEGATOR(registry, GemvRef, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, CPU, float, REF))
} // namespace ref
} // namespace ops
} // namespace mace
......@@ -107,8 +107,11 @@ MaceStatus Eltwise::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
MACE_REGISTER_DELEGATOR(registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, CPU, uint8_t, REF))
void RegisterEltwiseDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Eltwise, delegator::EltwiseParam,
MACE_DELEGATOR_KEY(Eltwise, DeviceType::CPU, uint8_t, ImplType::REF));
}
} // namespace q8
} // namespace ref
......
......@@ -12,58 +12,33 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_REF_GEMV_H_
#define MACE_OPS_REF_GEMV_H_
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/core/quantize.h"
#include "mace/ops/delegator/gemv.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace ref {
namespace q8 {
template<typename OUTPUT_TYPE>
template<typename T>
class Gemv : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
template<>
class Gemv<float> : public delegator::Gemv {
public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
~Gemv() {}
// Always row-major after transpose
MaceStatus Compute(
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) override;
};
#if defined(MACE_ENABLE_QUANTIZE)
template<>
class Gemv<uint8_t> : public delegator::Gemv {
public:
......@@ -101,11 +76,111 @@ class Gemv<int32_t> : public delegator::Gemv {
const bool rhs_batched,
Tensor *output) override;
};
#endif // MACE_ENABLE_QUANTIZE
MaceStatus Gemv<uint8_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
uint8_t *output_data = output->mutable_data<uint8_t>();
MACE_CHECK(output->scale() > 0, "output scale must not be zero");
const float
output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] =
Saturate<uint8_t>(std::roundf(sum * output_multiplier_float));
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Gemv<int32_t>::Compute(const OpContext *context,
const Tensor *lhs,
const Tensor *rhs,
const Tensor *bias,
const index_t batch,
const index_t lhs_height,
const index_t lhs_width,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_UNUSED(context);
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
const uint8_t *lhs_data = lhs->data<uint8_t>();
const uint8_t *rhs_data = rhs->data<uint8_t>();
const int32_t *bias_data = nullptr;
if (bias) {
bias_data = bias->data<int32_t>();
}
int32_t *output_data = output->mutable_data<int32_t>();
int32_t lhs_zero = lhs->zero_point();
int32_t rhs_zero = rhs->zero_point();
for (index_t b = 0; b < batch; ++b) {
for (index_t h = 0; h < lhs_height; ++h) {
int32_t sum = bias ? bias_data[h] : 0;
for (index_t w = 0; w < lhs_width; ++w) {
sum += (lhs_data[
static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+ h * lhs_width + w] - lhs_zero)
* (rhs_data[static_cast<index_t>(rhs_batched) * b * lhs_width + w]
- rhs_zero);
} // w
output_data[b * lhs_height + h] = sum;
} // h
} // b
return MaceStatus::MACE_SUCCESS;
}
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::REF));
MACE_REGISTER_DELEGATOR(
registry, Gemv<int32_t>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::REF));
}
} // namespace q8
} // namespace ref
} // namespace ops
} // namespace mace
#endif // MACE_OPS_REF_GEMV_H_
......@@ -20,19 +20,18 @@ namespace ops {
namespace ref {
extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dRefDelegator(
OpDelegatorRegistry *registry);
extern void RegisterGemmRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvRefDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE
namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace q8
extern void RegisterGemvUint8RefDelegator(OpDelegatorRegistry *registry);
#endif // MACE_ENABLE_QUANTIZE
} // namespace ref
......@@ -43,43 +42,26 @@ extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x7S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x1S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x15S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK15x1S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7S3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
extern void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S1Delegator(
extern void RegisterDepthwiseConv2dK3x3Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseConv2dK3x3S2Delegator(
extern void RegisterDepthwiseDeconv2dK3x3Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S1Delegator(
extern void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK3x3S2Delegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK3x3S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S1Delegator(
OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dK4x4S2Delegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S1Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4S2Delegator(OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry);
extern void RegisterDepthwiseDeconv2dGeneralDelegator(
OpDelegatorRegistry *registry);
extern void RegisterGroupDeconv2dGeneralDelegator(
......@@ -92,8 +74,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_QUANTIZE
namespace q8 {
extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvUint8Delegator(OpDelegatorRegistry *registry);
extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace q8
#endif // MACE_ENABLE_QUANTIZE
......@@ -103,16 +84,16 @@ extern void RegisterGemvInt32Delegator(OpDelegatorRegistry *registry);
void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
ref::RegisterActivationDelegator(registry);
ref::RegisterBiasAddDelegator(registry);
ref::RegisterConv2dRefDelegator(registry);
ref::RegisterDeconv2dRefDelegator(registry);
ref::RegisterDepthwiseConv2dRefDelegator(registry);
ref::RegisterDepthwiseDeconv2dRefDelegator(registry);
ref::RegisterGemmRefDelegator(registry);
ref::RegisterGemvRefDelegator(registry);
ref::RegisterConv2dDelegator(registry);
ref::RegisterDeconv2dDelegator(registry);
ref::RegisterDepthwiseConv2dDelegator(registry);
ref::RegisterDepthwiseDeconv2dDelegator(registry);
ref::RegisterGemmDelegator(registry);
ref::RegisterGemvDelegator(registry);
#ifdef MACE_ENABLE_QUANTIZE
ref::q8::RegisterEltwiseDelegator(registry);
ref::RegisterGemvUint8RefDelegator(registry);
ref::q8::RegisterGemvDelegator(registry);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_NEON
......@@ -120,37 +101,23 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
arm::fp32::RegisterBiasAddDelegator(registry);
arm::fp32::RegisterConv2dK1x1Delegator(registry);
arm::fp32::RegisterConv2dK1x7S1Delegator(registry);
arm::fp32::RegisterConv2dK7x1S1Delegator(registry);
arm::fp32::RegisterConv2dK1x15S1Delegator(registry);
arm::fp32::RegisterConv2dK15x1S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S1Delegator(registry);
arm::fp32::RegisterConv2dK3x3S2Delegator(registry);
arm::fp32::RegisterConv2dK1xNDelegator(registry);
arm::fp32::RegisterConv2dK3x3Delegator(registry);
arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
arm::fp32::RegisterConv2dK5x5S1Delegator(registry);
arm::fp32::RegisterConv2dK7x7S1Delegator(registry);
arm::fp32::RegisterConv2dK7x7S2Delegator(registry);
arm::fp32::RegisterConv2dK7x7S3Delegator(registry);
arm::fp32::RegisterConv2dK5x5Delegator(registry);
arm::fp32::RegisterConv2dK7x7Delegator(registry);
arm::fp32::RegisterConv2dGeneralDelegator(registry);
arm::fp32::RegisterDeconv2dK2x2S1Delegator(registry);
arm::fp32::RegisterDeconv2dK2x2S2Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDeconv2dK2x2Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S1Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3S2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S1Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4S2Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
......@@ -159,8 +126,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
#ifdef MACE_ENABLE_QUANTIZE
arm::q8::RegisterEltwiseDelegator(registry);
arm::q8::RegisterGemvUint8Delegator(registry);
arm::q8::RegisterGemvInt32Delegator(registry);
arm::q8::RegisterGemvDelegator(registry);
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_NEON
......
......@@ -98,6 +98,8 @@ class ReplaceIndexOp<DeviceType::CPU, T> : public Operation {
void RegisterReplaceIndex(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ReplaceIndex", ReplaceIndexOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -152,6 +152,7 @@ class ReshapeOp<GPU, float> : public Operation {
void RegisterReshape(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, DeviceType::CPU, int32_t);
MACE_REGISTER_GPU_OP(op_registry, "Reshape", ReshapeOp);
MACE_REGISTER_OP_CONDITION(
......
......@@ -57,15 +57,7 @@ inline T ComputeLerp(const T top_left,
const T bottom_left,
const T bottom_right,
const float x_lerp,
const float y_lerp);
template<>
inline float ComputeLerp<float>(const float top_left,
const float top_right,
const float bottom_left,
const float bottom_right,
const float x_lerp,
const float y_lerp) {
const float y_lerp) {
const float top = top_left + (top_right - top_left) * x_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
return top + (bottom - top) * y_lerp;
......@@ -370,6 +362,8 @@ class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
void RegisterResizeBilinear(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
......
......@@ -176,6 +176,8 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
void RegisterResizeNearestNeighbor(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
ResizeNearestNeighborOp);
......
......@@ -76,6 +76,8 @@ class ReverseOp<DeviceType::CPU, T> : public Operation {
void RegisterReverse(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Reverse", ReverseOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -158,6 +158,8 @@ class ScalarMathOp : public Operation {
void RegisterScalarMath(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU, int32_t);
}
......
......@@ -22,8 +22,8 @@ namespace ops {
template<DeviceType D, typename T>
class SelectOp;
template<>
class SelectOp<DeviceType::CPU, float> : public Operation {
template<class T>
class SelectOp<DeviceType::CPU, T> : public Operation {
public:
explicit SelectOp(OpConstructContext *context)
: Operation(context) {}
......@@ -41,7 +41,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
Tensor *output = this->Output(OUTPUT);
const index_t condition_rank = condition->dim_size();
MACE_RETURN_IF_ERROR(output->Resize({condition->size(), condition_rank}));
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
const bool *condition_data = condition->data<bool>();
index_t i = 0;
......@@ -161,10 +161,10 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
Tensor *output = this->Output(OUTPUT);
MACE_RETURN_IF_ERROR(output->Resize(x->shape()));
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
const bool *condition_data = condition->data<bool>();
const float *x_data = x->data<float>();
const float *y_data = y->data<float>();
const T *x_data = x->data<T>();
const T *y_data = y->data<T>();
const index_t condition_size = condition->size();
const index_t x_size = x->size();
......@@ -182,7 +182,7 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
MACE_ASSERT(
block_size > 1 && x_size % condition_size == 0,
"x_size should be a multiple of condition_size and greater than 1");
const auto raw_block_size = block_size * sizeof(float);
const auto raw_block_size = block_size * sizeof(T);
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t k = start; k < end; k += step) {
auto offset = block_size * k;
......@@ -208,6 +208,8 @@ class SelectOp<DeviceType::CPU, float> : public Operation {
void RegisterSelect(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Select", SelectOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Select", SelectOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -62,6 +62,8 @@ class ShapeOp : public Operation {
void RegisterShape(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Shape", ShapeOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -87,6 +87,8 @@ class SliceOp<DeviceType::CPU, T> : public Operation {
void RegisterSlice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Slice", SliceOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Slice", SliceOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -39,8 +39,8 @@ namespace ops {
template<DeviceType D, typename T>
class SoftmaxOp;
template<>
class SoftmaxOp<DeviceType::CPU, float> : public Operation {
template<class T>
class SoftmaxOp<DeviceType::CPU, T> : public Operation {
public:
explicit SoftmaxOp(OpConstructContext *context)
: Operation(context),
......@@ -71,9 +71,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
protected:
MaceStatus RunForNCHW(OpContext *context) {
const Tensor *input = this->Input(INPUT);
const float *input_data = input->data<float>();
const T *input_data = input->data<T>();
Tensor *output = this->Output(OUTPUT);
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
MACE_CHECK(input->dim_size() == 4, "The dim size of NCHW should be 4.");
index_t hw_stride = input->dim(3);
......@@ -93,8 +93,8 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t b_offset = 0;
b_offset < batch_size; b_offset += batch_stride) {
const float *input_b_base = input_data + b_offset;
float *output_b_base = output_data + b_offset;
const T *input_b_base = input_data + b_offset;
T *output_b_base = output_data + b_offset;
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
const auto raw_step_size = step * sizeof(float);
for (index_t k = start; k < end; k += step) {
......@@ -106,9 +106,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) {
const float *input_c_base = input_b_base + c_offset;
const T *input_c_base = input_b_base + c_offset;
for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_c_base + k;
const T *input_ptr = input_c_base + k;
float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) {
cache_k_ptr[i] = std::max(cache_k_ptr[i], input_ptr[i]);
......@@ -118,14 +118,14 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) {
const float *input_c_base = input_b_base + c_offset;
float *output_c_base = output_b_base + c_offset;
const T *input_c_base = input_b_base + c_offset;
T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_c_base + k;
float *output_ptr = output_c_base + k;
const T *input_ptr = input_c_base + k;
T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) {
output_ptr[i] = ::exp(input_ptr[i] - cache_k_ptr[i]);
output_ptr[i] = std::exp(input_ptr[i] - cache_k_ptr[i]);
}
}
}
......@@ -136,24 +136,24 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset;
T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k;
T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) {
cache_k_ptr[i] += output_ptr[i];
cache_k_ptr[i] += static_cast<float>(output_ptr[i]);
}
}
}
for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset;
T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k;
T *output_ptr = output_c_base + k;
float *cache_k_ptr = cache_ptr + k;
for (index_t i = 0; i < step; ++i) {
output_ptr[i] = output_ptr[i] / cache_k_ptr[i];
output_ptr[i] /= cache_k_ptr[i];
}
}
}
......@@ -161,9 +161,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
if (use_log_) {
for (index_t c_offset = 0; c_offset < class_size;
c_offset += class_stride) {
float *output_c_base = output_b_base + c_offset;
T *output_c_base = output_b_base + c_offset;
for (index_t k = start; k < end; k += step) {
float *output_ptr = output_c_base + k;
T *output_ptr = output_c_base + k;
for (index_t i = 0; i < step; ++i) {
output_ptr[i] = std::log(output_ptr[i]);
}
......@@ -179,7 +179,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
MaceStatus RunForNHWC(OpContext *context) {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
float *output_data = output->mutable_data<float>();
T *output_data = output->mutable_data<T>();
MACE_CHECK(input->dim_size() >= 2, "The input->dim_size() >= 2 failed.");
index_t class_size = input->dim(input->dim_size() - 1);
......@@ -196,16 +196,16 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
const float *input_data = input->data<float>();
const T *input_data = input->data<T>();
float std_lowest = std::numeric_limits<float>::lowest();
for (index_t b_offset = 0; b_offset < batch_size;
b_offset += batch_stride) {
const float *input_b_ptr = input_data + b_offset;
float *output_b_ptr = output_data + b_offset;
const T *input_b_ptr = input_data + b_offset;
T *output_b_ptr = output_data + b_offset;
thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t k = start; k < end; k += step) {
const float *input_ptr = input_b_ptr + k;
float *output_ptr = output_b_ptr + k;
const T *input_ptr = input_b_ptr + k;
T *output_ptr = output_b_ptr + k;
float max_val = std_lowest;
for (index_t c = 0; c < class_size; ++c) {
......@@ -214,15 +214,15 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
float sum = 0;
for (index_t c = 0; c < class_size; ++c) {
float exp_value = ::exp(input_ptr[c] - max_val);
float exp_value = std::exp(input_ptr[c] - max_val);
sum += exp_value;
output_ptr[c] = exp_value;
}
if (use_log_) {
for (index_t c = 0; c < class_size; ++c) {
output_ptr[c] /= sum;
output_ptr[c] = std::log(output_ptr[c]);
float output = (static_cast<float>(output_ptr[c])) / sum;
output_ptr[c] = std::log(output);
}
} else {
for (index_t c = 0; c < class_size; ++c) {
......@@ -306,8 +306,8 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
float sum = 0;
std::vector<float> depth_cache(depth);
for (index_t d = 0; d < depth; ++d) {
float exp_value = ::exp((static_cast<int>(input_ptr[d]) - max_value)
* input_scale);
float exp_value = std::exp(
(static_cast<int>(input_ptr[d]) - max_value) * input_scale);
sum += exp_value;
depth_cache[d] = exp_value;
}
......@@ -524,6 +524,8 @@ class SoftmaxOp<DeviceType::GPU, float> : public Operation {
void RegisterSoftmax(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Softmax", SoftmaxOp,
DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
......
......@@ -90,8 +90,8 @@ class SpaceToBatchOpBase : public Operation {
template<DeviceType D, class T>
class SpaceToBatchNDOp;
template<>
class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
template<class T>
class SpaceToBatchNDOp<DeviceType::CPU, T> : public SpaceToBatchOpBase {
public:
explicit SpaceToBatchNDOp(OpConstructContext *context)
: SpaceToBatchOpBase(context) {}
......@@ -115,8 +115,8 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
int block_shape_h = block_shape_[0];
int block_shape_w = block_shape_[1];
const float *input_data = space_tensor->data<float>();
float *output_data = batch_tensor->mutable_data<float>();
const T *input_data = space_tensor->data<T>();
T *output_data = batch_tensor->mutable_data<T>();
index_t in_batches = space_tensor->dim(0);
index_t in_height = space_tensor->dim(2);
......@@ -158,20 +158,20 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
(in_width + pad_left - tile_w
+ block_shape_w - 1)
/ block_shape_w);
const float *input_base =
const T *input_base =
input_data + (in_b * channels + c) * in_height * in_width;
float *output_base =
T *output_base =
output_data + (b * channels + c) * out_height * out_width;
memset(output_base + block_h * out_width,
0,
(valid_h_start - block_h) * out_width * sizeof(float));
(valid_h_start - block_h) * out_width * sizeof(T));
index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
for (index_t h = valid_h_start; h < valid_h_end; ++h) {
memset(output_base + h * out_width,
0,
valid_w_start * sizeof(float));
valid_w_start * sizeof(T));
index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
for (index_t w = valid_w_start; w < valid_w_end; ++w) {
......@@ -183,13 +183,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
memset(output_base + h * out_width + valid_w_end,
0,
(out_width - valid_w_end) * sizeof(float));
(out_width - valid_w_end) * sizeof(T));
} // h
memset(output_base + valid_h_end * out_width,
0,
(std::min(out_height, block_h + block_h_size) - valid_h_end)
* out_width * sizeof(float));
* out_width * sizeof(T));
} // b
} // block_h
} // c
......@@ -332,6 +332,8 @@ class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
void RegisterSpaceToBatchND(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SpaceToBatchND",
SpaceToBatchNDOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
......
......@@ -28,8 +28,8 @@ namespace ops {
template<DeviceType D, class T>
class SpaceToDepthOp;
template<>
class SpaceToDepthOp<CPU, float> : public Operation {
template<class T>
class SpaceToDepthOp<CPU, T> : public Operation {
public:
explicit SpaceToDepthOp(OpConstructContext *context)
: Operation(context),
......@@ -59,8 +59,8 @@ class SpaceToDepthOp<CPU, float> : public Operation {
Tensor::MappingGuard logits_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < input_depth; ++d) {
......@@ -184,6 +184,8 @@ class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
void RegisterSpaceToDepth(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SpaceToDepth",
SpaceToDepthOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "SpaceToDepth",
......
......@@ -157,6 +157,8 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
void RegisterSplice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Splice", SpliceOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Splice", SpliceOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -130,8 +130,8 @@ class SplitOp<DeviceType::GPU, float> : public Operation {
#endif // MACE_ENABLE_OPENCL
void RegisterSplit(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Split", SplitOp,
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Split", SplitOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Split", SplitOp, DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
......
......@@ -67,9 +67,10 @@ class SqrDiffMeanOp : public Operation {
const index_t img_size = input0->dim(2) * input0->dim(3);
const index_t bc = input0->dim(0) * input0->dim(1);
// TODO(luxuhui): cache the output_ptr[i]
for (int i = 0; i < bc; ++i) {
for (int j = 0; j < img_size; ++j) {
T diff = input_ptr0[i * img_size + j] - input_ptr1[i];
float diff = input_ptr0[i * img_size + j] - input_ptr1[i];
output_ptr[i] += diff * diff;
}
output_ptr[i] /= img_size;
......@@ -104,6 +105,8 @@ class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
void RegisterSqrDiffMean(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
DeviceType::CPU);
MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
}
......
......@@ -80,6 +80,7 @@ class SqueezeOp : public SqueezeOpRaw {
void RegisterSqueeze(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
......
......@@ -80,6 +80,7 @@ class StackOp : public Operation {
void RegisterStack(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Stack", StackOp, DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
}
......
......@@ -354,6 +354,8 @@ class StridedSliceOp : public Operation {
void RegisterStridedSlice(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU, int32_t);
}
......
......@@ -104,6 +104,8 @@ class SubsampleOp<DeviceType::CPU, T> : public Operation {
void RegisterSubsample(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Subsample", SubsampleOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Subsample", SubsampleOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -105,6 +105,8 @@ class SumGroupOp<DeviceType::CPU, T> : public Operation {
void RegisterSumGroup(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "SumGroup", SumGroupOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -152,6 +152,8 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
void RegisterTargetRMSNorm(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -113,6 +113,7 @@ class TileOp : public Operation {
void RegisterTile(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Tile", TileOp, DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Tile", TileOp, DeviceType::CPU);
MACE_REGISTER_OP_CONDITION(
op_registry, OpConditionBuilder("Tile").SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
......
......@@ -27,11 +27,8 @@
namespace mace {
namespace ops {
template<DeviceType D, typename T>
class TransposeOp;
template<DeviceType D>
class TransposeOp<D, float> : public Operation {
template<DeviceType D, class T>
class TransposeOp : public Operation {
public:
explicit TransposeOp(OpConstructContext *context)
: Operation(context),
......@@ -54,8 +51,8 @@ class TransposeOp<D, float> : public Operation {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
return Transpose(&context->device()->cpu_runtime()->thread_pool(),
input_data, input->shape(), dims_, output_data);
......@@ -68,6 +65,8 @@ class TransposeOp<D, float> : public Operation {
void RegisterTranspose(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Transpose", TransposeOp,
DeviceType::CPU);
}
} // namespace ops
......
......@@ -63,6 +63,8 @@ class UnsqueezeOp : public Operation {
void RegisterUnsqueeze(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Unsqueeze", UnsqueezeOp,
DeviceType::CPU, int32_t);
}
......
......@@ -77,6 +77,8 @@ class UnstackOp : public Operation {
void RegisterUnstack(OpRegistry *op_registry) {
MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU, float);
MACE_REGISTER_BF16_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU);
MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
DeviceType::CPU, int32_t);
}
......
......@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/gemm.h"
#include "mace/ops/ref/gemm.h"
#include "mace/ops/delegator/gemm.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h"
namespace mace {
......@@ -50,41 +50,48 @@ void TestGemmFloat32(const index_t batch,
GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data);
GenerateRandomRealTypeData<float>(output.shape(), output_data);
}
::mace::ops::arm::fp32::Gemm gemm((delegator::GemmParam()));
utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device);
gemm.Compute(&context,
&lhs,
&rhs,
batch,
rows,
cols,
depth,
lhs_major,
rhs_major,
output_major,
lhs_batched,
rhs_batched,
&output);
OpsTestNet net;
OpContext context(net.ws(), &cpu_device);
std::unique_ptr<delegator::Gemm> gemm = delegator::Gemm::Create(
context.workspace(),
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON),
delegator::GemmParam());
gemm->Compute(&context,
&lhs,
&rhs,
batch,
rows,
cols,
depth,
lhs_major,
rhs_major,
output_major,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
expected_output.Resize({batch, rows, cols});
::mace::ops::ref::Gemm<float> gemm_ref((delegator::GemmParam()));
gemm_ref.Compute(nullptr,
&lhs,
&rhs,
batch,
rows,
cols,
depth,
lhs_major,
rhs_major,
output_major,
lhs_batched,
rhs_batched,
&expected_output);
std::unique_ptr<delegator::Gemm> gemm_ref = delegator::Gemm::Create(
context.workspace(), MACE_DELEGATOR_KEY(
Gemm, DeviceType::CPU, float, ImplType::REF), delegator::GemmParam());
gemm_ref->Compute(&context,
&lhs,
&rhs,
batch,
rows,
cols,
depth,
lhs_major,
rhs_major,
output_major,
lhs_batched,
rhs_batched,
&expected_output);
ExpectTensorNear<float>(expected_output, output);
}
......
......@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/gemv.h"
#include "mace/ops/ref/gemv.h"
#include "mace/ops/delegator/gemv.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h"
namespace mace {
......@@ -52,34 +52,38 @@ void TestGemvFloat32(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device);
::mace::ops::arm::fp32::Gemv gemv =
::mace::ops::arm::fp32::Gemv(DelegatorParam());
gemv.Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
OpsTestNet net;
OpContext context(net.ws(), &cpu_device);
std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
context.workspace(),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON),
DelegatorParam());
gemv->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_FLOAT);
expected_output.Resize({batch, height});
::mace::ops::ref::Gemv<float> gemv_ref =
::mace::ops::ref::Gemv<float>(DelegatorParam());
gemv_ref.Compute(nullptr,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
context.workspace(), MACE_DELEGATOR_KEY(
Gemv, DeviceType::CPU, float, ImplType::REF), DelegatorParam());
gemv_ref->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
ExpectTensorNear<float>(expected_output, output);
}
......
......@@ -17,8 +17,8 @@
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/q8/gemv.h"
#include "mace/ops/ref/gemv.h"
#include "mace/ops/delegator/gemv.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/testing/test_utils.h"
namespace mace {
......@@ -57,34 +57,38 @@ void TestGemvInt32(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device);
mace::ops::arm::q8::Gemv<int32_t> gemv =
mace::ops::arm::q8::Gemv<int32_t>(DelegatorParam());
gemv.Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
OpsTestNet net;
OpContext context(net.ws(), &cpu_device);
std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
context.workspace(),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, int32_t, ImplType::NEON),
DelegatorParam());
gemv->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
expected_output.Resize({batch, height});
mace::ops::ref::Gemv<int32_t> gemv_ref =
mace::ops::ref::Gemv<int32_t>(DelegatorParam());
gemv_ref.Compute(nullptr,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
context.workspace(), MACE_DELEGATOR_KEY(
Gemv, DeviceType::CPU, int32_t, ImplType::REF), DelegatorParam());
gemv_ref->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
Tensor::MappingGuard output_guard(&output);
Tensor::MappingGuard expected_guard(&expected_output);
......@@ -131,36 +135,40 @@ void TestGemvUint8(const index_t batch,
utils::ThreadPool thread_pool(1, AFFINITY_NONE);
thread_pool.Init();
CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
OpContext context(nullptr, &cpu_device);
mace::ops::arm::q8::Gemv<uint8_t> gemv =
mace::ops::arm::q8::Gemv<uint8_t>(DelegatorParam());
gemv.Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
OpsTestNet net;
OpContext context(net.ws(), &cpu_device);
std::unique_ptr<delegator::Gemv> gemv = delegator::Gemv::Create(
context.workspace(),
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, uint8_t, ImplType::NEON),
DelegatorParam());
gemv->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&output);
Tensor expected_output(GetCPUAllocator(), DataType::DT_INT32);
expected_output.SetScale(0.6);
expected_output.SetZeroPoint(57);
expected_output.Resize({batch, height});
mace::ops::ref::Gemv<uint8_t> gemv_ref =
mace::ops::ref::Gemv<uint8_t>(DelegatorParam());
gemv_ref.Compute(nullptr,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
std::unique_ptr<delegator::Gemv> gemv_ref = delegator::Gemv::Create(
context.workspace(), MACE_DELEGATOR_KEY(
Gemv, DeviceType::CPU, uint8_t, ImplType::REF), DelegatorParam());
gemv_ref->Compute(&context,
&lhs,
&rhs,
&bias,
batch,
height,
width,
lhs_batched,
rhs_batched,
&expected_output);
Tensor::MappingGuard output_guard(&output);
Tensor::MappingGuard expected_guard(&expected_output);
......
......@@ -16,7 +16,6 @@
#include "mace/ops/delegator/gemm.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/ref/gemm.h"
namespace mace {
namespace ops {
......@@ -112,7 +111,9 @@ void Complex(const std::vector<index_t> &batch,
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
ref::Gemm<float> gemm = ref::Gemm<float>(delegator::GemmParam());
std::unique_ptr<delegator::Gemm> gemm = delegator::Gemm::Create(
net.ws(), MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::REF),
delegator::GemmParam());
Tensor expected_output_tensor;
std::vector<index_t> expected_output_shape({rows, cols});
expected_output_shape.insert(expected_output_shape.begin(),
......@@ -121,20 +122,20 @@ void Complex(const std::vector<index_t> &batch,
expected_output_tensor.Resize(expected_output_shape);
index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1,
std::multiplies<index_t>());
gemm.Compute(nullptr,
net.GetTensor("A"),
net.GetTensor("B"),
batch_count,
lhs_rows,
lhs_cols,
rhs_rows,
rhs_cols,
transpose_lhs,
transpose_rhs,
false,
lhs_batched,
rhs_batched,
&expected_output_tensor);
gemm->Compute(nullptr,
net.GetTensor("A"),
net.GetTensor("B"),
batch_count,
lhs_rows,
lhs_cols,
rhs_rows,
rhs_cols,
transpose_lhs,
transpose_rhs,
false,
lhs_batched,
rhs_batched,
&expected_output_tensor);
ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"),
1e-4, 1e-2);
......
......@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \
-DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \
......
......@@ -24,6 +24,7 @@ cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \
-DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \
......
......@@ -31,13 +31,14 @@ cmake -DANDROID_ABI="arm64-v8a" \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
-DANDROID_NATIVE_API_LEVEL=21 \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_STL=c++_shared \
-DANDROID_STL=c++_shared \
-DMACE_ENABLE_NEON=ON \
-DMACE_ENABLE_QUANTIZE=ON \
-DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL} \
-DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \
-DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \
-DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \
......
......@@ -40,6 +40,7 @@ cmake -DANDROID_ABI="armeabi-v7a" \
-DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP} \
-DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA} \
-DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU} \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_OPT_SIZE=ON \
-DMACE_ENABLE_OBFUSCATE=ON \
-DMACE_ENABLE_TESTS=ON \
......
......@@ -18,6 +18,7 @@ mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
cmake -DMACE_ENABLE_NEON=OFF \
-DMACE_ENABLE_QUANTIZE=OFF \
-DMACE_ENABLE_OPENCL=OFF \
-DMACE_ENABLE_BFLOAT16=ON \
-DMACE_ENABLE_TESTS=ON \
-DMACE_ENABLE_BENCHMARKS=ON \
-DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE} \
......
......@@ -171,7 +171,16 @@ def parse_device_type(runtime):
return device_type
def get_hexagon_mode(configs):
def bfloat16_enabled(configs):
for model_name in configs[YAMLKeyword.models]:
model_config = configs[YAMLKeyword.models][model_name]
dtype = model_config.get(YAMLKeyword.data_type, FPDataType.fp16_fp32)
if dtype == FPDataType.bf16_fp32:
return True
return False
def hexagon_enabled(configs):
runtime_list = []
for model_name in configs[YAMLKeyword.models]:
model_runtime = \
......@@ -184,7 +193,7 @@ def get_hexagon_mode(configs):
return False
def get_hta_mode(configs):
def hta_enabled(configs):
runtime_list = []
for model_name in configs[YAMLKeyword.models]:
model_runtime = \
......@@ -197,7 +206,7 @@ def get_hta_mode(configs):
return False
def get_apu_mode(configs):
def apu_enabled(configs):
runtime_list = []
for model_name in configs[YAMLKeyword.models]:
model_runtime = \
......@@ -210,7 +219,7 @@ def get_apu_mode(configs):
return False
def get_opencl_mode(configs):
def opencl_enabled(configs):
runtime_list = []
for model_name in configs[YAMLKeyword.models]:
model_runtime = \
......@@ -224,7 +233,7 @@ def get_opencl_mode(configs):
return False
def get_quantize_mode(configs):
def quantize_enabled(configs):
for model_name in configs[YAMLKeyword.models]:
quantize = \
configs[YAMLKeyword.models][model_name].get(
......@@ -739,11 +748,12 @@ def build_model_lib(configs, address_sanitizer, debug_mode):
MODEL_LIB_TARGET,
abi=target_abi,
toolchain=toolchain,
enable_hexagon=get_hexagon_mode(configs),
enable_hta=get_hta_mode(configs),
enable_apu=get_apu_mode(configs),
enable_opencl=get_opencl_mode(configs),
enable_quantize=get_quantize_mode(configs),
enable_hexagon=hexagon_enabled(configs),
enable_hta=hta_enabled(configs),
enable_apu=apu_enabled(configs),
enable_opencl=opencl_enabled(configs),
enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs),
address_sanitizer=address_sanitizer,
symbol_hidden=get_symbol_hidden_mode(debug_mode),
debug_mode=debug_mode
......@@ -900,12 +910,13 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
mace_run_target,
abi=target_abi,
toolchain=toolchain,
enable_hexagon=get_hexagon_mode(configs),
enable_hta=get_hta_mode(configs),
enable_apu=get_apu_mode(configs),
enable_hexagon=hexagon_enabled(configs),
enable_hta=hta_enabled(configs),
enable_apu=apu_enabled(configs),
enable_openmp=enable_openmp,
enable_opencl=get_opencl_mode(configs),
enable_quantize=get_quantize_mode(configs),
enable_opencl=opencl_enabled(configs),
enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs),
address_sanitizer=address_sanitizer,
symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type),
debug_mode=debug_mode,
......
......@@ -74,6 +74,7 @@ class ScratchComputer:
data_type == mace_pb2.DT_INT32:
return 4
elif data_type == mace_pb2.DT_HALF or \
data_type == mace_pb2.DT_BFLOAT16 or \
data_type == mace_pb2.DT_FLOAT16:
return 2
elif data_type == mace_pb2.DT_UINT8:
......
# Copyright 2020 The MACE Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -79,7 +78,7 @@ def merge_params(net_def, data_type):
for tensor in net_def.tensors:
if tensor.data_type == mace_pb2.DT_FLOAT \
or tensor.data_type == mace_pb2.DT_HALF \
or tensor.data_type == mace_pb2.DT_FLOAT16\
or tensor.data_type == mace_pb2.DT_FLOAT16 \
or tensor.data_type == mace_pb2.DT_BFLOAT16:
del tensor.float_data[:]
elif tensor.data_type == mace_pb2.DT_INT32:
......
......@@ -270,6 +270,7 @@ def bazel_build(target,
enable_neon=True,
enable_opencl=True,
enable_quantize=True,
enable_bfloat16=False,
enable_rpcmem=True,
address_sanitizer=False,
symbol_hidden=True,
......@@ -286,6 +287,8 @@ def bazel_build(target,
"openmp=%s" % str(enable_openmp).lower(),
"--define",
"quantize=%s" % str(enable_quantize).lower(),
"--define",
"bfloat16=%s" % str(enable_bfloat16).lower(),
target,
)
else:
......@@ -304,6 +307,8 @@ def bazel_build(target,
"--define",
"quantize=%s" % str(enable_quantize).lower(),
"--define",
"bfloat16=%s" % str(enable_bfloat16).lower(),
"--define",
"rpcmem=%s" % str(enable_rpcmem).lower(),
"--define",
"hexagon=%s" % str(enable_hexagon).lower(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册