提交 6b9aafd4 编写于 作者: 叶剑武

Merge branch 'cpplint' into 'master'

Reformatting code and enable cpplint

See merge request !273
stages:
- ops_test
- ops_benchmark
- cpplint
cpplint:
stage: cpplint
only:
- master
script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
ops_test:
stage: ops_test
......
......@@ -9,8 +9,8 @@
#include <malloc.h>
#include "mace/core/registry.h"
#include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
free(data);
};
void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset;
return (char *)buffer + offset;
}
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
......
......@@ -83,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname, \
enforce_lossless_conversion) \
template <> \
std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
const string &name, const std::vector<T> &default_value) const { \
if (arg_map_.count(name) == 0) { \
return default_value; \
} \
std::vector<T> values; \
std::vector<T> values; \
for (const auto &v : arg_map_.at(name).fieldname()) { \
if (enforce_lossless_conversion) { \
auto supportsConversion = \
......
......@@ -5,9 +5,9 @@
#ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector>
#include "mace/core/allocator.h"
#include "mace/core/types.h"
namespace mace {
......@@ -39,23 +39,19 @@ class BufferBase {
virtual bool OnHost() const = 0;
virtual index_t offset() const {
return 0;
};
virtual index_t offset() const { return 0; };
template<typename T>
template <typename T>
const T *data() const {
return reinterpret_cast<const T *>(raw_data());
}
template<typename T>
template <typename T>
T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data());
}
index_t size() const {
return size_;
}
index_t size() const { return size_; }
protected:
index_t size_;
......@@ -64,26 +60,26 @@ class BufferBase {
class Buffer : public BufferBase {
public:
Buffer(Allocator *allocator)
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
Buffer(Allocator *allocator, index_t size)
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
buf_ = allocator->New(size);
}
Buffer(Allocator *allocator, void *data, index_t size)
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
virtual ~Buffer() {
if (mapped_buf_ != nullptr) {
......@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length);
memcpy(mapped_buf_, (char *)src + offset, length);
}
bool OnHost() const {
return allocator_->OnHost();
}
bool OnHost() const { return allocator_->OnHost(); }
private:
Allocator *allocator_;
......@@ -168,23 +162,24 @@ class Buffer : public BufferBase {
void *mapped_buf_;
bool is_data_owner_;
DISABLE_COPY_AND_ASSIGN(Buffer);
DISABLE_COPY_AND_ASSIGN(Buffer);
};
class Image : public BufferBase {
public:
Image()
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(),
1, std::multiplies<index_t>())
* GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
: BufferBase(
std::accumulate(
shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
shape_ = shape;
data_type_ = data_type;
buf_ = allocator_->NewImage(shape, data_type);
......@@ -214,9 +209,7 @@ class Image : public BufferBase {
return mapped_buf_;
}
std::vector<size_t> image_shape() const {
return shape_;
}
std::vector<size_t> image_shape() const { return shape_; }
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
......@@ -241,17 +234,11 @@ class Image : public BufferBase {
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
MACE_NOT_IMPLEMENTED;
}
void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
bool OnHost() const {
return allocator_->OnHost();
}
bool OnHost() const { return allocator_->OnHost(); }
private:
Allocator *allocator_;
......@@ -260,34 +247,25 @@ class Image : public BufferBase {
void *buf_;
void *mapped_buf_;
DISABLE_COPY_AND_ASSIGN(Image);
DISABLE_COPY_AND_ASSIGN(Image);
};
class BufferSlice : public BufferBase {
public:
BufferSlice()
: buffer_(nullptr),
mapped_buf_(nullptr),
offset_(0),
length_(0) {}
: buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_,
"buffer slice offset + length (",
offset,
" + ",
length,
") should <= ",
size_);
MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
offset, " + ", length, ") should <= ", size_);
}
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
other.offset_,
other.length_) {}
BufferSlice(const BufferSlice &other)
: BufferSlice(other.buffer_, other.offset_, other.length_) {}
~BufferSlice() {
if (buffer_ != nullptr && mapped_buf_ != nullptr) {
......@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_;
return (char *)buffer_->raw_data() + offset_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
......@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_NOT_IMPLEMENTED;
}
void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_);
......@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
MACE_NOT_IMPLEMENTED;
}
void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
index_t offset() const {
return offset_;
}
index_t offset() const { return offset_; }
bool OnHost() const {
return buffer_->OnHost();
}
bool OnHost() const { return buffer_->OnHost(); }
private:
BufferBase *buffer_;
......@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
index_t offset_;
index_t length_;
};
}
#endif // MACE_CORE_BUFFER_H_
#endif // MACE_CORE_BUFFER_H_
......@@ -3,9 +3,9 @@
//
#include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/core/net.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/types.h"
namespace mace {
......@@ -13,46 +13,34 @@ ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const DataType data_type,
uint32_t node_id) :
name_(name),
data_(data),
data_size_(std::accumulate(dims.begin(), dims.end(), 1,
std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(data_type),
node_id_(node_id) {}
uint32_t node_id)
: name_(name),
data_(data),
data_size_(std::accumulate(
dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(data_type),
node_id_(node_id) {}
ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const int data_type,
uint32_t node_id) :
name_(name),
data_(data),
data_size_(std::accumulate(dims.begin(), dims.end(), 1,
std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(static_cast<DataType>(data_type)),
node_id_(node_id) {}
const std::string &ConstTensor::name() const {
return name_;
}
const unsigned char *ConstTensor::data() const {
return data_;
}
int64_t ConstTensor::data_size() const {
return data_size_;
}
const std::vector<int64_t> &ConstTensor::dims() const {
return dims_;
}
DataType ConstTensor::data_type() const {
return data_type_;
}
uint32_t ConstTensor::node_id() const {
return node_id_;
}
uint32_t node_id)
: name_(name),
data_(data),
data_size_(std::accumulate(
dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(static_cast<DataType>(data_type)),
node_id_(node_id) {}
const std::string &ConstTensor::name() const { return name_; }
const unsigned char *ConstTensor::data() const { return data_; }
int64_t ConstTensor::data_size() const { return data_size_; }
const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
DataType ConstTensor::data_type() const { return data_type_; }
uint32_t ConstTensor::node_id() const { return node_id_; }
Argument::Argument() : has_bits_(0) {}
......@@ -73,74 +61,42 @@ void Argument::CopyFrom(const Argument &from) {
this->has_bits_ = from.has_bits_;
}
const std::string &Argument::name() const {
return name_;
}
void Argument::set_name(const std::string &value) {
name_ = value;
}
bool Argument::has_f() const {
return (has_bits_ & 0x00000001u) != 0;
}
void Argument::set_has_f() {
has_bits_ |= 0x00000001u;
}
float Argument::f() const {
return f_;
}
const std::string &Argument::name() const { return name_; }
void Argument::set_name(const std::string &value) { name_ = value; }
bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
float Argument::f() const { return f_; }
void Argument::set_f(float value) {
set_has_f();
f_ = value;
}
bool Argument::has_i() const {
return (has_bits_ & 0x00000002u) != 0;
}
void Argument::set_has_i() {
has_bits_ |= 0x00000002u;
}
int64_t Argument::i() const {
return i_;
}
bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
int64_t Argument::i() const { return i_; }
void Argument::set_i(int64_t value) {
set_has_i();
i_ = value;
}
bool Argument::has_s() const {
return (has_bits_ & 0x00000004u) != 0;
}
void Argument::set_has_s() {
has_bits_ |= 0x00000004u;
}
std::string Argument::s() const {
return s_;
}
bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
std::string Argument::s() const { return s_; }
void Argument::set_s(const std::string &value) {
set_has_s();
s_ = value;
}
const std::vector<float> &Argument::floats() const {
return floats_;
}
void Argument::add_floats(float value) {
floats_.push_back(value);
}
const std::vector<float> &Argument::floats() const { return floats_; }
void Argument::add_floats(float value) { floats_.push_back(value); }
void Argument::set_floats(const std::vector<float> &value) {
floats_.resize(value.size());
std::copy(value.begin(), value.end(), floats_.begin());
}
const std::vector<int64_t> &Argument::ints() const {
return ints_;
}
void Argument::add_ints(int64_t value) {
ints_.push_back(value);
}
const std::vector<int64_t> &Argument::ints() const { return ints_; }
void Argument::add_ints(int64_t value) { ints_.push_back(value); }
void Argument::set_ints(const std::vector<int64_t> &value) {
ints_.resize(value.size());
std::copy(value.begin(), value.end(), ints_.begin());
}
const std::vector<std::string> &Argument::strings() const {
return strings_;
}
const std::vector<std::string> &Argument::strings() const { return strings_; }
void Argument::add_strings(const ::std::string &value) {
strings_.push_back(value);
}
......@@ -156,31 +112,21 @@ void NodeInput::CopyFrom(const NodeInput &from) {
node_id_ = from.node_id();
output_port_ = from.output_port();
}
int NodeInput::node_id() const {
return node_id_;
}
void NodeInput::set_node_id(int node_id) {
node_id_ = node_id;
}
int NodeInput::output_port() const {
return output_port_;
}
void NodeInput::set_output_port(int output_port) {
output_port_ = output_port;
}
int NodeInput::node_id() const { return node_id_; }
void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
int NodeInput::output_port() const { return output_port_; }
void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
// OutputShape
OutputShape::OutputShape() {}
OutputShape::OutputShape(const std::vector<int64_t> &dims) :
dims_(dims.begin(), dims.end()) {}
OutputShape::OutputShape(const std::vector<int64_t> &dims)
: dims_(dims.begin(), dims.end()) {}
void OutputShape::CopyFrom(const OutputShape &from) {
auto from_dims = from.dims();
dims_.resize(from_dims.size());
std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
}
const std::vector<int64_t> &OutputShape::dims() const {
return dims_;
}
const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
// Operator Def
void OperatorDef::CopyFrom(const OperatorDef &from) {
......@@ -220,68 +166,38 @@ void OperatorDef::CopyFrom(const OperatorDef &from) {
}
auto from_out_max_byte_size = from.out_max_byte_size();
out_max_byte_size_.resize(from_out_max_byte_size.size());
std::copy(from_out_max_byte_size.begin(),
from_out_max_byte_size.end(),
std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
out_max_byte_size_.begin());
has_bits_ = from.has_bits_;
}
const std::string &OperatorDef::name() const {
return name_;
}
const std::string &OperatorDef::name() const { return name_; }
void OperatorDef::set_name(const std::string &name_) {
set_has_name();
OperatorDef::name_ = name_;
}
bool OperatorDef::has_name() const {
return (has_bits_ & 0x00000001u) != 0;
}
void OperatorDef::set_has_name() {
has_bits_ |= 0x00000001u;
}
const std::string &OperatorDef::type() const {
return type_;
}
bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
const std::string &OperatorDef::type() const { return type_; }
void OperatorDef::set_type(const std::string &type_) {
set_has_type();
OperatorDef::type_ = type_;
}
bool OperatorDef::has_type() const {
return (has_bits_ & 0x00000002u) != 0;
}
void OperatorDef::set_has_type() {
has_bits_ |= 0x00000002u;
}
int OperatorDef::mem_id() const {
return mem_id_;
}
bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
int OperatorDef::mem_id() const { return mem_id_; }
void OperatorDef::set_mem_id(const int mem_id) {
set_has_mem_id();
mem_id_ = mem_id;
}
bool OperatorDef::has_mem_id() const {
return (has_bits_ & 0x00000004u) != 0;
}
void OperatorDef::set_has_mem_id() {
has_bits_ |= 0x00000004u;
}
uint32_t OperatorDef::node_id() const {
return node_id_;
}
void OperatorDef::set_node_id(uint32_t node_id) {
node_id_ = node_id;
}
uint32_t OperatorDef::op_id() const {
return op_id_;
}
uint32_t OperatorDef::padding() const {
return padding_;
}
void OperatorDef::set_padding(uint32_t padding) {
padding_ = padding;
}
bool OperatorDef::has_mem_id() const { return (has_bits_ & 0x00000004u) != 0; }
void OperatorDef::set_has_mem_id() { has_bits_ |= 0x00000004u; }
uint32_t OperatorDef::node_id() const { return node_id_; }
void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
uint32_t OperatorDef::op_id() const { return op_id_; }
uint32_t OperatorDef::padding() const { return padding_; }
void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
const std::vector<NodeInput> &OperatorDef::node_input() const {
return node_input_;
}
......@@ -294,9 +210,7 @@ const std::vector<int> &OperatorDef::out_max_byte_size() const {
void OperatorDef::add_out_max_byte_size(int value) {
out_max_byte_size_.push_back(value);
}
const std::vector<std::string> &OperatorDef::input() const {
return input_;
}
const std::vector<std::string> &OperatorDef::input() const { return input_; }
const std::string &OperatorDef::input(int index) const {
MACE_CHECK(0 <= index && index <= input_.size());
return input_[index];
......@@ -308,16 +222,12 @@ std::string *OperatorDef::add_input() {
void OperatorDef::add_input(const ::std::string &value) {
input_.push_back(value);
}
void OperatorDef::add_input(::std::string &&value) {
input_.push_back(value);
}
void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
void OperatorDef::set_input(const std::vector<std::string> &value) {
input_.resize(value.size());
std::copy(value.begin(), value.end(), input_.begin());
}
const std::vector<std::string> &OperatorDef::output() const {
return output_;
}
const std::vector<std::string> &OperatorDef::output() const { return output_; }
const std::string &OperatorDef::output(int index) const {
MACE_CHECK(0 <= index && index <= output_.size());
return output_[index];
......@@ -336,9 +246,7 @@ void OperatorDef::set_output(const std::vector<std::string> &value) {
output_.resize(value.size());
std::copy(value.begin(), value.end(), output_.begin());
}
const std::vector<Argument> &OperatorDef::arg() const {
return arg_;
}
const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
Argument *OperatorDef::add_arg() {
arg_.emplace_back(Argument());
return &arg_.back();
......@@ -358,18 +266,12 @@ void OperatorDef::set_output_type(const std::vector<DataType> &value) {
}
// MemoryBlock
MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y) :
mem_id_(mem_id), x_(x), y_(y) {}
MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
: mem_id_(mem_id), x_(x), y_(y) {}
int MemoryBlock::mem_id() const {
return mem_id_;
}
uint32_t MemoryBlock::x() const {
return x_;
}
uint32_t MemoryBlock::y() const {
return y_;
}
int MemoryBlock::mem_id() const { return mem_id_; }
uint32_t MemoryBlock::x() const { return x_; }
uint32_t MemoryBlock::y() const { return y_; }
// MemoryArena
const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
......@@ -378,131 +280,69 @@ const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
return mem_block_;
}
int MemoryArena::mem_block_size() const {
return mem_block_.size();
}
int MemoryArena::mem_block_size() const { return mem_block_.size(); }
// InputInfo
const std::string &InputInfo::name() const {
return name_;
}
int32_t InputInfo::node_id() const {
return node_id_;
}
int32_t InputInfo::max_byte_size() const {
return max_byte_size_;
}
DataType InputInfo::data_type() const {
return data_type_;
}
const std::vector<int32_t> &InputInfo::dims() const {
return dims_;
}
const std::string &InputInfo::name() const { return name_; }
int32_t InputInfo::node_id() const { return node_id_; }
int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
DataType InputInfo::data_type() const { return data_type_; }
const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
// OutputInfo
const std::string &OutputInfo::name() const {
return name_;
}
int32_t OutputInfo::node_id() const {
return node_id_;
}
int32_t OutputInfo::max_byte_size() const {
return max_byte_size_;
}
DataType OutputInfo::data_type() const {
return data_type_;
}
void OutputInfo::set_data_type(DataType data_type) {
data_type_ = data_type;
}
const std::vector<int32_t> &OutputInfo::dims() const {
return dims_;
}
void OutputInfo::set_dims(const std::vector<int32_t> &dims) {
dims_ = dims;
}
const std::string &OutputInfo::name() const { return name_; }
int32_t OutputInfo::node_id() const { return node_id_; }
int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
DataType OutputInfo::data_type() const { return data_type_; }
void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
// NetDef
NetDef::NetDef() : has_bits_(0) {}
const std::string &NetDef::name() const {
return name_;
}
const std::string &NetDef::name() const { return name_; }
void NetDef::set_name(const std::string &value) {
set_has_name();
name_ = value;
}
bool NetDef::has_name() const {
return (has_bits_ & 0x00000001u) != 0;
}
void NetDef::set_has_name() {
has_bits_ |= 0x00000001u;
}
const std::string &NetDef::version() const {
return version_;
}
bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
const std::string &NetDef::version() const { return version_; }
void NetDef::set_version(const std::string &value) {
set_has_version();
version_ = value;
}
bool NetDef::has_version() const {
return (has_bits_ & 0x00000002u) != 0;
}
void NetDef::set_has_version() {
has_bits_ |= 0x00000002u;
}
const std::vector<OperatorDef> &NetDef::op() const {
return op_;
}
bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
const std::vector<OperatorDef> &NetDef::op() const { return op_; }
OperatorDef *NetDef::add_op() {
op_.emplace_back(OperatorDef());
return &op_.back();
}
std::vector<OperatorDef> &NetDef::mutable_op() {
return op_;
}
const std::vector<Argument> &NetDef::arg() const {
return arg_;
}
std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
const std::vector<Argument> &NetDef::arg() const { return arg_; }
Argument *NetDef::add_arg() {
arg_.emplace_back(Argument());
return &arg_.back();
}
std::vector<Argument> &NetDef::mutable_arg() {
return arg_;
}
const std::vector<ConstTensor> &NetDef::tensors() const {
return tensors_;
}
std::vector<ConstTensor> &NetDef::mutable_tensors() {
return tensors_;
}
const MemoryArena &NetDef::mem_arena() const {
return mem_arena_;
}
std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
MemoryArena &NetDef::mutable_mem_arena() {
set_has_mem_arena();
return mem_arena_;
}
bool NetDef::has_mem_arena() const {
return (has_bits_ & 0x00000004u) != 0;
}
void NetDef::set_has_mem_arena() {
has_bits_ |= 0x00000004u;
}
const std::vector<InputInfo> &NetDef::input_info() const {
return input_info_;
}
bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
const std::vector<OutputInfo> &NetDef::output_info() const {
return output_info_;
}
std::vector<OutputInfo> &NetDef::mutable_output_info() {
return output_info_;
}
std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
int NetDef::op_size() const {
return op_.size();
}
int NetDef::op_size() const { return op_.size(); }
const OperatorDef &NetDef::op(const int idx) const {
MACE_CHECK(0 <= idx && idx < op_size());
......@@ -510,26 +350,27 @@ const OperatorDef &NetDef::op(const int idx) const {
}
// Mace Engine
MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
op_registry_(new OperatorRegistry()), device_type_(device_type),
ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) {
ws_->CreateTensor("mace_input_node:0",
GetDeviceAllocator(device_type_),
MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type)
: op_registry_(new OperatorRegistry()),
device_type_(device_type),
ws_(new Workspace()),
net_(nullptr),
hexagon_controller_(nullptr) {
ws_->CreateTensor("mace_input_node:0", GetDeviceAllocator(device_type_),
DT_FLOAT);
ws_->CreateTensor("mace_output_node:0",
GetDeviceAllocator(device_type_),
ws_->CreateTensor("mace_output_node:0", GetDeviceAllocator(device_type_),
DT_FLOAT);
if (device_type == HEXAGON) {
hexagon_controller_.reset(new HexagonControlWrapper());
MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
hexagon_controller_->SetDebugLevel(
static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>(
*net_def, "dsp_mode", 0);
static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
int dsp_mode =
ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
hexagon_controller_->SetGraphMode(dsp_mode);
MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
"hexagon setup graph error");
"hexagon setup graph error");
if (VLOG_IS_ON(2)) {
hexagon_controller_->PrintGraph();
}
......@@ -537,8 +378,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
ws_->LoadModelTensor(*net_def, device_type);
// Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(),
device_type, NetMode::INIT);
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
NetMode::INIT);
if (!net->Run()) {
LOG(FATAL) << "Net init run failed";
}
......@@ -548,18 +389,19 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
MaceEngine::MaceEngine(const NetDef *net_def,
DeviceType device_type,
const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes) :
op_registry_(new OperatorRegistry()), device_type_(device_type),
ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) {
const std::vector<std::string> &output_nodes)
: op_registry_(new OperatorRegistry()),
device_type_(device_type),
ws_(new Workspace()),
net_(nullptr),
hexagon_controller_(nullptr) {
for (auto input_name : input_nodes) {
ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
GetDeviceAllocator(device_type_),
DT_FLOAT);
GetDeviceAllocator(device_type_), DT_FLOAT);
}
for (auto output_name : output_nodes) {
ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"),
GetDeviceAllocator(device_type_),
DT_FLOAT);
GetDeviceAllocator(device_type_), DT_FLOAT);
}
if (device_type == HEXAGON) {
hexagon_controller_.reset(new HexagonControlWrapper());
......@@ -567,8 +409,8 @@ MaceEngine::MaceEngine(const NetDef *net_def,
MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
hexagon_controller_->SetDebugLevel(
static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>(
*net_def, "dsp_mode", 0);
int dsp_mode =
ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
hexagon_controller_->SetGraphMode(dsp_mode);
MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
"hexagon setup graph error");
......@@ -579,14 +421,13 @@ MaceEngine::MaceEngine(const NetDef *net_def,
ws_->LoadModelTensor(*net_def, device_type);
// Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(),
device_type, NetMode::INIT);
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
NetMode::INIT);
if (!net->Run()) {
LOG(FATAL) << "Net init run failed";
}
net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
}
}
MaceEngine::~MaceEngine() {
if (device_type_ == HEXAGON) {
......@@ -643,10 +484,11 @@ bool MaceEngine::Run(const float *input,
bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
std::map<std::string, float *> &outputs,
RunMetadata *run_metadata) {
MACE_CHECK(device_type_ != HEXAGON, "HEXAGON not supports multiple outputs now");
MACE_CHECK(device_type_ != HEXAGON,
"HEXAGON not supports multiple outputs now");
for (auto input : inputs) {
Tensor *input_tensor = ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
input_tensor->Resize(input.shape);
{
Tensor::MappingGuard input_guard(input_tensor);
......@@ -658,7 +500,8 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
LOG(FATAL) << "Net run failed";
}
for (auto output : outputs) {
Tensor *output_tensor = ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
// save output
if (output_tensor != nullptr && output.second != nullptr) {
Tensor::MappingGuard output_guard(output_tensor);
......
......@@ -3,9 +3,9 @@
//
#include "mace/core/net.h"
#include "mace/utils/utils.h"
#include "mace/utils/timer.h"
#include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
Workspace *ws,
DeviceType type,
const NetMode mode)
: NetBase(op_registry, net_def, ws, type),
device_type_(type) {
: NetBase(op_registry, net_def, ws, type), device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
......@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
"(", op->debug_def().type(), ")");
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
......@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws,
DeviceType type,
const NetMode mode) {
std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode));
return net;
}
......
......@@ -7,10 +7,10 @@
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -147,7 +147,7 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
class OperatorRegistry {
public:
typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
RegistryType;
RegistryType;
OperatorRegistry();
~OperatorRegistry() = default;
RegistryType *registry() { return &registry_; };
......
......@@ -36,6 +36,6 @@ class PreallocatedPooledAllocator {
std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
};
} // namespace mace
} // namespace mace
#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
......@@ -2,19 +2,19 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <vector>
#include <thread>
#include <sys/time.h>
#include <thread>
#include <vector>
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
namespace {
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
}
namespace mace {
......@@ -63,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// const node
std::thread const_thread([&]() {
std::vector<hexagon_nn_const_node> const_node_list;
for (const ConstTensor &const_tensor: net_def.tensors()) {
for (const ConstTensor &const_tensor : net_def.tensors()) {
std::vector<int> tensor_shape(const_tensor.dims().begin(),
const_tensor.dims().end());
const_tensor.dims().end());
while (tensor_shape.size() < 4) {
tensor_shape.insert(tensor_shape.begin(), 1);
}
......@@ -77,32 +77,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
const_node.tensor.width = tensor_shape[2];
const_node.tensor.depth = tensor_shape[3];
if (const_tensor.data_type() == DataType::DT_INT32
&& const_tensor.data_size() == 0) {
if (const_tensor.data_type() == DataType::DT_INT32 &&
const_tensor.data_size() == 0) {
const_node.tensor.data = NULL;
const_node.tensor.dataLen = 0;
} else {
const_node.tensor.data =
const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen =
const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type());
const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen = const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type());
}
const_node_list.push_back(const_node);
// 255 is magic number: why fastrpc limits sequence length to that?
if (const_node_list.size() >= 250) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
const_node_list.data(),
const_node_list.size())
== 0, "append const node error");
MACE_CHECK(
hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
const_node_list.clear();
}
}
if (!const_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
MACE_CHECK(
hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
}
const_node_list.clear();
});
......@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
std::vector<hexagon_nn_input> inputs;
std::vector<hexagon_nn_output> outputs;
for (const OperatorDef &op: net_def.op()) {
for (const OperatorDef &op : net_def.op()) {
int op_id = op_map.GetOpId(op.type());
inputs.resize(op.node_input().size());
for (size_t i = 0; i < op.node_input().size(); ++i) {
......@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
cached_inputs.push_back(inputs);
cached_outputs.push_back(outputs);
hexagon_nn_padding_type
padding_type = static_cast<hexagon_nn_padding_type>(
op.padding());
hexagon_nn_padding_type padding_type =
static_cast<hexagon_nn_padding_type>(op.padding());
hexagon_nn_op_node op_node;
op_node.node_id = node_id(op.node_id());
......@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
op_node_list.push_back(op_node);
if (op_node_list.size() >= 125) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
op_node_list.data(),
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.size()) == 0,
"append node error");
op_node_list.clear();
......@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
}
if (!op_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
op_node_list.data(),
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.size()) == 0,
"append node error");
}
......@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// input info
num_inputs_ = 0;
for (const InputInfo &input_info: net_def.input_info()) {
for (const InputInfo &input_info : net_def.input_info()) {
std::vector<index_t> input_shape;
input_shape.insert(input_shape.begin(),
input_info.dims().begin(), input_info.dims().end());
input_shape.insert(input_shape.begin(), input_info.dims().begin(),
input_info.dims().end());
while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1);
}
......@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// output info
num_outputs_ = 0;
for (const OutputInfo &output_info: net_def.output_info()) {
for (const OutputInfo &output_info : net_def.output_info()) {
std::vector<index_t> output_shape;
output_shape.insert(output_shape.begin(),
output_info.dims().begin(), output_info.dims().end());
output_shape.insert(output_shape.begin(), output_info.dims().begin(),
output_info.dims().end());
while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1);
}
......@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0;
}
#define PRINT_BUFSIZE (2*1024*1024)
#define PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() {
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_,
reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, "print log error");
MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
"print log error");
LOG(INFO) << std::string(buf);
delete[]buf;
delete[] buf;
}
void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph";
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_,
reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, "print graph error");
MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
"print graph error");
LOG(INFO) << std::string(buf);
delete[]buf;
delete[] buf;
}
void HexagonControlWrapper::SetDebugLevel(int level) {
......@@ -256,9 +253,9 @@ void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
unsigned int n_items = 0;
MACE_CHECK(
hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0,
"get perf info error");
MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
&n_items) == 0,
"get perf info error");
std::unordered_map<uint32_t, float> node_id_counters;
std::unordered_map<std::string, std::pair<int, float>> node_type_counters;
......@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
unsigned int node_id = perf_info[i].node_id;
unsigned int node_type_id = perf_info[i].node_type;
node_id_counters[node_id] =
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32)
+ perf_info[i].counter_lo) * 1.0f / perf_info[i].executions;
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
perf_info[i].counter_lo) *
1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
......@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
total_duration += node_id_counters[node_id];
}
for (auto &node_type_counter: node_type_counters) {
for (auto &node_type_counter : node_type_counters) {
LOG(INFO) << "node type: " << node_type_counter.first
<< ", time: " << node_type_counter.second.first
<< ", duration: " << node_type_counter.second.second;
......@@ -312,33 +310,25 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
output_tensor->Resize(output_shapes_[0]);
std::vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
int res = hexagon_nn_execute(
nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
input_tensor.shape()[2], input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
input_tensor.raw_size(), &output_shape[0], &output_shape[1],
&output_shape[2], &output_shape[3],
reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
output_tensor->raw_size(), &output_bytes);
MACE_CHECK(res == 0, "execute error");
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) {
bool HexagonControlWrapper::ExecuteGraphNew(
const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
......@@ -355,7 +345,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
......@@ -365,16 +355,16 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
int res =
hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
......@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor,
&input_tensors[0],
min_in_data,
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
......@@ -409,11 +397,9 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0],
*min_out_data,
*max_out_data,
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
output_tensor);
return true;
}
} // namespace mace
} // namespace mace
......@@ -16,16 +16,17 @@ namespace mace {
class HexagonControlWrapper {
public:
HexagonControlWrapper() {};
HexagonControlWrapper(){};
int GetVersion();
bool Config();
bool Init();
bool Finalize();
bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const NetDef &net_def);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors,
bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor);
bool TeardownGraph();
void PrintLog();
......@@ -38,9 +39,7 @@ class HexagonControlWrapper {
private:
static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) {
return NODE_ID_OFFSET + nodeid;
}
inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
int nn_id_;
Quantizer quantizer_;
......@@ -52,9 +51,8 @@ class HexagonControlWrapper {
uint32_t num_inputs_;
uint32_t num_outputs_;
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
};
}
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
......@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
return 0;
}
int hexagon_controller_DeInitHexagon() {
int hexagon_controller_DeInitHexagon() { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
......@@ -2,27 +2,27 @@
#define _HEXAGON_NN_H
#ifndef __QAIC_HEADER
#define __QAIC_HEADER(ff) ff
#endif //__QAIC_HEADER
#endif //__QAIC_HEADER
#ifndef __QAIC_HEADER_EXPORT
#define __QAIC_HEADER_EXPORT
#endif // __QAIC_HEADER_EXPORT
#endif // __QAIC_HEADER_EXPORT
#ifndef __QAIC_HEADER_ATTRIBUTE
#define __QAIC_HEADER_ATTRIBUTE
#endif // __QAIC_HEADER_ATTRIBUTE
#endif // __QAIC_HEADER_ATTRIBUTE
#ifndef __QAIC_IMPL
#define __QAIC_IMPL(ff) ff
#endif //__QAIC_IMPL
#endif //__QAIC_IMPL
#ifndef __QAIC_IMPL_EXPORT
#define __QAIC_IMPL_EXPORT
#endif // __QAIC_IMPL_EXPORT
#endif // __QAIC_IMPL_EXPORT
#ifndef __QAIC_IMPL_ATTRIBUTE
#define __QAIC_IMPL_ATTRIBUTE
#endif // __QAIC_IMPL_ATTRIBUTE
#endif // __QAIC_IMPL_ATTRIBUTE
#ifdef __cplusplus
extern "C" {
#endif
......@@ -30,92 +30,160 @@ extern "C" {
#define __QAIC_STRING1_OBJECT_DEFINED__
#define __STRING1_OBJECT__
typedef struct _cstring1_s {
char* data;
int dataLen;
char *data;
int dataLen;
} _cstring1_t;
#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
typedef struct hexagon_nn_input hexagon_nn_input;
struct hexagon_nn_input {
unsigned int src_id;
unsigned int output_idx;
unsigned int src_id;
unsigned int output_idx;
};
typedef struct hexagon_nn_output hexagon_nn_output;
struct hexagon_nn_output {
unsigned int max_size;
unsigned int unused;
unsigned int max_size;
unsigned int unused;
};
typedef struct hexagon_nn_perfinfo hexagon_nn_perfinfo;
struct hexagon_nn_perfinfo {
unsigned int node_id;
unsigned int node_type;
unsigned int executions;
unsigned int unused;
unsigned int counter_lo;
unsigned int counter_hi;
unsigned int node_id;
unsigned int node_type;
unsigned int executions;
unsigned int unused;
unsigned int counter_lo;
unsigned int counter_hi;
};
typedef int hexagon_nn_nn_id;
enum hexagon_nn_padding_type {
NN_PAD_NA,
NN_PAD_SAME,
NN_PAD_VALID,
NN_PAD_MIRROR_REFLECT,
NN_PAD_MIRROR_SYMMETRIC,
NN_PAD_SAME_CAFFE,
_32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
NN_PAD_NA,
NN_PAD_SAME,
NN_PAD_VALID,
NN_PAD_MIRROR_REFLECT,
NN_PAD_MIRROR_SYMMETRIC,
NN_PAD_SAME_CAFFE,
_32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
};
typedef enum hexagon_nn_padding_type hexagon_nn_padding_type;
typedef struct hexagon_nn_tensordef hexagon_nn_tensordef;
struct hexagon_nn_tensordef {
unsigned int batches;
unsigned int height;
unsigned int width;
unsigned int depth;
unsigned char* data;
int dataLen;
unsigned int data_valid_len;
unsigned int unused;
unsigned int batches;
unsigned int height;
unsigned int width;
unsigned int depth;
unsigned char *data;
int dataLen;
unsigned int data_valid_len;
unsigned int unused;
};
typedef struct hexagon_nn_op_node hexagon_nn_op_node;
struct hexagon_nn_op_node {
unsigned int node_id;
unsigned int operation;
hexagon_nn_padding_type padding;
hexagon_nn_input* inputs;
int inputsLen;
hexagon_nn_output* outputs;
int outputsLen;
unsigned int node_id;
unsigned int operation;
hexagon_nn_padding_type padding;
hexagon_nn_input *inputs;
int inputsLen;
hexagon_nn_output *outputs;
int outputsLen;
};
typedef struct hexagon_nn_const_node hexagon_nn_const_node;
struct hexagon_nn_const_node {
unsigned int node_id;
hexagon_nn_tensordef tensor;
unsigned int node_id;
hexagon_nn_tensordef tensor;
};
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
#ifdef __cplusplus
}
#endif
#endif //_HEXAGON_NN_H
#endif //_HEXAGON_NN_H
......@@ -5,8 +5,8 @@
#ifndef LIBMACE_HEXAGON_NN_OPS_H
#define LIBMACE_HEXAGON_NN_OPS_H
#include "mace/utils/logging.h"
#include <unordered_map>
#include "mace/utils/logging.h"
namespace mace {
......@@ -24,8 +24,7 @@ typedef enum op_type_enum {
class OpMap {
public:
void Init() {
#define DEF_OP(NAME) \
op_map_[#NAME] = OP_##NAME;
#define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
#include "mace/core/runtime/hexagon/ops.h"
......@@ -40,9 +39,10 @@ class OpMap {
return OP_INVALID;
}
}
private:
std::unordered_map<std::string, int> op_map_;
};
} // namespace mace
} // namespace mace
#endif // LIBMACE_HEXAGON_NN_OPS_H
#endif // LIBMACE_HEXAGON_NN_OPS_H
......@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
#undef __SELF_DEF_OP_WREF
#undef DEF_OP_WREF
#endif
......@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
float *max_out) {
float stepsize;
float recip_stepsize;
QuantizeAdjustRange(min_in, max_in,
min_out, max_out,
&stepsize, &recip_stepsize);
QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
&recip_stepsize);
const float *in = in_tensor.data<float>();
uint8_t *out = out_tensor->mutable_data<uint8_t>();
for (int i = 0; i < in_tensor.size(); i++) {
const float inval = in[i];
float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
float ival =
static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
if (ival < 0) ival = 0;
if (ival > 255) ival = 255;
out[i] = static_cast<uint8_t>(ival);
......@@ -93,4 +93,4 @@ void Quantizer::DeQuantize(const Tensor &in_tensor,
}
}
} // namespace mace
\ No newline at end of file
} // namespace mace
\ No newline at end of file
......@@ -16,13 +16,17 @@ class Quantizer {
void Quantize(const Tensor &in_tensor,
Tensor *out_tensor,
float *min_out, float *max_out);
float *min_out,
float *max_out);
void Quantize(const Tensor &in_tensor,
const float min_in, const float max_in,
const float min_in,
const float max_in,
Tensor *out_tensor,
float *min_out, float *max_out);
float *min_out,
float *max_out);
void DeQuantize(const Tensor &in_tensor,
const float min_in, const float max_in,
const float min_in,
const float max_in,
Tensor *out_tensor);
private:
......@@ -33,9 +37,9 @@ class Quantizer {
float *stepsize,
float *recip_stepsize);
DISABLE_COPY_AND_ASSIGN(Quantizer);
DISABLE_COPY_AND_ASSIGN(Quantizer);
};
} // mace
} // mace
#endif // MACE_DSP_UTIL_QUANTIZE_H_
#endif // MACE_DSP_UTIL_QUANTIZE_H_
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
......@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
return 0;
}
}
}
OpenCLAllocator::OpenCLAllocator() {}
......@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
<< image_shape[1];
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
image_shape[0], image_shape[1],
0, nullptr, &error);
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
image_shape[0], image_shape[1], 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1]
<< "]";
......@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
nbytes, nullptr, nullptr, &error);
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr;
}
......@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch->resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
mapped_image_pitch->data(),
mapped_image_pitch->data() + 1,
nullptr, nullptr, &error);
void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr;
......
......@@ -5,8 +5,8 @@
#include <vector>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap;
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
*is_binary = false;
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source == kEncryptedProgramMap.end()) {
......
......@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap;
extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
*is_binary = true;
auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
if (it_binary == kCompiledProgramMap.end()) {
......
......@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0;
}
double OpenCLProfilingTimer::AccumulatedMicros() {
return accumulated_micros_;
}
double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
void OpenCLProfilingTimer::AccumulateTiming(){
void OpenCLProfilingTimer::AccumulateTiming() {
StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
}
......@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
cl::CommandQueue command_queue(context, gpu_device, properties);
const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context);
......@@ -163,18 +162,14 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
MACE_CHECK_NOTNULL(program);
std::string binary_file_name_prefix =
GenerateCLBinaryFilenamePrefix(built_program_key);
GenerateCLBinaryFilenamePrefix(built_program_key);
std::vector<unsigned char> program_vec;
bool is_opencl_binary;
const bool found = GetSourceOrBinaryProgram(program_name,
binary_file_name_prefix,
context(),
device(),
program,
&is_opencl_binary);
const bool found =
GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
device(), program, &is_opencl_binary);
MACE_CHECK(found, "Program not found for ",
is_opencl_binary ? "binary: " : "source: ",
built_program_key);
is_opencl_binary ? "binary: " : "source: ", built_program_key);
// Build program
std::string build_options_str =
......@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
}
LOG(FATAL) << "Build program from "
<< (is_opencl_binary ? "binary: " : "source: ")
<< built_program_key
<< " failed: " << ret;
<< built_program_key << " failed: " << ret;
}
if (!is_opencl_binary) {
// Write binary if necessary
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
std::string binary_filename =
kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]);
......@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) {
program = built_program_it->second;
} else {
this->BuildProgram(program_name, built_program_key,
build_options_str, &program);
this->BuildProgram(program_name, built_program_key, build_options_str,
&program);
built_program_map_.emplace(built_program_key, program);
}
return cl::Kernel(program, kernel_name.c_str());
......@@ -250,9 +245,9 @@ cl::Kernel OpenCLRuntime::BuildKernel(
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) {
stats->start_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
stats->end_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
}
}
......
......@@ -19,7 +19,8 @@ namespace mace {
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0){};
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
......@@ -48,6 +49,7 @@ class OpenCLRuntime {
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
private:
OpenCLRuntime();
~OpenCLRuntime();
......
......@@ -7,10 +7,10 @@
namespace mace {
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
......@@ -65,23 +65,20 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
class Tensor {
public:
Tensor(Allocator *alloc, DataType type)
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
name_("") {};
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
name_(""){};
Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype),
buffer_(buffer),
is_buffer_owner_(false),
name_("") {}
: dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
name_("") {
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
name_("") {
buffer_ = &buffer_slice_;
}
......@@ -102,8 +99,8 @@ class Tensor {
inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ",
index, " >= ", shape_.size());
MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
shape_.size());
return shape_[index];
}
......@@ -112,40 +109,35 @@ class Tensor {
std::multiplies<int64_t>());
}
inline index_t raw_size() const {
return size() * SizeOfType();
}
inline index_t raw_size() const { return size() * SizeOfType(); }
inline bool has_opencl_image() const {
return buffer_ != nullptr && !buffer_->OnHost()
&& typeid(*buffer_) == typeid(Image);
return buffer_ != nullptr && !buffer_->OnHost() &&
typeid(*buffer_) == typeid(Image);
}
inline bool has_opencl_buffer() const {
return buffer_ != nullptr && !buffer_->OnHost()
&& !has_opencl_image();
return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
}
inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image");
return static_cast<cl::Image*>(buffer_->buffer());
return static_cast<cl::Image *>(buffer_->buffer());
}
inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
return static_cast<cl::Buffer*>(buffer_->buffer());
return static_cast<cl::Buffer *>(buffer_->buffer());
}
inline index_t buffer_offset() const {
return buffer_->offset();
}
inline index_t buffer_offset() const { return buffer_->offset(); }
inline const void *raw_data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_data();
}
template<typename T>
template <typename T>
inline const T *data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>();
......@@ -156,7 +148,7 @@ class Tensor {
return buffer_->raw_mutable_data();
}
template<typename T>
template <typename T>
inline T *mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data());
......@@ -188,25 +180,17 @@ class Tensor {
is_buffer_owner_ = true;
} else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image*>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0]
&& image_shape[1] <= image->image_shape()[1],
"tensor (source op ",
name_,
"): current physical image shape: ",
image->image_shape()[0],
", ",
image->image_shape()[1],
" < logical image shape: ",
image_shape[0],
", ",
image_shape[1]);
Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
image_shape[1] <= image->image_shape()[1],
"tensor (source op ", name_,
"): current physical image shape: ", image->image_shape()[0],
", ", image->image_shape()[1], " < logical image shape: ",
image_shape[0], ", ", image_shape[1]);
}
}
inline void ResizeLike(const Tensor &other) {
ResizeLike(&other);
}
inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
inline void ResizeLike(const Tensor *other) {
if (other->has_opencl_image()) {
......@@ -229,7 +213,7 @@ class Tensor {
memcpy(buffer_->raw_mutable_data(), src, size);
}
template<typename T>
template <typename T>
inline void Copy(const T *src, index_t length) {
MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
......@@ -248,13 +232,9 @@ class Tensor {
return type_size;
}
inline BufferBase *UnderlyingBuffer() const {
return buffer_;
}
inline BufferBase *UnderlyingBuffer() const { return buffer_; }
inline void SetSourceOpName(const std::string name) {
name_ = name;
}
inline void SetSourceOpName(const std::string name) { name_ = name; }
inline void DebugPrint() const {
using namespace numerical_chars;
......@@ -272,8 +252,9 @@ class Tensor {
}
CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
}
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", "
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str();
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
<< ", " << dim(3) << "], content:\n"
<< os.str();
}
class MappingGuard {
......@@ -301,20 +282,20 @@ class Tensor {
const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_;
DISABLE_COPY_AND_ASSIGN(MappingGuard);
DISABLE_COPY_AND_ASSIGN(MappingGuard);
};
private:
Allocator *allocator_;
DataType dtype_;
std::vector<index_t> shape_;
std::vector<size_t > image_shape_;
std::vector<size_t> image_shape_;
BufferBase *buffer_;
BufferSlice buffer_slice_;
bool is_buffer_owner_;
std::string name_;
DISABLE_COPY_AND_ASSIGN(Tensor);
DISABLE_COPY_AND_ASSIGN(Tensor);
};
} // namespace tensor
......
......@@ -99,9 +99,7 @@ void RestartTiming() {
accum_time = 0;
start_time = NowMicros();
}
void StartTiming() {
start_time = NowMicros();
}
void StartTiming() { start_time = NowMicros(); }
void StopTiming() {
if (start_time != 0) {
accum_time += (NowMicros() - start_time);
......
......@@ -6,9 +6,9 @@
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <string>
#include <utility>
#include <vector>
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
......
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <map>
#include <cstdint>
#include <map>
#include "mace/core/types.h"
#include "mace/utils/logging.h"
......@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"},
{DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"},
{DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
{DT_FLOAT, "DT_FLOAT"}, {DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"}, {DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"}, {DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt];
}
......
......@@ -5,8 +5,8 @@
#include <string>
#include <vector>
#include "mace/core/workspace.h"
#include "mace/core/arg_helper.h"
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
namespace mace {
......@@ -19,7 +19,7 @@ Tensor *Workspace::CreateTensor(const std::string &name,
} else {
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
}
return GetTensor(name);
}
......@@ -35,7 +35,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
Tensor *Workspace::GetTensor(const std::string &name) {
return const_cast<Tensor *>(
static_cast<const Workspace *>(this)->GetTensor(name));
static_cast<const Workspace *>(this)->GetTensor(name));
}
std::vector<std::string> Workspace::Tensors() const {
......@@ -51,28 +51,28 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr
|| reinterpret_cast<long long>(const_tensor.data())
< reinterpret_cast<long long>(model_data_ptr)) {
if (model_data_ptr == nullptr ||
reinterpret_cast<long long>(const_tensor.data()) <
reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
}
}
for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size,
static_cast<index_t>(
(reinterpret_cast<long long>(const_tensor.data())
- reinterpret_cast<long long>(model_data_ptr))
+ const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())));
model_data_size = std::max(
model_data_size,
static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
reinterpret_cast<long long>(model_data_ptr)) +
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())));
}
VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU) {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
} else {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_size)));
new Buffer(GetDeviceAllocator(type), model_data_size)));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap();
......@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type()
<< ", shape: "
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
......@@ -90,14 +89,12 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
dims.push_back(d);
}
index_t
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(),
offset,
const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
new Tensor(BufferSlice(tensor_buffer_.get(), offset,
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
......@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
// as GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op: net_def.op()) {
for (auto &op : net_def.op()) {
if (op.has_mem_id()) {
const DataType op_dtype = static_cast<DataType>(
ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op,
"T",
static_cast<int>(DT_FLOAT)));
ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op, "T", static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype;
// find first valid data type, break
......@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase>
image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
for (auto &mem_block : net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase> image_buf(
new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
}
VLOG(3) << "Preallocate image to tensors";
for (auto &op: net_def.op()) {
for (auto &op : net_def.op()) {
if (op.has_mem_id()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
std::unique_ptr<Tensor> tensor(
new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
tensor->SetSourceOpName(op.name());
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: "
<< op.mem_id() << "; Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1];
VLOG(3)
<< "Tensor: " << op.name() << "(" << op.type() << ")"
<< "; Mem: " << op.mem_id() << "; Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor_map_[op.output(0)] = std::move(tensor);
}
}
......
......@@ -5,9 +5,9 @@
#ifndef MACE_CORE_WORKSPACE_H_
#define MACE_CORE_WORKSPACE_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/preallocated_pooled_allocator.h"
namespace mace {
......@@ -43,7 +43,7 @@ class Workspace {
PreallocatedPooledAllocator preallocated_allocator_;
DISABLE_COPY_AND_ASSIGN(Workspace);
DISABLE_COPY_AND_ASSIGN(Workspace);
};
} // namespace mace
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_ACTIVATION_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -99,17 +99,15 @@ void PReLUActivation(const T *input_ptr,
output_ptr[i] = in;
}
}
}
template <DeviceType D, typename T>
class ActivationFunctor {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type),
relux_max_limit_(relux_max_limit){}
: activation_(type), relux_max_limit_(relux_max_limit) {}
void operator()(const Tensor *input,
void operator()(const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
......@@ -118,9 +116,11 @@ class ActivationFunctor {
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
const T *alpha_ptr = alpha->data<T>();
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr);
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
output_ptr);
} else {
DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_);
DoActivation(input_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
}
......@@ -131,14 +131,16 @@ class ActivationFunctor {
template <>
void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future);
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type),
relux_max_limit_(relux_max_limit){}
: activation_(type), relux_max_limit_(relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *alpha,
......
......@@ -18,7 +18,7 @@ namespace mace {
namespace kernels {
namespace {
constexpr int kCostPerGroup = 1024;
constexpr int kCostPerGroup = 1024;
} // namespace
template <DeviceType D, typename T>
......
......@@ -10,10 +10,10 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
......@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
const float relux_max_limit)
: folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const bool folded_constant_;
const ActivationType activation_;
......@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(
folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *scale,
......@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(
folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -32,7 +32,6 @@ struct BiasAddFunctor {
const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) {
......@@ -44,7 +43,6 @@ struct BiasAddFunctor {
}
}
}
}
};
......
......@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
bool i2b_;
};
template<DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
......@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
}
};
template<typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
......
......@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
public:
ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape,
T *output, StatsFuture *future) {
void operator()(const T *input,
const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......
......@@ -6,23 +6,23 @@
#define MACE_KERNELS_CONCAT_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis): axis_(axis){}
ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
}
};
template<typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, StatsFuture *future);
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
};
} // namepsace kernels
......
......@@ -116,9 +116,8 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
sum[sum_idx] += vaddvq_f32(tmp);
#else
for (int inci = 0; inci < inc_tile_size; ++inci) {
sum[sum_idx] +=
in[in_idx * inc_tile_size + inci] *
weights[weights_idx * inc_tile_size + inci];
sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
weights[weights_idx * inc_tile_size + inci];
}
#endif
}
......@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
paddings_(paddings),
dilations_(dilations),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w]
const Padding padding_type_;
......@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
output->Resize(output_shape);
......
......@@ -145,7 +145,7 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
......@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
*/
output_shape[0] = input_shape[0];
if (round_type == FLOOR) {
output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0]
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1]
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
output_shape[1] = static_cast<index_t>(
std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
(filter_shape[0] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} else {
output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0]
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1]
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
output_shape[1] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
(filter_shape[0] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
}
output_shape[3] = filter_shape[2];
}
void CalPaddingSize(const index_t *input_shape, // NCHW
......
......@@ -15,7 +15,7 @@ enum Padding {
FULL = 2, // Pads with one less than the filter size on both sides
};
enum RoundType{
enum RoundType {
FLOOR = 0,
CEIL = 1,
};
......
......@@ -10,9 +10,9 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
......@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
paddings_(paddings),
dilations_(dilations),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w]
const Padding padding_type_;
......@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
auto input_shape = fake_filter_shape;
output->Resize(output_shape);
......
......@@ -5,13 +5,13 @@
#define MACE_KERNELS_ELTWISE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
enum EltwiseType{
enum EltwiseType {
PROD = 0,
SUM = 1,
MAX = 2,
......@@ -19,8 +19,7 @@ enum EltwiseType{
};
struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type,
const std::vector<float> &coeff)
EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
: type_(type), coeff_(coeff) {}
EltwiseType type_;
......@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff)
EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0,
......@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
switch (type_) {
case PROD:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = input0_ptr[i] * input1_ptr[i];
}
break;
......@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
output_ptr[i] =
coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
}
}
break;
case MAX:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
}
break;
case MIN:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
}
break;
......@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
}
};
template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff)
struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0,
......
......@@ -6,8 +6,8 @@
#define MACE_KERNELS_FULLY_CONNECTED_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
namespace mace {
......@@ -16,25 +16,23 @@ namespace kernels {
struct FullyConnectedBase {
FullyConnectedBase(const ActivationType activation,
const float relux_max_limit)
: activation_(activation),
relux_max_limit_(relux_max_limit){}
: activation_(activation), relux_max_limit_(relux_max_limit) {}
const ActivationType activation_;
const float relux_max_limit_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct FullyConnectedFunctor : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) :
FullyConnectedBase(activation, relux_max_limit) {}
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
output->Resize(output_shape);
const index_t N = output->dim(0);
......@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
}
};
template<typename T>
template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) :
FullyConnectedBase(activation, relux_max_limit) {}
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *weight,
......
......@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape,
float *output, StatsFuture *future);
const float *input,
const index_t *input_shape,
float *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -6,20 +6,18 @@
#define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct MatMulFunctor {
void operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
C->Resize(c_shape);
const index_t N = C->dim(0);
......@@ -52,7 +50,6 @@ struct MatMulFunctor {
}
};
template <typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *A,
......
......@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < n; ++i) {
for (index_t j = 0; j < sample_size; ++j) {
const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel;
const float *input_sample_ptr =
input_ptr + (i * sample_size + j) * channel;
float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
const float *new_scale_ptr = new_scale.data();
const float *new_offset_ptr = new_offset.data();
......
......@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape_vec.data(), paddings.data());
input->shape().data(), filter->shape().data(), dilations_, strides_,
paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)(
......@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto output_shape = output->shape().data();
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
bias_data, output_data, output_shape);
conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
output_data, output_shape);
}
} // namespace kernels
......
......@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier =
filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) {
......@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier =
filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
......
......@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
dilations_)(
input, filter, bias, output, future);
DepthwiseConv2dFunctor<DeviceType::CPU, float>(
strides_, paddings_, dilations_)(input, filter, bias, output, future);
return;
}
......@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr,
output_shape);
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
output_ptr, output_shape);
}
} // namespace kernels
......
......@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ =
runtime->BuildKernel("activation", kernel_name, built_options);
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
......@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
}
......
......@@ -5,8 +5,8 @@
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0;
for (auto input : input_tensors) {
kernel_.setArg(idx++,
*(input->opencl_image()));
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output_shape[0] << "_"
<< output_shape[1] << "_"
<< output_shape[2] << "_"
<< output_shape[3];
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
template struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ =
runtime->BuildKernel("batch_norm", kernel_name, built_options);
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++,
*(offset->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (!folded_constant_) {
kernel_.setArg(idx++,
*(mean->opencl_image()));
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon);
}
......
......@@ -12,11 +12,10 @@ namespace mace {
namespace kernels {
template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
......@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template
struct BiasAddFunctor<DeviceType::OPENCL, float>;
template
struct BiasAddFunctor<DeviceType::OPENCL, half>;
template struct BiasAddFunctor<DeviceType::OPENCL, float>;
template struct BiasAddFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -9,36 +9,33 @@
namespace mace {
namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image,
StatsFuture *future) {
template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
if(type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape);
} else {
image->ResizeImage(buffer->shape(), image_shape);
}
} else {
Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape();
buffer->Resize(image->shape());
}
size_t gws[2] = {image_shape[0],
image_shape[1]};
size_t gws[2] = {image_shape[0], image_shape[1]};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
kernel_name =
i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
......@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break;
case IN_OUT_HEIGHT:
case WEIGHT_HEIGHT:
kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image";
kernel_name = i2b_ ? "in_out_height_image_to_buffer"
: "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
......@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break;
case WINOGRAD_FILTER:
gws[1] /= 16;
kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
: "winograd_filter_buffer_to_image";
break;
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
......@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Global();
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options);
obfuscated_kernel_name, built_options);
uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned");
b2f_kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
}
if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) {
} else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, 1);
......@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const std::vector<uint32_t> lws = {16, 64};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]),
nullptr, &event);
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
......
......@@ -18,8 +18,8 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU
......
......@@ -5,8 +5,8 @@
#include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image())));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "concat_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
......@@ -97,27 +96,25 @@ static void ConcatN(cl::Kernel *kernel,
index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_"
<< input_channel_blk << "_"
<< width << "_"
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
}
template<typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3";
<< "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0;
......@@ -137,8 +134,9 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
}
output_shape[axis_] += input->dim(axis_);
}
MACE_CHECK(inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
MACE_CHECK(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output->ResizeImage(output_shape, image_shape);
......@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
}
else {
} else {
MACE_NOT_IMPLEMENTED;
}
}
};
template
struct ConcatFunctor<DeviceType::OPENCL, float>;
template
struct ConcatFunctor<DeviceType::OPENCL, half>;
template struct ConcatFunctor<DeviceType::OPENCL, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Tensor *output,
StatsFuture *future);
template<typename T>
template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(
cl::Kernel *kernel,
const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
const int *padding, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt,
Tensor *output, StatsFuture *future);
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] =
{Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
......@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
......@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
relux_max_limit_, DataTypeToEnum<T>::value, output, future);
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future);
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
}
}
template
struct Conv2dFunctor<DeviceType::OPENCL, float>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
template struct Conv2dFunctor<DeviceType::OPENCL, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
// FIXME handle flexable data type: half not supported
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input_height));
......@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
}
} // namespace kernels
} // namespace mace
......@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2)));
......
......@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
......
......@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
if(kernel->get() == nullptr) {
if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
......@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(
idx++, *(filter->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(
idx++, *(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(
idx++, *(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width));
......@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_,
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
}
......
......@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
const index_t batch = input0->dim(0);
const index_t height = input0->dim(1);
const index_t width = input0->dim(2);
......@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++,
*(input0->opencl_image()));
kernel_.setArg(idx++,
*(input1->opencl_image()));
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
......@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image()));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss;
ss << "eltwise_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -10,14 +10,13 @@
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
......@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options);
kernel_ =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++,
*(input->opencl_image()));
kernel_.setArg(idx++,
*(weight->opencl_image()));
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++,
*(bias->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++,
*(output->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
......@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
}
const uint32_t gws[2] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(output_blocks),
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss;
ss << "fc_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template
struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -3,8 +3,8 @@
//
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -28,8 +28,9 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3];
......@@ -47,8 +48,9 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]);
......@@ -115,19 +117,16 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
}
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) {
if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1};
}else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] *
((shape[1] - 1) / 2) *
((shape[2] - 1) / 2);
} else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
return std::vector<index_t>();
}
}
......@@ -188,10 +187,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2],
kwg_size / (local_ws[0] * local_ws[1]));
local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return {
// TODO tuning these magic numbers
// TODO tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
......@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
......@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
(*tuning_result)[3] = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
......@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
{kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1},
{1, kwg_size, 1}
};
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
......@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
(*tuning_result)[2] = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
......@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
lws,
params_generator,
func,
&timer);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
}
};
}
}
} // namespace kernels
......
......@@ -5,16 +5,16 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h"
#include "mace/utils/utils.h"
#include "mace/core/future.h"
namespace mace {
namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType {
CONV2D_FILTER = 0,
......@@ -31,7 +31,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type);
std::string DtToCLCMDDt(const DataType dt);
......@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
......@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
}
namespace {
template<typename T>
template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v;
}
template<typename T, typename... Args>
template <typename T, typename... Args>
void AppendToStream(std::stringstream *ss,
const std::string &delimiter,
T first,
......@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
}
} // namespace
template<typename... Args>
template <typename... Args>
std::string Concat(Args... args) {
std::stringstream ss;
AppendToStream(&ss, "_", args...);
......
......@@ -11,12 +11,10 @@ namespace mace {
namespace kernels {
template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
......@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++,
*(B->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
......@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss;
ss << "matmul_opencl_kernel_"
<< C->dim(0) << "_"
<< C->dim(1) << "_"
<< C->dim(2) << "_"
<< C->dim(3);
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct MatMulFunctor<DeviceType::OPENCL, float>;
template struct MatMulFunctor<DeviceType::OPENCL, float>;
template
struct MatMulFunctor<DeviceType::OPENCL, half>;
template struct MatMulFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -11,17 +11,15 @@
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input->dim(3), input->dim(3)
};
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
......@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
template struct PoolingFunctor<DeviceType::OPENCL, float>;
template struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -2,12 +2,12 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape,
BufferType::IN_OUT_CHANNEL,
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
......@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
......@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -6,13 +6,13 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
Tensor *output,
StatsFuture *future) {
......@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "softmax_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct SoftmaxFunctor<DeviceType::OPENCL, float>;
template
struct SoftmaxFunctor<DeviceType::OPENCL, half>;
template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -5,20 +5,21 @@
#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) {
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *space_tensor,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) {
const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
......@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
uint32_t idx = 0;
if (b2s_) {
......@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {chan_blk,
static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << kernel_name << "_"
<< batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_"
ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -11,21 +11,21 @@
namespace mace {
namespace kernels {
template<typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(),
padding_type_, output_shape.data(), paddings.data());
input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data());
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h = (output_shape[1] + 1) / 2;
......@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
uint32_t idx = 0;
......@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
}
const uint32_t gws[2] = {static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
template<typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
......@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
}
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) {
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image())));
kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
}
kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
......@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
kernel_.setArg(idx++, relux_max_limit_);
}
const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_inverse_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -7,9 +7,9 @@
#include <limits>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
......@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
const int *dilations_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
......@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding_type,
paddings, dilations) {}
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
input_tensor->shape().data(), filter_shape.data(), dilations_,
strides_, padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
}
output_tensor->Resize(output_shape);
......@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
res = std::max(res, input[input_offset]);
}
}
......@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0;
int block_size = 0;
......@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset];
block_size += 1;
}
......@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
}
}
}
};
template<>
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future);
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
template<typename T>
template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
......@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding_type,
paddings, dilations) {}
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future);
......
......@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESHAPE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
......@@ -25,7 +25,6 @@ struct ReshapeFunctor {
}
};
} // namespace kernels
} // namespace mace
......
......@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
......@@ -74,9 +74,9 @@ void ResizeImage(const T *images,
const T *batch_input_ptr = images + in_batch_num_values * b;
T *batch_output_ptr = output + out_batch_num_values * b;
const T *y_lower_input_ptr =
batch_input_ptr + ys[y].lower * in_width * channels;
batch_input_ptr + ys[y].lower * in_width * channels;
const T *y_upper_input_ptr =
batch_input_ptr + ys[y].upper * in_width * channels;
batch_input_ptr + ys[y].upper * in_width * channels;
T *y_output_ptr = batch_output_ptr + y * out_width * channels;
const float ys_lerp = ys[y].lerp;
......@@ -95,7 +95,7 @@ void ResizeImage(const T *images,
const T bottom_right = bottom_right_ptr[c];
output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp);
bottom_right, xs_lerp, ys_lerp);
}
}
}
......@@ -107,10 +107,10 @@ struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size,
bool align_corners)
: align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
protected:
bool align_corners_;
......@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
}
};
template<typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
template <typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -16,11 +16,10 @@ namespace kernels {
struct SpaceToBatchFunctorBase {
SpaceToBatchFunctorBase(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s):
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s)
{}
bool b2s)
: paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s) {}
std::vector<int> paddings_;
std::vector<int> block_shape_;
......@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
};
template <DeviceType D, typename T>
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
......@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
};
template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
......@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
StatsFuture *future);
cl::Kernel kernel_;
};
} // namespace kernels
......
......@@ -6,10 +6,10 @@
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
......@@ -17,38 +17,36 @@ namespace kernels {
struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings)
: strides_({1, 1}), dilations_({1, 1}),
padding_type_(padding_type), paddings_(paddings) {}
: strides_({1, 1}),
dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings) {}
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_;
std::vector<int> paddings_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase {
template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
};
......@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
const float relux_max_limit_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *bias,
......@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *bias,
......
......@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr;
const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor);
......
......@@ -214,9 +214,7 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>(
"Alpha", {2},
{2.0, 3.0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage",
......@@ -250,7 +248,8 @@ void TestSimplePrelu() {
}
auto expected = CreateTensor<float>(
{2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
{2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......
......@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
for (int i = 1; i < n; ++i) {
inputs[i] = this->Input(i);
MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: "
<< MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size()
<< ". Input " << i << ": "
<< MakeString(inputs[i]->shape())
<< ", size: " << inputs[i]->size();
MACE_CHECK(inputs[0]->size() == inputs[i]->size())
<< "Input 0: " << MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size() << ". Input " << i << ": "
<< MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
}
functor_(inputs, output_tensor, future);
......
......@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
OpsTestNet net;
// Add input data
for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(),
{n, h, w, c});
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
}
if (D == DeviceType::OPENCL) {
......
......@@ -76,7 +76,7 @@ static void BatchNorm(
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -12,15 +12,14 @@
namespace mace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> {
public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(
OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
true) {}
functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
true) {}
bool Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT);
......@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future);
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
future);
return true;
}
......@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
Tensor *output,
index_t *output_shape) {
auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
......@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size();
index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim];
const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value
- crops[block_dim * 2]
- crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0,
"cropped size must be non-negative");
const index_t cropped_input_size =
input_tensor->dim(block_dim + 1) * block_shape_value -
crops[block_dim * 2] - crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = cropped_input_size;
}
......
......@@ -41,7 +41,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
......
......@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -11,16 +11,17 @@
namespace mace {
template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> {
class BufferToImageOp : public Operator<D, T> {
public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
: Operator<D, T>(op_def, ws) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
kernels::BufferType type =
static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output, future);
......
......@@ -7,8 +7,9 @@
using namespace mace;
template<DeviceType D, typename T>
void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
template <DeviceType D, typename T>
void TestBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
......@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
net.RunOp(D);
// Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-5);
}
TEST(BufferToImageTest, ArgSmall) {
......@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) {
}
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 3, 5});
}
TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 13, 17});
}
TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 128, 512});
}
TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 3, 5});
}
TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 13, 17});
}
TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256});
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 128, 256});
}
template<DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
......@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
net.RunOp(D);
// Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-2);
}
TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
{11});
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void TestStringHalfBidirectionTransform(const int type,
const std::vector<index_t> &input_shape,
const unsigned char *input_data) {
......@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type,
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
const half *h_data = reinterpret_cast<const half*>(input_data);
const half *h_data = reinterpret_cast<const half *>(input_data);
net.AddInputFromArray<D, half>("Input", input_shape, std::vector<half>(h_data, h_data+2));
net.AddInputFromArray<D, half>("Input", input_shape,
std::vector<half>(h_data, h_data + 2));
// Run
net.RunOp(D);
......@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type,
net.RunOp(D);
// Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-2);
}
TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = {0xCD, 0x3C, 0x33, 0x40,};
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
{2},
input_data);
const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40,
};
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
kernels::ARGUMENT, {2}, input_data);
}
......@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
input->shape()[1]);
output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(),
output->mutable_data<T>(), future);
functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
future);
return true;
}
......
......@@ -41,7 +41,7 @@ static void ChannelShuffle(
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \
} \
......
......@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
public:
ConcatOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
MACE_CHECK(this->InputSize() >= 2)
<< "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
const int32_t input_dims = input_list[0]->dim_size();
......
......@@ -37,11 +37,10 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
}
}
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1( \
int iters) { \
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
} \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO(0, 1000);
......@@ -90,13 +89,11 @@ static void OpenclConcatHelper(int iters,
}
}
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \
int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
......
......@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::CPU, float>(
MakeString("Input", i), input_shapes[i], inputs[i]);
net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
input_shapes[i], inputs[i]);
}
// Run
......@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
}
TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32},
{3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
OpenclRandomTest<float>(
{{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
}
\ No newline at end of file
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/ops/conv_2d.h"
#include <fstream>
#include "mace/ops/ops_test_util.h"
using namespace mace;
......@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
template <DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int stride) {
static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
const int stride) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
......@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
}
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
2);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
2);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17},
3);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17},
4);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
}
template<DeviceType D>
template <DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 1, 32, 64},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{3, 3, 32, 64},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{15, 1, 256, 2},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{1, 15, 256, 2},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 64},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{1, 1, 5, 7},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
{3, 3, 5, 7},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
{1, 1});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{5, 5, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
{7, 7, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
{2, 2});
}
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
{7, 7, 16, 16},
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
{4, 4});
}
template<DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
template <DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape,
const int dilation_rate) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
......@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
......@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
......@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
}
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
2);
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
}
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
4);
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
}
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
4);
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
}
template<DeviceType D, typename T>
static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std::vector<int> &paddings) {
template <DeviceType D, typename T>
static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
srand(time(NULL));
......@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
......@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
......@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
}
TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
{1, 1});
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
}
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
......@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
}
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
{4, 4});
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
}
......@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
functor_(static_cast<kernels::EltwiseType>(
OperatorBase::GetSingleArgument<int>(
"type", static_cast<int>(kernels::EltwiseType::SUM))),
OperatorBase::GetRepeatedArgument<float>("coeff")){}
OperatorBase::GetRepeatedArgument<float>("coeff")) {}
bool Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->Input(1);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape";
for(int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape";
MACE_CHECK(input0->dim_size() == input1->dim_size())
<< "Inputs of Eltwise op must be same shape";
for (int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i))
<< "Inputs of Eltwise op must be same shape";
}
output->ResizeLike(input0);
......
......@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
......@@ -2,15 +2,15 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/eltwise.h"
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/eltwise.h"
namespace mace {
class EltwiseOpTest : public OpsTestBase {};
template<DeviceType D>
template <DeviceType D>
void Simple(const kernels::EltwiseType type,
const std::vector<index_t> &shape,
const std::vector<float> &input0,
......@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type,
// Run
net.RunOp(D);
} else {
BufferToImage<D, half>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Input2", "InputImg2",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg1")
.Input("InputImg2")
......@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type,
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, float>(net, "OutputImg", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
}
auto expected = CreateTensor<float>(shape, output);
......@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type,
}
TEST_F(EltwiseOpTest, CPUSimple) {
Simple<DeviceType::CPU>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{3, 6, 9, 12, 15, 18},
{2, 1});
Simple<DeviceType::CPU>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{3, 6, 9, 12, 15, 18}, {2, 1});
Simple<DeviceType::CPU>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 6, 6});
Simple<DeviceType::CPU>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
Simple<DeviceType::CPU>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 1, 3, 3, 5, 6});
}
TEST_F(EltwiseOpTest, GPUSimple) {
Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{3, 6, 9, 12, 15, 18},
{2, 1});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{3, 6, 9, 12, 15, 18}, {2, 1});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 6, 6});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
{1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 1, 3, 3, 5, 6});
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void RandomTest(const kernels::EltwiseType type,
const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
......@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type,
// Run
net.RunOp();
BufferToImage<D, T>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input1", "InputImg1",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input2", "InputImg2",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg1")
.Input("InputImg2")
......@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type,
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-3);
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-3);
} else {
ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-1);
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-1);
}
}
......
......@@ -7,25 +7,22 @@
namespace mace {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry,
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry,
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry,
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>);
}
......
......@@ -17,7 +17,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
std::vector<float> &scale,
std::vector<float> &offset) {
size_t size = gamma.size();
for (int i = 0 ; i < size; ++i) {
for (int i = 0; i < size; ++i) {
scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
offset[i] = offset[i] - mean[i] * scale[i];
}
......
......@@ -15,11 +15,10 @@ class FullyConnectedOp : public Operator<D, T> {
public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
functor_(
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
functor_(kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......
......@@ -17,16 +17,17 @@ static void FCBenchmark(
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, channel});
net.AddRandomInput<D, float>("Weight", {out_channel, height * width * channel});
net.AddRandomInput<D, float>("Weight",
{out_channel, height * width * channel});
net.AddRandomInput<D, float>("Bias", {out_channel});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Weight", "WeightImage",
kernels::BufferType::WEIGHT_HEIGHT);
kernels::BufferType::WEIGHT_HEIGHT);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
.Input("InputImage")
......@@ -57,14 +58,17 @@ static void FCBenchmark(
net.Sync();
}
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define BM_FC(N, H, W, C, OC) \
......
......@@ -10,7 +10,7 @@ namespace mace {
class FullyConnectedOpTest : public OpsTestBase {};
template<DeviceType D>
template <DeviceType D>
void Simple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_value,
const std::vector<index_t> &weight_shape,
......@@ -58,83 +58,52 @@ void Simple(const std::vector<index_t> &input_shape,
}
// Check
auto expected =
CreateTensor<float>(output_shape, output_value);
auto expected = CreateTensor<float>(output_shape, output_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(FullyConnectedOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 2, 2},
{1, 2, 3, 4, 5, 6, 7, 8},
{1, 8},
{1, 2, 3, 4, 5, 6, 7, 8},
{1}, {2},
{1, 1, 1, 1}, {206});
Simple<DeviceType::CPU>({1, 1, 2, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3},
{1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::CPU>({1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{5, 6},
{1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {92, 912, 94, 914, 96});
Simple<DeviceType::CPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
Simple<DeviceType::CPU>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::CPU>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
}
TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 1, 2, 2},
{1, 2, 3, 4, 5, 6, 7, 8},
{1, 4},
{1, 2, 3, 4},
{1}, {2},
{2, 1, 1, 1}, {32, 72});
Simple<DeviceType::CPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
}
TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 2, 2},
{1, 2, 3, 4, 5, 6, 7, 8},
{1, 8},
{1, 2, 3, 4, 5, 6, 7, 8},
{1}, {2},
{1, 1, 1, 1}, {206});
Simple<DeviceType::OPENCL>({1, 1, 2, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3},
{1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::OPENCL>({1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{5, 6},
{1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {92, 912, 94, 914, 96});
Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{206});
Simple<DeviceType::OPENCL>(
{1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
{2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::OPENCL>(
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
}
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::OPENCL>({2, 1, 2, 2},
{1, 2, 3, 4, 5, 6, 7, 8},
{1, 4},
{1, 2, 3, 4},
{1}, {2},
{2, 1, 1, 1}, {32, 72});
Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
}
template<typename T>
template <typename T>
void Complex(const index_t batch,
const index_t height,
const index_t width,
......@@ -156,8 +125,7 @@ void Complex(const index_t batch,
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>(
"Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>(
"Bias", {out_channel});
net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
// run cpu
net.RunOp();
......@@ -215,6 +183,4 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 13, 12, 31, 113);
Complex<half>(31, 21, 11, 23, 103);
}
}
......@@ -511,8 +511,9 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
}
template<DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
template <DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape,
const int dilation) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
......@@ -550,9 +551,12 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
......@@ -567,7 +571,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
......@@ -591,7 +596,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
}
template<DeviceType D>
template <DeviceType D>
static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -620,7 +625,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
......@@ -632,9 +638,12 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
......@@ -649,7 +658,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
};
......@@ -658,13 +668,11 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
}
TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
{7, 7, 3, 16},
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 16},
{2, 2});
}
TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
{15, 15, 16, 16},
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, {15, 15, 16, 16},
{2, 2});
}
//DMACE_ENABLE_NEON
// DMACE_ENABLE_NEON
// Copyright (c) 2017 XiaoMi All rights reserved.
//
......
......@@ -40,13 +40,13 @@ static void GlobalAvgPooling(
static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
GlobalAvgPooling<DEVICE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE)
#define BM_GLOBAL_AVG_POOLING(N, C, H, W) \
#define BM_GLOBAL_AVG_POOLING(N, C, H, W) \
BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU);
// BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON);
......
......@@ -11,17 +11,18 @@
namespace mace {
template <DeviceType D, typename T>
class ImageToBufferOp: public Operator<D, T> {
class ImageToBufferOp : public Operator<D, T> {
public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
kernels::BufferType type =
static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor), future);
return true;
}
......
......@@ -24,8 +24,8 @@ class MatMulOp : public Operator<D, T> {
<< "The dimension of A and B should be 4";
MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size";
MACE_CHECK(A->dim(2) == B->dim(1))
<< "the number of A's column " << A->dim(2)
<< " must be equal to B's row " << B->dim(1);
<< "the number of A's column " << A->dim(2)
<< " must be equal to B's row " << B->dim(1);
functor_(A, B, C, future);
return true;
......
......@@ -10,7 +10,7 @@ namespace mace {
class MatMulOpTest : public OpsTestBase {};
template<DeviceType D>
template <DeviceType D>
void Simple(const std::vector<index_t> &A_shape,
const std::vector<float> &A_value,
const std::vector<index_t> &B_shape,
......@@ -51,29 +51,24 @@ void Simple(const std::vector<index_t> &A_shape,
}
// Check
auto expected =
CreateTensor<float>(C_shape, C_value);
auto expected = CreateTensor<float>(C_shape, C_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(MatMulOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
{1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::CPU>({1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::CPU>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
......@@ -81,19 +76,17 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
}
TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
{1, 2, 2, 1}, {22, 28, 49, 64});
Simple<DeviceType::OPENCL>({1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1},
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
{22, 28, 49, 64});
Simple<DeviceType::OPENCL>(
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
1310, 1400, 1315, 1430, 1545, 1660, 1775});
}
TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
......@@ -118,8 +111,8 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"A", {batch, height, channels, 1});
net.AddRandomInput<DeviceType::OPENCL, float>("A",
{batch, height, channels, 1});
net.AddRandomInput<DeviceType::OPENCL, float>(
"B", {batch, channels, out_width, 1});
......@@ -132,9 +125,9 @@ void Complex(const index_t batch,
// Run on opencl
BufferToImage<DeviceType::OPENCL, T>(net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<DeviceType::OPENCL, T>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulTest")
.Input("AImage")
......@@ -177,5 +170,4 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67);
}
}
......@@ -95,7 +95,7 @@ class OpDefBuilder {
class OpsTestNet {
public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {};
OpsTestNet() : op_registry_(new OperatorRegistry()){};
template <DeviceType D, typename T>
void AddInputFromArray(const std::string &name,
......@@ -239,7 +239,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
template <typename T>
std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) {
const std::vector<T> &data) {
std::unique_ptr<Tensor> res(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape);
......@@ -334,9 +334,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
for (int h = 0; h < x.dim(1); ++h) {
for (int w = 0; w < x.dim(2); ++w) {
for (int c = 0; c < x.dim(3); ++c) {
EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
<< n << ", " << h << ", "
<< w << ", " << c << "]";
EXPECT_NEAR(*a, *b, abs_err) << "with index = [" << n << ", " << h
<< ", " << w << ", " << c << "]";
a++;
b++;
}
......
......@@ -20,8 +20,12 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
pooling_type_(
static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>(
"pooling_type", static_cast<int>(AVG)))),
functor_(pooling_type_, kernels_.data(), this->strides_.data(),
this->padding_type_, this->paddings_, this->dilations_.data()){};
functor_(pooling_type_,
kernels_.data(),
this->strides_.data(),
this->padding_type_,
this->paddings_,
this->dilations_.data()){};
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......
......@@ -54,7 +54,7 @@ static void Pooling(int iters,
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
......@@ -62,7 +62,7 @@ static void Pooling(int iters,
BENCHMARK( \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE)
#define BM_POOLING(N, C, H, W, K, S, PA, PO) \
#define BM_POOLING(N, C, H, W, K, S, PA, PO) \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU);
// BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON);
......
......@@ -198,7 +198,8 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
......@@ -333,7 +334,8 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
......
......@@ -15,7 +15,7 @@ class ReshapeOp : public Operator<D, T> {
public:
ReshapeOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")){}
shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......@@ -38,9 +38,11 @@ class ReshapeOp : public Operator<D, T> {
}
if (unknown_idx != -1) {
MACE_CHECK(product != 0) << "Cannot infer shape if there is zero shape size.";
MACE_CHECK(product != 0)
<< "Cannot infer shape if there is zero shape size.";
const index_t missing = input->size() / product;
MACE_CHECK(missing * product == input->size()) << "Input size not match reshaped tensor size";
MACE_CHECK(missing * product == input->size())
<< "Input size not match reshaped tensor size";
out_shape[unknown_idx] = missing;
}
......
......@@ -13,7 +13,6 @@ class ReshapeTest : public OpsTestBase {};
void TestReshape(const std::vector<index_t> &org_shape,
const std::vector<int> &output_shape,
const std::vector<index_t> &res_shape) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Reshape", "ReshapeTest")
......
......@@ -20,9 +20,9 @@ void Register_Softmax(OperatorRegistry *op_registry) {
SoftmaxOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
SoftmaxOp<DeviceType::OPENCL, half>);
}
......
......@@ -14,11 +14,10 @@ template <DeviceType D, class T>
class SoftmaxOp : public Operator<D, T> {
public:
SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
}
: Operator<D, T>(operator_def, ws) {}
bool Run(StatsFuture *future) override {
const Tensor *logits= this->Input(LOGITS);
const Tensor *logits = this->Input(LOGITS);
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(logits);
......
......@@ -14,7 +14,8 @@ void Simple() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4});
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage",
......@@ -41,18 +42,15 @@ void Simple() {
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25,
0.0320586, 0.08714432, 0.23688282, 0.64391426});
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
}
TEST_F(SoftmaxOpTest, CPUSimple) {
Simple<DeviceType::CPU>();
}
TEST_F(SoftmaxOpTest, OPENCLSimple) {
Simple<DeviceType::OPENCL>();
}
TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
template <DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) {
......
......@@ -12,7 +12,7 @@
namespace mace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class SpaceToBatchNDOp : public Operator<D, T> {
public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
......@@ -28,17 +28,19 @@ class SpaceToBatchNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(space_tensor, batch_tensor, output_shape.data());
functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor, future);
functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor,
future);
return true;
}
private:
inline void CalculateOutputShape(const Tensor *input_tensor,
Tensor *output,
index_t *output_shape) {
auto paddings = OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
auto paddings =
OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
......@@ -46,13 +48,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size();
index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim];
const index_t padded_input_size = input_tensor->dim(block_dim + 1)
+ paddings[block_dim * 2]
+ paddings[block_dim * 2 + 1];
MACE_CHECK(padded_input_size % block_shape_value == 0,
"padded input ", padded_input_size, " is not divisible by block_shape");
const index_t padded_input_size = input_tensor->dim(block_dim + 1) +
paddings[block_dim * 2] +
paddings[block_dim * 2 + 1];
MACE_CHECK(padded_input_size % block_shape_value == 0, "padded input ",
padded_input_size, " is not divisible by block_shape");
block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = padded_input_size / block_shape_value;
}
......
......@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
......
......@@ -4,8 +4,8 @@
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......@@ -21,7 +21,9 @@ void TransposeFilter(const std::vector<float> &input,
for (index_t w = 0; w < input_shape[1]; ++w) {
for (index_t oc = 0; oc < input_shape[2]; ++oc) {
for (index_t ic = 0; ic < input_shape[3]; ++ic) {
int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) * input_shape[1] + w;
int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
input_shape[1] +
w;
output[offset] = *input_ptr;
++input_ptr;
}
......@@ -30,7 +32,7 @@ void TransposeFilter(const std::vector<float> &input,
}
}
template<DeviceType D, typename T>
template <DeviceType D, typename T>
void WinogradConvolution(const index_t batch,
const index_t height,
const index_t width,
......@@ -53,8 +55,7 @@ void WinogradConvolution(const index_t batch,
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
......@@ -78,8 +79,10 @@ void WinogradConvolution(const index_t batch,
// transform filter
std::vector<float> wino_filter_data;
TransposeFilter(filter_data, filter_shape, wino_filter_data);
net.AddInputFromArray<D, float>("WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER);
net.AddInputFromArray<D, float>(
"WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
......@@ -126,18 +129,23 @@ void WinogradConvolution(const index_t batch,
}
TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::SAME);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
Padding::SAME);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31, Padding::SAME);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31,
Padding::SAME);
}
TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32, Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31, Padding::SAME);
WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32,
Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
Padding::SAME);
}
}
......@@ -8,12 +8,12 @@
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/winograd_transform.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/winograd_transform.h"
namespace mace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class WinogradInverseTransformOp : public Operator<D, T> {
public:
WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
......
......@@ -12,14 +12,14 @@
namespace mace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class WinogradTransformOp : public Operator<D, T> {
public:
WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(static_cast<Padding>(OperatorBase::GetSingleArgument<int>(
"padding", static_cast<int>(VALID))),
OperatorBase::GetRepeatedArgument<int>("padding_values")) {}
"padding", static_cast<int>(VALID))),
OperatorBase::GetRepeatedArgument<int>("padding_values")) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
......
......@@ -16,7 +16,7 @@ static void BMWinogradTransform(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("OutputImage")
......@@ -36,17 +36,15 @@ static void BMWinogradTransform(
net.Sync();
}
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
......@@ -88,16 +86,16 @@ static void BMWinogradInverseTransform(
net.Sync();
}
#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
......
......@@ -6,10 +6,10 @@
#define MACE_CORE_MACE_H_
#include <cstdint>
#include <vector>
#include <string>
#include <memory>
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace mace {
......@@ -25,13 +25,11 @@ namespace mace {
#define MACE_STR(x) MACE_STR_HELPER(x)
// e.g. "0.5.0" or "0.6.0-alpha".
#define MACE_VERSION_STRING \
#define MACE_VERSION_STRING \
(MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
inline const char *MaceVersion() {
return MACE_VERSION_STRING;
}
inline const char *MaceVersion() { return MACE_VERSION_STRING; }
extern const char *MaceGitVersion();
......@@ -43,17 +41,9 @@ extern const char *MaceGitVersion();
classname &operator=(const classname &) = delete
#endif
enum NetMode {
INIT = 0,
NORMAL = 1
};
enum NetMode { INIT = 0, NORMAL = 1 };
enum DeviceType {
CPU = 0,
NEON = 1,
OPENCL = 2,
HEXAGON = 3
};
enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
enum DataType {
DT_INVALID = 0,
......@@ -104,6 +94,7 @@ class Argument {
public:
Argument();
void CopyFrom(const Argument &from);
public:
const std::string &name() const;
void set_name(const std::string &value);
......@@ -147,11 +138,13 @@ class NodeInput {
NodeInput() {}
NodeInput(int node_id, int output_port);
void CopyFrom(const NodeInput &from);
public:
int node_id() const;
void set_node_id(int node_id);
int output_port() const;
void set_output_port(int output_port);
private:
int node_id_;
int output_port_;
......@@ -162,8 +155,10 @@ class OutputShape {
OutputShape();
OutputShape(const std::vector<int64_t> &dims);
void CopyFrom(const OutputShape &from);
public:
const std::vector<int64_t> &dims() const;
private:
std::vector<int64_t> dims_;
};
......@@ -240,10 +235,12 @@ class OperatorDef {
class MemoryBlock {
public:
MemoryBlock(int mem_id, uint32_t x, uint32_t y);
public:
int mem_id() const;
uint32_t x() const;
uint32_t y() const;
private:
int mem_id_;
uint32_t x_;
......@@ -255,9 +252,9 @@ class MemoryArena {
const std::vector<MemoryBlock> &mem_block() const;
std::vector<MemoryBlock> &mutable_mem_block();
int mem_block_size() const;
private:
std::vector<MemoryBlock> mem_block_;
};
// for hexagon mace-nnlib
......@@ -268,10 +265,11 @@ class InputInfo {
int32_t max_byte_size() const;
DataType data_type() const;
const std::vector<int32_t> &dims() const;
private:
std::string name_;
int32_t node_id_;
int32_t max_byte_size_; // only support 32-bit len
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
};
......@@ -285,10 +283,11 @@ class OutputInfo {
void set_data_type(DataType data_type);
const std::vector<int32_t> &dims() const;
void set_dims(const std::vector<int32_t> &dims);
private:
std::string name_;
int32_t node_id_;
int32_t max_byte_size_; // only support 32-bit len
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
};
......@@ -299,6 +298,7 @@ class NetDef {
int op_size() const;
const OperatorDef &op(const int idx) const;
public:
const std::string &name() const;
bool has_name() const;
......@@ -359,7 +359,6 @@ struct RunMetadata {
std::vector<OperatorStats> op_stats;
};
class Workspace;
class NetBase;
class OperatorRegistry;
......@@ -374,8 +373,7 @@ struct MaceInputInfo {
class MaceEngine {
public:
// Single input and output
explicit MaceEngine(const NetDef *net_def,
DeviceType device_type);
explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
// Multiple input or output
explicit MaceEngine(const NetDef *net_def,
DeviceType device_type,
......@@ -394,7 +392,7 @@ class MaceEngine {
// Multiple input or output
bool Run(const std::vector<MaceInputInfo> &input,
std::map<std::string, float *> &output,
RunMetadata *run_metadata=nullptr);
RunMetadata *run_metadata = nullptr);
MaceEngine(const MaceEngine &) = delete;
MaceEngine &operator=(const MaceEngine &) = delete;
......
......@@ -45,7 +45,7 @@ class Flags {
// Return a usage message with command line cmdline, and the
// usage_text strings in flag_list[].
static std::string Usage(const std::string &cmdline,
const std::vector<Flag> &flag_list);
const std::vector<Flag> &flag_list);
};
} // namespace mace
......
......@@ -9,7 +9,6 @@
#include <sys/time.h>
#include <time.h>
namespace mace {
inline int64_t NowMicros() {
......
......@@ -10,8 +10,8 @@
#include <string>
#include <vector>
#include "mace/utils/env_time.h"
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/string_util.h"
#undef ERROR
......
......@@ -27,7 +27,7 @@ inline void MakeStringInternal(std::stringstream &ss,
MakeStringInternal(ss, args...);
}
} // namespace
} // namespace
template <typename... Args>
std::string MakeString(const Args &... args) {
......
......@@ -24,13 +24,9 @@ class WallClockTimer : public Timer {
public:
WallClockTimer() : accumulated_micros_(0) {}
void StartTiming() override {
start_micros_ = NowMicros();
}
void StartTiming() override { start_micros_ = NowMicros(); }
void StopTiming() override {
stop_micros_ = NowMicros();
}
void StopTiming() override { stop_micros_ = NowMicros(); }
void AccumulateTiming() override {
StopTiming();
......@@ -43,13 +39,9 @@ class WallClockTimer : public Timer {
accumulated_micros_ = 0;
}
double ElapsedMicros() override {
return stop_micros_ - start_micros_;
}
double ElapsedMicros() override { return stop_micros_ - start_micros_; }
double AccumulatedMicros() override {
return accumulated_micros_;
}
double AccumulatedMicros() override { return accumulated_micros_; }
private:
double start_micros_;
......
......@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer timer;
std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
default_params[0] = 2;
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect + 1, res);
}
......@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
};
// tune
WallClockTimer timer;
int res =
Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
*params_generator,
TunerFunc,
&timer);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, *params_generator, TunerFunc, &timer);
EXPECT_EQ(expect, res);
// run
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
}
......
......@@ -62,9 +62,9 @@ inline std::string ObfuscateSymbol(const std::string &src) {
if (dest.empty()) {
return dest;
}
dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
const std::string encode_dict =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
for (size_t i = 1; i < src.size(); i++) {
char ch = src[i];
int idx;
......
......@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer timer;
std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
default_params[0] = 2;
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect + 1, res);
}
......@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
};
// tune
WallClockTimer timer;
int res =
Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
*params_generator,
TunerFunc,
&timer);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, *params_generator, TunerFunc, &timer);
EXPECT_EQ(expect, res);
// run
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册