提交 6b9aafd4 编写于 作者: 叶剑武

Merge branch 'cpplint' into 'master'

Reformatting code and enable cpplint

See merge request !273
stages: stages:
- ops_test - ops_test
- ops_benchmark - ops_benchmark
- cpplint
cpplint:
stage: cpplint
only:
- master
script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
ops_test: ops_test:
stage: ops_test stage: ops_test
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include <malloc.h> #include <malloc.h>
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/public/mace.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator { ...@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
free(data); free(data);
}; };
void *Map(void *buffer, size_t offset, size_t nbytes) const override { void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset; return (char *)buffer + offset;
} }
void *MapImage(void *buffer, void *MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#ifndef MACE_CORE_BUFFER_H_ #ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_ #define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector> #include <vector>
#include "mace/core/allocator.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
...@@ -39,23 +39,19 @@ class BufferBase { ...@@ -39,23 +39,19 @@ class BufferBase {
virtual bool OnHost() const = 0; virtual bool OnHost() const = 0;
virtual index_t offset() const { virtual index_t offset() const { return 0; };
return 0;
};
template<typename T> template <typename T>
const T *data() const { const T *data() const {
return reinterpret_cast<const T *>(raw_data()); return reinterpret_cast<const T *>(raw_data());
} }
template<typename T> template <typename T>
T *mutable_data() { T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data()); return reinterpret_cast<T *>(raw_mutable_data());
} }
index_t size() const { index_t size() const { return size_; }
return size_;
}
protected: protected:
index_t size_; index_t size_;
...@@ -155,12 +151,10 @@ class Buffer : public BufferBase { ...@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_); MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer"); MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length); memcpy(mapped_buf_, (char *)src + offset, length);
} }
bool OnHost() const { bool OnHost() const { return allocator_->OnHost(); }
return allocator_->OnHost();
}
private: private:
Allocator *allocator_; Allocator *allocator_;
...@@ -180,9 +174,10 @@ class Image : public BufferBase { ...@@ -180,9 +174,10 @@ class Image : public BufferBase {
mapped_buf_(nullptr) {} mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type) Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(), : BufferBase(
1, std::multiplies<index_t>()) std::accumulate(
* GetEnumTypeSize(data_type)), shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)), allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) { mapped_buf_(nullptr) {
shape_ = shape; shape_ = shape;
...@@ -214,9 +209,7 @@ class Image : public BufferBase { ...@@ -214,9 +209,7 @@ class Image : public BufferBase {
return mapped_buf_; return mapped_buf_;
} }
std::vector<size_t> image_shape() const { std::vector<size_t> image_shape() const { return shape_; }
return shape_;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const { void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -241,17 +234,11 @@ class Image : public BufferBase { ...@@ -241,17 +234,11 @@ class Image : public BufferBase {
mapped_buf_ = nullptr; mapped_buf_ = nullptr;
}; };
void Resize(index_t size) { void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
bool OnHost() const { bool OnHost() const { return allocator_->OnHost(); }
return allocator_->OnHost();
}
private: private:
Allocator *allocator_; Allocator *allocator_;
...@@ -266,10 +253,7 @@ class Image : public BufferBase { ...@@ -266,10 +253,7 @@ class Image : public BufferBase {
class BufferSlice : public BufferBase { class BufferSlice : public BufferBase {
public: public:
BufferSlice() BufferSlice()
: buffer_(nullptr), : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
mapped_buf_(nullptr),
offset_(0),
length_(0) {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length) BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()), : BufferBase(buffer->size()),
buffer_(buffer), buffer_(buffer),
...@@ -277,17 +261,11 @@ class BufferSlice : public BufferBase { ...@@ -277,17 +261,11 @@ class BufferSlice : public BufferBase {
offset_(offset), offset_(offset),
length_(length) { length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0"); MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_, MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
"buffer slice offset + length (", offset, " + ", length, ") should <= ", size_);
offset, }
" + ", BufferSlice(const BufferSlice &other)
length, : BufferSlice(other.buffer_, other.offset_, other.length_) {}
") should <= ",
size_);
}
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
other.offset_,
other.length_) {}
~BufferSlice() { ~BufferSlice() {
if (buffer_ != nullptr && mapped_buf_ != nullptr) { if (buffer_ != nullptr && mapped_buf_ != nullptr) {
...@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase { ...@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
const void *raw_data() const { const void *raw_data() const {
if (OnHost()) { if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_); MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_; return (char *)buffer_->raw_data() + offset_;
} else { } else {
MACE_CHECK_NOTNULL(mapped_buf_); MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_; return mapped_buf_;
...@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase { ...@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
return nullptr; return nullptr;
} }
void UnMap(void *mapped_ptr) const { void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Map(std::vector<size_t> *pitch) { void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_); MACE_CHECK_NOTNULL(buffer_);
...@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase { ...@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
mapped_buf_ = nullptr; mapped_buf_ = nullptr;
}; };
void Resize(index_t size) { void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
index_t offset() const { index_t offset() const { return offset_; }
return offset_;
}
bool OnHost() const { bool OnHost() const { return buffer_->OnHost(); }
return buffer_->OnHost();
}
private: private:
BufferBase *buffer_; BufferBase *buffer_;
...@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase { ...@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
index_t offset_; index_t offset_;
index_t length_; index_t length_;
}; };
} }
#endif // MACE_CORE_BUFFER_H_ #endif // MACE_CORE_BUFFER_H_
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
// //
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
...@@ -13,11 +13,11 @@ ConstTensor::ConstTensor(const std::string &name, ...@@ -13,11 +13,11 @@ ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data, const unsigned char *data,
const std::vector<int64_t> &dims, const std::vector<int64_t> &dims,
const DataType data_type, const DataType data_type,
uint32_t node_id) : uint32_t node_id)
name_(name), : name_(name),
data_(data), data_(data),
data_size_(std::accumulate(dims.begin(), dims.end(), 1, data_size_(std::accumulate(
std::multiplies<int64_t>())), dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()), dims_(dims.begin(), dims.end()),
data_type_(data_type), data_type_(data_type),
node_id_(node_id) {} node_id_(node_id) {}
...@@ -26,33 +26,21 @@ ConstTensor::ConstTensor(const std::string &name, ...@@ -26,33 +26,21 @@ ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data, const unsigned char *data,
const std::vector<int64_t> &dims, const std::vector<int64_t> &dims,
const int data_type, const int data_type,
uint32_t node_id) : uint32_t node_id)
name_(name), : name_(name),
data_(data), data_(data),
data_size_(std::accumulate(dims.begin(), dims.end(), 1, data_size_(std::accumulate(
std::multiplies<int64_t>())), dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()), dims_(dims.begin(), dims.end()),
data_type_(static_cast<DataType>(data_type)), data_type_(static_cast<DataType>(data_type)),
node_id_(node_id) {} node_id_(node_id) {}
const std::string &ConstTensor::name() const { const std::string &ConstTensor::name() const { return name_; }
return name_; const unsigned char *ConstTensor::data() const { return data_; }
} int64_t ConstTensor::data_size() const { return data_size_; }
const unsigned char *ConstTensor::data() const { const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
return data_; DataType ConstTensor::data_type() const { return data_type_; }
} uint32_t ConstTensor::node_id() const { return node_id_; }
int64_t ConstTensor::data_size() const {
return data_size_;
}
const std::vector<int64_t> &ConstTensor::dims() const {
return dims_;
}
DataType ConstTensor::data_type() const {
return data_type_;
}
uint32_t ConstTensor::node_id() const {
return node_id_;
}
Argument::Argument() : has_bits_(0) {} Argument::Argument() : has_bits_(0) {}
...@@ -73,74 +61,42 @@ void Argument::CopyFrom(const Argument &from) { ...@@ -73,74 +61,42 @@ void Argument::CopyFrom(const Argument &from) {
this->has_bits_ = from.has_bits_; this->has_bits_ = from.has_bits_;
} }
const std::string &Argument::name() const { const std::string &Argument::name() const { return name_; }
return name_; void Argument::set_name(const std::string &value) { name_ = value; }
} bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
void Argument::set_name(const std::string &value) { void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
name_ = value; float Argument::f() const { return f_; }
}
bool Argument::has_f() const {
return (has_bits_ & 0x00000001u) != 0;
}
void Argument::set_has_f() {
has_bits_ |= 0x00000001u;
}
float Argument::f() const {
return f_;
}
void Argument::set_f(float value) { void Argument::set_f(float value) {
set_has_f(); set_has_f();
f_ = value; f_ = value;
} }
bool Argument::has_i() const { bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
return (has_bits_ & 0x00000002u) != 0; void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
} int64_t Argument::i() const { return i_; }
void Argument::set_has_i() {
has_bits_ |= 0x00000002u;
}
int64_t Argument::i() const {
return i_;
}
void Argument::set_i(int64_t value) { void Argument::set_i(int64_t value) {
set_has_i(); set_has_i();
i_ = value; i_ = value;
} }
bool Argument::has_s() const { bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
return (has_bits_ & 0x00000004u) != 0; void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
} std::string Argument::s() const { return s_; }
void Argument::set_has_s() {
has_bits_ |= 0x00000004u;
}
std::string Argument::s() const {
return s_;
}
void Argument::set_s(const std::string &value) { void Argument::set_s(const std::string &value) {
set_has_s(); set_has_s();
s_ = value; s_ = value;
} }
const std::vector<float> &Argument::floats() const { const std::vector<float> &Argument::floats() const { return floats_; }
return floats_; void Argument::add_floats(float value) { floats_.push_back(value); }
}
void Argument::add_floats(float value) {
floats_.push_back(value);
}
void Argument::set_floats(const std::vector<float> &value) { void Argument::set_floats(const std::vector<float> &value) {
floats_.resize(value.size()); floats_.resize(value.size());
std::copy(value.begin(), value.end(), floats_.begin()); std::copy(value.begin(), value.end(), floats_.begin());
} }
const std::vector<int64_t> &Argument::ints() const { const std::vector<int64_t> &Argument::ints() const { return ints_; }
return ints_; void Argument::add_ints(int64_t value) { ints_.push_back(value); }
}
void Argument::add_ints(int64_t value) {
ints_.push_back(value);
}
void Argument::set_ints(const std::vector<int64_t> &value) { void Argument::set_ints(const std::vector<int64_t> &value) {
ints_.resize(value.size()); ints_.resize(value.size());
std::copy(value.begin(), value.end(), ints_.begin()); std::copy(value.begin(), value.end(), ints_.begin());
} }
const std::vector<std::string> &Argument::strings() const { const std::vector<std::string> &Argument::strings() const { return strings_; }
return strings_;
}
void Argument::add_strings(const ::std::string &value) { void Argument::add_strings(const ::std::string &value) {
strings_.push_back(value); strings_.push_back(value);
} }
...@@ -156,31 +112,21 @@ void NodeInput::CopyFrom(const NodeInput &from) { ...@@ -156,31 +112,21 @@ void NodeInput::CopyFrom(const NodeInput &from) {
node_id_ = from.node_id(); node_id_ = from.node_id();
output_port_ = from.output_port(); output_port_ = from.output_port();
} }
int NodeInput::node_id() const { int NodeInput::node_id() const { return node_id_; }
return node_id_; void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
} int NodeInput::output_port() const { return output_port_; }
void NodeInput::set_node_id(int node_id) { void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
node_id_ = node_id;
}
int NodeInput::output_port() const {
return output_port_;
}
void NodeInput::set_output_port(int output_port) {
output_port_ = output_port;
}
// OutputShape // OutputShape
OutputShape::OutputShape() {} OutputShape::OutputShape() {}
OutputShape::OutputShape(const std::vector<int64_t> &dims) : OutputShape::OutputShape(const std::vector<int64_t> &dims)
dims_(dims.begin(), dims.end()) {} : dims_(dims.begin(), dims.end()) {}
void OutputShape::CopyFrom(const OutputShape &from) { void OutputShape::CopyFrom(const OutputShape &from) {
auto from_dims = from.dims(); auto from_dims = from.dims();
dims_.resize(from_dims.size()); dims_.resize(from_dims.size());
std::copy(from_dims.begin(), from_dims.end(), dims_.begin()); std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
} }
const std::vector<int64_t> &OutputShape::dims() const { const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
return dims_;
}
// Operator Def // Operator Def
void OperatorDef::CopyFrom(const OperatorDef &from) { void OperatorDef::CopyFrom(const OperatorDef &from) {
...@@ -220,68 +166,38 @@ void OperatorDef::CopyFrom(const OperatorDef &from) { ...@@ -220,68 +166,38 @@ void OperatorDef::CopyFrom(const OperatorDef &from) {
} }
auto from_out_max_byte_size = from.out_max_byte_size(); auto from_out_max_byte_size = from.out_max_byte_size();
out_max_byte_size_.resize(from_out_max_byte_size.size()); out_max_byte_size_.resize(from_out_max_byte_size.size());
std::copy(from_out_max_byte_size.begin(), std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
from_out_max_byte_size.end(),
out_max_byte_size_.begin()); out_max_byte_size_.begin());
has_bits_ = from.has_bits_; has_bits_ = from.has_bits_;
} }
const std::string &OperatorDef::name() const { const std::string &OperatorDef::name() const { return name_; }
return name_;
}
void OperatorDef::set_name(const std::string &name_) { void OperatorDef::set_name(const std::string &name_) {
set_has_name(); set_has_name();
OperatorDef::name_ = name_; OperatorDef::name_ = name_;
} }
bool OperatorDef::has_name() const { bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
return (has_bits_ & 0x00000001u) != 0; void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
} const std::string &OperatorDef::type() const { return type_; }
void OperatorDef::set_has_name() {
has_bits_ |= 0x00000001u;
}
const std::string &OperatorDef::type() const {
return type_;
}
void OperatorDef::set_type(const std::string &type_) { void OperatorDef::set_type(const std::string &type_) {
set_has_type(); set_has_type();
OperatorDef::type_ = type_; OperatorDef::type_ = type_;
} }
bool OperatorDef::has_type() const { bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
return (has_bits_ & 0x00000002u) != 0; void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
} int OperatorDef::mem_id() const { return mem_id_; }
void OperatorDef::set_has_type() {
has_bits_ |= 0x00000002u;
}
int OperatorDef::mem_id() const {
return mem_id_;
}
void OperatorDef::set_mem_id(const int mem_id) { void OperatorDef::set_mem_id(const int mem_id) {
set_has_mem_id(); set_has_mem_id();
mem_id_ = mem_id; mem_id_ = mem_id;
} }
bool OperatorDef::has_mem_id() const { bool OperatorDef::has_mem_id() const { return (has_bits_ & 0x00000004u) != 0; }
return (has_bits_ & 0x00000004u) != 0; void OperatorDef::set_has_mem_id() { has_bits_ |= 0x00000004u; }
} uint32_t OperatorDef::node_id() const { return node_id_; }
void OperatorDef::set_has_mem_id() { void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
has_bits_ |= 0x00000004u; uint32_t OperatorDef::op_id() const { return op_id_; }
} uint32_t OperatorDef::padding() const { return padding_; }
uint32_t OperatorDef::node_id() const { void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
return node_id_;
}
void OperatorDef::set_node_id(uint32_t node_id) {
node_id_ = node_id;
}
uint32_t OperatorDef::op_id() const {
return op_id_;
}
uint32_t OperatorDef::padding() const {
return padding_;
}
void OperatorDef::set_padding(uint32_t padding) {
padding_ = padding;
}
const std::vector<NodeInput> &OperatorDef::node_input() const { const std::vector<NodeInput> &OperatorDef::node_input() const {
return node_input_; return node_input_;
} }
...@@ -294,9 +210,7 @@ const std::vector<int> &OperatorDef::out_max_byte_size() const { ...@@ -294,9 +210,7 @@ const std::vector<int> &OperatorDef::out_max_byte_size() const {
void OperatorDef::add_out_max_byte_size(int value) { void OperatorDef::add_out_max_byte_size(int value) {
out_max_byte_size_.push_back(value); out_max_byte_size_.push_back(value);
} }
const std::vector<std::string> &OperatorDef::input() const { const std::vector<std::string> &OperatorDef::input() const { return input_; }
return input_;
}
const std::string &OperatorDef::input(int index) const { const std::string &OperatorDef::input(int index) const {
MACE_CHECK(0 <= index && index <= input_.size()); MACE_CHECK(0 <= index && index <= input_.size());
return input_[index]; return input_[index];
...@@ -308,16 +222,12 @@ std::string *OperatorDef::add_input() { ...@@ -308,16 +222,12 @@ std::string *OperatorDef::add_input() {
void OperatorDef::add_input(const ::std::string &value) { void OperatorDef::add_input(const ::std::string &value) {
input_.push_back(value); input_.push_back(value);
} }
void OperatorDef::add_input(::std::string &&value) { void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
input_.push_back(value);
}
void OperatorDef::set_input(const std::vector<std::string> &value) { void OperatorDef::set_input(const std::vector<std::string> &value) {
input_.resize(value.size()); input_.resize(value.size());
std::copy(value.begin(), value.end(), input_.begin()); std::copy(value.begin(), value.end(), input_.begin());
} }
const std::vector<std::string> &OperatorDef::output() const { const std::vector<std::string> &OperatorDef::output() const { return output_; }
return output_;
}
const std::string &OperatorDef::output(int index) const { const std::string &OperatorDef::output(int index) const {
MACE_CHECK(0 <= index && index <= output_.size()); MACE_CHECK(0 <= index && index <= output_.size());
return output_[index]; return output_[index];
...@@ -336,9 +246,7 @@ void OperatorDef::set_output(const std::vector<std::string> &value) { ...@@ -336,9 +246,7 @@ void OperatorDef::set_output(const std::vector<std::string> &value) {
output_.resize(value.size()); output_.resize(value.size());
std::copy(value.begin(), value.end(), output_.begin()); std::copy(value.begin(), value.end(), output_.begin());
} }
const std::vector<Argument> &OperatorDef::arg() const { const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
return arg_;
}
Argument *OperatorDef::add_arg() { Argument *OperatorDef::add_arg() {
arg_.emplace_back(Argument()); arg_.emplace_back(Argument());
return &arg_.back(); return &arg_.back();
...@@ -358,18 +266,12 @@ void OperatorDef::set_output_type(const std::vector<DataType> &value) { ...@@ -358,18 +266,12 @@ void OperatorDef::set_output_type(const std::vector<DataType> &value) {
} }
// MemoryBlock // MemoryBlock
MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y) : MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
mem_id_(mem_id), x_(x), y_(y) {} : mem_id_(mem_id), x_(x), y_(y) {}
int MemoryBlock::mem_id() const { int MemoryBlock::mem_id() const { return mem_id_; }
return mem_id_; uint32_t MemoryBlock::x() const { return x_; }
} uint32_t MemoryBlock::y() const { return y_; }
uint32_t MemoryBlock::x() const {
return x_;
}
uint32_t MemoryBlock::y() const {
return y_;
}
// MemoryArena // MemoryArena
const std::vector<MemoryBlock> &MemoryArena::mem_block() const { const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
...@@ -378,131 +280,69 @@ const std::vector<MemoryBlock> &MemoryArena::mem_block() const { ...@@ -378,131 +280,69 @@ const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() { std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
return mem_block_; return mem_block_;
} }
int MemoryArena::mem_block_size() const { int MemoryArena::mem_block_size() const { return mem_block_.size(); }
return mem_block_.size();
}
// InputInfo // InputInfo
const std::string &InputInfo::name() const { const std::string &InputInfo::name() const { return name_; }
return name_; int32_t InputInfo::node_id() const { return node_id_; }
} int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
int32_t InputInfo::node_id() const { DataType InputInfo::data_type() const { return data_type_; }
return node_id_; const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
}
int32_t InputInfo::max_byte_size() const {
return max_byte_size_;
}
DataType InputInfo::data_type() const {
return data_type_;
}
const std::vector<int32_t> &InputInfo::dims() const {
return dims_;
}
// OutputInfo // OutputInfo
const std::string &OutputInfo::name() const { const std::string &OutputInfo::name() const { return name_; }
return name_; int32_t OutputInfo::node_id() const { return node_id_; }
} int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
int32_t OutputInfo::node_id() const { DataType OutputInfo::data_type() const { return data_type_; }
return node_id_; void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
} const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
int32_t OutputInfo::max_byte_size() const { void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
return max_byte_size_;
}
DataType OutputInfo::data_type() const {
return data_type_;
}
void OutputInfo::set_data_type(DataType data_type) {
data_type_ = data_type;
}
const std::vector<int32_t> &OutputInfo::dims() const {
return dims_;
}
void OutputInfo::set_dims(const std::vector<int32_t> &dims) {
dims_ = dims;
}
// NetDef // NetDef
NetDef::NetDef() : has_bits_(0) {} NetDef::NetDef() : has_bits_(0) {}
const std::string &NetDef::name() const { const std::string &NetDef::name() const { return name_; }
return name_;
}
void NetDef::set_name(const std::string &value) { void NetDef::set_name(const std::string &value) {
set_has_name(); set_has_name();
name_ = value; name_ = value;
} }
bool NetDef::has_name() const { bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
return (has_bits_ & 0x00000001u) != 0; void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
} const std::string &NetDef::version() const { return version_; }
void NetDef::set_has_name() {
has_bits_ |= 0x00000001u;
}
const std::string &NetDef::version() const {
return version_;
}
void NetDef::set_version(const std::string &value) { void NetDef::set_version(const std::string &value) {
set_has_version(); set_has_version();
version_ = value; version_ = value;
} }
bool NetDef::has_version() const { bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
return (has_bits_ & 0x00000002u) != 0; void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
} const std::vector<OperatorDef> &NetDef::op() const { return op_; }
void NetDef::set_has_version() {
has_bits_ |= 0x00000002u;
}
const std::vector<OperatorDef> &NetDef::op() const {
return op_;
}
OperatorDef *NetDef::add_op() { OperatorDef *NetDef::add_op() {
op_.emplace_back(OperatorDef()); op_.emplace_back(OperatorDef());
return &op_.back(); return &op_.back();
} }
std::vector<OperatorDef> &NetDef::mutable_op() { std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
return op_; const std::vector<Argument> &NetDef::arg() const { return arg_; }
}
const std::vector<Argument> &NetDef::arg() const {
return arg_;
}
Argument *NetDef::add_arg() { Argument *NetDef::add_arg() {
arg_.emplace_back(Argument()); arg_.emplace_back(Argument());
return &arg_.back(); return &arg_.back();
} }
std::vector<Argument> &NetDef::mutable_arg() { std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
return arg_; const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
} std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
const std::vector<ConstTensor> &NetDef::tensors() const { const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
return tensors_;
}
std::vector<ConstTensor> &NetDef::mutable_tensors() {
return tensors_;
}
const MemoryArena &NetDef::mem_arena() const {
return mem_arena_;
}
MemoryArena &NetDef::mutable_mem_arena() { MemoryArena &NetDef::mutable_mem_arena() {
set_has_mem_arena(); set_has_mem_arena();
return mem_arena_; return mem_arena_;
} }
bool NetDef::has_mem_arena() const { bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
return (has_bits_ & 0x00000004u) != 0; void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
} const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
void NetDef::set_has_mem_arena() {
has_bits_ |= 0x00000004u;
}
const std::vector<InputInfo> &NetDef::input_info() const {
return input_info_;
}
const std::vector<OutputInfo> &NetDef::output_info() const { const std::vector<OutputInfo> &NetDef::output_info() const {
return output_info_; return output_info_;
} }
std::vector<OutputInfo> &NetDef::mutable_output_info() { std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
return output_info_;
}
int NetDef::op_size() const { int NetDef::op_size() const { return op_.size(); }
return op_.size();
}
const OperatorDef &NetDef::op(const int idx) const { const OperatorDef &NetDef::op(const int idx) const {
MACE_CHECK(0 <= idx && idx < op_size()); MACE_CHECK(0 <= idx && idx < op_size());
...@@ -510,14 +350,15 @@ const OperatorDef &NetDef::op(const int idx) const { ...@@ -510,14 +350,15 @@ const OperatorDef &NetDef::op(const int idx) const {
} }
// Mace Engine // Mace Engine
MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type)
op_registry_(new OperatorRegistry()), device_type_(device_type), : op_registry_(new OperatorRegistry()),
ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) { device_type_(device_type),
ws_->CreateTensor("mace_input_node:0", ws_(new Workspace()),
GetDeviceAllocator(device_type_), net_(nullptr),
hexagon_controller_(nullptr) {
ws_->CreateTensor("mace_input_node:0", GetDeviceAllocator(device_type_),
DT_FLOAT); DT_FLOAT);
ws_->CreateTensor("mace_output_node:0", ws_->CreateTensor("mace_output_node:0", GetDeviceAllocator(device_type_),
GetDeviceAllocator(device_type_),
DT_FLOAT); DT_FLOAT);
if (device_type == HEXAGON) { if (device_type == HEXAGON) {
hexagon_controller_.reset(new HexagonControlWrapper()); hexagon_controller_.reset(new HexagonControlWrapper());
...@@ -525,8 +366,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : ...@@ -525,8 +366,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
MACE_CHECK(hexagon_controller_->Init(), "hexagon init error"); MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
hexagon_controller_->SetDebugLevel( hexagon_controller_->SetDebugLevel(
static_cast<int>(mace::logging::LogMessage::MinVLogLevel())); static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>( int dsp_mode =
*net_def, "dsp_mode", 0); ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
hexagon_controller_->SetGraphMode(dsp_mode); hexagon_controller_->SetGraphMode(dsp_mode);
MACE_CHECK(hexagon_controller_->SetupGraph(*net_def), MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
"hexagon setup graph error"); "hexagon setup graph error");
...@@ -537,8 +378,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : ...@@ -537,8 +378,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
ws_->LoadModelTensor(*net_def, device_type); ws_->LoadModelTensor(*net_def, device_type);
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
device_type, NetMode::INIT); NetMode::INIT);
if (!net->Run()) { if (!net->Run()) {
LOG(FATAL) << "Net init run failed"; LOG(FATAL) << "Net init run failed";
} }
...@@ -548,18 +389,19 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : ...@@ -548,18 +389,19 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
MaceEngine::MaceEngine(const NetDef *net_def, MaceEngine::MaceEngine(const NetDef *net_def,
DeviceType device_type, DeviceType device_type,
const std::vector<std::string> &input_nodes, const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes) : const std::vector<std::string> &output_nodes)
op_registry_(new OperatorRegistry()), device_type_(device_type), : op_registry_(new OperatorRegistry()),
ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) { device_type_(device_type),
ws_(new Workspace()),
net_(nullptr),
hexagon_controller_(nullptr) {
for (auto input_name : input_nodes) { for (auto input_name : input_nodes) {
ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"), ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
GetDeviceAllocator(device_type_), GetDeviceAllocator(device_type_), DT_FLOAT);
DT_FLOAT);
} }
for (auto output_name : output_nodes) { for (auto output_name : output_nodes) {
ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"), ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"),
GetDeviceAllocator(device_type_), GetDeviceAllocator(device_type_), DT_FLOAT);
DT_FLOAT);
} }
if (device_type == HEXAGON) { if (device_type == HEXAGON) {
hexagon_controller_.reset(new HexagonControlWrapper()); hexagon_controller_.reset(new HexagonControlWrapper());
...@@ -567,8 +409,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, ...@@ -567,8 +409,8 @@ MaceEngine::MaceEngine(const NetDef *net_def,
MACE_CHECK(hexagon_controller_->Init(), "hexagon init error"); MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
hexagon_controller_->SetDebugLevel( hexagon_controller_->SetDebugLevel(
static_cast<int>(mace::logging::LogMessage::MinVLogLevel())); static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>( int dsp_mode =
*net_def, "dsp_mode", 0); ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
hexagon_controller_->SetGraphMode(dsp_mode); hexagon_controller_->SetGraphMode(dsp_mode);
MACE_CHECK(hexagon_controller_->SetupGraph(*net_def), MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
"hexagon setup graph error"); "hexagon setup graph error");
...@@ -579,14 +421,13 @@ MaceEngine::MaceEngine(const NetDef *net_def, ...@@ -579,14 +421,13 @@ MaceEngine::MaceEngine(const NetDef *net_def,
ws_->LoadModelTensor(*net_def, device_type); ws_->LoadModelTensor(*net_def, device_type);
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
device_type, NetMode::INIT); NetMode::INIT);
if (!net->Run()) { if (!net->Run()) {
LOG(FATAL) << "Net init run failed"; LOG(FATAL) << "Net init run failed";
} }
net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type)); net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
} }
} }
MaceEngine::~MaceEngine() { MaceEngine::~MaceEngine() {
if (device_type_ == HEXAGON) { if (device_type_ == HEXAGON) {
...@@ -643,10 +484,11 @@ bool MaceEngine::Run(const float *input, ...@@ -643,10 +484,11 @@ bool MaceEngine::Run(const float *input,
bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs, bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
std::map<std::string, float *> &outputs, std::map<std::string, float *> &outputs,
RunMetadata *run_metadata) { RunMetadata *run_metadata) {
MACE_CHECK(device_type_ != HEXAGON,
MACE_CHECK(device_type_ != HEXAGON, "HEXAGON not supports multiple outputs now"); "HEXAGON not supports multiple outputs now");
for (auto input : inputs) { for (auto input : inputs) {
Tensor *input_tensor = ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0")); Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
input_tensor->Resize(input.shape); input_tensor->Resize(input.shape);
{ {
Tensor::MappingGuard input_guard(input_tensor); Tensor::MappingGuard input_guard(input_tensor);
...@@ -658,7 +500,8 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs, ...@@ -658,7 +500,8 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
LOG(FATAL) << "Net run failed"; LOG(FATAL) << "Net run failed";
} }
for (auto output : outputs) { for (auto output : outputs) {
Tensor *output_tensor = ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0")); Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
// save output // save output
if (output_tensor != nullptr && output.second != nullptr) { if (output_tensor != nullptr && output.second != nullptr) {
Tensor::MappingGuard output_guard(output_tensor); Tensor::MappingGuard output_guard(output_tensor);
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
// //
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/utils/utils.h"
#include "mace/utils/timer.h"
#include "mace/utils/memory_logging.h" #include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry, ...@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) const NetMode mode)
: NetBase(op_registry, net_def, ws, type), : NetBase(op_registry, net_def, ws, type), device_type_(type) {
device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); const auto &operator_def = net_def->op(idx);
...@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) { ...@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter; auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
"(", op->debug_def().type(), ")"); op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL && bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr || (run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1)); std::distance(iter, operators_.end()) == 1));
...@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet( ...@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) { const NetMode mode) {
std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode)); std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode));
return net; return net;
} }
......
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
......
...@@ -2,19 +2,19 @@ ...@@ -2,19 +2,19 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <vector>
#include <thread>
#include <sys/time.h> #include <sys/time.h>
#include <thread>
#include <vector>
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_nn_ops.h" #include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
namespace { namespace {
inline int64_t NowMicros() { inline int64_t NowMicros() {
struct timeval tv; struct timeval tv;
gettimeofday(&tv, nullptr); gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
} }
} }
namespace mace { namespace mace {
...@@ -63,7 +63,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -63,7 +63,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// const node // const node
std::thread const_thread([&]() { std::thread const_thread([&]() {
std::vector<hexagon_nn_const_node> const_node_list; std::vector<hexagon_nn_const_node> const_node_list;
for (const ConstTensor &const_tensor: net_def.tensors()) { for (const ConstTensor &const_tensor : net_def.tensors()) {
std::vector<int> tensor_shape(const_tensor.dims().begin(), std::vector<int> tensor_shape(const_tensor.dims().begin(),
const_tensor.dims().end()); const_tensor.dims().end());
while (tensor_shape.size() < 4) { while (tensor_shape.size() < 4) {
...@@ -77,30 +77,30 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -77,30 +77,30 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
const_node.tensor.width = tensor_shape[2]; const_node.tensor.width = tensor_shape[2];
const_node.tensor.depth = tensor_shape[3]; const_node.tensor.depth = tensor_shape[3];
if (const_tensor.data_type() == DataType::DT_INT32 if (const_tensor.data_type() == DataType::DT_INT32 &&
&& const_tensor.data_size() == 0) { const_tensor.data_size() == 0) {
const_node.tensor.data = NULL; const_node.tensor.data = NULL;
const_node.tensor.dataLen = 0; const_node.tensor.dataLen = 0;
} else { } else {
const_node.tensor.data = const_node.tensor.data =
const_cast<unsigned char *>(const_tensor.data()); const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen = const_node.tensor.dataLen = const_tensor.data_size() *
const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type()); GetEnumTypeSize(const_tensor.data_type());
} }
const_node_list.push_back(const_node); const_node_list.push_back(const_node);
// 255 is magic number: why fastrpc limits sequence length to that? // 255 is magic number: why fastrpc limits sequence length to that?
if (const_node_list.size() >= 250) { if (const_node_list.size() >= 250) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_, MACE_CHECK(
const_node_list.data(), hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) const_node_list.size()) == 0,
== 0, "append const node error"); "append const node error");
const_node_list.clear(); const_node_list.clear();
} }
} }
if (!const_node_list.empty()) { if (!const_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_, MACE_CHECK(
const_node_list.data(), hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0, const_node_list.size()) == 0,
"append const node error"); "append const node error");
} }
...@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
std::vector<hexagon_nn_input> inputs; std::vector<hexagon_nn_input> inputs;
std::vector<hexagon_nn_output> outputs; std::vector<hexagon_nn_output> outputs;
for (const OperatorDef &op: net_def.op()) { for (const OperatorDef &op : net_def.op()) {
int op_id = op_map.GetOpId(op.type()); int op_id = op_map.GetOpId(op.type());
inputs.resize(op.node_input().size()); inputs.resize(op.node_input().size());
for (size_t i = 0; i < op.node_input().size(); ++i) { for (size_t i = 0; i < op.node_input().size(); ++i) {
...@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
cached_inputs.push_back(inputs); cached_inputs.push_back(inputs);
cached_outputs.push_back(outputs); cached_outputs.push_back(outputs);
hexagon_nn_padding_type hexagon_nn_padding_type padding_type =
padding_type = static_cast<hexagon_nn_padding_type>( static_cast<hexagon_nn_padding_type>(op.padding());
op.padding());
hexagon_nn_op_node op_node; hexagon_nn_op_node op_node;
op_node.node_id = node_id(op.node_id()); op_node.node_id = node_id(op.node_id());
...@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
op_node_list.push_back(op_node); op_node_list.push_back(op_node);
if (op_node_list.size() >= 125) { if (op_node_list.size() >= 125) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.data(),
op_node_list.size()) == 0, op_node_list.size()) == 0,
"append node error"); "append node error");
op_node_list.clear(); op_node_list.clear();
...@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
} }
if (!op_node_list.empty()) { if (!op_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.data(),
op_node_list.size()) == 0, op_node_list.size()) == 0,
"append node error"); "append node error");
} }
...@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// input info // input info
num_inputs_ = 0; num_inputs_ = 0;
for (const InputInfo &input_info: net_def.input_info()) { for (const InputInfo &input_info : net_def.input_info()) {
std::vector<index_t> input_shape; std::vector<index_t> input_shape;
input_shape.insert(input_shape.begin(), input_shape.insert(input_shape.begin(), input_info.dims().begin(),
input_info.dims().begin(), input_info.dims().end()); input_info.dims().end());
while (input_shape.size() < 4) { while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1); input_shape.insert(input_shape.begin(), 1);
} }
...@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// output info // output info
num_outputs_ = 0; num_outputs_ = 0;
for (const OutputInfo &output_info: net_def.output_info()) { for (const OutputInfo &output_info : net_def.output_info()) {
std::vector<index_t> output_shape; std::vector<index_t> output_shape;
output_shape.insert(output_shape.begin(), output_shape.insert(output_shape.begin(), output_info.dims().begin(),
output_info.dims().begin(), output_info.dims().end()); output_info.dims().end());
while (output_shape.size() < 4) { while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1); output_shape.insert(output_shape.begin(), 1);
} }
...@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() { ...@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0; return hexagon_nn_teardown(nn_id_) == 0;
} }
#define PRINT_BUFSIZE (2*1024*1024) #define PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() { void HexagonControlWrapper::PrintLog() {
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_, MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
reinterpret_cast<unsigned char *>(buf), PRINT_BUFSIZE) == 0,
PRINT_BUFSIZE) == 0, "print log error"); "print log error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[]buf; delete[] buf;
} }
void HexagonControlWrapper::PrintGraph() { void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph"; LOG(INFO) << "Print Graph";
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_, MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
reinterpret_cast<unsigned char *>(buf), PRINT_BUFSIZE) == 0,
PRINT_BUFSIZE) == 0, "print graph error"); "print graph error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[]buf; delete[] buf;
} }
void HexagonControlWrapper::SetDebugLevel(int level) { void HexagonControlWrapper::SetDebugLevel(int level) {
...@@ -256,8 +253,8 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -256,8 +253,8 @@ void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info"; LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE); std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
unsigned int n_items = 0; unsigned int n_items = 0;
MACE_CHECK( MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0, &n_items) == 0,
"get perf info error"); "get perf info error");
std::unordered_map<uint32_t, float> node_id_counters; std::unordered_map<uint32_t, float> node_id_counters;
...@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
unsigned int node_id = perf_info[i].node_id; unsigned int node_id = perf_info[i].node_id;
unsigned int node_type_id = perf_info[i].node_type; unsigned int node_type_id = perf_info[i].node_type;
node_id_counters[node_id] = node_id_counters[node_id] =
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
+ perf_info[i].counter_lo) * 1.0f / perf_info[i].executions; perf_info[i].counter_lo) *
1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE]; char node_type_buf[MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE); hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
...@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
total_duration += node_id_counters[node_id]; total_duration += node_id_counters[node_id];
} }
for (auto &node_type_counter: node_type_counters) { for (auto &node_type_counter : node_type_counters) {
LOG(INFO) << "node type: " << node_type_counter.first LOG(INFO) << "node type: " << node_type_counter.first
<< ", time: " << node_type_counter.second.first << ", time: " << node_type_counter.second.first
<< ", duration: " << node_type_counter.second.second; << ", duration: " << node_type_counter.second.second;
...@@ -312,32 +310,24 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, ...@@ -312,32 +310,24 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
output_tensor->Resize(output_shapes_[0]); output_tensor->Resize(output_shapes_[0]);
std::vector<uint32_t> output_shape(4); std::vector<uint32_t> output_shape(4);
uint32_t output_bytes; uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_, int res = hexagon_nn_execute(
input_tensor.shape()[0], nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
input_tensor.shape()[1], input_tensor.shape()[2], input_tensor.shape()[3],
input_tensor.shape()[2], reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
input_tensor.shape()[3], input_tensor.raw_size(), &output_shape[0], &output_shape[1],
reinterpret_cast<const unsigned char *>( &output_shape[2], &output_shape[3],
input_tensor.raw_data()), reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
input_tensor.raw_size(), output_tensor->raw_size(), &output_bytes);
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_CHECK(res == 0, "execute error"); MACE_CHECK(res == 0, "execute error");
MACE_ASSERT(output_shape == output_shapes_[0], MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(), MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred."); "wrong output bytes inferred.");
return res == 0; return res == 0;
}; };
bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors, bool HexagonControlWrapper::ExecuteGraphNew(
const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) { std::vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_; LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size(); int num_inputs = input_tensors.size();
...@@ -369,8 +359,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten ...@@ -369,8 +359,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
outputs[i].dataLen = (*output_tensors)[i].raw_size(); outputs[i].dataLen = (*output_tensors)[i].raw_size();
} }
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs, int res =
outputs, num_outputs); hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) { for (int i = 0; i < num_outputs; ++i) {
std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height, std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
...@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, ...@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
float *min_in_data = input_tensors[1].mutable_data<float>(); float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1}); input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>(); float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
&input_tensors[0],
min_in_data,
max_in_data); max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) { if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false; return false;
...@@ -409,9 +397,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, ...@@ -409,9 +397,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
const float *min_out_data = output_tensors[1].data<float>(); const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>(); const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
*min_out_data,
*max_out_data,
output_tensor); output_tensor);
return true; return true;
} }
......
...@@ -16,16 +16,17 @@ namespace mace { ...@@ -16,16 +16,17 @@ namespace mace {
class HexagonControlWrapper { class HexagonControlWrapper {
public: public:
HexagonControlWrapper() {}; HexagonControlWrapper(){};
int GetVersion(); int GetVersion();
bool Config(); bool Config();
bool Init(); bool Init();
bool Finalize(); bool Finalize();
bool SetupGraph(const NetDef& net_def); bool SetupGraph(const NetDef &net_def);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor); bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors, bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors); std::vector<Tensor> *output_tensors);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor); bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor);
bool TeardownGraph(); bool TeardownGraph();
void PrintLog(); void PrintLog();
...@@ -38,9 +39,7 @@ class HexagonControlWrapper { ...@@ -38,9 +39,7 @@ class HexagonControlWrapper {
private: private:
static constexpr int NODE_ID_OFFSET = 10000; static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) { inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
return NODE_ID_OFFSET + nodeid;
}
int nn_id_; int nn_id_;
Quantizer quantizer_; Quantizer quantizer_;
...@@ -54,7 +53,6 @@ class HexagonControlWrapper { ...@@ -54,7 +53,6 @@ class HexagonControlWrapper {
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
}; };
} }
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_ #endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
...@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs, ...@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
return 0; return 0;
} }
int hexagon_controller_DeInitHexagon() { int hexagon_controller_DeInitHexagon() { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0; return 0;
} }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
...@@ -30,7 +30,7 @@ extern "C" { ...@@ -30,7 +30,7 @@ extern "C" {
#define __QAIC_STRING1_OBJECT_DEFINED__ #define __QAIC_STRING1_OBJECT_DEFINED__
#define __STRING1_OBJECT__ #define __STRING1_OBJECT__
typedef struct _cstring1_s { typedef struct _cstring1_s {
char* data; char *data;
int dataLen; int dataLen;
} _cstring1_t; } _cstring1_t;
...@@ -71,7 +71,7 @@ struct hexagon_nn_tensordef { ...@@ -71,7 +71,7 @@ struct hexagon_nn_tensordef {
unsigned int height; unsigned int height;
unsigned int width; unsigned int width;
unsigned int depth; unsigned int depth;
unsigned char* data; unsigned char *data;
int dataLen; int dataLen;
unsigned int data_valid_len; unsigned int data_valid_len;
unsigned int unused; unsigned int unused;
...@@ -81,9 +81,9 @@ struct hexagon_nn_op_node { ...@@ -81,9 +81,9 @@ struct hexagon_nn_op_node {
unsigned int node_id; unsigned int node_id;
unsigned int operation; unsigned int operation;
hexagon_nn_padding_type padding; hexagon_nn_padding_type padding;
hexagon_nn_input* inputs; hexagon_nn_input *inputs;
int inputsLen; int inputsLen;
hexagon_nn_output* outputs; hexagon_nn_output *outputs;
int outputsLen; int outputsLen;
}; };
typedef struct hexagon_nn_const_node hexagon_nn_const_node; typedef struct hexagon_nn_const_node hexagon_nn_const_node;
...@@ -91,30 +91,98 @@ struct hexagon_nn_const_node { ...@@ -91,30 +91,98 @@ struct hexagon_nn_const_node {
unsigned int node_id; unsigned int node_id;
hexagon_nn_tensordef tensor; hexagon_nn_tensordef tensor;
}; };
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE; unsigned char *buf,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE; int bufLen)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE; unsigned char *buf,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE; int bufLen)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE; unsigned int node_id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE; unsigned int operation,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_padding_type padding,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE; const hexagon_nn_input *inputs,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; int inputsLen,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE; const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#ifndef LIBMACE_HEXAGON_NN_OPS_H #ifndef LIBMACE_HEXAGON_NN_OPS_H
#define LIBMACE_HEXAGON_NN_OPS_H #define LIBMACE_HEXAGON_NN_OPS_H
#include "mace/utils/logging.h"
#include <unordered_map> #include <unordered_map>
#include "mace/utils/logging.h"
namespace mace { namespace mace {
...@@ -24,8 +24,7 @@ typedef enum op_type_enum { ...@@ -24,8 +24,7 @@ typedef enum op_type_enum {
class OpMap { class OpMap {
public: public:
void Init() { void Init() {
#define DEF_OP(NAME) \ #define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
op_map_[#NAME] = OP_##NAME;
#include "mace/core/runtime/hexagon/ops.h" #include "mace/core/runtime/hexagon/ops.h"
...@@ -40,6 +39,7 @@ class OpMap { ...@@ -40,6 +39,7 @@ class OpMap {
return OP_INVALID; return OP_INVALID;
} }
} }
private: private:
std::unordered_map<std::string, int> op_map_; std::unordered_map<std::string, int> op_map_;
}; };
......
...@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8) ...@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
#undef __SELF_DEF_OP_WREF #undef __SELF_DEF_OP_WREF
#undef DEF_OP_WREF #undef DEF_OP_WREF
#endif #endif
...@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor, ...@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
float *max_out) { float *max_out) {
float stepsize; float stepsize;
float recip_stepsize; float recip_stepsize;
QuantizeAdjustRange(min_in, max_in, QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
min_out, max_out, &recip_stepsize);
&stepsize, &recip_stepsize);
const float *in = in_tensor.data<float>(); const float *in = in_tensor.data<float>();
uint8_t *out = out_tensor->mutable_data<uint8_t>(); uint8_t *out = out_tensor->mutable_data<uint8_t>();
for (int i = 0; i < in_tensor.size(); i++) { for (int i = 0; i < in_tensor.size(); i++) {
const float inval = in[i]; const float inval = in[i];
float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f); float ival =
static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
if (ival < 0) ival = 0; if (ival < 0) ival = 0;
if (ival > 255) ival = 255; if (ival > 255) ival = 255;
out[i] = static_cast<uint8_t>(ival); out[i] = static_cast<uint8_t>(ival);
......
...@@ -16,13 +16,17 @@ class Quantizer { ...@@ -16,13 +16,17 @@ class Quantizer {
void Quantize(const Tensor &in_tensor, void Quantize(const Tensor &in_tensor,
Tensor *out_tensor, Tensor *out_tensor,
float *min_out, float *max_out); float *min_out,
float *max_out);
void Quantize(const Tensor &in_tensor, void Quantize(const Tensor &in_tensor,
const float min_in, const float max_in, const float min_in,
const float max_in,
Tensor *out_tensor, Tensor *out_tensor,
float *min_out, float *max_out); float *min_out,
float *max_out);
void DeQuantize(const Tensor &in_tensor, void DeQuantize(const Tensor &in_tensor,
const float min_in, const float max_in, const float min_in,
const float max_in,
Tensor *out_tensor); Tensor *out_tensor);
private: private:
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_allocator.h" #include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
...@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { ...@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
return 0; return 0;
} }
} }
} }
OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::OpenCLAllocator() {}
...@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const { ...@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) const { const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2"; MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1]; VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
<< image_shape[1];
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt)); cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error; cl_int error;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(), new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
img_format, image_shape[0], image_shape[1], 0, nullptr, &error);
image_shape[0], image_shape[1],
0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: [" MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1] << image_shape[0] << ", " << image_shape[1]
<< "]"; << "]";
...@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { ...@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
// TODO(heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset, queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
nbytes, nullptr, nullptr, &error); offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr; return mapped_ptr;
} }
...@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer, ...@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch->resize(2); mapped_image_pitch->resize(2);
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image, *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
origin, region, nullptr, &error);
mapped_image_pitch->data(),
mapped_image_pitch->data() + 1,
nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error; MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr; return mapped_ptr;
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include <vector> #include <vector>
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name, ...@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device, cl::Device &device,
cl::Program *program, cl::Program *program,
bool *is_binary) { bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap; extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
*is_binary = false; *is_binary = false;
auto it_source = kEncryptedProgramMap.find(program_name); auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source == kEncryptedProgramMap.end()) { if (it_source == kEncryptedProgramMap.end()) {
......
...@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name, ...@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device, cl::Device &device,
cl::Program *program, cl::Program *program,
bool *is_binary) { bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap; extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
*is_binary = true; *is_binary = true;
auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix); auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
if (it_binary == kCompiledProgramMap.end()) { if (it_binary == kCompiledProgramMap.end()) {
......
...@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() { ...@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0; return (stop_nanos_ - start_nanos_) / 1000.0;
} }
double OpenCLProfilingTimer::AccumulatedMicros() { double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
return accumulated_micros_;
}
void OpenCLProfilingTimer::AccumulateTiming(){ void OpenCLProfilingTimer::AccumulateTiming() {
StopTiming(); StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0; accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
} }
...@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() { ...@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
cl::CommandQueue command_queue(context, gpu_device, properties); cl::CommandQueue command_queue(context, gpu_device, properties);
const char *kernel_path = getenv("MACE_KERNEL_PATH"); const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->device_ = new cl::Device(gpu_device); this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context); this->context_ = new cl::Context(context);
...@@ -166,15 +165,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ...@@ -166,15 +165,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
GenerateCLBinaryFilenamePrefix(built_program_key); GenerateCLBinaryFilenamePrefix(built_program_key);
std::vector<unsigned char> program_vec; std::vector<unsigned char> program_vec;
bool is_opencl_binary; bool is_opencl_binary;
const bool found = GetSourceOrBinaryProgram(program_name, const bool found =
binary_file_name_prefix, GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
context(), device(), program, &is_opencl_binary);
device(),
program,
&is_opencl_binary);
MACE_CHECK(found, "Program not found for ", MACE_CHECK(found, "Program not found for ",
is_opencl_binary ? "binary: " : "source: ", is_opencl_binary ? "binary: " : "source: ", built_program_key);
built_program_key);
// Build program // Build program
std::string build_options_str = std::string build_options_str =
...@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ...@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
} }
LOG(FATAL) << "Build program from " LOG(FATAL) << "Build program from "
<< (is_opencl_binary ? "binary: " : "source: ") << (is_opencl_binary ? "binary: " : "source: ")
<< built_program_key << built_program_key << " failed: " << ret;
<< " failed: " << ret;
} }
if (!is_opencl_binary) { if (!is_opencl_binary) {
// Write binary if necessary // Write binary if necessary
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin"; std::string binary_filename =
kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1; size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes( std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]); new size_t[device_list_size]);
...@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel( ...@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) { if (built_program_it != built_program_map_.end()) {
program = built_program_it->second; program = built_program_it->second;
} else { } else {
this->BuildProgram(program_name, built_program_key, this->BuildProgram(program_name, built_program_key, build_options_str,
build_options_str, &program); &program);
built_program_map_.emplace(built_program_key, program); built_program_map_.emplace(built_program_key, program);
} }
return cl::Kernel(program, kernel_name.c_str()); return cl::Kernel(program, kernel_name.c_str());
......
...@@ -19,7 +19,8 @@ namespace mace { ...@@ -19,7 +19,8 @@ namespace mace {
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {}; explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0){};
void StartTiming() override; void StartTiming() override;
void StopTiming() override; void StopTiming() override;
void AccumulateTiming() override; void AccumulateTiming() override;
...@@ -48,6 +49,7 @@ class OpenCLRuntime { ...@@ -48,6 +49,7 @@ class OpenCLRuntime {
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
private: private:
OpenCLRuntime(); OpenCLRuntime();
~OpenCLRuntime(); ~OpenCLRuntime();
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
namespace mace { namespace mace {
// These functions are not thread-safe. // These functions are not thread-safe.
void LoadOpenCLLibrary(); void LoadOpenCLLibrary();
void UnloadOpenCLLibrary(); void UnloadOpenCLLibrary();
} // namespace mace } // namespace mace
......
...@@ -69,13 +69,10 @@ class Tensor { ...@@ -69,13 +69,10 @@ class Tensor {
dtype_(type), dtype_(type),
buffer_(nullptr), buffer_(nullptr),
is_buffer_owner_(true), is_buffer_owner_(true),
name_("") {}; name_(""){};
Tensor(BufferBase *buffer, DataType dtype) Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype), : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
buffer_(buffer),
is_buffer_owner_(false),
name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype) Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype), : dtype_(dtype),
...@@ -102,8 +99,8 @@ class Tensor { ...@@ -102,8 +99,8 @@ class Tensor {
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const { inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ", MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
index, " >= ", shape_.size()); shape_.size());
return shape_[index]; return shape_[index];
} }
...@@ -112,40 +109,35 @@ class Tensor { ...@@ -112,40 +109,35 @@ class Tensor {
std::multiplies<int64_t>()); std::multiplies<int64_t>());
} }
inline index_t raw_size() const { inline index_t raw_size() const { return size() * SizeOfType(); }
return size() * SizeOfType();
}
inline bool has_opencl_image() const { inline bool has_opencl_image() const {
return buffer_ != nullptr && !buffer_->OnHost() return buffer_ != nullptr && !buffer_->OnHost() &&
&& typeid(*buffer_) == typeid(Image); typeid(*buffer_) == typeid(Image);
} }
inline bool has_opencl_buffer() const { inline bool has_opencl_buffer() const {
return buffer_ != nullptr && !buffer_->OnHost() return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
&& !has_opencl_image();
} }
inline cl::Image *opencl_image() const { inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image"); MACE_CHECK(has_opencl_image(), "do not have image");
return static_cast<cl::Image*>(buffer_->buffer()); return static_cast<cl::Image *>(buffer_->buffer());
} }
inline cl::Buffer *opencl_buffer() const { inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer"); MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
return static_cast<cl::Buffer*>(buffer_->buffer()); return static_cast<cl::Buffer *>(buffer_->buffer());
} }
inline index_t buffer_offset() const { inline index_t buffer_offset() const { return buffer_->offset(); }
return buffer_->offset();
}
inline const void *raw_data() const { inline const void *raw_data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_data(); return buffer_->raw_data();
} }
template<typename T> template <typename T>
inline const T *data() const { inline const T *data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>(); return buffer_->data<T>();
...@@ -156,7 +148,7 @@ class Tensor { ...@@ -156,7 +148,7 @@ class Tensor {
return buffer_->raw_mutable_data(); return buffer_->raw_mutable_data();
} }
template<typename T> template <typename T>
inline T *mutable_data() { inline T *mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data()); return static_cast<T *>(buffer_->raw_mutable_data());
...@@ -188,25 +180,17 @@ class Tensor { ...@@ -188,25 +180,17 @@ class Tensor {
is_buffer_owner_ = true; is_buffer_owner_ = true;
} else { } else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image*>(buffer_); Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
&& image_shape[1] <= image->image_shape()[1], image_shape[1] <= image->image_shape()[1],
"tensor (source op ", "tensor (source op ", name_,
name_, "): current physical image shape: ", image->image_shape()[0],
"): current physical image shape: ", ", ", image->image_shape()[1], " < logical image shape: ",
image->image_shape()[0], image_shape[0], ", ", image_shape[1]);
", ",
image->image_shape()[1],
" < logical image shape: ",
image_shape[0],
", ",
image_shape[1]);
} }
} }
inline void ResizeLike(const Tensor &other) { inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
ResizeLike(&other);
}
inline void ResizeLike(const Tensor *other) { inline void ResizeLike(const Tensor *other) {
if (other->has_opencl_image()) { if (other->has_opencl_image()) {
...@@ -229,7 +213,7 @@ class Tensor { ...@@ -229,7 +213,7 @@ class Tensor {
memcpy(buffer_->raw_mutable_data(), src, size); memcpy(buffer_->raw_mutable_data(), src, size);
} }
template<typename T> template <typename T>
inline void Copy(const T *src, index_t length) { inline void Copy(const T *src, index_t length) {
MACE_CHECK(length == size(), "copy src and dst with different size."); MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * length); CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
...@@ -248,13 +232,9 @@ class Tensor { ...@@ -248,13 +232,9 @@ class Tensor {
return type_size; return type_size;
} }
inline BufferBase *UnderlyingBuffer() const { inline BufferBase *UnderlyingBuffer() const { return buffer_; }
return buffer_;
}
inline void SetSourceOpName(const std::string name) { inline void SetSourceOpName(const std::string name) { name_ = name; }
name_ = name;
}
inline void DebugPrint() const { inline void DebugPrint() const {
using namespace numerical_chars; using namespace numerical_chars;
...@@ -272,8 +252,9 @@ class Tensor { ...@@ -272,8 +252,9 @@ class Tensor {
} }
CASES(dtype_, (os << (this->data<T>()[i]) << ", ")); CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
} }
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str(); << ", " << dim(3) << "], content:\n"
<< os.str();
} }
class MappingGuard { class MappingGuard {
...@@ -308,7 +289,7 @@ class Tensor { ...@@ -308,7 +289,7 @@ class Tensor {
Allocator *allocator_; Allocator *allocator_;
DataType dtype_; DataType dtype_;
std::vector<index_t> shape_; std::vector<index_t> shape_;
std::vector<size_t > image_shape_; std::vector<size_t> image_shape_;
BufferBase *buffer_; BufferBase *buffer_;
BufferSlice buffer_slice_; BufferSlice buffer_slice_;
bool is_buffer_owner_; bool is_buffer_owner_;
......
...@@ -99,9 +99,7 @@ void RestartTiming() { ...@@ -99,9 +99,7 @@ void RestartTiming() {
accum_time = 0; accum_time = 0;
start_time = NowMicros(); start_time = NowMicros();
} }
void StartTiming() { void StartTiming() { start_time = NowMicros(); }
start_time = NowMicros();
}
void StopTiming() { void StopTiming() {
if (start_time != 0) { if (start_time != 0) {
accum_time += (NowMicros() - start_time); accum_time += (NowMicros() - start_time);
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_ #ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_ #define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \ #define BENCHMARK(n) \
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <map>
#include <cstdint> #include <cstdint>
#include <map>
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
...@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
std::string DataTypeToString(const DataType dt) { std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = { static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"}, {DT_FLOAT, "DT_FLOAT"}, {DT_HALF, "DT_HALF"},
{DT_HALF, "DT_HALF"}, {DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
{DT_DOUBLE, "DT_DOUBLE"}, {DT_INT8, "DT_INT8"}, {DT_INT32, "DT_INT32"},
{DT_UINT8, "DT_UINT8"}, {DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
{DT_INT8, "DT_INT8"}, {DT_INT64, "DT_INT64"}, {DT_BOOL, "DT_BOOL"},
{DT_INT32, "DT_INT32"}, {DT_STRING, "DT_STRING"}};
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type"; MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt]; return dtype_string_map[dt];
} }
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/core/workspace.h"
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/workspace.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
namespace mace { namespace mace {
...@@ -51,19 +51,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -51,19 +51,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
index_t model_data_size = 0; index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr; unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr if (model_data_ptr == nullptr ||
|| reinterpret_cast<long long>(const_tensor.data()) reinterpret_cast<long long>(const_tensor.data()) <
< reinterpret_cast<long long>(model_data_ptr)) { reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data()); model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
} }
} }
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size, model_data_size = std::max(
static_cast<index_t>( model_data_size,
(reinterpret_cast<long long>(const_tensor.data()) static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
- reinterpret_cast<long long>(model_data_ptr)) reinterpret_cast<long long>(model_data_ptr)) +
+ const_tensor.data_size() const_tensor.data_size() *
* GetEnumTypeSize(const_tensor.data_type()))); GetEnumTypeSize(const_tensor.data_type())));
} }
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
...@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name() VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", data type: " << const_tensor.data_type() << ", shape: "
<< ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(), << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end())); const_tensor.dims().end()));
std::vector<index_t> dims; std::vector<index_t> dims;
...@@ -90,13 +89,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -90,13 +89,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
dims.push_back(d); dims.push_back(d);
} }
index_t index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
std::unique_ptr<Tensor> tensor( std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(), new Tensor(BufferSlice(tensor_buffer_.get(), offset,
offset, const_tensor.data_size() *
const_tensor.data_size() GetEnumTypeSize(const_tensor.data_type())),
* GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type())); const_tensor.data_type()));
tensor->Reshape(dims); tensor->Reshape(dims);
...@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
// as GPU have consistent data type for each layer for now. // as GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op, // As DSP may have different data output type for each op,
// we stick to the same concept. // we stick to the same concept.
for (auto &op: net_def.op()) { for (auto &op : net_def.op()) {
if (op.has_mem_id()) { if (op.has_mem_id()) {
const DataType op_dtype = static_cast<DataType>( const DataType op_dtype = static_cast<DataType>(
ArgumentHelper::GetSingleArgument<OperatorDef, int>( ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op, op, "T", static_cast<int>(DT_FLOAT)));
"T",
static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) { if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype; dtype = op_dtype;
// find first valid data type, break // find first valid data type, break
...@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
} }
} }
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase> std::unique_ptr<BufferBase> image_buf(
image_buf(new Image({mem_block.x(), mem_block.y()}, dtype)); new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
} }
VLOG(3) << "Preallocate image to tensors"; VLOG(3) << "Preallocate image to tensors";
for (auto &op: net_def.op()) { for (auto &op : net_def.op()) {
if (op.has_mem_id()) { if (op.has_mem_id()) {
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor(
(new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype)); new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
tensor->SetSourceOpName(op.name()); tensor->SetSourceOpName(op.name());
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: " VLOG(3)
<< op.mem_id() << "; Image shape: " << "Tensor: " << op.name() << "(" << op.type() << ")"
<< "; Mem: " << op.mem_id() << "; Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0] << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< ", " << ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1]; << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor_map_[op.output(0)] = std::move(tensor); tensor_map_[op.output(0)] = std::move(tensor);
} }
} }
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#ifndef MACE_CORE_WORKSPACE_H_ #ifndef MACE_CORE_WORKSPACE_H_
#define MACE_CORE_WORKSPACE_H_ #define MACE_CORE_WORKSPACE_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/preallocated_pooled_allocator.h"
namespace mace { namespace mace {
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_ACTIVATION_H_ #define MACE_KERNELS_ACTIVATION_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -99,15 +99,13 @@ void PReLUActivation(const T *input_ptr, ...@@ -99,15 +99,13 @@ void PReLUActivation(const T *input_ptr,
output_ptr[i] = in; output_ptr[i] = in;
} }
} }
} }
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ActivationFunctor { class ActivationFunctor {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), : activation_(type), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
...@@ -118,9 +116,11 @@ class ActivationFunctor { ...@@ -118,9 +116,11 @@ class ActivationFunctor {
if (activation_ == PRELU) { if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha); MACE_CHECK_NOTNULL(alpha);
const T *alpha_ptr = alpha->data<T>(); const T *alpha_ptr = alpha->data<T>();
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr); PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
output_ptr);
} else { } else {
DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_); DoActivation(input_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
} }
} }
...@@ -131,14 +131,16 @@ class ActivationFunctor { ...@@ -131,14 +131,16 @@ class ActivationFunctor {
template <> template <>
void ActivationFunctor<DeviceType::NEON, float>::operator()( void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future); const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
template <typename T> template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> { class ActivationFunctor<DeviceType::OPENCL, T> {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), : activation_(type), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
......
...@@ -18,7 +18,7 @@ namespace mace { ...@@ -18,7 +18,7 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
constexpr int kCostPerGroup = 1024; constexpr int kCostPerGroup = 1024;
} // namespace } // namespace
template <DeviceType D, typename T> template <DeviceType D, typename T>
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#endif #endif
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -24,7 +24,7 @@ struct BatchNormFunctorBase { ...@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
const float relux_max_limit) const float relux_max_limit)
: folded_constant_(folded_constant), : folded_constant_(folded_constant),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const bool folded_constant_; const bool folded_constant_;
const ActivationType activation_; const ActivationType activation_;
...@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { ...@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase( : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
...@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase( : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_BIAS_ADD_H_ #define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -32,7 +32,6 @@ struct BiasAddFunctor { ...@@ -32,7 +32,6 @@ struct BiasAddFunctor {
const T *bias_ptr = bias->data<T>(); const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
...@@ -44,7 +43,6 @@ struct BiasAddFunctor { ...@@ -44,7 +43,6 @@ struct BiasAddFunctor {
} }
} }
} }
} }
}; };
......
...@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase { ...@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
bool i2b_; bool i2b_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{ struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{ ...@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
} }
}; };
template<typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
...@@ -16,8 +16,10 @@ class ChannelShuffleFunctor { ...@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
public: public:
ChannelShuffleFunctor(const int group) : group_(group) {} ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape, void operator()(const T *input,
T *output, StatsFuture *future) { const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0]; index_t batch = input_shape[0];
index_t channels = input_shape[1]; index_t channels = input_shape[1];
index_t height = input_shape[2]; index_t height = input_shape[2];
......
...@@ -6,23 +6,23 @@ ...@@ -6,23 +6,23 @@
#define MACE_KERNELS_CONCAT_H_ #define MACE_KERNELS_CONCAT_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ConcatFunctorBase { struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis): axis_(axis){} ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_; int32_t axis_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase { struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
} }
}; };
template<typename T> template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, StatsFuture *future); Tensor *output,
StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -116,8 +116,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -116,8 +116,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
sum[sum_idx] += vaddvq_f32(tmp); sum[sum_idx] += vaddvq_f32(tmp);
#else #else
for (int inci = 0; inci < inc_tile_size; ++inci) { for (int inci = 0; inci < inc_tile_size; ++inci) {
sum[sum_idx] += sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
in[in_idx * inc_tile_size + inci] *
weights[weights_idx * inc_tile_size + inci]; weights[weights_idx * inc_tile_size + inci];
} }
#endif #endif
...@@ -188,7 +187,7 @@ struct Conv2dFunctorBase { ...@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
const Padding padding_type_; const Padding padding_type_;
...@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(), CalcOutputSize(input->shape().data(), filter->shape().data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
output->Resize(output_shape); output->Resize(output_shape);
......
...@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC ...@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
*/ */
output_shape[0] = input_shape[0]; output_shape[0] = input_shape[0];
if (round_type == FLOOR) { if (round_type == FLOOR) {
output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0] output_shape[1] = static_cast<index_t>(
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1); std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1] (filter_shape[0] - 1) * (dilations[0] - 1)) /
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1); strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} else { } else {
output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0] output_shape[1] = static_cast<index_t>(
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1); std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1] (filter_shape[0] - 1) * (dilations[0] - 1)) /
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1); strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} }
output_shape[3] = filter_shape[2]; output_shape[3] = filter_shape[2];
} }
void CalPaddingSize(const index_t *input_shape, // NCHW void CalPaddingSize(const index_t *input_shape, // NCHW
......
...@@ -15,7 +15,7 @@ enum Padding { ...@@ -15,7 +15,7 @@ enum Padding {
FULL = 2, // Pads with one less than the filter size on both sides FULL = 2, // Pads with one less than the filter size on both sides
}; };
enum RoundType{ enum RoundType {
FLOOR = 0, FLOOR = 0,
CEIL = 1, CEIL = 1,
}; };
......
...@@ -10,9 +10,9 @@ ...@@ -10,9 +10,9 @@
#endif #endif
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase { ...@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
const Padding padding_type_; const Padding padding_type_;
...@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(), CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
auto input_shape = fake_filter_shape; auto input_shape = fake_filter_shape;
output->Resize(output_shape); output->Resize(output_shape);
......
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
#define MACE_KERNELS_ELTWISE_H_ #define MACE_KERNELS_ELTWISE_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
enum EltwiseType{ enum EltwiseType {
PROD = 0, PROD = 0,
SUM = 1, SUM = 1,
MAX = 2, MAX = 2,
...@@ -19,8 +19,7 @@ enum EltwiseType{ ...@@ -19,8 +19,7 @@ enum EltwiseType{
}; };
struct EltwiseFunctorBase { struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type, EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: type_(type), coeff_(coeff) {} : type_(type), coeff_(coeff) {}
EltwiseType type_; EltwiseType type_;
...@@ -29,8 +28,7 @@ struct EltwiseFunctorBase { ...@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase { struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {} : EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0, void operator()(const Tensor *input0,
...@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
switch (type_) { switch (type_) {
case PROD: case PROD:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = input0_ptr[i] * input1_ptr[i]; output_ptr[i] = input0_ptr[i] * input1_ptr[i];
} }
break; break;
...@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i]; output_ptr[i] =
coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
} }
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]); output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]); output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
} }
break; break;
...@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} }
}; };
template <typename T> template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {} : EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0, void operator()(const Tensor *input0,
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#define MACE_KERNELS_FULLY_CONNECTED_H_ #define MACE_KERNELS_FULLY_CONNECTED_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
namespace mace { namespace mace {
...@@ -16,25 +16,23 @@ namespace kernels { ...@@ -16,25 +16,23 @@ namespace kernels {
struct FullyConnectedBase { struct FullyConnectedBase {
FullyConnectedBase(const ActivationType activation, FullyConnectedBase(const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: activation_(activation), : activation_(activation), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct FullyConnectedFunctor : FullyConnectedBase { struct FullyConnectedFunctor : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) : const float relux_max_limit)
FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
output->Resize(output_shape); output->Resize(output_shape);
const index_t N = output->dim(0); const index_t N = output->dim(0);
...@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase { ...@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
} }
}; };
template<typename T> template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) : const float relux_max_limit)
FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
......
...@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor { ...@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <> template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()( void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, const float *input,
float *output, StatsFuture *future); const index_t *input_shape,
float *output,
StatsFuture *future);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -6,20 +6,18 @@ ...@@ -6,20 +6,18 @@
#define MACE_KERNELS_MATMUL_H_ #define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct MatMulFunctor { struct MatMulFunctor {
void operator()(const Tensor *A, void operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
C->Resize(c_shape); C->Resize(c_shape);
const index_t N = C->dim(0); const index_t N = C->dim(0);
...@@ -52,7 +50,6 @@ struct MatMulFunctor { ...@@ -52,7 +50,6 @@ struct MatMulFunctor {
} }
}; };
template <typename T> template <typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> { struct MatMulFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *A, void operator()(const Tensor *A,
......
...@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()( ...@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t i = 0; i < n; ++i) { for (index_t i = 0; i < n; ++i) {
for (index_t j = 0; j < sample_size; ++j) { for (index_t j = 0; j < sample_size; ++j) {
const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel; const float *input_sample_ptr =
input_ptr + (i * sample_size + j) * channel;
float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel; float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
const float *new_scale_ptr = new_scale.data(); const float *new_scale_ptr = new_scale.data();
const float *new_offset_ptr = new_offset.data(); const float *new_offset_ptr = new_offset.data();
......
...@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4); std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2); std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize( kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, input->shape().data(), filter->shape().data(), dilations_, strides_,
strides_, paddings_, output_shape_vec.data(), paddings.data()); paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec); output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)( typedef void (*Conv2dNeonFunction)(
...@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto output_shape = output->shape().data(); auto output_shape = output->shape().data();
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr, conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
bias_data, output_data, output_shape); output_data, output_shape);
} }
} // namespace kernels } // namespace kernels
......
...@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW ...@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
int input_channels = input_shape[1]; int input_channels = input_shape[1];
int input_height = input_shape[2]; int input_height = input_shape[2];
int input_width = input_shape[3]; int input_width = input_shape[3];
int multiplier = int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
filter_shape == nullptr ? 0 : filter_shape[0]; int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) { for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) { for (int oc = 0; oc < output_channels; ++oc) {
...@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW ...@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
int input_channels = input_shape[1]; int input_channels = input_shape[1];
int input_height = input_shape[2]; int input_height = input_shape[2];
int input_width = input_shape[3]; int input_width = input_shape[3];
int multiplier = int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
filter_shape == nullptr ? 0 : filter_shape[0]; int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) { for (int b = 0; b < output_batch; ++b) {
......
...@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, DepthwiseConv2dFunctor<DeviceType::CPU, float>(
dilations_)( strides_, paddings_, dilations_)(input, filter, bias, output, future);
input, filter, bias, output, future);
return; return;
} }
...@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
} }
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr, conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
output_shape); output_ptr, output_shape);
} }
} // namespace kernels } // namespace kernels
......
...@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
default: default:
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
runtime->BuildKernel("activation", kernel_name, built_options);
int idx = 0; int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) { if (activation_ == PRELU) {
...@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(2), output->dim(3)); output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
} }
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
for (auto input : input_tensors) { for (auto input : input_tensors) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input->opencl_image()));
*(input->opencl_image()));
} }
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(batch_height_pixels)};
static_cast<uint32_t>(batch_height_pixels)
};
const std::vector<uint32_t> lws = {64, 16, 1}; const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< output_shape[0] << "_" << "_" << output_shape[2] << "_" << output_shape[3];
<< output_shape[1] << "_"
<< output_shape[2] << "_"
<< output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct AddNFunctor<DeviceType::OPENCL, float>;
struct AddNFunctor<DeviceType::OPENCL, float>;
template template struct AddNFunctor<DeviceType::OPENCL, half>;
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
runtime->BuildKernel("batch_norm", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, kernel_.setArg(idx++, *(offset->opencl_image()));
*(offset->opencl_image()));
if (!folded_constant_) { if (!folded_constant_) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(mean->opencl_image()));
*(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image())); kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon); kernel_.setArg(idx++, epsilon);
} }
......
...@@ -12,8 +12,7 @@ namespace mace { ...@@ -12,8 +12,7 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
...@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template template struct BiasAddFunctor<DeviceType::OPENCL, float>;
struct BiasAddFunctor<DeviceType::OPENCL, float>; template struct BiasAddFunctor<DeviceType::OPENCL, half>;
template
struct BiasAddFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -9,36 +9,33 @@ ...@@ -9,36 +9,33 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
const BufferType type, Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
Tensor *image,
StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape); CalImage2DShape(buffer->shape(), type, image_shape);
if(type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape); image->ResizeImage(new_shape, image_shape);
} else { } else {
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
} }
} else { } else {
Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer()); Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape(); image_shape = image_buf->image_shape();
buffer->Resize(image->shape()); buffer->Resize(image->shape());
} }
size_t gws[2] = {image_shape[0], size_t gws[2] = {image_shape[0], image_shape[1]};
image_shape[1]};
std::string kernel_name; std::string kernel_name;
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image"; kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break; break;
case DW_CONV2D_FILTER: case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; kernel_name =
i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break; break;
case IN_OUT_CHANNEL: case IN_OUT_CHANNEL:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
...@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break; break;
case IN_OUT_HEIGHT: case IN_OUT_HEIGHT:
case WEIGHT_HEIGHT: case WEIGHT_HEIGHT:
kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image"; kernel_name = i2b_ ? "in_out_height_image_to_buffer"
: "in_out_height_buffer_to_image";
break; break;
case IN_OUT_WIDTH: case IN_OUT_WIDTH:
MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now"; MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
...@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break; break;
case WINOGRAD_FILTER: case WINOGRAD_FILTER:
gws[1] /= 16; gws[1] /= 16;
kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image"; kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
: "winograd_filter_buffer_to_image";
break; break;
} }
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
...@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, obfuscated_kernel_name, built_options);
built_options);
uint32_t idx = 0; uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) { if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype()))); "buffer offset not aligned");
b2f_kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
} }
if (type == ARGUMENT) { if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) { } else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, 1); b2f_kernel.setArg(idx++, 1);
...@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const std::vector<uint32_t> lws = {16, 64}; const std::vector<uint32_t> lws = {16, 64};
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event);
cl::NDRange(lws[0], lws[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
inline DATA_TYPE4 do_activation(DATA_TYPE4 in, inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU #ifdef USE_PRELU
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include "mace/kernels/concat.h" #include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel, ...@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image()))); kernel->setArg(idx++,
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image()))); *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3))); kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image()))); kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
} }
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
} }
...@@ -97,22 +96,20 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -97,22 +96,20 @@ static void ConcatN(cl::Kernel *kernel,
index_t input_channel_blk = input->dim(3) / 4; index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk; chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_n_opencl_kernel_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< input_channel_blk << "_"
<< width << "_"
<< batch * height; << batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
} }
} }
template<typename T> template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list, void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const int inputs_count = input_list.size(); const int inputs_count = input_list.size();
...@@ -137,7 +134,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te ...@@ -137,7 +134,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
} }
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
MACE_CHECK(inputs_count == 2 || divisible_four, MACE_CHECK(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
...@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te ...@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future); ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
} } else {
else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
} }
}; };
template template struct ConcatFunctor<DeviceType::OPENCL, float>;
struct ConcatFunctor<DeviceType::OPENCL, float>; template struct ConcatFunctor<DeviceType::OPENCL, half>;
template
struct ConcatFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
template<typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)( typedef void (*Conv2dOpenclFunction)(
cl::Kernel *kernel, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, const Tensor *bias, const int stride, const int *padding,
const int *padding, const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt, Tensor *output,
Tensor *output, StatsFuture *future); StatsFuture *future);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = static const Conv2dOpenclFunction selector[5] = {
{Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
...@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(), CalcOutputSize(input->shape().data(), filter->shape().data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
...@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
relux_max_limit_, DataTypeToEnum<T>::value, output, future); dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future); dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} }
} }
template template struct Conv2dFunctor<DeviceType::OPENCL, float>;
struct Conv2dFunctor<DeviceType::OPENCL, float>; template struct Conv2dFunctor<DeviceType::OPENCL, half>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
// FIXME handle flexable data type: half not supported // FIXME handle flexable data type: half not supported
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input_height)); kernel->setArg(idx++, static_cast<int>(input_height));
...@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input->dim(1))); kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2))); kernel->setArg(idx++, static_cast<int>(input->dim(2)));
......
...@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
runtime->BuildKernel("conv_2d", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1))); kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2))); kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
......
...@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
if(kernel->get() == nullptr) { if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0); const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1); const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2); const index_t input_width = input->dim(2);
...@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); *kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg( kernel->setArg(idx++, *(filter->opencl_image()));
idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg( kernel->setArg(idx++, *(bias->opencl_image()));
idx++, *(bias->opencl_image()));
} }
kernel->setArg( kernel->setArg(idx++, *(output->opencl_image()));
idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height)); kernel->setArg(idx++, static_cast<short>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width)); kernel->setArg(idx++, static_cast<short>(input_width));
...@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(), CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} }
......
...@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input0->dim(0); const index_t batch = input0->dim(0);
const index_t height = input0->dim(1); const index_t height = input0->dim(1);
const index_t width = input0->dim(2); const index_t width = input0->dim(2);
...@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input0->opencl_image()));
*(input0->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++,
*(input1->opencl_image()));
if (!coeff_.empty()) { if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]); kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]); kernel_.setArg(idx++, coeff_[1]);
...@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(batch_height_pixels)};
static_cast<uint32_t>(batch_height_pixels)
};
const std::vector<uint32_t> lws = {64, 16, 1}; const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "eltwise_opencl_kernel_" ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -10,14 +10,13 @@ ...@@ -10,14 +10,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
...@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
default: default:
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options); kernel_ =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++,
*(weight->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel_.setArg(idx++, kernel_.setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(input->dim(1))); kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2))); kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(input->dim(3))); kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
...@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
static_cast<uint32_t>(output_blocks),
}; };
const std::vector<uint32_t> lws = {16, 64, 1}; const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "fc_opencl_kernel_" ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(0) << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
// //
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -28,7 +28,8 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */ ...@@ -28,7 +28,8 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
} }
// [H * W * M, (Ic + 3) / 4] // [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */ void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
...@@ -47,7 +48,8 @@ void CalArgImageShape(const std::vector<index_t> &shape, ...@@ -47,7 +48,8 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// Only support 3x3 now // Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc] // [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/ void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
...@@ -115,15 +117,12 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -115,15 +117,12 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
} }
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) { const BufferType type) {
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1}; return {16, shape[0], shape[1], 1};
}else if (type == IN_OUT_HEIGHT) { } else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
((shape[1] - 1) / 2) *
((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1}; return {16, shape[3], out_width, 1};
} else { } else {
LOG(FATAL) << "Mace not supported yet."; LOG(FATAL) << "Mace not supported yet.";
...@@ -188,8 +187,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -188,8 +187,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size); local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2], local_ws[2] =
kwg_size / (local_ws[0] * local_ws[1])); std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return { return {
// TODO tuning these magic numbers // TODO tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1}, {local_ws[0], local_ws[1], local_ws[2], 1},
...@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int { std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D"; MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[3]; uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks; const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
...@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) { if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]); uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
(*tuning_result)[3] = num_blocks; (*tuning_result)[3] = num_blocks;
const uint32_t block_size = gws[2] / num_blocks; const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
...@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
{kwg_size / 256, 256, 1}, {kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1}, {kwg_size / 512, 512, 1},
{kwg_size, 1, 1}, {kwg_size, 1, 1},
{1, kwg_size, 1} {1, kwg_size, 1}};
};
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int { std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d"; MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[2]; uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks; const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++; if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(0, i * block_size), cl::NDRange(params[0], params[1]), nullptr, &event);
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
...@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) { if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]); uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
(*tuning_result)[2] = num_blocks; (*tuning_result)[2] = num_blocks;
const uint32_t block_size = gws[1] / num_blocks; const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++; if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
...@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key, Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
lws, tuning_key, lws, params_generator, func, &timer);
params_generator,
func,
&timer);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
...@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
} }
}; };
} }
} }
} // namespace kernels } // namespace kernels
......
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_ #ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/core/future.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
...@@ -72,12 +71,12 @@ inline bool LimitKernelTime() { ...@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
} }
namespace { namespace {
template<typename T> template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v; (*ss) << v;
} }
template<typename T, typename... Args> template <typename T, typename... Args>
void AppendToStream(std::stringstream *ss, void AppendToStream(std::stringstream *ss,
const std::string &delimiter, const std::string &delimiter,
T first, T first,
...@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss, ...@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
} }
} // namespace } // namespace
template<typename... Args> template <typename... Args>
std::string Concat(Args... args) { std::string Concat(Args... args) {
std::stringstream ss; std::stringstream ss;
AppendToStream(&ss, "_", args...); AppendToStream(&ss, "_", args...);
......
...@@ -11,12 +11,10 @@ namespace mace { ...@@ -11,12 +11,10 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()( void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
...@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, kernel_.setArg(idx++, *(B->opencl_image()));
*(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height)); kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width)); kernel_.setArg(idx++, static_cast<int>(width));
...@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
}; };
const std::vector<uint32_t> lws = {16, 64, 1}; const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "matmul_opencl_kernel_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(0) << "_" << C->dim(2) << "_" << C->dim(3);
<< C->dim(1) << "_"
<< C->dim(2) << "_"
<< C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct MatMulFunctor<DeviceType::OPENCL, float>;
struct MatMulFunctor<DeviceType::OPENCL, float>;
template template struct MatMulFunctor<DeviceType::OPENCL, half>;
struct MatMulFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -11,17 +11,15 @@ ...@@ -11,17 +11,15 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = { std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
kernels_[0], kernels_[1], input->dim(3)};
input->dim(3), input->dim(3)
};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
...@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
}; };
std::vector<uint32_t> lws = {8, 16, 8, 1}; std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "pooling_opencl_kernel_" ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct PoolingFunctor<DeviceType::OPENCL, float>;
struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::OPENCL, half>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape{batch, out_height, out_width, channels}; std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
BufferType::IN_OUT_CHANNEL,
output_image_shape); output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
...@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -6,13 +6,13 @@ ...@@ -6,13 +6,13 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
struct SoftmaxFunctor<DeviceType::OPENCL, float>; template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
template
struct SoftmaxFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -5,17 +5,18 @@ ...@@ -5,17 +5,18 @@
#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h" #include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor, void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
Tensor *batch_tensor, Tensor *batch_tensor,
StatsFuture *future) { StatsFuture *future) {
...@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
if (b2s_) { if (b2s_) {
...@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {chan_blk, const uint32_t gws[3] = {
static_cast<uint32_t>(batch_tensor->dim(2)), chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3); << batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -11,21 +11,21 @@ ...@@ -11,21 +11,21 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor, void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output_tensor, const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
StatsFuture *future) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(), input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
padding_type_, output_shape.data(), paddings.data()); strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(), CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
} }
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h = (output_shape[1] + 1) / 2;
...@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
obfuscated_kernel_name,
built_options); built_options);
uint32_t idx = 0; uint32_t idx = 0;
...@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
} }
const uint32_t gws[2] = {static_cast<uint32_t>(out_width), const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))}; static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1}; const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_transform_kernel_" ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
template<typename T> template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor, void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor,
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)}; std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name); built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); obfuscated_kernel_name);
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) { switch (activation_) {
case NOOP: case NOOP:
...@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te ...@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
obfuscated_kernel_name,
built_options); built_options);
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image()))); kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) { if (bias != nullptr) {
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image()))); kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
} }
kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image()))); kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
...@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te ...@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
} }
const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)), const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))}; static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1}; const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
struct WinogradTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
#include <limits> #include <limits>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
...@@ -42,7 +42,7 @@ struct PoolingFunctorBase { ...@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
const int *dilations_; const int *dilations_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase { struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(const PoolingType pooling_type,
const int *kernels, const int *kernels,
...@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase(pooling_type, kernels, : PoolingFunctorBase(
strides, padding_type, pooling_type, kernels, strides, padding_type, paddings, dilations) {
paddings, dilations) {} }
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = { std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1], kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
input_tensor->dim(3), input_tensor->dim(3)
};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_, strides_, input_tensor->shape().data(), filter_shape.data(), dilations_,
padding_type_, output_shape.data(), paddings.data()); strides_, padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(), CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
} }
output_tensor->Resize(output_shape); output_tensor->Resize(output_shape);
...@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest(); T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
res = std::max(res, input[input_offset]); res = std::max(res, input[input_offset]);
} }
} }
...@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
...@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset]; sum += input[input_offset];
block_size += 1; block_size += 1;
} }
...@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
} }
} }
} }
}; };
template<> template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()( void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
Tensor *output_tensor,
StatsFuture *future);
template<typename T> template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(const PoolingType pooling_type,
const int *kernels, const int *kernels,
...@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase(pooling_type, kernels, : PoolingFunctorBase(
strides, padding_type, pooling_type, kernels, strides, padding_type, paddings, dilations) {
paddings, dilations) {} }
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESHAPE_H_ #define MACE_KERNELS_RESHAPE_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -25,7 +25,6 @@ struct ReshapeFunctor { ...@@ -25,7 +25,6 @@ struct ReshapeFunctor {
} }
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_ #define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ...@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
} }
}; };
template<typename T> template <typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase { struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {} : ResizeBilinearFunctorBase(size, align_corners) {}
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_CONV_2D_H_ #define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -16,11 +16,10 @@ namespace kernels { ...@@ -16,11 +16,10 @@ namespace kernels {
struct SpaceToBatchFunctorBase { struct SpaceToBatchFunctorBase {
SpaceToBatchFunctorBase(const std::vector<int> &paddings, SpaceToBatchFunctorBase(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): bool b2s)
paddings_(paddings.begin(), paddings.end()), : paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()), block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s) b2s_(b2s) {}
{}
std::vector<int> paddings_; std::vector<int> paddings_;
std::vector<int> block_shape_; std::vector<int> block_shape_;
...@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase { ...@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){} bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor, void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
...@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{ ...@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
}; };
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){} bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor, void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
...@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{ ...@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_ #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/conv_pool_2d_util.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -17,8 +17,10 @@ namespace kernels { ...@@ -17,8 +17,10 @@ namespace kernels {
struct WinogradTransformFunctorBase { struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type, WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: strides_({1, 1}), dilations_({1, 1}), : strides_({1, 1}),
padding_type_(padding_type), paddings_(paddings) {} dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings) {}
const std::vector<int> strides_; // [stride_h, stride_w] const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w] const std::vector<int> dilations_; // [dilation_h, dilation_w]
...@@ -26,29 +28,25 @@ struct WinogradTransformFunctorBase { ...@@ -26,29 +28,25 @@ struct WinogradTransformFunctorBase {
std::vector<int> paddings_; std::vector<int> paddings_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase { struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {} : WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input, void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
template<typename T> template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase { struct WinogradTransformFunctor<DeviceType::OPENCL, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {} : WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input, void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
...@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase { ...@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
const float relux_max_limit_; const float relux_max_limit_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch, WinogradInverseTransformFunctor(const int batch,
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {} : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
...@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
StatsFuture *future) { StatsFuture *future) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
template<typename T> template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch, WinogradInverseTransformFunctor(const int batch,
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {} : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
......
...@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> { ...@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr; const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor); output_tensor->ResizeLike(input_tensor);
......
...@@ -214,9 +214,7 @@ void TestSimplePrelu() { ...@@ -214,9 +214,7 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
"Alpha", {2},
{2.0, 3.0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
...@@ -250,7 +248,8 @@ void TestSimplePrelu() { ...@@ -250,7 +248,8 @@ void TestSimplePrelu() {
} }
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); {2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
......
...@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> { ...@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
inputs[i] = this->Input(i); inputs[i] = this->Input(i);
MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size()); MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: " MACE_CHECK(inputs[0]->size() == inputs[i]->size())
<< MakeString(inputs[0]->shape()) << "Input 0: " << MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size() << ", size: " << inputs[0]->size() << ". Input " << i << ": "
<< ". Input " << i << ": " << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
<< MakeString(inputs[i]->shape())
<< ", size: " << inputs[i]->size();
} }
functor_(inputs, output_tensor, future); functor_(inputs, output_tensor, future);
......
...@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
{n, h, w, c});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
......
...@@ -12,13 +12,12 @@ ...@@ -12,13 +12,12 @@
namespace mace { namespace mace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> { class BatchToSpaceNDOp : public Operator<D, T> {
public: public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_( functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}), OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
true) {} true) {}
...@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(batch_tensor, space_tensor, output_shape.data()); CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future); functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
future);
return true; return true;
} }
...@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
Tensor *output, Tensor *output,
index_t *output_shape) { index_t *output_shape) {
auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}); auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}); auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D"); MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D"); MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
...@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size(); const index_t block_dims = block_shape.size();
index_t block_shape_product = 1; index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1"); MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim]; const index_t block_shape_value = block_shape[block_dim];
const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value const index_t cropped_input_size =
- crops[block_dim * 2] input_tensor->dim(block_dim + 1) * block_shape_value -
- crops[block_dim * 2 + 1]; crops[block_dim * 2] - crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0, MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
"cropped size must be non-negative");
block_shape_product *= block_shape_value; block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = cropped_input_size; output_shape[block_dim + 1] = cropped_input_size;
} }
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> { class BufferToImageOp : public Operator<D, T> {
public: public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws) BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, ws) {}
...@@ -19,7 +19,8 @@ class BufferToImageOp: public Operator<D, T> { ...@@ -19,7 +19,8 @@ class BufferToImageOp: public Operator<D, T> {
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>( kernels::BufferType type =
static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER))); "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
......
...@@ -7,8 +7,9 @@ ...@@ -7,8 +7,9 @@
using namespace mace; using namespace mace;
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) { void TestBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input") .Input("Input")
...@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
net.RunOp(D); net.RunOp(D);
// Check // Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5); ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-5);
} }
TEST(BufferToImageTest, ArgSmall) { TEST(BufferToImageTest, ArgSmall) {
...@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) { ...@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) {
} }
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedia) { TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
{3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 3, 5});
} }
TEST(BufferToImageTest, Filter1x1Media) { TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 13, 17});
} }
TEST(BufferToImageTest, Filter1x1Large) { TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{1, 1, 128, 512});
} }
TEST(BufferToImageTest, Filter3x3Small) { TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 3, 5});
} }
TEST(BufferToImageTest, Filter3x3Meida) { TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 13, 17});
} }
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
{3, 3, 128, 256});
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) { void TestDiffTypeBidirectionTransform(const int type,
const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input") .Input("Input")
...@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> ...@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
net.RunOp(D); net.RunOp(D);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2); ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-2);
} }
TEST(BufferToImageTest, ArgFloatToHalfSmall) { TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11}); TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
{11});
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void TestStringHalfBidirectionTransform(const int type, void TestStringHalfBidirectionTransform(const int type,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const unsigned char *input_data) { const unsigned char *input_data) {
...@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type, ...@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type,
.AddIntArg("T", DataTypeToEnum<T>::value) .AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
const half *h_data = reinterpret_cast<const half*>(input_data); const half *h_data = reinterpret_cast<const half *>(input_data);
net.AddInputFromArray<D, half>("Input", input_shape, std::vector<half>(h_data, h_data+2)); net.AddInputFromArray<D, half>("Input", input_shape,
std::vector<half>(h_data, h_data + 2));
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type, ...@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type,
net.RunOp(D); net.RunOp(D);
// Check // Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2); ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
1e-2);
} }
TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = {0xCD, 0x3C, 0x33, 0x40,}; const unsigned char input_data[] = {
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, 0xCD, 0x3C, 0x33, 0x40,
{2}, };
input_data); TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
kernels::ARGUMENT, {2}, input_data);
} }
...@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> { ...@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
input->shape()[1]); input->shape()[1]);
output->ResizeLike(input); output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(), functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
output->mutable_data<T>(), future); future);
return true; return true;
} }
......
...@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> { ...@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
public: public:
ConcatOp(const OperatorDef &op_def, Workspace *ws) ConcatOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){} functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat"; MACE_CHECK(this->InputSize() >= 2)
<< "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs(); const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3); const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
const int32_t input_dims = input_list[0]->dim_size(); const int32_t input_dims = input_list[0]->dim_size();
......
...@@ -38,8 +38,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { ...@@ -38,8 +38,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
} }
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ #define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1( \ static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \ ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \ } \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1) BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
...@@ -90,10 +89,8 @@ static void OpenclConcatHelper(int iters, ...@@ -90,10 +89,8 @@ static void OpenclConcatHelper(int iters,
} }
} }
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ #define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \ static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \ std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \ OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \ } \
......
...@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) { ...@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
concat_axis_size += input_shapes[i][axis]; concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], inputs[i]); GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
MakeString("Input", i), input_shapes[i], inputs[i]); input_shapes[i], inputs[i]);
} }
// Run // Run
...@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) { ...@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
} }
TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) { TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32}, OpenclRandomTest<float>(
{3, 32, 32, 32}, {3, 32, 32, 32}}, 3); {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
} }
\ No newline at end of file
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <fstream>
#include "mace/ops/conv_2d.h" #include "mace/ops/conv_2d.h"
#include <fstream>
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
using namespace mace; using namespace mace;
...@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); } ...@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); } TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int stride) { static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
const int stride) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) { Padding type) {
...@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s ...@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
} }
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
1); TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
2);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
1); TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
2);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
3); TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17},
4);
} }
template<DeviceType D> template <DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape, const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) { const std::vector<int> &dilations) {
...@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
{1, 1, 32, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
{3, 3, 32, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
{15, 1, 256, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
{1, 15, 256, 2},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
{7, 7, 3, 64},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
{1, 1, 5, 7},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
{3, 3, 5, 7},
{1, 1}); {1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
{5, 5, 16, 16},
{2, 2}); {2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
{7, 7, 16, 16},
{2, 2}); {2, 2});
} }
TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) { TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
{7, 7, 16, 16},
{4, 4}); {4, 4});
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) { static void TestDilationConvNxN(const std::vector<index_t> &shape,
const int dilation_rate) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) { Padding type) {
...@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input", "InputImage",
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil ...@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
} }
TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) { TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
2);
} }
TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) { TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
4);
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) { TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
4);
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std::vector<int> &paddings) { static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) { auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
srand(time(NULL)); srand(time(NULL));
...@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std ...@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input", "InputImage",
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std ...@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std ...@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
} }
TEST_F(Conv2dOpTest, OPENCLAlignedPad1) { TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
{1, 1});
} }
TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
...@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { ...@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
{4, 4});
} }
...@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> { ...@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
functor_(static_cast<kernels::EltwiseType>( functor_(static_cast<kernels::EltwiseType>(
OperatorBase::GetSingleArgument<int>( OperatorBase::GetSingleArgument<int>(
"type", static_cast<int>(kernels::EltwiseType::SUM))), "type", static_cast<int>(kernels::EltwiseType::SUM))),
OperatorBase::GetRepeatedArgument<float>("coeff")){} OperatorBase::GetRepeatedArgument<float>("coeff")) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0); const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->Input(1); const Tensor *input1 = this->Input(1);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape"; MACE_CHECK(input0->dim_size() == input1->dim_size())
for(int i = 0; i < input0->dim_size(); ++i) { << "Inputs of Eltwise op must be same shape";
MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape"; for (int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i))
<< "Inputs of Eltwise op must be same shape";
} }
output->ResizeLike(input0); output->ResizeLike(input0);
......
...@@ -2,15 +2,15 @@ ...@@ -2,15 +2,15 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/kernels/eltwise.h"
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/kernels/eltwise.h"
namespace mace { namespace mace {
class EltwiseOpTest : public OpsTestBase {}; class EltwiseOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void Simple(const kernels::EltwiseType type, void Simple(const kernels::EltwiseType type,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const std::vector<float> &input0, const std::vector<float> &input0,
...@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type, ...@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type,
// Run // Run
net.RunOp(D); net.RunOp(D);
} else { } else {
BufferToImage<D, half>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, half>(net, "Input1", "InputImg1",
BufferToImage<D, half>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Input2", "InputImg2",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg1") .Input("InputImg1")
.Input("InputImg2") .Input("InputImg2")
...@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type, ...@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type,
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, float>(net, "OutputImg", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<float>(shape, output); auto expected = CreateTensor<float>(shape, output);
...@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type, ...@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type,
} }
TEST_F(EltwiseOpTest, CPUSimple) { TEST_F(EltwiseOpTest, CPUSimple) {
Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36}); {1, 4, 9, 16, 25, 36});
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12}); {2, 4, 6, 8, 10, 12});
Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, {3, 6, 9, 12, 15, 18}, {2, 1});
{1, 2, 3, 4, 5, 6}, Simple<DeviceType::CPU>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
{3, 6, 9, 12, 15, 18}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{2, 1});
Simple<DeviceType::CPU>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 6, 6}); {1, 2, 3, 4, 6, 6});
Simple<DeviceType::CPU>(kernels::EltwiseType::MIN, Simple<DeviceType::CPU>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
{1, 1, 3, 3, 5, 6}); {1, 1, 3, 3, 5, 6});
} }
TEST_F(EltwiseOpTest, GPUSimple) { TEST_F(EltwiseOpTest, GPUSimple) {
Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD, Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 4, 9, 16, 25, 36}); {1, 4, 9, 16, 25, 36});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6},
{2, 4, 6, 8, 10, 12}); {2, 4, 6, 8, 10, 12});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
{1, 2, 3, 4, 5, 6}, {3, 6, 9, 12, 15, 18}, {2, 1});
{1, 2, 3, 4, 5, 6}, Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
{3, 6, 9, 12, 15, 18}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{2, 1});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX,
{1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 6, 6}); {1, 2, 3, 4, 6, 6});
Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN, Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
{1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
{1, 2, 3, 4, 5, 6},
{1, 1, 3, 3, 6, 6},
{1, 1, 3, 3, 5, 6}); {1, 1, 3, 3, 5, 6});
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void RandomTest(const kernels::EltwiseType type, void RandomTest(const kernels::EltwiseType type,
const std::vector<index_t> &shape) { const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
...@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type, ...@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type,
// Run // Run
net.RunOp(); net.RunOp();
BufferToImage<D, T>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input1", "InputImg1",
BufferToImage<D, T>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Input2", "InputImg2",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("InputImg1") .Input("InputImg1")
.Input("InputImg2") .Input("InputImg2")
...@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type, ...@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type,
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-3); ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-3);
} else { } else {
ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-1); ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-1);
} }
} }
......
...@@ -7,22 +7,19 @@ ...@@ -7,22 +7,19 @@
namespace mace { namespace mace {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU) .Device(DeviceType::CPU)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::CPU, float>); FoldedBatchNormOp<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry, REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::OPENCL)
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
.Build(), .Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>); FoldedBatchNormOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL) .Device(DeviceType::OPENCL)
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
......
...@@ -17,7 +17,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma, ...@@ -17,7 +17,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
std::vector<float> &scale, std::vector<float> &scale,
std::vector<float> &offset) { std::vector<float> &offset) {
size_t size = gamma.size(); size_t size = gamma.size();
for (int i = 0 ; i < size; ++i) { for (int i = 0; i < size; ++i) {
scale[i] = gamma[i] / std::sqrt(var[i] + epsilon); scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
offset[i] = offset[i] - mean[i] * scale[i]; offset[i] = offset[i] - mean[i] * scale[i];
} }
......
...@@ -15,8 +15,7 @@ class FullyConnectedOp : public Operator<D, T> { ...@@ -15,8 +15,7 @@ class FullyConnectedOp : public Operator<D, T> {
public: public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, ws),
functor_( functor_(kernels::StringToActivationType(
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation", OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {} OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
......
...@@ -17,7 +17,8 @@ static void FCBenchmark( ...@@ -17,7 +17,8 @@ static void FCBenchmark(
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, channel}); net.AddRandomInput<D, float>("Input", {batch, height, width, channel});
net.AddRandomInput<D, float>("Weight", {out_channel, height * width * channel}); net.AddRandomInput<D, float>("Weight",
{out_channel, height * width * channel});
net.AddRandomInput<D, float>("Bias", {out_channel}); net.AddRandomInput<D, float>("Bias", {out_channel});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
...@@ -58,9 +59,12 @@ static void FCBenchmark( ...@@ -58,9 +59,12 @@ static void FCBenchmark(
} }
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ #define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE(int iters) { \ static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \ const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \ mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \ FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
......
...@@ -10,7 +10,7 @@ namespace mace { ...@@ -10,7 +10,7 @@ namespace mace {
class FullyConnectedOpTest : public OpsTestBase {}; class FullyConnectedOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void Simple(const std::vector<index_t> &input_shape, void Simple(const std::vector<index_t> &input_shape,
const std::vector<float> &input_value, const std::vector<float> &input_value,
const std::vector<index_t> &weight_shape, const std::vector<index_t> &weight_shape,
...@@ -58,83 +58,52 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -58,83 +58,52 @@ void Simple(const std::vector<index_t> &input_shape,
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(output_shape, output_value);
CreateTensor<float>(output_shape, output_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(FullyConnectedOpTest, SimpleCPU) { TEST_F(FullyConnectedOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 2, 2}, Simple<DeviceType::CPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{1, 8}, {206});
{1, 2, 3, 4, 5, 6, 7, 8}, Simple<DeviceType::CPU>(
{1}, {2}, {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 1, 1, 1}, {206}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
Simple<DeviceType::CPU>({1, 1, 2, 5}, {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, Simple<DeviceType::CPU>(
{2, 10}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{2}, {2, 3}, {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
{1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::CPU>({1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{5, 6},
{1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {92, 912, 94, 914, 96});
} }
TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 1, 2, 2}, Simple<DeviceType::CPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
{1, 4},
{1, 2, 3, 4},
{1}, {2},
{2, 1, 1, 1}, {32, 72});
} }
TEST_F(FullyConnectedOpTest, SimpleOPENCL) { TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 2, 2}, Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
{1, 8}, {206});
{1, 2, 3, 4, 5, 6, 7, 8}, Simple<DeviceType::OPENCL>(
{1}, {2}, {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
{1, 1, 1, 1}, {206}); {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
Simple<DeviceType::OPENCL>({1, 1, 2, 5}, {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, Simple<DeviceType::OPENCL>(
{2, 10}, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
{2}, {2, 3}, {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
{1, 1, 1, 2}, {387, 3853});
Simple<DeviceType::OPENCL>({1, 1, 2, 3},
{1, 2, 3, 4, 5, 6},
{5, 6},
{1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6,
10, 20, 30, 40, 50, 60,
1, 2, 3, 4, 5, 6},
{5}, {1, 2, 3, 4, 5},
{1, 1, 1, 5}, {92, 912, 94, 914, 96});
} }
TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
Simple<DeviceType::OPENCL>({2, 1, 2, 2}, Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
{1, 4},
{1, 2, 3, 4},
{1}, {2},
{2, 1, 1, 1}, {32, 72});
} }
template<typename T> template <typename T>
void Complex(const index_t batch, void Complex(const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
...@@ -156,8 +125,7 @@ void Complex(const index_t batch, ...@@ -156,8 +125,7 @@ void Complex(const index_t batch,
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::OPENCL, float>(
"Weight", {out_channel, height * width * channels}); "Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
"Bias", {out_channel});
// run cpu // run cpu
net.RunOp(); net.RunOp();
...@@ -215,6 +183,4 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) { ...@@ -215,6 +183,4 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 13, 12, 31, 113); Complex<half>(16, 13, 12, 31, 113);
Complex<half>(31, 21, 11, 23, 103); Complex<half>(31, 21, 11, 23, 103);
} }
} }
...@@ -511,8 +511,9 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) { ...@@ -511,8 +511,9 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64}); TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) { static void TestAtrousConvNxN(const std::vector<index_t> &shape,
const int dilation) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) { Padding type) {
...@@ -550,9 +551,12 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat ...@@ -550,9 +551,12 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input", "InputImage",
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -567,7 +571,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat ...@@ -567,7 +571,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
...@@ -591,7 +596,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) { ...@@ -591,7 +596,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2); TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
} }
template<DeviceType D> template <DeviceType D>
static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape, const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) { const std::vector<int> &dilations) {
...@@ -620,7 +625,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -620,7 +625,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>( net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels}); "Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
...@@ -632,9 +638,12 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -632,9 +638,12 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, half>(net, "Input", "InputImage",
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, half>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest") OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -649,7 +658,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -649,7 +658,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
}; };
...@@ -658,13 +668,11 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -658,13 +668,11 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
} }
TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) { TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 16},
{7, 7, 3, 16},
{2, 2}); {2, 2});
} }
TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) { TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, {15, 15, 16, 16},
{15, 15, 16, 16},
{2, 2}); {2, 2});
} }
//DMACE_ENABLE_NEON // DMACE_ENABLE_NEON
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ImageToBufferOp: public Operator<D, T> { class ImageToBufferOp : public Operator<D, T> {
public: public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {} : Operator<D, T>(op_def, ws), functor_(true) {}
...@@ -20,7 +20,8 @@ class ImageToBufferOp: public Operator<D, T> { ...@@ -20,7 +20,8 @@ class ImageToBufferOp: public Operator<D, T> {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>( kernels::BufferType type =
static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER))); "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor), future); functor_(output, type, const_cast<Tensor *>(input_tensor), future);
return true; return true;
......
...@@ -10,7 +10,7 @@ namespace mace { ...@@ -10,7 +10,7 @@ namespace mace {
class MatMulOpTest : public OpsTestBase {}; class MatMulOpTest : public OpsTestBase {};
template<DeviceType D> template <DeviceType D>
void Simple(const std::vector<index_t> &A_shape, void Simple(const std::vector<index_t> &A_shape,
const std::vector<float> &A_value, const std::vector<float> &A_value,
const std::vector<index_t> &B_shape, const std::vector<index_t> &B_shape,
...@@ -51,29 +51,24 @@ void Simple(const std::vector<index_t> &A_shape, ...@@ -51,29 +51,24 @@ void Simple(const std::vector<index_t> &A_shape,
} }
// Check // Check
auto expected = auto expected = CreateTensor<float>(C_shape, C_value);
CreateTensor<float>(C_shape, C_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
TEST_F(MatMulOpTest, SimpleCPU) { TEST_F(MatMulOpTest, SimpleCPU) {
Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
{1, 2, 2, 1}, {22, 28, 49, 64}); Simple<DeviceType::CPU>(
Simple<DeviceType::CPU>({1, 5, 5, 1}, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
{1, 5, 5, 1}, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, 650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
{1, 5, 5, 1}, 1310, 1400, 1315, 1430, 1545, 1660, 1775});
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
} }
TEST_F(MatMulOpTest, SimpleCPUWithBatch) { TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
{2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
...@@ -81,19 +76,17 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) { ...@@ -81,19 +76,17 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
} }
TEST_F(MatMulOpTest, SimpleOPENCL) { TEST_F(MatMulOpTest, SimpleOPENCL) {
Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
{1, 3, 2, 1}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
{1, 2, 2, 1}, {22, 28, 49, 64}); {22, 28, 49, 64});
Simple<DeviceType::OPENCL>({1, 5, 5, 1}, Simple<DeviceType::OPENCL>(
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
{1, 5, 5, 1}, {1, 5, 5, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, {1, 5, 5, 1}, {215, 230, 245, 260, 275, 490, 530, 570, 610,
{1, 5, 5, 1}, 650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
{215, 230, 245, 260, 275, 490, 530, 570, 610, 650, 1310, 1400, 1315, 1430, 1545, 1660, 1775});
765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
1315, 1430, 1545, 1660, 1775});
} }
TEST_F(MatMulOpTest, SimpleGPUWithBatch) { TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
...@@ -118,8 +111,8 @@ void Complex(const index_t batch, ...@@ -118,8 +111,8 @@ void Complex(const index_t batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::OPENCL, float>("A",
"A", {batch, height, channels, 1}); {batch, height, channels, 1});
net.AddRandomInput<DeviceType::OPENCL, float>( net.AddRandomInput<DeviceType::OPENCL, float>(
"B", {batch, channels, out_width, 1}); "B", {batch, channels, out_width, 1});
...@@ -177,5 +170,4 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) { ...@@ -177,5 +170,4 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 32, 64, 64); Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67); Complex<half>(31, 31, 61, 67);
} }
} }
...@@ -95,7 +95,7 @@ class OpDefBuilder { ...@@ -95,7 +95,7 @@ class OpDefBuilder {
class OpsTestNet { class OpsTestNet {
public: public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {}; OpsTestNet() : op_registry_(new OperatorRegistry()){};
template <DeviceType D, typename T> template <DeviceType D, typename T>
void AddInputFromArray(const std::string &name, void AddInputFromArray(const std::string &name,
...@@ -334,9 +334,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> { ...@@ -334,9 +334,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
for (int h = 0; h < x.dim(1); ++h) { for (int h = 0; h < x.dim(1); ++h) {
for (int w = 0; w < x.dim(2); ++w) { for (int w = 0; w < x.dim(2); ++w) {
for (int c = 0; c < x.dim(3); ++c) { for (int c = 0; c < x.dim(3); ++c) {
EXPECT_NEAR(*a, *b, abs_err) << "with index = [" EXPECT_NEAR(*a, *b, abs_err) << "with index = [" << n << ", " << h
<< n << ", " << h << ", " << ", " << w << ", " << c << "]";
<< w << ", " << c << "]";
a++; a++;
b++; b++;
} }
......
...@@ -20,8 +20,12 @@ class PoolingOp : public ConvPool2dOpBase<D, T> { ...@@ -20,8 +20,12 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
pooling_type_( pooling_type_(
static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>( static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>(
"pooling_type", static_cast<int>(AVG)))), "pooling_type", static_cast<int>(AVG)))),
functor_(pooling_type_, kernels_.data(), this->strides_.data(), functor_(pooling_type_,
this->padding_type_, this->paddings_, this->dilations_.data()){}; kernels_.data(),
this->strides_.data(),
this->padding_type_,
this->paddings_,
this->dilations_.data()){};
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -198,7 +198,8 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -198,7 +198,8 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
...@@ -333,7 +334,8 @@ static void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -333,7 +334,8 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
Tensor expected; Tensor expected;
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage<D, T>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest") OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
......
...@@ -15,7 +15,7 @@ class ReshapeOp : public Operator<D, T> { ...@@ -15,7 +15,7 @@ class ReshapeOp : public Operator<D, T> {
public: public:
ReshapeOp(const OperatorDef &op_def, Workspace *ws) ReshapeOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")){} shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -38,9 +38,11 @@ class ReshapeOp : public Operator<D, T> { ...@@ -38,9 +38,11 @@ class ReshapeOp : public Operator<D, T> {
} }
if (unknown_idx != -1) { if (unknown_idx != -1) {
MACE_CHECK(product != 0) << "Cannot infer shape if there is zero shape size."; MACE_CHECK(product != 0)
<< "Cannot infer shape if there is zero shape size.";
const index_t missing = input->size() / product; const index_t missing = input->size() / product;
MACE_CHECK(missing * product == input->size()) << "Input size not match reshaped tensor size"; MACE_CHECK(missing * product == input->size())
<< "Input size not match reshaped tensor size";
out_shape[unknown_idx] = missing; out_shape[unknown_idx] = missing;
} }
......
...@@ -13,7 +13,6 @@ class ReshapeTest : public OpsTestBase {}; ...@@ -13,7 +13,6 @@ class ReshapeTest : public OpsTestBase {};
void TestReshape(const std::vector<index_t> &org_shape, void TestReshape(const std::vector<index_t> &org_shape,
const std::vector<int> &output_shape, const std::vector<int> &output_shape,
const std::vector<index_t> &res_shape) { const std::vector<index_t> &res_shape) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
OpDefBuilder("Reshape", "ReshapeTest") OpDefBuilder("Reshape", "ReshapeTest")
......
...@@ -14,11 +14,10 @@ template <DeviceType D, class T> ...@@ -14,11 +14,10 @@ template <DeviceType D, class T>
class SoftmaxOp : public Operator<D, T> { class SoftmaxOp : public Operator<D, T> {
public: public:
SoftmaxOp(const OperatorDef &operator_def, Workspace *ws) SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) { : Operator<D, T>(operator_def, ws) {}
}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *logits= this->Input(LOGITS); const Tensor *logits = this->Input(LOGITS);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
output->ResizeLike(logits); output->ResizeLike(logits);
......
...@@ -14,7 +14,8 @@ void Simple() { ...@@ -14,7 +14,8 @@ void Simple() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4}); net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
...@@ -41,18 +42,15 @@ void Simple() { ...@@ -41,18 +42,15 @@ void Simple() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25, auto expected = CreateTensor<float>(
0.0320586, 0.08714432, 0.23688282, 0.64391426}); {1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
} }
TEST_F(SoftmaxOpTest, CPUSimple) { TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
Simple<DeviceType::CPU>(); TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
}
TEST_F(SoftmaxOpTest, OPENCLSimple) {
Simple<DeviceType::OPENCL>();
}
template <DeviceType D> template <DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) { void Complex(const std::vector<index_t> &logits_shape) {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
namespace mace { namespace mace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class SpaceToBatchNDOp : public Operator<D, T> { class SpaceToBatchNDOp : public Operator<D, T> {
public: public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
...@@ -28,17 +28,19 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -28,17 +28,19 @@ class SpaceToBatchNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(space_tensor, batch_tensor, output_shape.data()); CalculateOutputShape(space_tensor, batch_tensor, output_shape.data());
functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor, future); functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor,
future);
return true; return true;
} }
private: private:
inline void CalculateOutputShape(const Tensor *input_tensor, inline void CalculateOutputShape(const Tensor *input_tensor,
Tensor *output, Tensor *output,
index_t *output_shape) { index_t *output_shape) {
auto paddings = OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0}); auto paddings =
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}); OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D"); MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
...@@ -46,13 +48,14 @@ class SpaceToBatchNDOp : public Operator<D, T> { ...@@ -46,13 +48,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size(); const index_t block_dims = block_shape.size();
index_t block_shape_product = 1; index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1"); MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim]; const index_t block_shape_value = block_shape[block_dim];
const index_t padded_input_size = input_tensor->dim(block_dim + 1) const index_t padded_input_size = input_tensor->dim(block_dim + 1) +
+ paddings[block_dim * 2] paddings[block_dim * 2] +
+ paddings[block_dim * 2 + 1]; paddings[block_dim * 2 + 1];
MACE_CHECK(padded_input_size % block_shape_value == 0, MACE_CHECK(padded_input_size % block_shape_value == 0, "padded input ",
"padded input ", padded_input_size, " is not divisible by block_shape"); padded_input_size, " is not divisible by block_shape");
block_shape_product *= block_shape_value; block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = padded_input_size / block_shape_value; output_shape[block_dim + 1] = padded_input_size / block_shape_value;
} }
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
#include <fstream> #include <fstream>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -21,7 +21,9 @@ void TransposeFilter(const std::vector<float> &input, ...@@ -21,7 +21,9 @@ void TransposeFilter(const std::vector<float> &input,
for (index_t w = 0; w < input_shape[1]; ++w) { for (index_t w = 0; w < input_shape[1]; ++w) {
for (index_t oc = 0; oc < input_shape[2]; ++oc) { for (index_t oc = 0; oc < input_shape[2]; ++oc) {
for (index_t ic = 0; ic < input_shape[3]; ++ic) { for (index_t ic = 0; ic < input_shape[3]; ++ic) {
int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) * input_shape[1] + w; int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
input_shape[1] +
w;
output[offset] = *input_ptr; output[offset] = *input_ptr;
++input_ptr; ++input_ptr;
} }
...@@ -30,7 +32,7 @@ void TransposeFilter(const std::vector<float> &input, ...@@ -30,7 +32,7 @@ void TransposeFilter(const std::vector<float> &input,
} }
} }
template<DeviceType D, typename T> template <DeviceType D, typename T>
void WinogradConvolution(const index_t batch, void WinogradConvolution(const index_t batch,
const index_t height, const index_t height,
const index_t width, const index_t width,
...@@ -53,8 +55,7 @@ void WinogradConvolution(const index_t batch, ...@@ -53,8 +55,7 @@ void WinogradConvolution(const index_t batch,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(net, "Filter", "FilterImage", BufferToImage<D, T>(net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER); kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -78,8 +79,10 @@ void WinogradConvolution(const index_t batch, ...@@ -78,8 +79,10 @@ void WinogradConvolution(const index_t batch,
// transform filter // transform filter
std::vector<float> wino_filter_data; std::vector<float> wino_filter_data;
TransposeFilter(filter_data, filter_shape, wino_filter_data); TransposeFilter(filter_data, filter_shape, wino_filter_data);
net.AddInputFromArray<D, float>("WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data); net.AddInputFromArray<D, float>(
BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER); "WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER);
// transform input // transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest") OpDefBuilder("WinogradTransform", "WinogradTransformTest")
...@@ -126,18 +129,23 @@ void WinogradConvolution(const index_t batch, ...@@ -126,18 +129,23 @@ void WinogradConvolution(const index_t batch,
} }
TEST_F(WinogradConvlutionTest, AlignedConvolution) { TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::VALID); WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::SAME); Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
Padding::SAME);
} }
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37, Padding::VALID); WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37,
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31, Padding::SAME); Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31,
Padding::SAME);
} }
TEST_F(WinogradConvlutionTest, BatchConvolution) { TEST_F(WinogradConvlutionTest, BatchConvolution) {
WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32, Padding::VALID); WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32,
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31, Padding::SAME); Padding::VALID);
WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
Padding::SAME);
} }
} }
...@@ -8,12 +8,12 @@ ...@@ -8,12 +8,12 @@
#include <memory> #include <memory>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/winograd_transform.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/kernels/winograd_transform.h"
namespace mace { namespace mace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class WinogradInverseTransformOp : public Operator<D, T> { class WinogradInverseTransformOp : public Operator<D, T> {
public: public:
WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws) WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
namespace mace { namespace mace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class WinogradTransformOp : public Operator<D, T> { class WinogradTransformOp : public Operator<D, T> {
public: public:
WinogradTransformOp(const OperatorDef &op_def, Workspace *ws) WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
......
...@@ -37,16 +37,14 @@ static void BMWinogradTransform( ...@@ -37,16 +37,14 @@ static void BMWinogradTransform(
} }
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ #define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \ static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \ } \
BENCHMARK( \ BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \ #define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL); BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
......
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#define MACE_CORE_MACE_H_ #define MACE_CORE_MACE_H_
#include <cstdint> #include <cstdint>
#include <vector>
#include <string>
#include <memory>
#include <map> #include <map>
#include <memory>
#include <string>
#include <vector>
namespace mace { namespace mace {
...@@ -29,9 +29,7 @@ namespace mace { ...@@ -29,9 +29,7 @@ namespace mace {
(MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \ (MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
MACE_PATCH_VERSION) MACE_VERSION_SUFFIX) MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
inline const char *MaceVersion() { inline const char *MaceVersion() { return MACE_VERSION_STRING; }
return MACE_VERSION_STRING;
}
extern const char *MaceGitVersion(); extern const char *MaceGitVersion();
...@@ -43,17 +41,9 @@ extern const char *MaceGitVersion(); ...@@ -43,17 +41,9 @@ extern const char *MaceGitVersion();
classname &operator=(const classname &) = delete classname &operator=(const classname &) = delete
#endif #endif
enum NetMode { enum NetMode { INIT = 0, NORMAL = 1 };
INIT = 0,
NORMAL = 1
};
enum DeviceType { enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
CPU = 0,
NEON = 1,
OPENCL = 2,
HEXAGON = 3
};
enum DataType { enum DataType {
DT_INVALID = 0, DT_INVALID = 0,
...@@ -104,6 +94,7 @@ class Argument { ...@@ -104,6 +94,7 @@ class Argument {
public: public:
Argument(); Argument();
void CopyFrom(const Argument &from); void CopyFrom(const Argument &from);
public: public:
const std::string &name() const; const std::string &name() const;
void set_name(const std::string &value); void set_name(const std::string &value);
...@@ -147,11 +138,13 @@ class NodeInput { ...@@ -147,11 +138,13 @@ class NodeInput {
NodeInput() {} NodeInput() {}
NodeInput(int node_id, int output_port); NodeInput(int node_id, int output_port);
void CopyFrom(const NodeInput &from); void CopyFrom(const NodeInput &from);
public: public:
int node_id() const; int node_id() const;
void set_node_id(int node_id); void set_node_id(int node_id);
int output_port() const; int output_port() const;
void set_output_port(int output_port); void set_output_port(int output_port);
private: private:
int node_id_; int node_id_;
int output_port_; int output_port_;
...@@ -162,8 +155,10 @@ class OutputShape { ...@@ -162,8 +155,10 @@ class OutputShape {
OutputShape(); OutputShape();
OutputShape(const std::vector<int64_t> &dims); OutputShape(const std::vector<int64_t> &dims);
void CopyFrom(const OutputShape &from); void CopyFrom(const OutputShape &from);
public: public:
const std::vector<int64_t> &dims() const; const std::vector<int64_t> &dims() const;
private: private:
std::vector<int64_t> dims_; std::vector<int64_t> dims_;
}; };
...@@ -240,10 +235,12 @@ class OperatorDef { ...@@ -240,10 +235,12 @@ class OperatorDef {
class MemoryBlock { class MemoryBlock {
public: public:
MemoryBlock(int mem_id, uint32_t x, uint32_t y); MemoryBlock(int mem_id, uint32_t x, uint32_t y);
public: public:
int mem_id() const; int mem_id() const;
uint32_t x() const; uint32_t x() const;
uint32_t y() const; uint32_t y() const;
private: private:
int mem_id_; int mem_id_;
uint32_t x_; uint32_t x_;
...@@ -255,9 +252,9 @@ class MemoryArena { ...@@ -255,9 +252,9 @@ class MemoryArena {
const std::vector<MemoryBlock> &mem_block() const; const std::vector<MemoryBlock> &mem_block() const;
std::vector<MemoryBlock> &mutable_mem_block(); std::vector<MemoryBlock> &mutable_mem_block();
int mem_block_size() const; int mem_block_size() const;
private: private:
std::vector<MemoryBlock> mem_block_; std::vector<MemoryBlock> mem_block_;
}; };
// for hexagon mace-nnlib // for hexagon mace-nnlib
...@@ -268,6 +265,7 @@ class InputInfo { ...@@ -268,6 +265,7 @@ class InputInfo {
int32_t max_byte_size() const; int32_t max_byte_size() const;
DataType data_type() const; DataType data_type() const;
const std::vector<int32_t> &dims() const; const std::vector<int32_t> &dims() const;
private: private:
std::string name_; std::string name_;
int32_t node_id_; int32_t node_id_;
...@@ -285,6 +283,7 @@ class OutputInfo { ...@@ -285,6 +283,7 @@ class OutputInfo {
void set_data_type(DataType data_type); void set_data_type(DataType data_type);
const std::vector<int32_t> &dims() const; const std::vector<int32_t> &dims() const;
void set_dims(const std::vector<int32_t> &dims); void set_dims(const std::vector<int32_t> &dims);
private: private:
std::string name_; std::string name_;
int32_t node_id_; int32_t node_id_;
...@@ -299,6 +298,7 @@ class NetDef { ...@@ -299,6 +298,7 @@ class NetDef {
int op_size() const; int op_size() const;
const OperatorDef &op(const int idx) const; const OperatorDef &op(const int idx) const;
public: public:
const std::string &name() const; const std::string &name() const;
bool has_name() const; bool has_name() const;
...@@ -359,7 +359,6 @@ struct RunMetadata { ...@@ -359,7 +359,6 @@ struct RunMetadata {
std::vector<OperatorStats> op_stats; std::vector<OperatorStats> op_stats;
}; };
class Workspace; class Workspace;
class NetBase; class NetBase;
class OperatorRegistry; class OperatorRegistry;
...@@ -374,8 +373,7 @@ struct MaceInputInfo { ...@@ -374,8 +373,7 @@ struct MaceInputInfo {
class MaceEngine { class MaceEngine {
public: public:
// Single input and output // Single input and output
explicit MaceEngine(const NetDef *net_def, explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
DeviceType device_type);
// Multiple input or output // Multiple input or output
explicit MaceEngine(const NetDef *net_def, explicit MaceEngine(const NetDef *net_def,
DeviceType device_type, DeviceType device_type,
...@@ -394,7 +392,7 @@ class MaceEngine { ...@@ -394,7 +392,7 @@ class MaceEngine {
// Multiple input or output // Multiple input or output
bool Run(const std::vector<MaceInputInfo> &input, bool Run(const std::vector<MaceInputInfo> &input,
std::map<std::string, float *> &output, std::map<std::string, float *> &output,
RunMetadata *run_metadata=nullptr); RunMetadata *run_metadata = nullptr);
MaceEngine(const MaceEngine &) = delete; MaceEngine(const MaceEngine &) = delete;
MaceEngine &operator=(const MaceEngine &) = delete; MaceEngine &operator=(const MaceEngine &) = delete;
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
namespace mace { namespace mace {
inline int64_t NowMicros() { inline int64_t NowMicros() {
......
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/utils/env_time.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
#undef ERROR #undef ERROR
......
...@@ -24,13 +24,9 @@ class WallClockTimer : public Timer { ...@@ -24,13 +24,9 @@ class WallClockTimer : public Timer {
public: public:
WallClockTimer() : accumulated_micros_(0) {} WallClockTimer() : accumulated_micros_(0) {}
void StartTiming() override { void StartTiming() override { start_micros_ = NowMicros(); }
start_micros_ = NowMicros();
}
void StopTiming() override { void StopTiming() override { stop_micros_ = NowMicros(); }
stop_micros_ = NowMicros();
}
void AccumulateTiming() override { void AccumulateTiming() override {
StopTiming(); StopTiming();
...@@ -43,13 +39,9 @@ class WallClockTimer : public Timer { ...@@ -43,13 +39,9 @@ class WallClockTimer : public Timer {
accumulated_micros_ = 0; accumulated_micros_ = 0;
} }
double ElapsedMicros() override { double ElapsedMicros() override { return stop_micros_ - start_micros_; }
return stop_micros_ - start_micros_;
}
double AccumulatedMicros() override { double AccumulatedMicros() override { return accumulated_micros_; }
return accumulated_micros_;
}
private: private:
double start_micros_; double start_micros_;
......
...@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) { ...@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer timer; WallClockTimer timer;
std::vector<unsigned int> default_params(1, 1); std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
default_params[0] = 2; default_params[0] = 2;
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect + 1, res); EXPECT_EQ(expect + 1, res);
} }
...@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) { ...@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
}; };
// tune // tune
WallClockTimer timer; WallClockTimer timer;
int res = int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
default_params,
*params_generator,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
// run // run
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
} }
......
...@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) { ...@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer timer; WallClockTimer timer;
std::vector<unsigned int> default_params(1, 1); std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
default_params[0] = 2; default_params[0] = 2;
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect + 1, res); EXPECT_EQ(expect + 1, res);
} }
...@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) { ...@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
}; };
// tune // tune
WallClockTimer timer; WallClockTimer timer;
int res = int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
default_params,
*params_generator,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
// run // run
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun", res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
default_params, "SimpleRun", default_params, nullptr, TunerFunc, &timer);
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册