提交 6b9aafd4 编写于 作者: 叶剑武

Merge branch 'cpplint' into 'master'

Reformatting code and enable cpplint

See merge request !273
stages: stages:
- ops_test - ops_test
- ops_benchmark - ops_benchmark
- cpplint
cpplint:
stage: cpplint
only:
- master
script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
ops_test: ops_test:
stage: ops_test stage: ops_test
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include <malloc.h> #include <malloc.h>
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/public/mace.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator { ...@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
free(data); free(data);
}; };
void *Map(void *buffer, size_t offset, size_t nbytes) const override { void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset; return (char *)buffer + offset;
} }
void *MapImage(void *buffer, void *MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
......
...@@ -83,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false) ...@@ -83,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname, \ #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname, \
enforce_lossless_conversion) \ enforce_lossless_conversion) \
template <> \ template <> \
std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \ std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
const string &name, const std::vector<T> &default_value) const { \ const string &name, const std::vector<T> &default_value) const { \
if (arg_map_.count(name) == 0) { \ if (arg_map_.count(name) == 0) { \
return default_value; \ return default_value; \
} \ } \
std::vector<T> values; \ std::vector<T> values; \
for (const auto &v : arg_map_.at(name).fieldname()) { \ for (const auto &v : arg_map_.at(name).fieldname()) { \
if (enforce_lossless_conversion) { \ if (enforce_lossless_conversion) { \
auto supportsConversion = \ auto supportsConversion = \
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#ifndef MACE_CORE_BUFFER_H_ #ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_ #define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector> #include <vector>
#include "mace/core/allocator.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
...@@ -39,23 +39,19 @@ class BufferBase { ...@@ -39,23 +39,19 @@ class BufferBase {
virtual bool OnHost() const = 0; virtual bool OnHost() const = 0;
virtual index_t offset() const { virtual index_t offset() const { return 0; };
return 0;
};
template<typename T> template <typename T>
const T *data() const { const T *data() const {
return reinterpret_cast<const T *>(raw_data()); return reinterpret_cast<const T *>(raw_data());
} }
template<typename T> template <typename T>
T *mutable_data() { T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data()); return reinterpret_cast<T *>(raw_mutable_data());
} }
index_t size() const { index_t size() const { return size_; }
return size_;
}
protected: protected:
index_t size_; index_t size_;
...@@ -64,26 +60,26 @@ class BufferBase { ...@@ -64,26 +60,26 @@ class BufferBase {
class Buffer : public BufferBase { class Buffer : public BufferBase {
public: public:
Buffer(Allocator *allocator) Buffer(Allocator *allocator)
: BufferBase(0), : BufferBase(0),
allocator_(allocator), allocator_(allocator),
buf_(nullptr), buf_(nullptr),
mapped_buf_(nullptr), mapped_buf_(nullptr),
is_data_owner_(true) {} is_data_owner_(true) {}
Buffer(Allocator *allocator, index_t size) Buffer(Allocator *allocator, index_t size)
: BufferBase(size), : BufferBase(size),
allocator_(allocator), allocator_(allocator),
mapped_buf_(nullptr), mapped_buf_(nullptr),
is_data_owner_(true) { is_data_owner_(true) {
buf_ = allocator->New(size); buf_ = allocator->New(size);
} }
Buffer(Allocator *allocator, void *data, index_t size) Buffer(Allocator *allocator, void *data, index_t size)
: BufferBase(size), : BufferBase(size),
allocator_(allocator), allocator_(allocator),
buf_(data), buf_(data),
mapped_buf_(nullptr), mapped_buf_(nullptr),
is_data_owner_(false) {} is_data_owner_(false) {}
virtual ~Buffer() { virtual ~Buffer() {
if (mapped_buf_ != nullptr) { if (mapped_buf_ != nullptr) {
...@@ -155,12 +151,10 @@ class Buffer : public BufferBase { ...@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_); MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer"); MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length); memcpy(mapped_buf_, (char *)src + offset, length);
} }
bool OnHost() const { bool OnHost() const { return allocator_->OnHost(); }
return allocator_->OnHost();
}
private: private:
Allocator *allocator_; Allocator *allocator_;
...@@ -168,23 +162,24 @@ class Buffer : public BufferBase { ...@@ -168,23 +162,24 @@ class Buffer : public BufferBase {
void *mapped_buf_; void *mapped_buf_;
bool is_data_owner_; bool is_data_owner_;
DISABLE_COPY_AND_ASSIGN(Buffer); DISABLE_COPY_AND_ASSIGN(Buffer);
}; };
class Image : public BufferBase { class Image : public BufferBase {
public: public:
Image() Image()
: BufferBase(0), : BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)), allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr), buf_(nullptr),
mapped_buf_(nullptr) {} mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type) Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(), : BufferBase(
1, std::multiplies<index_t>()) std::accumulate(
* GetEnumTypeSize(data_type)), shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
allocator_(GetDeviceAllocator(OPENCL)), GetEnumTypeSize(data_type)),
mapped_buf_(nullptr) { allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
shape_ = shape; shape_ = shape;
data_type_ = data_type; data_type_ = data_type;
buf_ = allocator_->NewImage(shape, data_type); buf_ = allocator_->NewImage(shape, data_type);
...@@ -214,9 +209,7 @@ class Image : public BufferBase { ...@@ -214,9 +209,7 @@ class Image : public BufferBase {
return mapped_buf_; return mapped_buf_;
} }
std::vector<size_t> image_shape() const { std::vector<size_t> image_shape() const { return shape_; }
return shape_;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const { void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -241,17 +234,11 @@ class Image : public BufferBase { ...@@ -241,17 +234,11 @@ class Image : public BufferBase {
mapped_buf_ = nullptr; mapped_buf_ = nullptr;
}; };
void Resize(index_t size) { void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
bool OnHost() const { bool OnHost() const { return allocator_->OnHost(); }
return allocator_->OnHost();
}
private: private:
Allocator *allocator_; Allocator *allocator_;
...@@ -260,34 +247,25 @@ class Image : public BufferBase { ...@@ -260,34 +247,25 @@ class Image : public BufferBase {
void *buf_; void *buf_;
void *mapped_buf_; void *mapped_buf_;
DISABLE_COPY_AND_ASSIGN(Image); DISABLE_COPY_AND_ASSIGN(Image);
}; };
class BufferSlice : public BufferBase { class BufferSlice : public BufferBase {
public: public:
BufferSlice() BufferSlice()
: buffer_(nullptr), : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
mapped_buf_(nullptr),
offset_(0),
length_(0) {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length) BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()), : BufferBase(buffer->size()),
buffer_(buffer), buffer_(buffer),
mapped_buf_(nullptr), mapped_buf_(nullptr),
offset_(offset), offset_(offset),
length_(length) { length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0"); MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_, MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
"buffer slice offset + length (", offset, " + ", length, ") should <= ", size_);
offset,
" + ",
length,
") should <= ",
size_);
} }
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_, BufferSlice(const BufferSlice &other)
other.offset_, : BufferSlice(other.buffer_, other.offset_, other.length_) {}
other.length_) {}
~BufferSlice() { ~BufferSlice() {
if (buffer_ != nullptr && mapped_buf_ != nullptr) { if (buffer_ != nullptr && mapped_buf_ != nullptr) {
...@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase { ...@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
const void *raw_data() const { const void *raw_data() const {
if (OnHost()) { if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_); MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_; return (char *)buffer_->raw_data() + offset_;
} else { } else {
MACE_CHECK_NOTNULL(mapped_buf_); MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_; return mapped_buf_;
...@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase { ...@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
return nullptr; return nullptr;
} }
void UnMap(void *mapped_ptr) const { void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Map(std::vector<size_t> *pitch) { void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_); MACE_CHECK_NOTNULL(buffer_);
...@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase { ...@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
mapped_buf_ = nullptr; mapped_buf_ = nullptr;
}; };
void Resize(index_t size) { void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
MACE_NOT_IMPLEMENTED;
}
index_t offset() const { index_t offset() const { return offset_; }
return offset_;
}
bool OnHost() const { bool OnHost() const { return buffer_->OnHost(); }
return buffer_->OnHost();
}
private: private:
BufferBase *buffer_; BufferBase *buffer_;
...@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase { ...@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
index_t offset_; index_t offset_;
index_t length_; index_t length_;
}; };
} }
#endif // MACE_CORE_BUFFER_H_ #endif // MACE_CORE_BUFFER_H_
此差异已折叠。
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
// //
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/utils/utils.h"
#include "mace/utils/timer.h"
#include "mace/utils/memory_logging.h" #include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry, ...@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) const NetMode mode)
: NetBase(op_registry, net_def, ws, type), : NetBase(op_registry, net_def, ws, type), device_type_(type) {
device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); const auto &operator_def = net_def->op(idx);
...@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) { ...@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter; auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
"(", op->debug_def().type(), ")"); op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL && bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr || (run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1)); std::distance(iter, operators_.end()) == 1));
...@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet( ...@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) { const NetMode mode) {
std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode)); std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode));
return net; return net;
} }
......
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -147,7 +147,7 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { ...@@ -147,7 +147,7 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
class OperatorRegistry { class OperatorRegistry {
public: public:
typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *> typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
RegistryType; RegistryType;
OperatorRegistry(); OperatorRegistry();
~OperatorRegistry() = default; ~OperatorRegistry() = default;
RegistryType *registry() { return &registry_; }; RegistryType *registry() { return &registry_; };
......
...@@ -36,6 +36,6 @@ class PreallocatedPooledAllocator { ...@@ -36,6 +36,6 @@ class PreallocatedPooledAllocator {
std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_; std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
}; };
} // namespace mace } // namespace mace
#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_ #endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
...@@ -2,19 +2,19 @@ ...@@ -2,19 +2,19 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <vector>
#include <thread>
#include <sys/time.h> #include <sys/time.h>
#include <thread>
#include <vector>
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_nn_ops.h" #include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
namespace { namespace {
inline int64_t NowMicros() { inline int64_t NowMicros() {
struct timeval tv; struct timeval tv;
gettimeofday(&tv, nullptr); gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
} }
} }
namespace mace { namespace mace {
...@@ -63,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -63,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// const node // const node
std::thread const_thread([&]() { std::thread const_thread([&]() {
std::vector<hexagon_nn_const_node> const_node_list; std::vector<hexagon_nn_const_node> const_node_list;
for (const ConstTensor &const_tensor: net_def.tensors()) { for (const ConstTensor &const_tensor : net_def.tensors()) {
std::vector<int> tensor_shape(const_tensor.dims().begin(), std::vector<int> tensor_shape(const_tensor.dims().begin(),
const_tensor.dims().end()); const_tensor.dims().end());
while (tensor_shape.size() < 4) { while (tensor_shape.size() < 4) {
tensor_shape.insert(tensor_shape.begin(), 1); tensor_shape.insert(tensor_shape.begin(), 1);
} }
...@@ -77,32 +77,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -77,32 +77,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
const_node.tensor.width = tensor_shape[2]; const_node.tensor.width = tensor_shape[2];
const_node.tensor.depth = tensor_shape[3]; const_node.tensor.depth = tensor_shape[3];
if (const_tensor.data_type() == DataType::DT_INT32 if (const_tensor.data_type() == DataType::DT_INT32 &&
&& const_tensor.data_size() == 0) { const_tensor.data_size() == 0) {
const_node.tensor.data = NULL; const_node.tensor.data = NULL;
const_node.tensor.dataLen = 0; const_node.tensor.dataLen = 0;
} else { } else {
const_node.tensor.data = const_node.tensor.data =
const_cast<unsigned char *>(const_tensor.data()); const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen = const_node.tensor.dataLen = const_tensor.data_size() *
const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type()); GetEnumTypeSize(const_tensor.data_type());
} }
const_node_list.push_back(const_node); const_node_list.push_back(const_node);
// 255 is magic number: why fastrpc limits sequence length to that? // 255 is magic number: why fastrpc limits sequence length to that?
if (const_node_list.size() >= 250) { if (const_node_list.size() >= 250) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_, MACE_CHECK(
const_node_list.data(), hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) const_node_list.size()) == 0,
== 0, "append const node error"); "append const node error");
const_node_list.clear(); const_node_list.clear();
} }
} }
if (!const_node_list.empty()) { if (!const_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_, MACE_CHECK(
const_node_list.data(), hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0, const_node_list.size()) == 0,
"append const node error"); "append const node error");
} }
const_node_list.clear(); const_node_list.clear();
}); });
...@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
std::vector<hexagon_nn_input> inputs; std::vector<hexagon_nn_input> inputs;
std::vector<hexagon_nn_output> outputs; std::vector<hexagon_nn_output> outputs;
for (const OperatorDef &op: net_def.op()) { for (const OperatorDef &op : net_def.op()) {
int op_id = op_map.GetOpId(op.type()); int op_id = op_map.GetOpId(op.type());
inputs.resize(op.node_input().size()); inputs.resize(op.node_input().size());
for (size_t i = 0; i < op.node_input().size(); ++i) { for (size_t i = 0; i < op.node_input().size(); ++i) {
...@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
cached_inputs.push_back(inputs); cached_inputs.push_back(inputs);
cached_outputs.push_back(outputs); cached_outputs.push_back(outputs);
hexagon_nn_padding_type hexagon_nn_padding_type padding_type =
padding_type = static_cast<hexagon_nn_padding_type>( static_cast<hexagon_nn_padding_type>(op.padding());
op.padding());
hexagon_nn_op_node op_node; hexagon_nn_op_node op_node;
op_node.node_id = node_id(op.node_id()); op_node.node_id = node_id(op.node_id());
...@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
op_node_list.push_back(op_node); op_node_list.push_back(op_node);
if (op_node_list.size() >= 125) { if (op_node_list.size() >= 125) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.data(),
op_node_list.size()) == 0, op_node_list.size()) == 0,
"append node error"); "append node error");
op_node_list.clear(); op_node_list.clear();
...@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
} }
if (!op_node_list.empty()) { if (!op_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.data(),
op_node_list.size()) == 0, op_node_list.size()) == 0,
"append node error"); "append node error");
} }
...@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// input info // input info
num_inputs_ = 0; num_inputs_ = 0;
for (const InputInfo &input_info: net_def.input_info()) { for (const InputInfo &input_info : net_def.input_info()) {
std::vector<index_t> input_shape; std::vector<index_t> input_shape;
input_shape.insert(input_shape.begin(), input_shape.insert(input_shape.begin(), input_info.dims().begin(),
input_info.dims().begin(), input_info.dims().end()); input_info.dims().end());
while (input_shape.size() < 4) { while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1); input_shape.insert(input_shape.begin(), 1);
} }
...@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) { ...@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// output info // output info
num_outputs_ = 0; num_outputs_ = 0;
for (const OutputInfo &output_info: net_def.output_info()) { for (const OutputInfo &output_info : net_def.output_info()) {
std::vector<index_t> output_shape; std::vector<index_t> output_shape;
output_shape.insert(output_shape.begin(), output_shape.insert(output_shape.begin(), output_info.dims().begin(),
output_info.dims().begin(), output_info.dims().end()); output_info.dims().end());
while (output_shape.size() < 4) { while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1); output_shape.insert(output_shape.begin(), 1);
} }
...@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() { ...@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0; return hexagon_nn_teardown(nn_id_) == 0;
} }
#define PRINT_BUFSIZE (2*1024*1024) #define PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() { void HexagonControlWrapper::PrintLog() {
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_, MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
reinterpret_cast<unsigned char *>(buf), PRINT_BUFSIZE) == 0,
PRINT_BUFSIZE) == 0, "print log error"); "print log error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[]buf; delete[] buf;
} }
void HexagonControlWrapper::PrintGraph() { void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph"; LOG(INFO) << "Print Graph";
char *buf; char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return; if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_, MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
reinterpret_cast<unsigned char *>(buf), PRINT_BUFSIZE) == 0,
PRINT_BUFSIZE) == 0, "print graph error"); "print graph error");
LOG(INFO) << std::string(buf); LOG(INFO) << std::string(buf);
delete[]buf; delete[] buf;
} }
void HexagonControlWrapper::SetDebugLevel(int level) { void HexagonControlWrapper::SetDebugLevel(int level) {
...@@ -256,9 +253,9 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -256,9 +253,9 @@ void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info"; LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE); std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
unsigned int n_items = 0; unsigned int n_items = 0;
MACE_CHECK( MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0, &n_items) == 0,
"get perf info error"); "get perf info error");
std::unordered_map<uint32_t, float> node_id_counters; std::unordered_map<uint32_t, float> node_id_counters;
std::unordered_map<std::string, std::pair<int, float>> node_type_counters; std::unordered_map<std::string, std::pair<int, float>> node_type_counters;
...@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
unsigned int node_id = perf_info[i].node_id; unsigned int node_id = perf_info[i].node_id;
unsigned int node_type_id = perf_info[i].node_type; unsigned int node_type_id = perf_info[i].node_type;
node_id_counters[node_id] = node_id_counters[node_id] =
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
+ perf_info[i].counter_lo) * 1.0f / perf_info[i].executions; perf_info[i].counter_lo) *
1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE]; char node_type_buf[MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE); hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
...@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() { ...@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
total_duration += node_id_counters[node_id]; total_duration += node_id_counters[node_id];
} }
for (auto &node_type_counter: node_type_counters) { for (auto &node_type_counter : node_type_counters) {
LOG(INFO) << "node type: " << node_type_counter.first LOG(INFO) << "node type: " << node_type_counter.first
<< ", time: " << node_type_counter.second.first << ", time: " << node_type_counter.second.first
<< ", duration: " << node_type_counter.second.second; << ", duration: " << node_type_counter.second.second;
...@@ -312,33 +310,25 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, ...@@ -312,33 +310,25 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
output_tensor->Resize(output_shapes_[0]); output_tensor->Resize(output_shapes_[0]);
std::vector<uint32_t> output_shape(4); std::vector<uint32_t> output_shape(4);
uint32_t output_bytes; uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_, int res = hexagon_nn_execute(
input_tensor.shape()[0], nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
input_tensor.shape()[1], input_tensor.shape()[2], input_tensor.shape()[3],
input_tensor.shape()[2], reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
input_tensor.shape()[3], input_tensor.raw_size(), &output_shape[0], &output_shape[1],
reinterpret_cast<const unsigned char *>( &output_shape[2], &output_shape[3],
input_tensor.raw_data()), reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
input_tensor.raw_size(), output_tensor->raw_size(), &output_bytes);
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_CHECK(res == 0, "execute error"); MACE_CHECK(res == 0, "execute error");
MACE_ASSERT(output_shape == output_shapes_[0], MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(), MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred."); "wrong output bytes inferred.");
return res == 0; return res == 0;
}; };
bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors, bool HexagonControlWrapper::ExecuteGraphNew(
std::vector<Tensor> *output_tensors) { const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_; LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size(); int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size(); int num_outputs = output_tensors->size();
...@@ -355,7 +345,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten ...@@ -355,7 +345,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
inputs[i].width = input_shape[2]; inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3]; inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>( inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data())); reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size(); inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size(); inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0; inputs[i].unused = 0;
...@@ -365,16 +355,16 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten ...@@ -365,16 +355,16 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
(*output_tensors)[i].SetDtype(output_data_types_[i]); (*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]); (*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>( outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data()); (*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size(); outputs[i].dataLen = (*output_tensors)[i].raw_size();
} }
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs, int res =
outputs, num_outputs); hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) { for (int i = 0; i < num_outputs; ++i) {
std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height, std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth}; outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i], MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred"); "wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(), MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
...@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, ...@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
float *min_in_data = input_tensors[1].mutable_data<float>(); float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1}); input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>(); float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
&input_tensors[0],
min_in_data,
max_in_data); max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) { if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false; return false;
...@@ -409,11 +397,9 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, ...@@ -409,11 +397,9 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
const float *min_out_data = output_tensors[1].data<float>(); const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>(); const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
*min_out_data,
*max_out_data,
output_tensor); output_tensor);
return true; return true;
} }
} // namespace mace } // namespace mace
...@@ -16,16 +16,17 @@ namespace mace { ...@@ -16,16 +16,17 @@ namespace mace {
class HexagonControlWrapper { class HexagonControlWrapper {
public: public:
HexagonControlWrapper() {}; HexagonControlWrapper(){};
int GetVersion(); int GetVersion();
bool Config(); bool Config();
bool Init(); bool Init();
bool Finalize(); bool Finalize();
bool SetupGraph(const NetDef& net_def); bool SetupGraph(const NetDef &net_def);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor); bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors, bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors); std::vector<Tensor> *output_tensors);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor); bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor);
bool TeardownGraph(); bool TeardownGraph();
void PrintLog(); void PrintLog();
...@@ -38,9 +39,7 @@ class HexagonControlWrapper { ...@@ -38,9 +39,7 @@ class HexagonControlWrapper {
private: private:
static constexpr int NODE_ID_OFFSET = 10000; static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) { inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
return NODE_ID_OFFSET + nodeid;
}
int nn_id_; int nn_id_;
Quantizer quantizer_; Quantizer quantizer_;
...@@ -52,9 +51,8 @@ class HexagonControlWrapper { ...@@ -52,9 +51,8 @@ class HexagonControlWrapper {
uint32_t num_inputs_; uint32_t num_inputs_;
uint32_t num_outputs_; uint32_t num_outputs_;
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
}; };
} }
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_ #endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
...@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs, ...@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
return 0; return 0;
} }
int hexagon_controller_DeInitHexagon() { int hexagon_controller_DeInitHexagon() { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0; return 0;
} }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
...@@ -2,27 +2,27 @@ ...@@ -2,27 +2,27 @@
#define _HEXAGON_NN_H #define _HEXAGON_NN_H
#ifndef __QAIC_HEADER #ifndef __QAIC_HEADER
#define __QAIC_HEADER(ff) ff #define __QAIC_HEADER(ff) ff
#endif //__QAIC_HEADER #endif //__QAIC_HEADER
#ifndef __QAIC_HEADER_EXPORT #ifndef __QAIC_HEADER_EXPORT
#define __QAIC_HEADER_EXPORT #define __QAIC_HEADER_EXPORT
#endif // __QAIC_HEADER_EXPORT #endif // __QAIC_HEADER_EXPORT
#ifndef __QAIC_HEADER_ATTRIBUTE #ifndef __QAIC_HEADER_ATTRIBUTE
#define __QAIC_HEADER_ATTRIBUTE #define __QAIC_HEADER_ATTRIBUTE
#endif // __QAIC_HEADER_ATTRIBUTE #endif // __QAIC_HEADER_ATTRIBUTE
#ifndef __QAIC_IMPL #ifndef __QAIC_IMPL
#define __QAIC_IMPL(ff) ff #define __QAIC_IMPL(ff) ff
#endif //__QAIC_IMPL #endif //__QAIC_IMPL
#ifndef __QAIC_IMPL_EXPORT #ifndef __QAIC_IMPL_EXPORT
#define __QAIC_IMPL_EXPORT #define __QAIC_IMPL_EXPORT
#endif // __QAIC_IMPL_EXPORT #endif // __QAIC_IMPL_EXPORT
#ifndef __QAIC_IMPL_ATTRIBUTE #ifndef __QAIC_IMPL_ATTRIBUTE
#define __QAIC_IMPL_ATTRIBUTE #define __QAIC_IMPL_ATTRIBUTE
#endif // __QAIC_IMPL_ATTRIBUTE #endif // __QAIC_IMPL_ATTRIBUTE
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
...@@ -30,92 +30,160 @@ extern "C" { ...@@ -30,92 +30,160 @@ extern "C" {
#define __QAIC_STRING1_OBJECT_DEFINED__ #define __QAIC_STRING1_OBJECT_DEFINED__
#define __STRING1_OBJECT__ #define __STRING1_OBJECT__
typedef struct _cstring1_s { typedef struct _cstring1_s {
char* data; char *data;
int dataLen; int dataLen;
} _cstring1_t; } _cstring1_t;
#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */ #endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
typedef struct hexagon_nn_input hexagon_nn_input; typedef struct hexagon_nn_input hexagon_nn_input;
struct hexagon_nn_input { struct hexagon_nn_input {
unsigned int src_id; unsigned int src_id;
unsigned int output_idx; unsigned int output_idx;
}; };
typedef struct hexagon_nn_output hexagon_nn_output; typedef struct hexagon_nn_output hexagon_nn_output;
struct hexagon_nn_output { struct hexagon_nn_output {
unsigned int max_size; unsigned int max_size;
unsigned int unused; unsigned int unused;
}; };
typedef struct hexagon_nn_perfinfo hexagon_nn_perfinfo; typedef struct hexagon_nn_perfinfo hexagon_nn_perfinfo;
struct hexagon_nn_perfinfo { struct hexagon_nn_perfinfo {
unsigned int node_id; unsigned int node_id;
unsigned int node_type; unsigned int node_type;
unsigned int executions; unsigned int executions;
unsigned int unused; unsigned int unused;
unsigned int counter_lo; unsigned int counter_lo;
unsigned int counter_hi; unsigned int counter_hi;
}; };
typedef int hexagon_nn_nn_id; typedef int hexagon_nn_nn_id;
enum hexagon_nn_padding_type { enum hexagon_nn_padding_type {
NN_PAD_NA, NN_PAD_NA,
NN_PAD_SAME, NN_PAD_SAME,
NN_PAD_VALID, NN_PAD_VALID,
NN_PAD_MIRROR_REFLECT, NN_PAD_MIRROR_REFLECT,
NN_PAD_MIRROR_SYMMETRIC, NN_PAD_MIRROR_SYMMETRIC,
NN_PAD_SAME_CAFFE, NN_PAD_SAME_CAFFE,
_32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff _32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
}; };
typedef enum hexagon_nn_padding_type hexagon_nn_padding_type; typedef enum hexagon_nn_padding_type hexagon_nn_padding_type;
typedef struct hexagon_nn_tensordef hexagon_nn_tensordef; typedef struct hexagon_nn_tensordef hexagon_nn_tensordef;
struct hexagon_nn_tensordef { struct hexagon_nn_tensordef {
unsigned int batches; unsigned int batches;
unsigned int height; unsigned int height;
unsigned int width; unsigned int width;
unsigned int depth; unsigned int depth;
unsigned char* data; unsigned char *data;
int dataLen; int dataLen;
unsigned int data_valid_len; unsigned int data_valid_len;
unsigned int unused; unsigned int unused;
}; };
typedef struct hexagon_nn_op_node hexagon_nn_op_node; typedef struct hexagon_nn_op_node hexagon_nn_op_node;
struct hexagon_nn_op_node { struct hexagon_nn_op_node {
unsigned int node_id; unsigned int node_id;
unsigned int operation; unsigned int operation;
hexagon_nn_padding_type padding; hexagon_nn_padding_type padding;
hexagon_nn_input* inputs; hexagon_nn_input *inputs;
int inputsLen; int inputsLen;
hexagon_nn_output* outputs; hexagon_nn_output *outputs;
int outputsLen; int outputsLen;
}; };
typedef struct hexagon_nn_const_node hexagon_nn_const_node; typedef struct hexagon_nn_const_node hexagon_nn_const_node;
struct hexagon_nn_const_node { struct hexagon_nn_const_node {
unsigned int node_id; unsigned int node_id;
hexagon_nn_tensordef tensor; hexagon_nn_tensordef tensor;
}; };
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE; unsigned char *buf,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE; int bufLen)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE; unsigned char *buf,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE; int bufLen)
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_nn_id id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE; unsigned int node_id,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE; unsigned int operation,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE; hexagon_nn_padding_type padding,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE; const hexagon_nn_input *inputs,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE; int inputsLen,
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE; const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif //_HEXAGON_NN_H #endif //_HEXAGON_NN_H
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#ifndef LIBMACE_HEXAGON_NN_OPS_H #ifndef LIBMACE_HEXAGON_NN_OPS_H
#define LIBMACE_HEXAGON_NN_OPS_H #define LIBMACE_HEXAGON_NN_OPS_H
#include "mace/utils/logging.h"
#include <unordered_map> #include <unordered_map>
#include "mace/utils/logging.h"
namespace mace { namespace mace {
...@@ -24,8 +24,7 @@ typedef enum op_type_enum { ...@@ -24,8 +24,7 @@ typedef enum op_type_enum {
class OpMap { class OpMap {
public: public:
void Init() { void Init() {
#define DEF_OP(NAME) \ #define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
op_map_[#NAME] = OP_##NAME;
#include "mace/core/runtime/hexagon/ops.h" #include "mace/core/runtime/hexagon/ops.h"
...@@ -40,9 +39,10 @@ class OpMap { ...@@ -40,9 +39,10 @@ class OpMap {
return OP_INVALID; return OP_INVALID;
} }
} }
private: private:
std::unordered_map<std::string, int> op_map_; std::unordered_map<std::string, int> op_map_;
}; };
} // namespace mace } // namespace mace
#endif // LIBMACE_HEXAGON_NN_OPS_H #endif // LIBMACE_HEXAGON_NN_OPS_H
...@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8) ...@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
#undef __SELF_DEF_OP_WREF #undef __SELF_DEF_OP_WREF
#undef DEF_OP_WREF #undef DEF_OP_WREF
#endif #endif
...@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor, ...@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
float *max_out) { float *max_out) {
float stepsize; float stepsize;
float recip_stepsize; float recip_stepsize;
QuantizeAdjustRange(min_in, max_in, QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
min_out, max_out, &recip_stepsize);
&stepsize, &recip_stepsize);
const float *in = in_tensor.data<float>(); const float *in = in_tensor.data<float>();
uint8_t *out = out_tensor->mutable_data<uint8_t>(); uint8_t *out = out_tensor->mutable_data<uint8_t>();
for (int i = 0; i < in_tensor.size(); i++) { for (int i = 0; i < in_tensor.size(); i++) {
const float inval = in[i]; const float inval = in[i];
float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f); float ival =
static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
if (ival < 0) ival = 0; if (ival < 0) ival = 0;
if (ival > 255) ival = 255; if (ival > 255) ival = 255;
out[i] = static_cast<uint8_t>(ival); out[i] = static_cast<uint8_t>(ival);
...@@ -93,4 +93,4 @@ void Quantizer::DeQuantize(const Tensor &in_tensor, ...@@ -93,4 +93,4 @@ void Quantizer::DeQuantize(const Tensor &in_tensor,
} }
} }
} // namespace mace } // namespace mace
\ No newline at end of file \ No newline at end of file
...@@ -16,13 +16,17 @@ class Quantizer { ...@@ -16,13 +16,17 @@ class Quantizer {
void Quantize(const Tensor &in_tensor, void Quantize(const Tensor &in_tensor,
Tensor *out_tensor, Tensor *out_tensor,
float *min_out, float *max_out); float *min_out,
float *max_out);
void Quantize(const Tensor &in_tensor, void Quantize(const Tensor &in_tensor,
const float min_in, const float max_in, const float min_in,
const float max_in,
Tensor *out_tensor, Tensor *out_tensor,
float *min_out, float *max_out); float *min_out,
float *max_out);
void DeQuantize(const Tensor &in_tensor, void DeQuantize(const Tensor &in_tensor,
const float min_in, const float max_in, const float min_in,
const float max_in,
Tensor *out_tensor); Tensor *out_tensor);
private: private:
...@@ -33,9 +37,9 @@ class Quantizer { ...@@ -33,9 +37,9 @@ class Quantizer {
float *stepsize, float *stepsize,
float *recip_stepsize); float *recip_stepsize);
DISABLE_COPY_AND_ASSIGN(Quantizer); DISABLE_COPY_AND_ASSIGN(Quantizer);
}; };
} // mace } // mace
#endif // MACE_DSP_UTIL_QUANTIZE_H_ #endif // MACE_DSP_UTIL_QUANTIZE_H_
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_allocator.h" #include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
...@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { ...@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
return 0; return 0;
} }
} }
} }
OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::OpenCLAllocator() {}
...@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const { ...@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) const { const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2"; MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1]; VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
<< image_shape[1];
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt)); cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error; cl_int error;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(), new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
img_format, image_shape[0], image_shape[1], 0, nullptr, &error);
image_shape[0], image_shape[1],
0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: [" MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1] << image_shape[0] << ", " << image_shape[1]
<< "]"; << "]";
...@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { ...@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
// TODO(heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset, queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
nbytes, nullptr, nullptr, &error); offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr; return mapped_ptr;
} }
...@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer, ...@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch->resize(2); mapped_image_pitch->resize(2);
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image, *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
origin, region, nullptr, &error);
mapped_image_pitch->data(),
mapped_image_pitch->data() + 1,
nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error; MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr; return mapped_ptr;
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include <vector> #include <vector>
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name, ...@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device, cl::Device &device,
cl::Program *program, cl::Program *program,
bool *is_binary) { bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap; extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
*is_binary = false; *is_binary = false;
auto it_source = kEncryptedProgramMap.find(program_name); auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source == kEncryptedProgramMap.end()) { if (it_source == kEncryptedProgramMap.end()) {
......
...@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name, ...@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device, cl::Device &device,
cl::Program *program, cl::Program *program,
bool *is_binary) { bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap; extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
*is_binary = true; *is_binary = true;
auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix); auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
if (it_binary == kCompiledProgramMap.end()) { if (it_binary == kCompiledProgramMap.end()) {
......
...@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() { ...@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0; return (stop_nanos_ - start_nanos_) / 1000.0;
} }
double OpenCLProfilingTimer::AccumulatedMicros() { double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
return accumulated_micros_;
}
void OpenCLProfilingTimer::AccumulateTiming(){ void OpenCLProfilingTimer::AccumulateTiming() {
StopTiming(); StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0; accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
} }
...@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() { ...@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
cl::CommandQueue command_queue(context, gpu_device, properties); cl::CommandQueue command_queue(context, gpu_device, properties);
const char *kernel_path = getenv("MACE_KERNEL_PATH"); const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->device_ = new cl::Device(gpu_device); this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context); this->context_ = new cl::Context(context);
...@@ -163,18 +162,14 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ...@@ -163,18 +162,14 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
MACE_CHECK_NOTNULL(program); MACE_CHECK_NOTNULL(program);
std::string binary_file_name_prefix = std::string binary_file_name_prefix =
GenerateCLBinaryFilenamePrefix(built_program_key); GenerateCLBinaryFilenamePrefix(built_program_key);
std::vector<unsigned char> program_vec; std::vector<unsigned char> program_vec;
bool is_opencl_binary; bool is_opencl_binary;
const bool found = GetSourceOrBinaryProgram(program_name, const bool found =
binary_file_name_prefix, GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
context(), device(), program, &is_opencl_binary);
device(),
program,
&is_opencl_binary);
MACE_CHECK(found, "Program not found for ", MACE_CHECK(found, "Program not found for ",
is_opencl_binary ? "binary: " : "source: ", is_opencl_binary ? "binary: " : "source: ", built_program_key);
built_program_key);
// Build program // Build program
std::string build_options_str = std::string build_options_str =
...@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ...@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
} }
LOG(FATAL) << "Build program from " LOG(FATAL) << "Build program from "
<< (is_opencl_binary ? "binary: " : "source: ") << (is_opencl_binary ? "binary: " : "source: ")
<< built_program_key << built_program_key << " failed: " << ret;
<< " failed: " << ret;
} }
if (!is_opencl_binary) { if (!is_opencl_binary) {
// Write binary if necessary // Write binary if necessary
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin"; std::string binary_filename =
kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1; size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes( std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]); new size_t[device_list_size]);
...@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel( ...@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) { if (built_program_it != built_program_map_.end()) {
program = built_program_it->second; program = built_program_it->second;
} else { } else {
this->BuildProgram(program_name, built_program_key, this->BuildProgram(program_name, built_program_key, build_options_str,
build_options_str, &program); &program);
built_program_map_.emplace(built_program_key, program); built_program_map_.emplace(built_program_key, program);
} }
return cl::Kernel(program, kernel_name.c_str()); return cl::Kernel(program, kernel_name.c_str());
...@@ -250,9 +245,9 @@ cl::Kernel OpenCLRuntime::BuildKernel( ...@@ -250,9 +245,9 @@ cl::Kernel OpenCLRuntime::BuildKernel(
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) { void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) { if (stats != nullptr) {
stats->start_micros = stats->start_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000; event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
stats->end_micros = stats->end_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000; event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
} }
} }
......
...@@ -19,7 +19,8 @@ namespace mace { ...@@ -19,7 +19,8 @@ namespace mace {
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {}; explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0){};
void StartTiming() override; void StartTiming() override;
void StopTiming() override; void StopTiming() override;
void AccumulateTiming() override; void AccumulateTiming() override;
...@@ -48,6 +49,7 @@ class OpenCLRuntime { ...@@ -48,6 +49,7 @@ class OpenCLRuntime {
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
private: private:
OpenCLRuntime(); OpenCLRuntime();
~OpenCLRuntime(); ~OpenCLRuntime();
......
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
namespace mace { namespace mace {
// These functions are not thread-safe. // These functions are not thread-safe.
void LoadOpenCLLibrary(); void LoadOpenCLLibrary();
void UnloadOpenCLLibrary(); void UnloadOpenCLLibrary();
} // namespace mace } // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_ #endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
...@@ -65,23 +65,20 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { ...@@ -65,23 +65,20 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
class Tensor { class Tensor {
public: public:
Tensor(Allocator *alloc, DataType type) Tensor(Allocator *alloc, DataType type)
: allocator_(alloc), : allocator_(alloc),
dtype_(type), dtype_(type),
buffer_(nullptr), buffer_(nullptr),
is_buffer_owner_(true), is_buffer_owner_(true),
name_("") {}; name_(""){};
Tensor(BufferBase *buffer, DataType dtype) Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype), : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
buffer_(buffer),
is_buffer_owner_(false),
name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype) Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype), : dtype_(dtype),
buffer_slice_(buffer_slice), buffer_slice_(buffer_slice),
is_buffer_owner_(false), is_buffer_owner_(false),
name_("") { name_("") {
buffer_ = &buffer_slice_; buffer_ = &buffer_slice_;
} }
...@@ -102,8 +99,8 @@ class Tensor { ...@@ -102,8 +99,8 @@ class Tensor {
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const { inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ", MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
index, " >= ", shape_.size()); shape_.size());
return shape_[index]; return shape_[index];
} }
...@@ -112,40 +109,35 @@ class Tensor { ...@@ -112,40 +109,35 @@ class Tensor {
std::multiplies<int64_t>()); std::multiplies<int64_t>());
} }
inline index_t raw_size() const { inline index_t raw_size() const { return size() * SizeOfType(); }
return size() * SizeOfType();
}
inline bool has_opencl_image() const { inline bool has_opencl_image() const {
return buffer_ != nullptr && !buffer_->OnHost() return buffer_ != nullptr && !buffer_->OnHost() &&
&& typeid(*buffer_) == typeid(Image); typeid(*buffer_) == typeid(Image);
} }
inline bool has_opencl_buffer() const { inline bool has_opencl_buffer() const {
return buffer_ != nullptr && !buffer_->OnHost() return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
&& !has_opencl_image();
} }
inline cl::Image *opencl_image() const { inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image"); MACE_CHECK(has_opencl_image(), "do not have image");
return static_cast<cl::Image*>(buffer_->buffer()); return static_cast<cl::Image *>(buffer_->buffer());
} }
inline cl::Buffer *opencl_buffer() const { inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer"); MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
return static_cast<cl::Buffer*>(buffer_->buffer()); return static_cast<cl::Buffer *>(buffer_->buffer());
} }
inline index_t buffer_offset() const { inline index_t buffer_offset() const { return buffer_->offset(); }
return buffer_->offset();
}
inline const void *raw_data() const { inline const void *raw_data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_data(); return buffer_->raw_data();
} }
template<typename T> template <typename T>
inline const T *data() const { inline const T *data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>(); return buffer_->data<T>();
...@@ -156,7 +148,7 @@ class Tensor { ...@@ -156,7 +148,7 @@ class Tensor {
return buffer_->raw_mutable_data(); return buffer_->raw_mutable_data();
} }
template<typename T> template <typename T>
inline T *mutable_data() { inline T *mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null"); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data()); return static_cast<T *>(buffer_->raw_mutable_data());
...@@ -188,25 +180,17 @@ class Tensor { ...@@ -188,25 +180,17 @@ class Tensor {
is_buffer_owner_ = true; is_buffer_owner_ = true;
} else { } else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image*>(buffer_); Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
&& image_shape[1] <= image->image_shape()[1], image_shape[1] <= image->image_shape()[1],
"tensor (source op ", "tensor (source op ", name_,
name_, "): current physical image shape: ", image->image_shape()[0],
"): current physical image shape: ", ", ", image->image_shape()[1], " < logical image shape: ",
image->image_shape()[0], image_shape[0], ", ", image_shape[1]);
", ",
image->image_shape()[1],
" < logical image shape: ",
image_shape[0],
", ",
image_shape[1]);
} }
} }
inline void ResizeLike(const Tensor &other) { inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
ResizeLike(&other);
}
inline void ResizeLike(const Tensor *other) { inline void ResizeLike(const Tensor *other) {
if (other->has_opencl_image()) { if (other->has_opencl_image()) {
...@@ -229,7 +213,7 @@ class Tensor { ...@@ -229,7 +213,7 @@ class Tensor {
memcpy(buffer_->raw_mutable_data(), src, size); memcpy(buffer_->raw_mutable_data(), src, size);
} }
template<typename T> template <typename T>
inline void Copy(const T *src, index_t length) { inline void Copy(const T *src, index_t length) {
MACE_CHECK(length == size(), "copy src and dst with different size."); MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * length); CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
...@@ -248,13 +232,9 @@ class Tensor { ...@@ -248,13 +232,9 @@ class Tensor {
return type_size; return type_size;
} }
inline BufferBase *UnderlyingBuffer() const { inline BufferBase *UnderlyingBuffer() const { return buffer_; }
return buffer_;
}
inline void SetSourceOpName(const std::string name) { inline void SetSourceOpName(const std::string name) { name_ = name; }
name_ = name;
}
inline void DebugPrint() const { inline void DebugPrint() const {
using namespace numerical_chars; using namespace numerical_chars;
...@@ -272,8 +252,9 @@ class Tensor { ...@@ -272,8 +252,9 @@ class Tensor {
} }
CASES(dtype_, (os << (this->data<T>()[i]) << ", ")); CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
} }
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str(); << ", " << dim(3) << "], content:\n"
<< os.str();
} }
class MappingGuard { class MappingGuard {
...@@ -301,20 +282,20 @@ class Tensor { ...@@ -301,20 +282,20 @@ class Tensor {
const Tensor *tensor_; const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_; std::vector<size_t> mapped_image_pitch_;
DISABLE_COPY_AND_ASSIGN(MappingGuard); DISABLE_COPY_AND_ASSIGN(MappingGuard);
}; };
private: private:
Allocator *allocator_; Allocator *allocator_;
DataType dtype_; DataType dtype_;
std::vector<index_t> shape_; std::vector<index_t> shape_;
std::vector<size_t > image_shape_; std::vector<size_t> image_shape_;
BufferBase *buffer_; BufferBase *buffer_;
BufferSlice buffer_slice_; BufferSlice buffer_slice_;
bool is_buffer_owner_; bool is_buffer_owner_;
std::string name_; std::string name_;
DISABLE_COPY_AND_ASSIGN(Tensor); DISABLE_COPY_AND_ASSIGN(Tensor);
}; };
} // namespace tensor } // namespace tensor
......
...@@ -99,9 +99,7 @@ void RestartTiming() { ...@@ -99,9 +99,7 @@ void RestartTiming() {
accum_time = 0; accum_time = 0;
start_time = NowMicros(); start_time = NowMicros();
} }
void StartTiming() { void StartTiming() { start_time = NowMicros(); }
start_time = NowMicros();
}
void StopTiming() { void StopTiming() {
if (start_time != 0) { if (start_time != 0) {
accum_time += (NowMicros() - start_time); accum_time += (NowMicros() - start_time);
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_ #ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_ #define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \ #define BENCHMARK(n) \
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include <map>
#include <cstdint> #include <cstdint>
#include <map>
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
...@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
std::string DataTypeToString(const DataType dt) { std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = { static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"}, {DT_FLOAT, "DT_FLOAT"}, {DT_HALF, "DT_HALF"},
{DT_HALF, "DT_HALF"}, {DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
{DT_DOUBLE, "DT_DOUBLE"}, {DT_INT8, "DT_INT8"}, {DT_INT32, "DT_INT32"},
{DT_UINT8, "DT_UINT8"}, {DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
{DT_INT8, "DT_INT8"}, {DT_INT64, "DT_INT64"}, {DT_BOOL, "DT_BOOL"},
{DT_INT32, "DT_INT32"}, {DT_STRING, "DT_STRING"}};
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type"; MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt]; return dtype_string_map[dt];
} }
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/core/workspace.h"
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/workspace.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
namespace mace { namespace mace {
...@@ -19,7 +19,7 @@ Tensor *Workspace::CreateTensor(const std::string &name, ...@@ -19,7 +19,7 @@ Tensor *Workspace::CreateTensor(const std::string &name,
} else { } else {
VLOG(3) << "Creating Tensor " << name; VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type))); std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
} }
return GetTensor(name); return GetTensor(name);
} }
...@@ -35,7 +35,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const { ...@@ -35,7 +35,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
Tensor *Workspace::GetTensor(const std::string &name) { Tensor *Workspace::GetTensor(const std::string &name) {
return const_cast<Tensor *>( return const_cast<Tensor *>(
static_cast<const Workspace *>(this)->GetTensor(name)); static_cast<const Workspace *>(this)->GetTensor(name));
} }
std::vector<std::string> Workspace::Tensors() const { std::vector<std::string> Workspace::Tensors() const {
...@@ -51,28 +51,28 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -51,28 +51,28 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
index_t model_data_size = 0; index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr; unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr if (model_data_ptr == nullptr ||
|| reinterpret_cast<long long>(const_tensor.data()) reinterpret_cast<long long>(const_tensor.data()) <
< reinterpret_cast<long long>(model_data_ptr)) { reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data()); model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
} }
} }
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size, model_data_size = std::max(
static_cast<index_t>( model_data_size,
(reinterpret_cast<long long>(const_tensor.data()) static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
- reinterpret_cast<long long>(model_data_ptr)) reinterpret_cast<long long>(model_data_ptr)) +
+ const_tensor.data_size() const_tensor.data_size() *
* GetEnumTypeSize(const_tensor.data_type()))); GetEnumTypeSize(const_tensor.data_type())));
} }
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU) { if (type == DeviceType::CPU) {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>( tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size))); new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
} else { } else {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>( tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_size))); new Buffer(GetDeviceAllocator(type), model_data_size)));
tensor_buffer_->Map(nullptr); tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size); tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap(); tensor_buffer_->UnMap();
...@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name() VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", data type: " << const_tensor.data_type() << ", shape: "
<< ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(), << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end())); const_tensor.dims().end()));
std::vector<index_t> dims; std::vector<index_t> dims;
...@@ -90,14 +89,12 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -90,14 +89,12 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
dims.push_back(d); dims.push_back(d);
} }
index_t index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
std::unique_ptr<Tensor> tensor( std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(), new Tensor(BufferSlice(tensor_buffer_.get(), offset,
offset, const_tensor.data_size() *
const_tensor.data_size() GetEnumTypeSize(const_tensor.data_type())),
* GetEnumTypeSize(const_tensor.data_type())), const_tensor.data_type()));
const_tensor.data_type()));
tensor->Reshape(dims); tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor); tensor_map_[const_tensor.name()] = std::move(tensor);
...@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
// as GPU have consistent data type for each layer for now. // as GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op, // As DSP may have different data output type for each op,
// we stick to the same concept. // we stick to the same concept.
for (auto &op: net_def.op()) { for (auto &op : net_def.op()) {
if (op.has_mem_id()) { if (op.has_mem_id()) {
const DataType op_dtype = static_cast<DataType>( const DataType op_dtype = static_cast<DataType>(
ArgumentHelper::GetSingleArgument<OperatorDef, int>( ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op, op, "T", static_cast<int>(DT_FLOAT)));
"T",
static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) { if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype; dtype = op_dtype;
// find first valid data type, break // find first valid data type, break
...@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
} }
} }
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase> std::unique_ptr<BufferBase> image_buf(
image_buf(new Image({mem_block.x(), mem_block.y()}, dtype)); new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
} }
VLOG(3) << "Preallocate image to tensors"; VLOG(3) << "Preallocate image to tensors";
for (auto &op: net_def.op()) { for (auto &op : net_def.op()) {
if (op.has_mem_id()) { if (op.has_mem_id()) {
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor(
(new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype)); new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
tensor->SetSourceOpName(op.name()); tensor->SetSourceOpName(op.name());
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: " VLOG(3)
<< op.mem_id() << "; Image shape: " << "Tensor: " << op.name() << "(" << op.type() << ")"
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0] << "; Mem: " << op.mem_id() << "; Image shape: "
<< ", " << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1]; << ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor_map_[op.output(0)] = std::move(tensor); tensor_map_[op.output(0)] = std::move(tensor);
} }
} }
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#ifndef MACE_CORE_WORKSPACE_H_ #ifndef MACE_CORE_WORKSPACE_H_
#define MACE_CORE_WORKSPACE_H_ #define MACE_CORE_WORKSPACE_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/preallocated_pooled_allocator.h"
namespace mace { namespace mace {
...@@ -43,7 +43,7 @@ class Workspace { ...@@ -43,7 +43,7 @@ class Workspace {
PreallocatedPooledAllocator preallocated_allocator_; PreallocatedPooledAllocator preallocated_allocator_;
DISABLE_COPY_AND_ASSIGN(Workspace); DISABLE_COPY_AND_ASSIGN(Workspace);
}; };
} // namespace mace } // namespace mace
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_ACTIVATION_H_ #define MACE_KERNELS_ACTIVATION_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -99,17 +99,15 @@ void PReLUActivation(const T *input_ptr, ...@@ -99,17 +99,15 @@ void PReLUActivation(const T *input_ptr,
output_ptr[i] = in; output_ptr[i] = in;
} }
} }
} }
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ActivationFunctor { class ActivationFunctor {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), : activation_(type), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -118,9 +116,11 @@ class ActivationFunctor { ...@@ -118,9 +116,11 @@ class ActivationFunctor {
if (activation_ == PRELU) { if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha); MACE_CHECK_NOTNULL(alpha);
const T *alpha_ptr = alpha->data<T>(); const T *alpha_ptr = alpha->data<T>();
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr); PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
output_ptr);
} else { } else {
DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_); DoActivation(input_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
} }
} }
...@@ -131,14 +131,16 @@ class ActivationFunctor { ...@@ -131,14 +131,16 @@ class ActivationFunctor {
template <> template <>
void ActivationFunctor<DeviceType::NEON, float>::operator()( void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future); const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
template <typename T> template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> { class ActivationFunctor<DeviceType::OPENCL, T> {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), : activation_(type), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
......
...@@ -18,7 +18,7 @@ namespace mace { ...@@ -18,7 +18,7 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
constexpr int kCostPerGroup = 1024; constexpr int kCostPerGroup = 1024;
} // namespace } // namespace
template <DeviceType D, typename T> template <DeviceType D, typename T>
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#endif #endif
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -24,7 +24,7 @@ struct BatchNormFunctorBase { ...@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
const float relux_max_limit) const float relux_max_limit)
: folded_constant_(folded_constant), : folded_constant_(folded_constant),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const bool folded_constant_; const bool folded_constant_;
const ActivationType activation_; const ActivationType activation_;
...@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { ...@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase( : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
...@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase( : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_BIAS_ADD_H_ #define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -32,7 +32,6 @@ struct BiasAddFunctor { ...@@ -32,7 +32,6 @@ struct BiasAddFunctor {
const T *bias_ptr = bias->data<T>(); const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
...@@ -44,7 +43,6 @@ struct BiasAddFunctor { ...@@ -44,7 +43,6 @@ struct BiasAddFunctor {
} }
} }
} }
} }
}; };
......
...@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase { ...@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
bool i2b_; bool i2b_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{ struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{ ...@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
} }
}; };
template<typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
...@@ -16,8 +16,10 @@ class ChannelShuffleFunctor { ...@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
public: public:
ChannelShuffleFunctor(const int group) : group_(group) {} ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape, void operator()(const T *input,
T *output, StatsFuture *future) { const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0]; index_t batch = input_shape[0];
index_t channels = input_shape[1]; index_t channels = input_shape[1];
index_t height = input_shape[2]; index_t height = input_shape[2];
......
...@@ -6,23 +6,23 @@ ...@@ -6,23 +6,23 @@
#define MACE_KERNELS_CONCAT_H_ #define MACE_KERNELS_CONCAT_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ConcatFunctorBase { struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis): axis_(axis){} ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_; int32_t axis_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase { struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
} }
}; };
template<typename T> template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){} ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, StatsFuture *future); Tensor *output,
StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -116,9 +116,8 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -116,9 +116,8 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
sum[sum_idx] += vaddvq_f32(tmp); sum[sum_idx] += vaddvq_f32(tmp);
#else #else
for (int inci = 0; inci < inc_tile_size; ++inci) { for (int inci = 0; inci < inc_tile_size; ++inci) {
sum[sum_idx] += sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
in[in_idx * inc_tile_size + inci] * weights[weights_idx * inc_tile_size + inci];
weights[weights_idx * inc_tile_size + inci];
} }
#endif #endif
} }
...@@ -188,7 +187,7 @@ struct Conv2dFunctorBase { ...@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
const Padding padding_type_; const Padding padding_type_;
...@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(), CalcOutputSize(input->shape().data(), filter->shape().data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
output->Resize(output_shape); output->Resize(output_shape);
......
...@@ -145,7 +145,7 @@ void CalcOutputSize(const index_t *input_shape, // NHWC ...@@ -145,7 +145,7 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1"); "Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) && MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1), (dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1"); "If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape); MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size); MACE_CHECK_NOTNULL(padding_size);
...@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC ...@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
*/ */
output_shape[0] = input_shape[0]; output_shape[0] = input_shape[0];
if (round_type == FLOOR) { if (round_type == FLOOR) {
output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0] output_shape[1] = static_cast<index_t>(
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1); std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1] (filter_shape[0] - 1) * (dilations[0] - 1)) /
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1); strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} else { } else {
output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0] output_shape[1] = static_cast<index_t>(
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1); std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1] (filter_shape[0] - 1) * (dilations[0] - 1)) /
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1); strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} }
output_shape[3] = filter_shape[2]; output_shape[3] = filter_shape[2];
} }
void CalPaddingSize(const index_t *input_shape, // NCHW void CalPaddingSize(const index_t *input_shape, // NCHW
......
...@@ -15,7 +15,7 @@ enum Padding { ...@@ -15,7 +15,7 @@ enum Padding {
FULL = 2, // Pads with one less than the filter size on both sides FULL = 2, // Pads with one less than the filter size on both sides
}; };
enum RoundType{ enum RoundType {
FLOOR = 0, FLOOR = 0,
CEIL = 1, CEIL = 1,
}; };
......
...@@ -10,9 +10,9 @@ ...@@ -10,9 +10,9 @@
#endif #endif
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase { ...@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit){} relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
const Padding padding_type_; const Padding padding_type_;
...@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(), CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
auto input_shape = fake_filter_shape; auto input_shape = fake_filter_shape;
output->Resize(output_shape); output->Resize(output_shape);
......
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
#define MACE_KERNELS_ELTWISE_H_ #define MACE_KERNELS_ELTWISE_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
enum EltwiseType{ enum EltwiseType {
PROD = 0, PROD = 0,
SUM = 1, SUM = 1,
MAX = 2, MAX = 2,
...@@ -19,8 +19,7 @@ enum EltwiseType{ ...@@ -19,8 +19,7 @@ enum EltwiseType{
}; };
struct EltwiseFunctorBase { struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type, EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: type_(type), coeff_(coeff) {} : type_(type), coeff_(coeff) {}
EltwiseType type_; EltwiseType type_;
...@@ -29,8 +28,7 @@ struct EltwiseFunctorBase { ...@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase { struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {} : EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0, void operator()(const Tensor *input0,
...@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
switch (type_) { switch (type_) {
case PROD: case PROD:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = input0_ptr[i] * input1_ptr[i]; output_ptr[i] = input0_ptr[i] * input1_ptr[i];
} }
break; break;
...@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i]; output_ptr[i] =
coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
} }
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]); output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for #pragma omp parallel for
for(index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]); output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
} }
break; break;
...@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} }
}; };
template <typename T> template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {} : EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0, void operator()(const Tensor *input0,
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#define MACE_KERNELS_FULLY_CONNECTED_H_ #define MACE_KERNELS_FULLY_CONNECTED_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
namespace mace { namespace mace {
...@@ -16,25 +16,23 @@ namespace kernels { ...@@ -16,25 +16,23 @@ namespace kernels {
struct FullyConnectedBase { struct FullyConnectedBase {
FullyConnectedBase(const ActivationType activation, FullyConnectedBase(const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: activation_(activation), : activation_(activation), relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit){}
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct FullyConnectedFunctor : FullyConnectedBase { struct FullyConnectedFunctor : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) : const float relux_max_limit)
FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
output->Resize(output_shape); output->Resize(output_shape);
const index_t N = output->dim(0); const index_t N = output->dim(0);
...@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase { ...@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
} }
}; };
template<typename T> template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) : const float relux_max_limit)
FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
......
...@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor { ...@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <> template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()( void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, const float *input,
float *output, StatsFuture *future); const index_t *input_shape,
float *output,
StatsFuture *future);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -6,20 +6,18 @@ ...@@ -6,20 +6,18 @@
#define MACE_KERNELS_MATMUL_H_ #define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct MatMulFunctor { struct MatMulFunctor {
void operator()(const Tensor *A, void operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
C->Resize(c_shape); C->Resize(c_shape);
const index_t N = C->dim(0); const index_t N = C->dim(0);
...@@ -52,7 +50,6 @@ struct MatMulFunctor { ...@@ -52,7 +50,6 @@ struct MatMulFunctor {
} }
}; };
template <typename T> template <typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> { struct MatMulFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *A, void operator()(const Tensor *A,
......
...@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()( ...@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t i = 0; i < n; ++i) { for (index_t i = 0; i < n; ++i) {
for (index_t j = 0; j < sample_size; ++j) { for (index_t j = 0; j < sample_size; ++j) {
const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel; const float *input_sample_ptr =
input_ptr + (i * sample_size + j) * channel;
float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel; float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
const float *new_scale_ptr = new_scale.data(); const float *new_scale_ptr = new_scale.data();
const float *new_offset_ptr = new_offset.data(); const float *new_offset_ptr = new_offset.data();
......
...@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4); std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2); std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize( kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, input->shape().data(), filter->shape().data(), dilations_, strides_,
strides_, paddings_, output_shape_vec.data(), paddings.data()); paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec); output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)( typedef void (*Conv2dNeonFunction)(
...@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto output_shape = output->shape().data(); auto output_shape = output->shape().data();
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr, conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
bias_data, output_data, output_shape); output_data, output_shape);
} }
} // namespace kernels } // namespace kernels
......
...@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW ...@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
int input_channels = input_shape[1]; int input_channels = input_shape[1];
int input_height = input_shape[2]; int input_height = input_shape[2];
int input_width = input_shape[3]; int input_width = input_shape[3];
int multiplier = int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
filter_shape == nullptr ? 0 : filter_shape[0]; int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) { for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) { for (int oc = 0; oc < output_channels; ++oc) {
...@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW ...@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
int input_channels = input_shape[1]; int input_channels = input_shape[1];
int input_height = input_shape[2]; int input_height = input_shape[2];
int input_width = input_shape[3]; int input_width = input_shape[3];
int multiplier = int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
filter_shape == nullptr ? 0 : filter_shape[0]; int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) { for (int b = 0; b < output_batch; ++b) {
......
...@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, DepthwiseConv2dFunctor<DeviceType::CPU, float>(
dilations_)( strides_, paddings_, dilations_)(input, filter, bias, output, future);
input, filter, bias, output, future);
return; return;
} }
...@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
} }
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr, conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
output_shape); output_ptr, output_shape);
} }
} // namespace kernels } // namespace kernels
......
...@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
default: default:
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
runtime->BuildKernel("activation", kernel_name, built_options);
int idx = 0; int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) { if (activation_ == PRELU) {
...@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(2), output->dim(3)); output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
} }
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
for (auto input : input_tensors) { for (auto input : input_tensors) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input->opencl_image()));
*(input->opencl_image()));
} }
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(batch_height_pixels)};
static_cast<uint32_t>(batch_height_pixels)
};
const std::vector<uint32_t> lws = {64, 16, 1}; const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< output_shape[0] << "_" << "_" << output_shape[2] << "_" << output_shape[3];
<< output_shape[1] << "_"
<< output_shape[2] << "_"
<< output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct AddNFunctor<DeviceType::OPENCL, float>;
struct AddNFunctor<DeviceType::OPENCL, float>;
template template struct AddNFunctor<DeviceType::OPENCL, half>;
struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
runtime->BuildKernel("batch_norm", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, kernel_.setArg(idx++, *(offset->opencl_image()));
*(offset->opencl_image()));
if (!folded_constant_) { if (!folded_constant_) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(mean->opencl_image()));
*(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image())); kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon); kernel_.setArg(idx++, epsilon);
} }
......
...@@ -12,11 +12,10 @@ namespace mace { ...@@ -12,11 +12,10 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *input, const Tensor *bias,
const Tensor *bias, Tensor *output,
Tensor *output, StatsFuture *future) {
StatsFuture *future) {
const index_t batch = input->dim(0); const index_t batch = input->dim(0);
const index_t height = input->dim(1); const index_t height = input->dim(1);
const index_t width = input->dim(2); const index_t width = input->dim(2);
...@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
...@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
} }
} }
template template struct BiasAddFunctor<DeviceType::OPENCL, float>;
struct BiasAddFunctor<DeviceType::OPENCL, float>; template struct BiasAddFunctor<DeviceType::OPENCL, half>;
template
struct BiasAddFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -9,36 +9,33 @@ ...@@ -9,36 +9,33 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
const BufferType type, Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
Tensor *image,
StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape); CalImage2DShape(buffer->shape(), type, image_shape);
if(type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape); image->ResizeImage(new_shape, image_shape);
} else { } else {
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
} }
} else { } else {
Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer()); Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape(); image_shape = image_buf->image_shape();
buffer->Resize(image->shape()); buffer->Resize(image->shape());
} }
size_t gws[2] = {image_shape[0], size_t gws[2] = {image_shape[0], image_shape[1]};
image_shape[1]};
std::string kernel_name; std::string kernel_name;
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image"; kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break; break;
case DW_CONV2D_FILTER: case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; kernel_name =
i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break; break;
case IN_OUT_CHANNEL: case IN_OUT_CHANNEL:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
...@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break; break;
case IN_OUT_HEIGHT: case IN_OUT_HEIGHT:
case WEIGHT_HEIGHT: case WEIGHT_HEIGHT:
kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image"; kernel_name = i2b_ ? "in_out_height_image_to_buffer"
: "in_out_height_buffer_to_image";
break; break;
case IN_OUT_WIDTH: case IN_OUT_WIDTH:
MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now"; MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
...@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break; break;
case WINOGRAD_FILTER: case WINOGRAD_FILTER:
gws[1] /= 16; gws[1] /= 16;
kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image"; kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
: "winograd_filter_buffer_to_image";
break; break;
} }
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
...@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else { } else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, obfuscated_kernel_name, built_options);
built_options);
uint32_t idx = 0; uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) { if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype()))); "buffer offset not aligned");
b2f_kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
} }
if (type == ARGUMENT) { if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) { } else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, 1); b2f_kernel.setArg(idx++, 1);
...@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const std::vector<uint32_t> lws = {16, 64}; const std::vector<uint32_t> lws = {16, 64};
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event);
cl::NDRange(lws[0], lws[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
inline DATA_TYPE4 do_activation(DATA_TYPE4 in, inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU #ifdef USE_PRELU
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include "mace/kernels/concat.h" #include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel, ...@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image()))); kernel->setArg(idx++,
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image()))); *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3))); kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image()))); kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
} }
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
} }
...@@ -97,27 +96,25 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -97,27 +96,25 @@ static void ConcatN(cl::Kernel *kernel,
index_t input_channel_blk = input->dim(3) / 4; index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk; chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "concat_n_opencl_kernel_" ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< input_channel_blk << "_"
<< width << "_"
<< batch * height; << batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
} }
} }
template<typename T> template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list, void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output, const std::vector<const Tensor *> &input_list,
StatsFuture *future) { Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size(); const int inputs_count = input_list.size();
MACE_CHECK(inputs_count >= 2 && axis_ == 3) MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3"; << "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0]; const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0; bool divisible_four = input0->dim(axis_) % 4 == 0;
...@@ -137,8 +134,9 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te ...@@ -137,8 +134,9 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
} }
output_shape[axis_] += input->dim(axis_); output_shape[axis_] += input->dim(axis_);
} }
MACE_CHECK(inputs_count == 2 || divisible_four, MACE_CHECK(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output->ResizeImage(output_shape, image_shape); output->ResizeImage(output_shape, image_shape);
...@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te ...@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
default: default:
if (divisible_four) { if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future); ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
} } else {
else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
} }
}; };
template template struct ConcatFunctor<DeviceType::OPENCL, float>;
struct ConcatFunctor<DeviceType::OPENCL, float>; template struct ConcatFunctor<DeviceType::OPENCL, half>;
template
struct ConcatFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
template<typename T> template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)( typedef void (*Conv2dOpenclFunction)(
cl::Kernel *kernel, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, const Tensor *bias, const int stride, const int *padding,
const int *padding, const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt, Tensor *output,
Tensor *output, StatsFuture *future); StatsFuture *future);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = static const Conv2dOpenclFunction selector[5] = {
{Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
...@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(), CalcOutputSize(input->shape().data(), filter->shape().data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
...@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
relux_max_limit_, DataTypeToEnum<T>::value, output, future); dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future); dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} }
} }
template template struct Conv2dFunctor<DeviceType::OPENCL, float>;
struct Conv2dFunctor<DeviceType::OPENCL, float>; template struct Conv2dFunctor<DeviceType::OPENCL, half>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
// FIXME handle flexable data type: half not supported // FIXME handle flexable data type: half not supported
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input_height)); kernel->setArg(idx++, static_cast<int>(input_height));
...@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input->dim(1))); kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2))); kernel->setArg(idx++, static_cast<int>(input->dim(2)));
......
...@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
runtime->BuildKernel("conv_2d", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg(idx++, kernel->setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel->setArg(idx++, kernel->setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1))); kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2))); kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
......
...@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
if(kernel->get() == nullptr) { if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0); const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1); const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2); const index_t input_width = input->dim(2);
...@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); *kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg( kernel->setArg(idx++, *(filter->opencl_image()));
idx++, *(filter->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel->setArg( kernel->setArg(idx++, *(bias->opencl_image()));
idx++, *(bias->opencl_image()));
} }
kernel->setArg( kernel->setArg(idx++, *(output->opencl_image()));
idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height)); kernel->setArg(idx++, static_cast<short>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width)); kernel->setArg(idx++, static_cast<short>(input_width));
...@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(), CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, output, future);
} }
......
...@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = input0->dim(0); const index_t batch = input0->dim(0);
const index_t height = input0->dim(1); const index_t height = input0->dim(1);
const index_t width = input0->dim(2); const index_t width = input0->dim(2);
...@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input0->opencl_image()));
*(input0->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image()));
kernel_.setArg(idx++,
*(input1->opencl_image()));
if (!coeff_.empty()) { if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]); kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]); kernel_.setArg(idx++, coeff_[1]);
...@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(batch_height_pixels)};
static_cast<uint32_t>(batch_height_pixels)
};
const std::vector<uint32_t> lws = {64, 16, 1}; const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss; std::stringstream ss;
ss << "eltwise_opencl_kernel_" ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -10,14 +10,13 @@ ...@@ -10,14 +10,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
...@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
default: default:
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options); kernel_ =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, kernel_.setArg(idx++, *(input->opencl_image()));
*(input->opencl_image())); kernel_.setArg(idx++, *(weight->opencl_image()));
kernel_.setArg(idx++,
*(weight->opencl_image()));
if (bias != nullptr) { if (bias != nullptr) {
kernel_.setArg(idx++, kernel_.setArg(idx++, *(bias->opencl_image()));
*(bias->opencl_image()));
} }
kernel_.setArg(idx++, kernel_.setArg(idx++, *(output->opencl_image()));
*(output->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(input->dim(1))); kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2))); kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(input->dim(3))); kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
...@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
static_cast<uint32_t>(output_blocks),
}; };
const std::vector<uint32_t> lws = {16, 64, 1}; const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "fc_opencl_kernel_" ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(0) << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
// //
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -28,8 +28,9 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */ ...@@ -28,8 +28,9 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
} }
// [H * W * M, (Ic + 3) / 4] // [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */ void CalDepthwiseConv2dFilterImageShape(
std::vector<size_t> &image_shape) { const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3]; image_shape[0] = shape[0] * shape[1] * shape[3];
...@@ -47,8 +48,9 @@ void CalArgImageShape(const std::vector<index_t> &shape, ...@@ -47,8 +48,9 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// Only support 3x3 now // Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc] // [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/ void CalWinogradFilterImageShape(
std::vector<size_t> &image_shape) { const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]); image_shape[0] = RoundUpDiv4(shape[1]);
...@@ -115,19 +117,16 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -115,19 +117,16 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
} }
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) { const BufferType type) {
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1}; return {16, shape[0], shape[1], 1};
}else if (type == IN_OUT_HEIGHT) { } else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
((shape[1] - 1) / 2) *
((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1}; return {16, shape[3], out_width, 1};
} else { } else {
LOG(FATAL) << "Mace not supported yet."; LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>(); return std::vector<index_t>();
} }
} }
...@@ -188,10 +187,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -188,10 +187,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size); local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2], local_ws[2] =
kwg_size / (local_ws[0] * local_ws[1])); std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return { return {
// TODO tuning these magic numbers // TODO tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1}, {kwg_size / 32, 4, 8, 1},
...@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int { std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D"; MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[3]; uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks; const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
...@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) { if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]); uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
(*tuning_result)[3] = num_blocks; (*tuning_result)[3] = num_blocks;
const uint32_t block_size = gws[2] / num_blocks; const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2), cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
...@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
{kwg_size / 256, 256, 1}, {kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1}, {kwg_size / 512, 512, 1},
{kwg_size, 1, 1}, {kwg_size, 1, 1},
{1, kwg_size, 1} {1, kwg_size, 1}};
};
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int { std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d"; MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS; cl_int error = CL_SUCCESS;
if (timer == nullptr) { if (timer == nullptr) {
uint32_t num_blocks = params[2]; uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks; const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++; if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(0, i * block_size), cl::NDRange(params[0], params[1]), nullptr, &event);
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
...@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) { if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros(); double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming(); timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]); uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
(*tuning_result)[2] = num_blocks; (*tuning_result)[2] = num_blocks;
const uint32_t block_size = gws[1] / num_blocks; const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++; if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) { for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming(); timer->AccumulateTiming();
...@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key, Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
lws, tuning_key, lws, params_generator, func, &timer);
params_generator,
func,
&timer);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
...@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel, ...@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
} }
}; };
} }
} }
} // namespace kernels } // namespace kernels
......
...@@ -5,16 +5,16 @@ ...@@ -5,16 +5,16 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_ #ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/core/future.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType { enum BufferType {
CONV2D_FILTER = 0, CONV2D_FILTER = 0,
...@@ -31,7 +31,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -31,7 +31,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape); std::vector<size_t> &image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type); const BufferType type);
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
...@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
...@@ -72,12 +71,12 @@ inline bool LimitKernelTime() { ...@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
} }
namespace { namespace {
template<typename T> template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v; (*ss) << v;
} }
template<typename T, typename... Args> template <typename T, typename... Args>
void AppendToStream(std::stringstream *ss, void AppendToStream(std::stringstream *ss,
const std::string &delimiter, const std::string &delimiter,
T first, T first,
...@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss, ...@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
} }
} // namespace } // namespace
template<typename... Args> template <typename... Args>
std::string Concat(Args... args) { std::string Concat(Args... args) {
std::stringstream ss; std::stringstream ss;
AppendToStream(&ss, "_", args...); AppendToStream(&ss, "_", args...);
......
...@@ -11,12 +11,10 @@ namespace mace { ...@@ -11,12 +11,10 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()( void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const Tensor *A, const Tensor *B,
const Tensor *B, Tensor *C,
Tensor *C, StatsFuture *future) {
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
...@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, kernel_.setArg(idx++, *(B->opencl_image()));
*(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height)); kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width)); kernel_.setArg(idx++, static_cast<int>(width));
...@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
}; };
const std::vector<uint32_t> lws = {16, 64, 1}; const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss; std::stringstream ss;
ss << "matmul_opencl_kernel_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(0) << "_" << C->dim(2) << "_" << C->dim(3);
<< C->dim(1) << "_"
<< C->dim(2) << "_"
<< C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; };
template template struct MatMulFunctor<DeviceType::OPENCL, float>;
struct MatMulFunctor<DeviceType::OPENCL, float>;
template template struct MatMulFunctor<DeviceType::OPENCL, half>;
struct MatMulFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -11,17 +11,15 @@ ...@@ -11,17 +11,15 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = { std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
kernels_[0], kernels_[1], input->dim(3)};
input->dim(3), input->dim(3)
};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
...@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
}; };
std::vector<uint32_t> lws = {8, 16, 8, 1}; std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "pooling_opencl_kernel_" ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct PoolingFunctor<DeviceType::OPENCL, float>;
struct PoolingFunctor<DeviceType::OPENCL, float>; template struct PoolingFunctor<DeviceType::OPENCL, half>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape{batch, out_height, out_width, channels}; std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
BufferType::IN_OUT_CHANNEL,
output_image_shape); output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
...@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -6,13 +6,13 @@ ...@@ -6,13 +6,13 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< output->dim(0) << "_" << "_" << output->dim(2) << "_" << output->dim(3);
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
struct SoftmaxFunctor<DeviceType::OPENCL, float>; template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
template
struct SoftmaxFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -5,20 +5,21 @@ ...@@ -5,20 +5,21 @@
#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h" #include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor, void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<index_t> &output_shape, Tensor *space_tensor,
Tensor *batch_tensor, const std::vector<index_t> &output_shape,
StatsFuture *future) { Tensor *batch_tensor,
StatsFuture *future) {
const char *kernel_name = nullptr; const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
...@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DCMD_DATA_TYPE=" +
kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
if (b2s_) { if (b2s_) {
...@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {chan_blk, const uint32_t gws[3] = {
static_cast<uint32_t>(batch_tensor->dim(2)), chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8, 1}; const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3); << batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
......
...@@ -11,21 +11,21 @@ ...@@ -11,21 +11,21 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor, void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *output_tensor, const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
StatsFuture *future) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(), input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
padding_type_, output_shape.data(), paddings.data()); strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(), CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data()); paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
} }
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h = (output_shape[1] + 1) / 2;
...@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
obfuscated_kernel_name,
built_options); built_options);
uint32_t idx = 0; uint32_t idx = 0;
...@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i ...@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
} }
const uint32_t gws[2] = {static_cast<uint32_t>(out_width), const uint32_t gws[2] = {
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))}; static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1}; const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_transform_kernel_" ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
template<typename T> template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor, void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias, const Tensor *input_tensor,
Tensor *output_tensor, const Tensor *bias,
StatsFuture *future) { Tensor *output_tensor,
std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)}; StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name); built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value)); obfuscated_kernel_name);
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value)); built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) { switch (activation_) {
case NOOP: case NOOP:
...@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te ...@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
obfuscated_kernel_name,
built_options); built_options);
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image()))); kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) { if (bias != nullptr) {
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image()))); kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
} }
kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image()))); kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
...@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te ...@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
} }
const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)), const uint32_t gws[2] = {
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))}; static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1}; const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss; std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
<< input_tensor->dim(3); << input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
struct WinogradTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>; template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
#include <limits> #include <limits>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
...@@ -42,7 +42,7 @@ struct PoolingFunctorBase { ...@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
const int *dilations_; const int *dilations_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase { struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(const PoolingType pooling_type,
const int *kernels, const int *kernels,
...@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase(pooling_type, kernels, : PoolingFunctorBase(
strides, padding_type, pooling_type, kernels, strides, padding_type, paddings, dilations) {
paddings, dilations) {} }
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = { std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1], kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
input_tensor->dim(3), input_tensor->dim(3)
};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize( kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_, strides_, input_tensor->shape().data(), filter_shape.data(), dilations_,
padding_type_, output_shape.data(), paddings.data()); strides_, padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(), CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
} }
output_tensor->Resize(output_shape); output_tensor->Resize(output_shape);
...@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest(); T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
res = std::max(res, input[input_offset]); res = std::max(res, input[input_offset]);
} }
} }
...@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
...@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw; int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 && if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) { inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset]; sum += input[input_offset];
block_size += 1; block_size += 1;
} }
...@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
} }
} }
} }
}; };
template<> template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()( void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
Tensor *output_tensor,
StatsFuture *future);
template<typename T> template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(const PoolingType pooling_type,
const int *kernels, const int *kernels,
...@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase(pooling_type, kernels, : PoolingFunctorBase(
strides, padding_type, pooling_type, kernels, strides, padding_type, paddings, dilations) {
paddings, dilations) {} }
void operator()(const Tensor *input_tensor, void operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESHAPE_H_ #define MACE_KERNELS_RESHAPE_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -25,7 +25,6 @@ struct ReshapeFunctor { ...@@ -25,7 +25,6 @@ struct ReshapeFunctor {
} }
}; };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_ #define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -74,9 +74,9 @@ void ResizeImage(const T *images, ...@@ -74,9 +74,9 @@ void ResizeImage(const T *images,
const T *batch_input_ptr = images + in_batch_num_values * b; const T *batch_input_ptr = images + in_batch_num_values * b;
T *batch_output_ptr = output + out_batch_num_values * b; T *batch_output_ptr = output + out_batch_num_values * b;
const T *y_lower_input_ptr = const T *y_lower_input_ptr =
batch_input_ptr + ys[y].lower * in_width * channels; batch_input_ptr + ys[y].lower * in_width * channels;
const T *y_upper_input_ptr = const T *y_upper_input_ptr =
batch_input_ptr + ys[y].upper * in_width * channels; batch_input_ptr + ys[y].upper * in_width * channels;
T *y_output_ptr = batch_output_ptr + y * out_width * channels; T *y_output_ptr = batch_output_ptr + y * out_width * channels;
const float ys_lerp = ys[y].lerp; const float ys_lerp = ys[y].lerp;
...@@ -95,7 +95,7 @@ void ResizeImage(const T *images, ...@@ -95,7 +95,7 @@ void ResizeImage(const T *images,
const T bottom_right = bottom_right_ptr[c]; const T bottom_right = bottom_right_ptr[c];
output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left, output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp); bottom_right, xs_lerp, ys_lerp);
} }
} }
} }
...@@ -107,10 +107,10 @@ struct ResizeBilinearFunctorBase { ...@@ -107,10 +107,10 @@ struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size, ResizeBilinearFunctorBase(const std::vector<index_t> &size,
bool align_corners) bool align_corners)
: align_corners_(align_corners) { : align_corners_(align_corners) {
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
out_height_ = size[0]; out_height_ = size[0];
out_width_ = size[1]; out_width_ = size[1];
} }
protected: protected:
bool align_corners_; bool align_corners_;
...@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { ...@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
} }
}; };
template<typename T> template <typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase { struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {} : ResizeBilinearFunctorBase(size, align_corners) {}
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#define MACE_KERNELS_CONV_2D_H_ #define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -16,11 +16,10 @@ namespace kernels { ...@@ -16,11 +16,10 @@ namespace kernels {
struct SpaceToBatchFunctorBase { struct SpaceToBatchFunctorBase {
SpaceToBatchFunctorBase(const std::vector<int> &paddings, SpaceToBatchFunctorBase(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): bool b2s)
paddings_(paddings.begin(), paddings.end()), : paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()), block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s) b2s_(b2s) {}
{}
std::vector<int> paddings_; std::vector<int> paddings_;
std::vector<int> block_shape_; std::vector<int> block_shape_;
...@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase { ...@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){} bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor, void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
...@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{ ...@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
}; };
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){} bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor, void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
...@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{ ...@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_ #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h" #include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/conv_pool_2d_util.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -17,38 +17,36 @@ namespace kernels { ...@@ -17,38 +17,36 @@ namespace kernels {
struct WinogradTransformFunctorBase { struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type, WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: strides_({1, 1}), dilations_({1, 1}), : strides_({1, 1}),
padding_type_(padding_type), paddings_(paddings) {} dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings) {}
const std::vector<int> strides_; // [stride_h, stride_w] const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w] const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_; Padding padding_type_;
std::vector<int> paddings_; std::vector<int> paddings_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase { struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {} : WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input, void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
template<typename T> template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase { struct WinogradTransformFunctor<DeviceType::OPENCL, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {} : WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input, void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
...@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase { ...@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
const float relux_max_limit_; const float relux_max_limit_;
}; };
template<DeviceType D, typename T> template <DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch, WinogradInverseTransformFunctor(const int batch,
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {} : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
...@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
StatsFuture *future) { StatsFuture *future) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
template<typename T> template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch, WinogradInverseTransformFunctor(const int batch,
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {} : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
......
...@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> { ...@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr; const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor); output_tensor->ResizeLike(input_tensor);
......
...@@ -214,9 +214,7 @@ void TestSimplePrelu() { ...@@ -214,9 +214,7 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
"Alpha", {2},
{2.0, 3.0});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", BufferToImage<D, float>(net, "Input", "InputImage",
...@@ -250,7 +248,8 @@ void TestSimplePrelu() { ...@@ -250,7 +248,8 @@ void TestSimplePrelu() {
} }
auto expected = CreateTensor<float>( auto expected = CreateTensor<float>(
{2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); {2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
......
...@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> { ...@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
inputs[i] = this->Input(i); inputs[i] = this->Input(i);
MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size()); MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: " MACE_CHECK(inputs[0]->size() == inputs[i]->size())
<< MakeString(inputs[0]->shape()) << "Input 0: " << MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size() << ", size: " << inputs[0]->size() << ". Input " << i << ": "
<< ". Input " << i << ": " << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
<< MakeString(inputs[i]->shape())
<< ", size: " << inputs[i]->size();
} }
functor_(inputs, output_tensor, future); functor_(inputs, output_tensor, future);
......
...@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
{n, h, w, c});
} }
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
......
...@@ -76,7 +76,7 @@ static void BatchNorm( ...@@ -76,7 +76,7 @@ static void BatchNorm(
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \ BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -12,15 +12,14 @@ ...@@ -12,15 +12,14 @@
namespace mace { namespace mace {
template<DeviceType D, typename T> template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> { class BatchToSpaceNDOp : public Operator<D, T> {
public: public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_( functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}), true) {}
true) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT); const Tensor *batch_tensor = this->Input(INPUT);
...@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0); std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(batch_tensor, space_tensor, output_shape.data()); CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future); functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
future);
return true; return true;
} }
...@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
Tensor *output, Tensor *output,
index_t *output_shape) { index_t *output_shape) {
auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}); auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}); auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D"); MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D"); MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D"); MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
...@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> { ...@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size(); const index_t block_dims = block_shape.size();
index_t block_shape_product = 1; index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) { for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1"); MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim]; const index_t block_shape_value = block_shape[block_dim];
const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value const index_t cropped_input_size =
- crops[block_dim * 2] input_tensor->dim(block_dim + 1) * block_shape_value -
- crops[block_dim * 2 + 1]; crops[block_dim * 2] - crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0, MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
"cropped size must be non-negative");
block_shape_product *= block_shape_value; block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = cropped_input_size; output_shape[block_dim + 1] = cropped_input_size;
} }
......
...@@ -41,7 +41,7 @@ static void BMBatchToSpace( ...@@ -41,7 +41,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \ BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \ } \
......
...@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \ BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -11,16 +11,17 @@ ...@@ -11,16 +11,17 @@
namespace mace { namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> { class BufferToImageOp : public Operator<D, T> {
public: public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws) BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, ws) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>( kernels::BufferType type =
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER))); static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output, future); functor_(const_cast<Tensor *>(input_tensor), type, output, future);
......
此差异已折叠。
...@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> { ...@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
input->shape()[1]); input->shape()[1]);
output->ResizeLike(input); output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(), functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
output->mutable_data<T>(), future); future);
return true; return true;
} }
......
...@@ -41,7 +41,7 @@ static void ChannelShuffle( ...@@ -41,7 +41,7 @@ static void ChannelShuffle(
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \ static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \
ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \ ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \
} \ } \
......
...@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> { ...@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
public: public:
ConcatOp(const OperatorDef &op_def, Workspace *ws) ConcatOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){} functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat"; MACE_CHECK(this->InputSize() >= 2)
<< "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs(); const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3); const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
const int32_t input_dims = input_list[0]->dim_size(); const int32_t input_dims = input_list[0]->dim_size();
......
...@@ -37,11 +37,10 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { ...@@ -37,11 +37,10 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
} }
} }
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \ #define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1( \ static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \ ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \ } \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1) BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO(0, 1000); BM_CONCAT_CPU_MACRO(0, 1000);
...@@ -90,13 +89,11 @@ static void OpenclConcatHelper(int iters, ...@@ -90,13 +89,11 @@ static void OpenclConcatHelper(int iters,
} }
} }
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \ std::vector<index_t> shape = {N, H, W, C}; \
int iters) { \ OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
std::vector<index_t> shape = {N, H, W, C}; \ } \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float); BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
......
...@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) { ...@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
concat_axis_size += input_shapes[i][axis]; concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], inputs[i]); GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::CPU, float>( net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
MakeString("Input", i), input_shapes[i], inputs[i]); input_shapes[i], inputs[i]);
} }
// Run // Run
...@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) { ...@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
} }
TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) { TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32}, OpenclRandomTest<float>(
{3, 32, 32, 32}, {3, 32, 32, 32}}, 3); {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
} }
\ No newline at end of file
此差异已折叠。
...@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> { ...@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
functor_(static_cast<kernels::EltwiseType>( functor_(static_cast<kernels::EltwiseType>(
OperatorBase::GetSingleArgument<int>( OperatorBase::GetSingleArgument<int>(
"type", static_cast<int>(kernels::EltwiseType::SUM))), "type", static_cast<int>(kernels::EltwiseType::SUM))),
OperatorBase::GetRepeatedArgument<float>("coeff")){} OperatorBase::GetRepeatedArgument<float>("coeff")) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0); const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->Input(1); const Tensor *input1 = this->Input(1);
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape"; MACE_CHECK(input0->dim_size() == input1->dim_size())
for(int i = 0; i < input0->dim_size(); ++i) { << "Inputs of Eltwise op must be same shape";
MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape"; for (int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i))
<< "Inputs of Eltwise op must be same shape";
} }
output->ResizeLike(input0); output->ResizeLike(input0);
......
...@@ -61,7 +61,7 @@ static void EltwiseBenchmark( ...@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \ EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \ iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册