提交 6b9aafd4 编写于 作者: 叶剑武

Merge branch 'cpplint' into 'master'

Reformatting code and enable cpplint

See merge request !273
stages:
- ops_test
- ops_benchmark
- cpplint
cpplint:
stage: cpplint
only:
- master
script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
ops_test:
stage: ops_test
......
......@@ -9,8 +9,8 @@
#include <malloc.h>
#include "mace/core/registry.h"
#include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
free(data);
};
void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset;
return (char *)buffer + offset;
}
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
......
......@@ -83,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname, \
enforce_lossless_conversion) \
template <> \
std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
const string &name, const std::vector<T> &default_value) const { \
if (arg_map_.count(name) == 0) { \
return default_value; \
} \
std::vector<T> values; \
std::vector<T> values; \
for (const auto &v : arg_map_.at(name).fieldname()) { \
if (enforce_lossless_conversion) { \
auto supportsConversion = \
......
......@@ -5,9 +5,9 @@
#ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector>
#include "mace/core/allocator.h"
#include "mace/core/types.h"
namespace mace {
......@@ -39,23 +39,19 @@ class BufferBase {
virtual bool OnHost() const = 0;
virtual index_t offset() const {
return 0;
};
virtual index_t offset() const { return 0; };
template<typename T>
template <typename T>
const T *data() const {
return reinterpret_cast<const T *>(raw_data());
}
template<typename T>
template <typename T>
T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data());
}
index_t size() const {
return size_;
}
index_t size() const { return size_; }
protected:
index_t size_;
......@@ -64,26 +60,26 @@ class BufferBase {
class Buffer : public BufferBase {
public:
Buffer(Allocator *allocator)
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
Buffer(Allocator *allocator, index_t size)
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
buf_ = allocator->New(size);
}
Buffer(Allocator *allocator, void *data, index_t size)
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
virtual ~Buffer() {
if (mapped_buf_ != nullptr) {
......@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length);
memcpy(mapped_buf_, (char *)src + offset, length);
}
bool OnHost() const {
return allocator_->OnHost();
}
bool OnHost() const { return allocator_->OnHost(); }
private:
Allocator *allocator_;
......@@ -168,23 +162,24 @@ class Buffer : public BufferBase {
void *mapped_buf_;
bool is_data_owner_;
DISABLE_COPY_AND_ASSIGN(Buffer);
DISABLE_COPY_AND_ASSIGN(Buffer);
};
class Image : public BufferBase {
public:
Image()
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(),
1, std::multiplies<index_t>())
* GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
: BufferBase(
std::accumulate(
shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
shape_ = shape;
data_type_ = data_type;
buf_ = allocator_->NewImage(shape, data_type);
......@@ -214,9 +209,7 @@ class Image : public BufferBase {
return mapped_buf_;
}
std::vector<size_t> image_shape() const {
return shape_;
}
std::vector<size_t> image_shape() const { return shape_; }
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
......@@ -241,17 +234,11 @@ class Image : public BufferBase {
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
MACE_NOT_IMPLEMENTED;
}
void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
bool OnHost() const {
return allocator_->OnHost();
}
bool OnHost() const { return allocator_->OnHost(); }
private:
Allocator *allocator_;
......@@ -260,34 +247,25 @@ class Image : public BufferBase {
void *buf_;
void *mapped_buf_;
DISABLE_COPY_AND_ASSIGN(Image);
DISABLE_COPY_AND_ASSIGN(Image);
};
class BufferSlice : public BufferBase {
public:
BufferSlice()
: buffer_(nullptr),
mapped_buf_(nullptr),
offset_(0),
length_(0) {}
: buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_,
"buffer slice offset + length (",
offset,
" + ",
length,
") should <= ",
size_);
MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
offset, " + ", length, ") should <= ", size_);
}
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
other.offset_,
other.length_) {}
BufferSlice(const BufferSlice &other)
: BufferSlice(other.buffer_, other.offset_, other.length_) {}
~BufferSlice() {
if (buffer_ != nullptr && mapped_buf_ != nullptr) {
......@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_;
return (char *)buffer_->raw_data() + offset_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
......@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_NOT_IMPLEMENTED;
}
void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_);
......@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
MACE_NOT_IMPLEMENTED;
}
void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
index_t offset() const {
return offset_;
}
index_t offset() const { return offset_; }
bool OnHost() const {
return buffer_->OnHost();
}
bool OnHost() const { return buffer_->OnHost(); }
private:
BufferBase *buffer_;
......@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
index_t offset_;
index_t length_;
};
}
#endif // MACE_CORE_BUFFER_H_
#endif // MACE_CORE_BUFFER_H_
此差异已折叠。
......@@ -3,9 +3,9 @@
//
#include "mace/core/net.h"
#include "mace/utils/utils.h"
#include "mace/utils/timer.h"
#include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
Workspace *ws,
DeviceType type,
const NetMode mode)
: NetBase(op_registry, net_def, ws, type),
device_type_(type) {
: NetBase(op_registry, net_def, ws, type), device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
......@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
"(", op->debug_def().type(), ")");
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), ")");
bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
......@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws,
DeviceType type,
const NetMode mode) {
std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode));
return net;
}
......
......@@ -7,10 +7,10 @@
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -147,7 +147,7 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
class OperatorRegistry {
public:
typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
RegistryType;
RegistryType;
OperatorRegistry();
~OperatorRegistry() = default;
RegistryType *registry() { return &registry_; };
......
......@@ -36,6 +36,6 @@ class PreallocatedPooledAllocator {
std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
};
} // namespace mace
} // namespace mace
#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
......@@ -2,19 +2,19 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <vector>
#include <thread>
#include <sys/time.h>
#include <thread>
#include <vector>
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
namespace {
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
}
namespace mace {
......@@ -63,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// const node
std::thread const_thread([&]() {
std::vector<hexagon_nn_const_node> const_node_list;
for (const ConstTensor &const_tensor: net_def.tensors()) {
for (const ConstTensor &const_tensor : net_def.tensors()) {
std::vector<int> tensor_shape(const_tensor.dims().begin(),
const_tensor.dims().end());
const_tensor.dims().end());
while (tensor_shape.size() < 4) {
tensor_shape.insert(tensor_shape.begin(), 1);
}
......@@ -77,32 +77,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
const_node.tensor.width = tensor_shape[2];
const_node.tensor.depth = tensor_shape[3];
if (const_tensor.data_type() == DataType::DT_INT32
&& const_tensor.data_size() == 0) {
if (const_tensor.data_type() == DataType::DT_INT32 &&
const_tensor.data_size() == 0) {
const_node.tensor.data = NULL;
const_node.tensor.dataLen = 0;
} else {
const_node.tensor.data =
const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen =
const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type());
const_cast<unsigned char *>(const_tensor.data());
const_node.tensor.dataLen = const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type());
}
const_node_list.push_back(const_node);
// 255 is magic number: why fastrpc limits sequence length to that?
if (const_node_list.size() >= 250) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
const_node_list.data(),
const_node_list.size())
== 0, "append const node error");
MACE_CHECK(
hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
const_node_list.clear();
}
}
if (!const_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
MACE_CHECK(
hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
const_node_list.size()) == 0,
"append const node error");
}
const_node_list.clear();
});
......@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
std::vector<hexagon_nn_input> inputs;
std::vector<hexagon_nn_output> outputs;
for (const OperatorDef &op: net_def.op()) {
for (const OperatorDef &op : net_def.op()) {
int op_id = op_map.GetOpId(op.type());
inputs.resize(op.node_input().size());
for (size_t i = 0; i < op.node_input().size(); ++i) {
......@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
cached_inputs.push_back(inputs);
cached_outputs.push_back(outputs);
hexagon_nn_padding_type
padding_type = static_cast<hexagon_nn_padding_type>(
op.padding());
hexagon_nn_padding_type padding_type =
static_cast<hexagon_nn_padding_type>(op.padding());
hexagon_nn_op_node op_node;
op_node.node_id = node_id(op.node_id());
......@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
op_node_list.push_back(op_node);
if (op_node_list.size() >= 125) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
op_node_list.data(),
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.size()) == 0,
"append node error");
op_node_list.clear();
......@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
}
if (!op_node_list.empty()) {
MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
op_node_list.data(),
MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
op_node_list.size()) == 0,
"append node error");
}
......@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// input info
num_inputs_ = 0;
for (const InputInfo &input_info: net_def.input_info()) {
for (const InputInfo &input_info : net_def.input_info()) {
std::vector<index_t> input_shape;
input_shape.insert(input_shape.begin(),
input_info.dims().begin(), input_info.dims().end());
input_shape.insert(input_shape.begin(), input_info.dims().begin(),
input_info.dims().end());
while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1);
}
......@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// output info
num_outputs_ = 0;
for (const OutputInfo &output_info: net_def.output_info()) {
for (const OutputInfo &output_info : net_def.output_info()) {
std::vector<index_t> output_shape;
output_shape.insert(output_shape.begin(),
output_info.dims().begin(), output_info.dims().end());
output_shape.insert(output_shape.begin(), output_info.dims().begin(),
output_info.dims().end());
while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1);
}
......@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
return hexagon_nn_teardown(nn_id_) == 0;
}
#define PRINT_BUFSIZE (2*1024*1024)
#define PRINT_BUFSIZE (2 * 1024 * 1024)
void HexagonControlWrapper::PrintLog() {
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_getlog(nn_id_,
reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, "print log error");
MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
"print log error");
LOG(INFO) << std::string(buf);
delete[]buf;
delete[] buf;
}
void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph";
char *buf;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
MACE_CHECK(hexagon_nn_snpprint(nn_id_,
reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0, "print graph error");
MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
PRINT_BUFSIZE) == 0,
"print graph error");
LOG(INFO) << std::string(buf);
delete[]buf;
delete[] buf;
}
void HexagonControlWrapper::SetDebugLevel(int level) {
......@@ -256,9 +253,9 @@ void HexagonControlWrapper::GetPerfInfo() {
LOG(INFO) << "Get perf info";
std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
unsigned int n_items = 0;
MACE_CHECK(
hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0,
"get perf info error");
MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
&n_items) == 0,
"get perf info error");
std::unordered_map<uint32_t, float> node_id_counters;
std::unordered_map<std::string, std::pair<int, float>> node_type_counters;
......@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
unsigned int node_id = perf_info[i].node_id;
unsigned int node_type_id = perf_info[i].node_type;
node_id_counters[node_id] =
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32)
+ perf_info[i].counter_lo) * 1.0f / perf_info[i].executions;
((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
perf_info[i].counter_lo) *
1.0f / perf_info[i].executions;
char node_type_buf[MAX_NODE];
hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
......@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
total_duration += node_id_counters[node_id];
}
for (auto &node_type_counter: node_type_counters) {
for (auto &node_type_counter : node_type_counters) {
LOG(INFO) << "node type: " << node_type_counter.first
<< ", time: " << node_type_counter.second.first
<< ", duration: " << node_type_counter.second.second;
......@@ -312,33 +310,25 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
output_tensor->Resize(output_shapes_[0]);
std::vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
int res = hexagon_nn_execute(
nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
input_tensor.shape()[2], input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
input_tensor.raw_size(), &output_shape[0], &output_shape[1],
&output_shape[2], &output_shape[3],
reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
output_tensor->raw_size(), &output_bytes);
MACE_CHECK(res == 0, "execute error");
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) {
bool HexagonControlWrapper::ExecuteGraphNew(
const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
......@@ -355,7 +345,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
......@@ -365,16 +355,16 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
int res =
hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
......@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor,
&input_tensors[0],
min_in_data,
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
......@@ -409,11 +397,9 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0],
*min_out_data,
*max_out_data,
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
output_tensor);
return true;
}
} // namespace mace
} // namespace mace
......@@ -16,16 +16,17 @@ namespace mace {
class HexagonControlWrapper {
public:
HexagonControlWrapper() {};
HexagonControlWrapper(){};
int GetVersion();
bool Config();
bool Init();
bool Finalize();
bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const NetDef &net_def);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors,
bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
std::vector<Tensor> *output_tensors);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor);
bool TeardownGraph();
void PrintLog();
......@@ -38,9 +39,7 @@ class HexagonControlWrapper {
private:
static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) {
return NODE_ID_OFFSET + nodeid;
}
inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
int nn_id_;
Quantizer quantizer_;
......@@ -52,9 +51,8 @@ class HexagonControlWrapper {
uint32_t num_inputs_;
uint32_t num_outputs_;
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
};
}
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
......@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
return 0;
}
int hexagon_controller_DeInitHexagon() {
int hexagon_controller_DeInitHexagon() { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE {
return 0;
}
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
......@@ -2,27 +2,27 @@
#define _HEXAGON_NN_H
#ifndef __QAIC_HEADER
#define __QAIC_HEADER(ff) ff
#endif //__QAIC_HEADER
#endif //__QAIC_HEADER
#ifndef __QAIC_HEADER_EXPORT
#define __QAIC_HEADER_EXPORT
#endif // __QAIC_HEADER_EXPORT
#endif // __QAIC_HEADER_EXPORT
#ifndef __QAIC_HEADER_ATTRIBUTE
#define __QAIC_HEADER_ATTRIBUTE
#endif // __QAIC_HEADER_ATTRIBUTE
#endif // __QAIC_HEADER_ATTRIBUTE
#ifndef __QAIC_IMPL
#define __QAIC_IMPL(ff) ff
#endif //__QAIC_IMPL
#endif //__QAIC_IMPL
#ifndef __QAIC_IMPL_EXPORT
#define __QAIC_IMPL_EXPORT
#endif // __QAIC_IMPL_EXPORT
#endif // __QAIC_IMPL_EXPORT
#ifndef __QAIC_IMPL_ATTRIBUTE
#define __QAIC_IMPL_ATTRIBUTE
#endif // __QAIC_IMPL_ATTRIBUTE
#endif // __QAIC_IMPL_ATTRIBUTE
#ifdef __cplusplus
extern "C" {
#endif
......@@ -30,92 +30,160 @@ extern "C" {
#define __QAIC_STRING1_OBJECT_DEFINED__
#define __STRING1_OBJECT__
typedef struct _cstring1_s {
char* data;
int dataLen;
char *data;
int dataLen;
} _cstring1_t;
#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
typedef struct hexagon_nn_input hexagon_nn_input;
struct hexagon_nn_input {
unsigned int src_id;
unsigned int output_idx;
unsigned int src_id;
unsigned int output_idx;
};
typedef struct hexagon_nn_output hexagon_nn_output;
struct hexagon_nn_output {
unsigned int max_size;
unsigned int unused;
unsigned int max_size;
unsigned int unused;
};
typedef struct hexagon_nn_perfinfo hexagon_nn_perfinfo;
struct hexagon_nn_perfinfo {
unsigned int node_id;
unsigned int node_type;
unsigned int executions;
unsigned int unused;
unsigned int counter_lo;
unsigned int counter_hi;
unsigned int node_id;
unsigned int node_type;
unsigned int executions;
unsigned int unused;
unsigned int counter_lo;
unsigned int counter_hi;
};
typedef int hexagon_nn_nn_id;
enum hexagon_nn_padding_type {
NN_PAD_NA,
NN_PAD_SAME,
NN_PAD_VALID,
NN_PAD_MIRROR_REFLECT,
NN_PAD_MIRROR_SYMMETRIC,
NN_PAD_SAME_CAFFE,
_32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
NN_PAD_NA,
NN_PAD_SAME,
NN_PAD_VALID,
NN_PAD_MIRROR_REFLECT,
NN_PAD_MIRROR_SYMMETRIC,
NN_PAD_SAME_CAFFE,
_32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
};
typedef enum hexagon_nn_padding_type hexagon_nn_padding_type;
typedef struct hexagon_nn_tensordef hexagon_nn_tensordef;
struct hexagon_nn_tensordef {
unsigned int batches;
unsigned int height;
unsigned int width;
unsigned int depth;
unsigned char* data;
int dataLen;
unsigned int data_valid_len;
unsigned int unused;
unsigned int batches;
unsigned int height;
unsigned int width;
unsigned int depth;
unsigned char *data;
int dataLen;
unsigned int data_valid_len;
unsigned int unused;
};
typedef struct hexagon_nn_op_node hexagon_nn_op_node;
struct hexagon_nn_op_node {
unsigned int node_id;
unsigned int operation;
hexagon_nn_padding_type padding;
hexagon_nn_input* inputs;
int inputsLen;
hexagon_nn_output* outputs;
int outputsLen;
unsigned int node_id;
unsigned int operation;
hexagon_nn_padding_type padding;
hexagon_nn_input *inputs;
int inputsLen;
hexagon_nn_output *outputs;
int outputsLen;
};
typedef struct hexagon_nn_const_node hexagon_nn_const_node;
struct hexagon_nn_const_node {
unsigned int node_id;
hexagon_nn_tensordef tensor;
unsigned int node_id;
hexagon_nn_tensordef tensor;
};
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
unsigned char *buf,
int bufLen)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int operation,
hexagon_nn_padding_type padding,
const hexagon_nn_input *inputs,
int inputsLen,
const hexagon_nn_output *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_op_node *ops,
int opsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
hexagon_nn_nn_id id,
unsigned int node_id,
unsigned int batches,
unsigned int height,
unsigned int width,
unsigned int depth,
const unsigned char *data,
int dataLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
hexagon_nn_nn_id id,
const hexagon_nn_const_node *consts,
int constsLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
hexagon_nn_nn_id id,
unsigned int batches_in,
unsigned int height_in,
unsigned int width_in,
unsigned int depth_in,
const unsigned char *data_in,
int data_inLen,
unsigned int *batches_out,
unsigned int *height_out,
unsigned int *width_out,
unsigned int *depth_out,
unsigned char *data_out,
int data_outLen,
unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
unsigned int level) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
hexagon_nn_nn_id id,
hexagon_nn_perfinfo *info_out,
int info_outLen,
unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
hexagon_nn_nn_id id,
unsigned int *cycles_lo,
unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
__QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
int *ver) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
hexagon_nn_nn_id id,
const hexagon_nn_tensordef *inputs,
int inputsLen,
hexagon_nn_tensordef *outputs,
int outputsLen) __QAIC_HEADER_ATTRIBUTE;
#ifdef __cplusplus
}
#endif
#endif //_HEXAGON_NN_H
#endif //_HEXAGON_NN_H
......@@ -5,8 +5,8 @@
#ifndef LIBMACE_HEXAGON_NN_OPS_H
#define LIBMACE_HEXAGON_NN_OPS_H
#include "mace/utils/logging.h"
#include <unordered_map>
#include "mace/utils/logging.h"
namespace mace {
......@@ -24,8 +24,7 @@ typedef enum op_type_enum {
class OpMap {
public:
void Init() {
#define DEF_OP(NAME) \
op_map_[#NAME] = OP_##NAME;
#define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
#include "mace/core/runtime/hexagon/ops.h"
......@@ -40,9 +39,10 @@ class OpMap {
return OP_INVALID;
}
}
private:
std::unordered_map<std::string, int> op_map_;
};
} // namespace mace
} // namespace mace
#endif // LIBMACE_HEXAGON_NN_OPS_H
#endif // LIBMACE_HEXAGON_NN_OPS_H
......@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
#undef __SELF_DEF_OP_WREF
#undef DEF_OP_WREF
#endif
......@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
float *max_out) {
float stepsize;
float recip_stepsize;
QuantizeAdjustRange(min_in, max_in,
min_out, max_out,
&stepsize, &recip_stepsize);
QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
&recip_stepsize);
const float *in = in_tensor.data<float>();
uint8_t *out = out_tensor->mutable_data<uint8_t>();
for (int i = 0; i < in_tensor.size(); i++) {
const float inval = in[i];
float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
float ival =
static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
if (ival < 0) ival = 0;
if (ival > 255) ival = 255;
out[i] = static_cast<uint8_t>(ival);
......@@ -93,4 +93,4 @@ void Quantizer::DeQuantize(const Tensor &in_tensor,
}
}
} // namespace mace
\ No newline at end of file
} // namespace mace
\ No newline at end of file
......@@ -16,13 +16,17 @@ class Quantizer {
void Quantize(const Tensor &in_tensor,
Tensor *out_tensor,
float *min_out, float *max_out);
float *min_out,
float *max_out);
void Quantize(const Tensor &in_tensor,
const float min_in, const float max_in,
const float min_in,
const float max_in,
Tensor *out_tensor,
float *min_out, float *max_out);
float *min_out,
float *max_out);
void DeQuantize(const Tensor &in_tensor,
const float min_in, const float max_in,
const float min_in,
const float max_in,
Tensor *out_tensor);
private:
......@@ -33,9 +37,9 @@ class Quantizer {
float *stepsize,
float *recip_stepsize);
DISABLE_COPY_AND_ASSIGN(Quantizer);
DISABLE_COPY_AND_ASSIGN(Quantizer);
};
} // mace
} // mace
#endif // MACE_DSP_UTIL_QUANTIZE_H_
#endif // MACE_DSP_UTIL_QUANTIZE_H_
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
......@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
return 0;
}
}
}
OpenCLAllocator::OpenCLAllocator() {}
......@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
<< image_shape[1];
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
image_shape[0], image_shape[1],
0, nullptr, &error);
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
image_shape[0], image_shape[1], 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
<< image_shape[0] << ", " << image_shape[1]
<< "]";
......@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
nbytes, nullptr, nullptr, &error);
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr;
}
......@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch->resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
mapped_image_pitch->data(),
mapped_image_pitch->data() + 1,
nullptr, nullptr, &error);
void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr;
......
......@@ -5,8 +5,8 @@
#include <vector>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap;
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
*is_binary = false;
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source == kEncryptedProgramMap.end()) {
......
......@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap;
extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
*is_binary = true;
auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
if (it_binary == kCompiledProgramMap.end()) {
......
......@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0;
}
double OpenCLProfilingTimer::AccumulatedMicros() {
return accumulated_micros_;
}
double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
void OpenCLProfilingTimer::AccumulateTiming(){
void OpenCLProfilingTimer::AccumulateTiming() {
StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
}
......@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
cl::CommandQueue command_queue(context, gpu_device, properties);
const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context);
......@@ -163,18 +162,14 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
MACE_CHECK_NOTNULL(program);
std::string binary_file_name_prefix =
GenerateCLBinaryFilenamePrefix(built_program_key);
GenerateCLBinaryFilenamePrefix(built_program_key);
std::vector<unsigned char> program_vec;
bool is_opencl_binary;
const bool found = GetSourceOrBinaryProgram(program_name,
binary_file_name_prefix,
context(),
device(),
program,
&is_opencl_binary);
const bool found =
GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
device(), program, &is_opencl_binary);
MACE_CHECK(found, "Program not found for ",
is_opencl_binary ? "binary: " : "source: ",
built_program_key);
is_opencl_binary ? "binary: " : "source: ", built_program_key);
// Build program
std::string build_options_str =
......@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
}
LOG(FATAL) << "Build program from "
<< (is_opencl_binary ? "binary: " : "source: ")
<< built_program_key
<< " failed: " << ret;
<< built_program_key << " failed: " << ret;
}
if (!is_opencl_binary) {
// Write binary if necessary
std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
std::string binary_filename =
kernel_path_ + binary_file_name_prefix + ".bin";
size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]);
......@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) {
program = built_program_it->second;
} else {
this->BuildProgram(program_name, built_program_key,
build_options_str, &program);
this->BuildProgram(program_name, built_program_key, build_options_str,
&program);
built_program_map_.emplace(built_program_key, program);
}
return cl::Kernel(program, kernel_name.c_str());
......@@ -250,9 +245,9 @@ cl::Kernel OpenCLRuntime::BuildKernel(
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) {
stats->start_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
stats->end_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
}
}
......
......@@ -19,7 +19,8 @@ namespace mace {
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0){};
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
......@@ -48,6 +49,7 @@ class OpenCLRuntime {
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
private:
OpenCLRuntime();
~OpenCLRuntime();
......
......@@ -7,10 +7,10 @@
namespace mace {
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
......@@ -65,23 +65,20 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
class Tensor {
public:
Tensor(Allocator *alloc, DataType type)
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
name_("") {};
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
name_(""){};
Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype),
buffer_(buffer),
is_buffer_owner_(false),
name_("") {}
: dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
name_("") {
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
name_("") {
buffer_ = &buffer_slice_;
}
......@@ -102,8 +99,8 @@ class Tensor {
inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ",
index, " >= ", shape_.size());
MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
shape_.size());
return shape_[index];
}
......@@ -112,40 +109,35 @@ class Tensor {
std::multiplies<int64_t>());
}
inline index_t raw_size() const {
return size() * SizeOfType();
}
inline index_t raw_size() const { return size() * SizeOfType(); }
inline bool has_opencl_image() const {
return buffer_ != nullptr && !buffer_->OnHost()
&& typeid(*buffer_) == typeid(Image);
return buffer_ != nullptr && !buffer_->OnHost() &&
typeid(*buffer_) == typeid(Image);
}
inline bool has_opencl_buffer() const {
return buffer_ != nullptr && !buffer_->OnHost()
&& !has_opencl_image();
return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
}
inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), "do not have image");
return static_cast<cl::Image*>(buffer_->buffer());
return static_cast<cl::Image *>(buffer_->buffer());
}
inline cl::Buffer *opencl_buffer() const {
MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
return static_cast<cl::Buffer*>(buffer_->buffer());
return static_cast<cl::Buffer *>(buffer_->buffer());
}
inline index_t buffer_offset() const {
return buffer_->offset();
}
inline index_t buffer_offset() const { return buffer_->offset(); }
inline const void *raw_data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_data();
}
template<typename T>
template <typename T>
inline const T *data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>();
......@@ -156,7 +148,7 @@ class Tensor {
return buffer_->raw_mutable_data();
}
template<typename T>
template <typename T>
inline T *mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data());
......@@ -188,25 +180,17 @@ class Tensor {
is_buffer_owner_ = true;
} else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
Image *image = dynamic_cast<Image*>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0]
&& image_shape[1] <= image->image_shape()[1],
"tensor (source op ",
name_,
"): current physical image shape: ",
image->image_shape()[0],
", ",
image->image_shape()[1],
" < logical image shape: ",
image_shape[0],
", ",
image_shape[1]);
Image *image = dynamic_cast<Image *>(buffer_);
MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
image_shape[1] <= image->image_shape()[1],
"tensor (source op ", name_,
"): current physical image shape: ", image->image_shape()[0],
", ", image->image_shape()[1], " < logical image shape: ",
image_shape[0], ", ", image_shape[1]);
}
}
inline void ResizeLike(const Tensor &other) {
ResizeLike(&other);
}
inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
inline void ResizeLike(const Tensor *other) {
if (other->has_opencl_image()) {
......@@ -229,7 +213,7 @@ class Tensor {
memcpy(buffer_->raw_mutable_data(), src, size);
}
template<typename T>
template <typename T>
inline void Copy(const T *src, index_t length) {
MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
......@@ -248,13 +232,9 @@ class Tensor {
return type_size;
}
inline BufferBase *UnderlyingBuffer() const {
return buffer_;
}
inline BufferBase *UnderlyingBuffer() const { return buffer_; }
inline void SetSourceOpName(const std::string name) {
name_ = name;
}
inline void SetSourceOpName(const std::string name) { name_ = name; }
inline void DebugPrint() const {
using namespace numerical_chars;
......@@ -272,8 +252,9 @@ class Tensor {
}
CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
}
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", "
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str();
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
<< ", " << dim(3) << "], content:\n"
<< os.str();
}
class MappingGuard {
......@@ -301,20 +282,20 @@ class Tensor {
const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_;
DISABLE_COPY_AND_ASSIGN(MappingGuard);
DISABLE_COPY_AND_ASSIGN(MappingGuard);
};
private:
Allocator *allocator_;
DataType dtype_;
std::vector<index_t> shape_;
std::vector<size_t > image_shape_;
std::vector<size_t> image_shape_;
BufferBase *buffer_;
BufferSlice buffer_slice_;
bool is_buffer_owner_;
std::string name_;
DISABLE_COPY_AND_ASSIGN(Tensor);
DISABLE_COPY_AND_ASSIGN(Tensor);
};
} // namespace tensor
......
......@@ -99,9 +99,7 @@ void RestartTiming() {
accum_time = 0;
start_time = NowMicros();
}
void StartTiming() {
start_time = NowMicros();
}
void StartTiming() { start_time = NowMicros(); }
void StopTiming() {
if (start_time != 0) {
accum_time += (NowMicros() - start_time);
......
......@@ -6,9 +6,9 @@
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <string>
#include <utility>
#include <vector>
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
......
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <map>
#include <cstdint>
#include <map>
#include "mace/core/types.h"
#include "mace/utils/logging.h"
......@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
std::string DataTypeToString(const DataType dt) {
static std::map<DataType, std::string> dtype_string_map = {
{DT_FLOAT, "DT_FLOAT"},
{DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"},
{DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"},
{DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"},
{DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"},
{DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}
};
{DT_FLOAT, "DT_FLOAT"}, {DT_HALF, "DT_HALF"},
{DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
{DT_INT8, "DT_INT8"}, {DT_INT32, "DT_INT32"},
{DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
{DT_INT64, "DT_INT64"}, {DT_BOOL, "DT_BOOL"},
{DT_STRING, "DT_STRING"}};
MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
return dtype_string_map[dt];
}
......
......@@ -5,8 +5,8 @@
#include <string>
#include <vector>
#include "mace/core/workspace.h"
#include "mace/core/arg_helper.h"
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
namespace mace {
......@@ -19,7 +19,7 @@ Tensor *Workspace::CreateTensor(const std::string &name,
} else {
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
}
return GetTensor(name);
}
......@@ -35,7 +35,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
Tensor *Workspace::GetTensor(const std::string &name) {
return const_cast<Tensor *>(
static_cast<const Workspace *>(this)->GetTensor(name));
static_cast<const Workspace *>(this)->GetTensor(name));
}
std::vector<std::string> Workspace::Tensors() const {
......@@ -51,28 +51,28 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr
|| reinterpret_cast<long long>(const_tensor.data())
< reinterpret_cast<long long>(model_data_ptr)) {
if (model_data_ptr == nullptr ||
reinterpret_cast<long long>(const_tensor.data()) <
reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
}
}
for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size,
static_cast<index_t>(
(reinterpret_cast<long long>(const_tensor.data())
- reinterpret_cast<long long>(model_data_ptr))
+ const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())));
model_data_size = std::max(
model_data_size,
static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
reinterpret_cast<long long>(model_data_ptr)) +
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())));
}
VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU) {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
} else {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_size)));
new Buffer(GetDeviceAllocator(type), model_data_size)));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap();
......@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type()
<< ", shape: "
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
......@@ -90,14 +89,12 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
dims.push_back(d);
}
index_t
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(),
offset,
const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
new Tensor(BufferSlice(tensor_buffer_.get(), offset,
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
......@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
// as GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op: net_def.op()) {
for (auto &op : net_def.op()) {
if (op.has_mem_id()) {
const DataType op_dtype = static_cast<DataType>(
ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op,
"T",
static_cast<int>(DT_FLOAT)));
ArgumentHelper::GetSingleArgument<OperatorDef, int>(
op, "T", static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype;
// find first valid data type, break
......@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase>
image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
for (auto &mem_block : net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase> image_buf(
new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
}
VLOG(3) << "Preallocate image to tensors";
for (auto &op: net_def.op()) {
for (auto &op : net_def.op()) {
if (op.has_mem_id()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
std::unique_ptr<Tensor> tensor(
new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
tensor->SetSourceOpName(op.name());
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: "
<< op.mem_id() << "; Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1];
VLOG(3)
<< "Tensor: " << op.name() << "(" << op.type() << ")"
<< "; Mem: " << op.mem_id() << "; Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor_map_[op.output(0)] = std::move(tensor);
}
}
......
......@@ -5,9 +5,9 @@
#ifndef MACE_CORE_WORKSPACE_H_
#define MACE_CORE_WORKSPACE_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/preallocated_pooled_allocator.h"
namespace mace {
......@@ -43,7 +43,7 @@ class Workspace {
PreallocatedPooledAllocator preallocated_allocator_;
DISABLE_COPY_AND_ASSIGN(Workspace);
DISABLE_COPY_AND_ASSIGN(Workspace);
};
} // namespace mace
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_ACTIVATION_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -99,17 +99,15 @@ void PReLUActivation(const T *input_ptr,
output_ptr[i] = in;
}
}
}
template <DeviceType D, typename T>
class ActivationFunctor {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type),
relux_max_limit_(relux_max_limit){}
: activation_(type), relux_max_limit_(relux_max_limit) {}
void operator()(const Tensor *input,
void operator()(const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
......@@ -118,9 +116,11 @@ class ActivationFunctor {
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
const T *alpha_ptr = alpha->data<T>();
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr);
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
output_ptr);
} else {
DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_);
DoActivation(input_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
}
......@@ -131,14 +131,16 @@ class ActivationFunctor {
template <>
void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future);
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type),
relux_max_limit_(relux_max_limit){}
: activation_(type), relux_max_limit_(relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *alpha,
......
......@@ -18,7 +18,7 @@ namespace mace {
namespace kernels {
namespace {
constexpr int kCostPerGroup = 1024;
constexpr int kCostPerGroup = 1024;
} // namespace
template <DeviceType D, typename T>
......
......@@ -10,10 +10,10 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
......@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
const float relux_max_limit)
: folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const bool folded_constant_;
const ActivationType activation_;
......@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(
folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *scale,
......@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(
folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -32,7 +32,6 @@ struct BiasAddFunctor {
const T *bias_ptr = bias->data<T>();
T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) {
......@@ -44,7 +43,6 @@ struct BiasAddFunctor {
}
}
}
}
};
......
......@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
bool i2b_;
};
template<DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
......@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
}
};
template<typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
......
......@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
public:
ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape,
T *output, StatsFuture *future) {
void operator()(const T *input,
const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......
......@@ -6,23 +6,23 @@
#define MACE_KERNELS_CONCAT_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis): axis_(axis){}
ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
}
};
template<typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, StatsFuture *future);
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
};
} // namepsace kernels
......
......@@ -116,9 +116,8 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
sum[sum_idx] += vaddvq_f32(tmp);
#else
for (int inci = 0; inci < inc_tile_size; ++inci) {
sum[sum_idx] +=
in[in_idx * inc_tile_size + inci] *
weights[weights_idx * inc_tile_size + inci];
sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
weights[weights_idx * inc_tile_size + inci];
}
#endif
}
......@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
paddings_(paddings),
dilations_(dilations),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w]
const Padding padding_type_;
......@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
output->Resize(output_shape);
......
......@@ -145,7 +145,7 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
......@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
*/
output_shape[0] = input_shape[0];
if (round_type == FLOOR) {
output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0]
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1]
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
output_shape[1] = static_cast<index_t>(
std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
(filter_shape[0] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} else {
output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0]
- filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1]
- filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
output_shape[1] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
(filter_shape[0] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[2] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
(filter_shape[1] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
}
output_shape[3] = filter_shape[2];
}
void CalPaddingSize(const index_t *input_shape, // NCHW
......
......@@ -15,7 +15,7 @@ enum Padding {
FULL = 2, // Pads with one less than the filter size on both sides
};
enum RoundType{
enum RoundType {
FLOOR = 0,
CEIL = 1,
};
......
......@@ -10,9 +10,9 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
......@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
paddings_(paddings),
dilations_(dilations),
activation_(activation),
relux_max_limit_(relux_max_limit){}
relux_max_limit_(relux_max_limit) {}
const int *strides_; // [stride_h, stride_w]
const Padding padding_type_;
......@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
auto input_shape = fake_filter_shape;
output->Resize(output_shape);
......
......@@ -5,13 +5,13 @@
#define MACE_KERNELS_ELTWISE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
enum EltwiseType{
enum EltwiseType {
PROD = 0,
SUM = 1,
MAX = 2,
......@@ -19,8 +19,7 @@ enum EltwiseType{
};
struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type,
const std::vector<float> &coeff)
EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
: type_(type), coeff_(coeff) {}
EltwiseType type_;
......@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff)
EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0,
......@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
switch (type_) {
case PROD:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = input0_ptr[i] * input1_ptr[i];
}
break;
......@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
output_ptr[i] =
coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
}
}
break;
case MAX:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
}
break;
case MIN:
#pragma omp parallel for
for(index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
}
break;
......@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
}
};
template <typename T>
struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff)
struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
: EltwiseFunctorBase(type, coeff) {}
void operator()(const Tensor *input0,
......
......@@ -6,8 +6,8 @@
#define MACE_KERNELS_FULLY_CONNECTED_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
namespace mace {
......@@ -16,25 +16,23 @@ namespace kernels {
struct FullyConnectedBase {
FullyConnectedBase(const ActivationType activation,
const float relux_max_limit)
: activation_(activation),
relux_max_limit_(relux_max_limit){}
: activation_(activation), relux_max_limit_(relux_max_limit) {}
const ActivationType activation_;
const float relux_max_limit_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct FullyConnectedFunctor : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) :
FullyConnectedBase(activation, relux_max_limit) {}
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
output->Resize(output_shape);
const index_t N = output->dim(0);
......@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
}
};
template<typename T>
template <typename T>
struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit) :
FullyConnectedBase(activation, relux_max_limit) {}
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *weight,
......
......@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape,
float *output, StatsFuture *future);
const float *input,
const index_t *input_shape,
float *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -6,20 +6,18 @@
#define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct MatMulFunctor {
void operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
C->Resize(c_shape);
const index_t N = C->dim(0);
......@@ -52,7 +50,6 @@ struct MatMulFunctor {
}
};
template <typename T>
struct MatMulFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *A,
......
......@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < n; ++i) {
for (index_t j = 0; j < sample_size; ++j) {
const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel;
const float *input_sample_ptr =
input_ptr + (i * sample_size + j) * channel;
float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
const float *new_scale_ptr = new_scale.data();
const float *new_offset_ptr = new_offset.data();
......
......@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape_vec.data(), paddings.data());
input->shape().data(), filter->shape().data(), dilations_, strides_,
paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)(
......@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto output_shape = output->shape().data();
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
bias_data, output_data, output_shape);
conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
output_data, output_shape);
}
} // namespace kernels
......
......@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier =
filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) {
......@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier =
filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels =
filter_shape == nullptr ? input_channels : 1;
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
......
......@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
dilations_)(
input, filter, bias, output, future);
DepthwiseConv2dFunctor<DeviceType::CPU, float>(
strides_, paddings_, dilations_)(input, filter, bias, output, future);
return;
}
......@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr,
output_shape);
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
output_ptr, output_shape);
}
} // namespace kernels
......
......@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ =
runtime->BuildKernel("activation", kernel_name, built_options);
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
......@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
}
......
......@@ -5,8 +5,8 @@
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0;
for (auto input : input_tensors) {
kernel_.setArg(idx++,
*(input->opencl_image()));
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output_shape[0] << "_"
<< output_shape[1] << "_"
<< output_shape[2] << "_"
<< output_shape[3];
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct AddNFunctor<DeviceType::OPENCL, float>;
template struct AddNFunctor<DeviceType::OPENCL, float>;
template
struct AddNFunctor<DeviceType::OPENCL, half>;
template struct AddNFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ =
runtime->BuildKernel("batch_norm", kernel_name, built_options);
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++,
*(offset->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
if (!folded_constant_) {
kernel_.setArg(idx++,
*(mean->opencl_image()));
kernel_.setArg(idx++, *(mean->opencl_image()));
kernel_.setArg(idx++, *(var->opencl_image()));
kernel_.setArg(idx++, epsilon);
}
......
......@@ -12,11 +12,10 @@ namespace mace {
namespace kernels {
template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
......@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template
struct BiasAddFunctor<DeviceType::OPENCL, float>;
template
struct BiasAddFunctor<DeviceType::OPENCL, half>;
template struct BiasAddFunctor<DeviceType::OPENCL, float>;
template struct BiasAddFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -9,36 +9,33 @@
namespace mace {
namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image,
StatsFuture *future) {
template <typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
if(type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape);
} else {
image->ResizeImage(buffer->shape(), image_shape);
}
} else {
Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape();
buffer->Resize(image->shape());
}
size_t gws[2] = {image_shape[0],
image_shape[1]};
size_t gws[2] = {image_shape[0], image_shape[1]};
std::string kernel_name;
switch (type) {
case CONV2D_FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break;
case DW_CONV2D_FILTER:
kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
kernel_name =
i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
break;
case IN_OUT_CHANNEL:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
......@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break;
case IN_OUT_HEIGHT:
case WEIGHT_HEIGHT:
kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image";
kernel_name = i2b_ ? "in_out_height_image_to_buffer"
: "in_out_height_buffer_to_image";
break;
case IN_OUT_WIDTH:
MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
......@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break;
case WINOGRAD_FILTER:
gws[1] /= 16;
kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
: "winograd_filter_buffer_to_image";
break;
}
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
......@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace(kernel_name_ss.str());
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
} else {
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Global();
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options);
obfuscated_kernel_name, built_options);
uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
"buffer offset not aligned");
b2f_kernel.setArg(idx++,
static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype())));
}
if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) {
} else if (type == WEIGHT_HEIGHT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, 1);
......@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const std::vector<uint32_t> lws = {16, 64};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]),
nullptr, &event);
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
......
......@@ -18,8 +18,8 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__constant sampler_t SAMPLER =
CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
#ifdef USE_PRELU
......
......@@ -5,8 +5,8 @@
#include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input1->opencl_image())));
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image())));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
}
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "concat_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
......@@ -97,27 +96,25 @@ static void ConcatN(cl::Kernel *kernel,
index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_"
<< input_channel_blk << "_"
<< width << "_"
ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
<< batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
}
template<typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
template <typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3";
<< "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0;
......@@ -137,8 +134,9 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
}
output_shape[axis_] += input->dim(axis_);
}
MACE_CHECK(inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
MACE_CHECK(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output->ResizeImage(output_shape, image_shape);
......@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
}
else {
} else {
MACE_NOT_IMPLEMENTED;
}
}
};
template
struct ConcatFunctor<DeviceType::OPENCL, float>;
template
struct ConcatFunctor<DeviceType::OPENCL, half>;
template struct ConcatFunctor<DeviceType::OPENCL, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Tensor *output,
StatsFuture *future);
template<typename T>
template <typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(
cl::Kernel *kernel,
const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
const int *padding, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt,
Tensor *output, StatsFuture *future);
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] =
{Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
......@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
......@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
relux_max_limit_, DataTypeToEnum<T>::value, output, future);
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future);
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
}
}
template
struct Conv2dFunctor<DeviceType::OPENCL, float>;
template
struct Conv2dFunctor<DeviceType::OPENCL, half>;
template struct Conv2dFunctor<DeviceType::OPENCL, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
// FIXME handle flexable data type: half not supported
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input_height));
......@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
}
} // namespace kernels
} // namespace mace
......@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2)));
......
......@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
auto runtime = OpenCLRuntime::Global();
*kernel =
runtime->BuildKernel("conv_2d", kernel_name, built_options);
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++,
*(input->opencl_image()));
kernel->setArg(idx++,
*(filter->opencl_image()));
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(idx++,
*(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++,
*(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
......
......@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
if(kernel->get() == nullptr) {
if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
......@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(
idx++, *(filter->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
kernel->setArg(
idx++, *(bias->opencl_image()));
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(
idx++, *(output->opencl_image()));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width));
......@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::FLOOR, output_shape.data());
CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_,
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
}
......
......@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
const index_t batch = input0->dim(0);
const index_t height = input0->dim(1);
const index_t width = input0->dim(2);
......@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++,
*(input0->opencl_image()));
kernel_.setArg(idx++,
*(input1->opencl_image()));
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
if (!coeff_.empty()) {
kernel_.setArg(idx++, coeff_[0]);
kernel_.setArg(idx++, coeff_[1]);
......@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, *(output->opencl_image()));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)};
const std::vector<uint32_t> lws = {64, 16, 1};
std::stringstream ss;
ss << "eltwise_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -10,14 +10,13 @@
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
......@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options);
kernel_ =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++,
*(input->opencl_image()));
kernel_.setArg(idx++,
*(weight->opencl_image()));
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(weight->opencl_image()));
if (bias != nullptr) {
kernel_.setArg(idx++,
*(bias->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
}
kernel_.setArg(idx++,
*(output->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
......@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
}
const uint32_t gws[2] = {
static_cast<uint32_t>(batch),
static_cast<uint32_t>(output_blocks),
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss;
ss << "fc_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
template
struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -3,8 +3,8 @@
//
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -28,8 +28,9 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3];
......@@ -47,8 +48,9 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]);
......@@ -115,19 +117,16 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
}
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) {
if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1], 1};
}else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] *
((shape[1] - 1) / 2) *
((shape[2] - 1) / 2);
} else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
return {16, shape[3], out_width, 1};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
return std::vector<index_t>();
}
}
......@@ -188,10 +187,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2],
kwg_size / (local_ws[0] * local_ws[1]));
local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return {
// TODO tuning these magic numbers
// TODO tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
......@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
MACE_CHECK(params.size() == 4)
<< "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
......@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
(*tuning_result)[3] = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
uint32_t gws2 =
(i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
......@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
{kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1},
{1, kwg_size, 1}
};
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
MACE_CHECK(params.size() == 3)
<< "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
......@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
uint32_t num_blocks = std::min(
static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
(*tuning_result)[2] = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
uint32_t gws1 =
(i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
......@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
lws,
params_generator,
func,
&timer);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
}
};
}
}
} // namespace kernels
......
......@@ -5,16 +5,16 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h"
#include "mace/utils/utils.h"
#include "mace/core/future.h"
namespace mace {
namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType {
CONV2D_FILTER = 0,
......@@ -31,7 +31,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type);
std::string DtToCLCMDDt(const DataType dt);
......@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
......@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
}
namespace {
template<typename T>
template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v;
}
template<typename T, typename... Args>
template <typename T, typename... Args>
void AppendToStream(std::stringstream *ss,
const std::string &delimiter,
T first,
......@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
}
} // namespace
template<typename... Args>
template <typename... Args>
std::string Concat(Args... args) {
std::stringstream ss;
AppendToStream(&ss, "_", args...);
......
......@@ -11,12 +11,10 @@ namespace mace {
namespace kernels {
template <typename T>
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
......@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++,
*(B->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
......@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
};
const std::vector<uint32_t> lws = {16, 64, 1};
std::stringstream ss;
ss << "matmul_opencl_kernel_"
<< C->dim(0) << "_"
<< C->dim(1) << "_"
<< C->dim(2) << "_"
<< C->dim(3);
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
template
struct MatMulFunctor<DeviceType::OPENCL, float>;
template struct MatMulFunctor<DeviceType::OPENCL, float>;
template
struct MatMulFunctor<DeviceType::OPENCL, half>;
template struct MatMulFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -11,17 +11,15 @@
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input->dim(3), input->dim(3)
};
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
......@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct PoolingFunctor<DeviceType::OPENCL, float>;
template
struct PoolingFunctor<DeviceType::OPENCL, half>;
template struct PoolingFunctor<DeviceType::OPENCL, float>;
template struct PoolingFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -2,12 +2,12 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape,
BufferType::IN_OUT_CHANNEL,
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
......@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
......@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -6,13 +6,13 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template<typename T>
template <typename T>
void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
Tensor *output,
StatsFuture *future) {
......@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "softmax_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct SoftmaxFunctor<DeviceType::OPENCL, float>;
template
struct SoftmaxFunctor<DeviceType::OPENCL, half>;
template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -5,20 +5,21 @@
#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template <typename T>
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) {
void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *space_tensor,
const std::vector<index_t> &output_shape,
Tensor *batch_tensor,
StatsFuture *future) {
const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
......@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
uint32_t idx = 0;
if (b2s_) {
......@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
const uint32_t gws[3] = {chan_blk,
static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const uint32_t gws[3] = {
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << kernel_name << "_"
<< batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_"
ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -11,21 +11,21 @@
namespace mace {
namespace kernels {
template<typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(),
padding_type_, output_shape.data(), paddings.data());
input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
strides_.data(), padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data());
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h = (output_shape[1] + 1) / 2;
......@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
uint32_t idx = 0;
......@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
}
const uint32_t gws[2] = {static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
template<typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
template <typename T>
void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor,
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
switch (activation_) {
case NOOP:
......@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
}
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
if (bias != nullptr) {
kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image())));
kernel_.setArg(idx++,
*(static_cast<const cl::Image2D *>(bias->opencl_image())));
}
kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(
idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
......@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
kernel_.setArg(idx++, relux_max_limit_);
}
const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const uint32_t gws[2] = {
static_cast<uint32_t>(input_tensor->dim(2)),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
const std::vector<uint32_t> lws = {128, 8, 1};
std::stringstream ss;
ss << "winograd_inverse_transform_kernel_"
<< input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_"
<< input_tensor->dim(2) << "_"
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}
template
struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -7,9 +7,9 @@
#include <limits>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
......@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
const int *dilations_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct PoolingFunctor : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
......@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding_type,
paddings, dilations) {}
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {
kernels_[0], kernels_[1],
input_tensor->dim(3), input_tensor->dim(3)
};
kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
input_tensor->shape().data(), filter_shape.data(), dilations_,
strides_, padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
}
output_tensor->Resize(output_shape);
......@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
res = std::max(res, input[input_offset]);
}
}
......@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t out_offset =
(((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0;
int block_size = 0;
......@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width) {
index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
index_t input_offset =
in_offset + (inh * input_width + inw) * input_channels;
sum += input[input_offset];
block_size += 1;
}
......@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
}
}
}
};
template<>
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future);
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
template<typename T>
template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
......@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(pooling_type, kernels,
strides, padding_type,
paddings, dilations) {}
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future);
......
......@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESHAPE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
......@@ -25,7 +25,6 @@ struct ReshapeFunctor {
}
};
} // namespace kernels
} // namespace mace
......
......@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
......@@ -74,9 +74,9 @@ void ResizeImage(const T *images,
const T *batch_input_ptr = images + in_batch_num_values * b;
T *batch_output_ptr = output + out_batch_num_values * b;
const T *y_lower_input_ptr =
batch_input_ptr + ys[y].lower * in_width * channels;
batch_input_ptr + ys[y].lower * in_width * channels;
const T *y_upper_input_ptr =
batch_input_ptr + ys[y].upper * in_width * channels;
batch_input_ptr + ys[y].upper * in_width * channels;
T *y_output_ptr = batch_output_ptr + y * out_width * channels;
const float ys_lerp = ys[y].lerp;
......@@ -95,7 +95,7 @@ void ResizeImage(const T *images,
const T bottom_right = bottom_right_ptr[c];
output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
bottom_right, xs_lerp, ys_lerp);
bottom_right, xs_lerp, ys_lerp);
}
}
}
......@@ -107,10 +107,10 @@ struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size,
bool align_corners)
: align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
}
protected:
bool align_corners_;
......@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
}
};
template<typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
template <typename T>
struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
: ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
......
......@@ -6,9 +6,9 @@
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace mace {
namespace kernels {
......@@ -16,11 +16,10 @@ namespace kernels {
struct SpaceToBatchFunctorBase {
SpaceToBatchFunctorBase(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s):
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s)
{}
bool b2s)
: paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s) {}
std::vector<int> paddings_;
std::vector<int> block_shape_;
......@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
};
template <DeviceType D, typename T>
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
......@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
};
template <typename T>
struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings,
const std::vector<int> &block_shape,
bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
void operator()(Tensor *space_tensor,
const std::vector<index_t> &output_shape,
......@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
StatsFuture *future);
cl::Kernel kernel_;
};
} // namespace kernels
......
......@@ -6,10 +6,10 @@
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
......@@ -17,38 +17,36 @@ namespace kernels {
struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings)
: strides_({1, 1}), dilations_({1, 1}),
padding_type_(padding_type), paddings_(paddings) {}
: strides_({1, 1}),
dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings) {}
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_;
std::vector<int> paddings_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase {
template <typename T>
struct WinogradTransformFunctor<DeviceType::OPENCL, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
void operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
};
......@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
const float relux_max_limit_;
};
template<DeviceType D, typename T>
template <DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *bias,
......@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
};
template<typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
: WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const int batch,
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
: WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *bias,
......
......@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr;
const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor);
......
......@@ -214,9 +214,7 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>(
"Alpha", {2},
{2.0, 3.0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage",
......@@ -250,7 +248,8 @@ void TestSimplePrelu() {
}
auto expected = CreateTensor<float>(
{2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
{2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......
......@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
for (int i = 1; i < n; ++i) {
inputs[i] = this->Input(i);
MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: "
<< MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size()
<< ". Input " << i << ": "
<< MakeString(inputs[i]->shape())
<< ", size: " << inputs[i]->size();
MACE_CHECK(inputs[0]->size() == inputs[i]->size())
<< "Input 0: " << MakeString(inputs[0]->shape())
<< ", size: " << inputs[0]->size() << ". Input " << i << ": "
<< MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
}
functor_(inputs, output_tensor, future);
......
......@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
OpsTestNet net;
// Add input data
for (int i = 0; i < inputs; ++i) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(),
{n, h, w, c});
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
}
if (D == DeviceType::OPENCL) {
......
......@@ -76,7 +76,7 @@ static void BatchNorm(
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -12,15 +12,14 @@
namespace mace {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> {
public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(
OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
true) {}
functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
true) {}
bool Run(StatsFuture *future) override {
const Tensor *batch_tensor = this->Input(INPUT);
......@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
std::vector<index_t> output_shape(4, 0);
CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future);
functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
future);
return true;
}
......@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
Tensor *output,
index_t *output_shape) {
auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
auto block_shape =
OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
......@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
const index_t block_dims = block_shape.size();
index_t block_shape_product = 1;
for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
MACE_CHECK(block_shape[block_dim] > 1,
"block_shape's value should be great to 1");
const index_t block_shape_value = block_shape[block_dim];
const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value
- crops[block_dim * 2]
- crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0,
"cropped size must be non-negative");
const index_t cropped_input_size =
input_tensor->dim(block_dim + 1) * block_shape_value -
crops[block_dim * 2] - crops[block_dim * 2 + 1];
MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
block_shape_product *= block_shape_value;
output_shape[block_dim + 1] = cropped_input_size;
}
......
......@@ -41,7 +41,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
......
......@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -11,16 +11,17 @@
namespace mace {
template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> {
class BufferToImageOp : public Operator<D, T> {
public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
: Operator<D, T>(op_def, ws) {}
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
kernels::BufferType type =
static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output, future);
......
此差异已折叠。
......@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
input->shape()[1]);
output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(),
output->mutable_data<T>(), future);
functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
future);
return true;
}
......
......@@ -41,7 +41,7 @@ static void ChannelShuffle(
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \
} \
......
......@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
public:
ConcatOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
MACE_CHECK(this->InputSize() >= 2)
<< "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
const int32_t input_dims = input_list[0]->dim_size();
......
......@@ -37,11 +37,10 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
}
}
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1( \
int iters) { \
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
} \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO(0, 1000);
......@@ -90,13 +89,11 @@ static void OpenclConcatHelper(int iters,
}
}
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \
int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
......
......@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::CPU, float>(
MakeString("Input", i), input_shapes[i], inputs[i]);
net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
input_shapes[i], inputs[i]);
}
// Run
......@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
}
TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32},
{3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
OpenclRandomTest<float>(
{{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
}
\ No newline at end of file
此差异已折叠。
......@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
functor_(static_cast<kernels::EltwiseType>(
OperatorBase::GetSingleArgument<int>(
"type", static_cast<int>(kernels::EltwiseType::SUM))),
OperatorBase::GetRepeatedArgument<float>("coeff")){}
OperatorBase::GetRepeatedArgument<float>("coeff")) {}
bool Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->Input(1);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape";
for(int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape";
MACE_CHECK(input0->dim_size() == input1->dim_size())
<< "Inputs of Eltwise op must be same shape";
for (int i = 0; i < input0->dim_size(); ++i) {
MACE_CHECK(input0->dim(i) == input1->dim(i))
<< "Inputs of Eltwise op must be same shape";
}
output->ResizeLike(input0);
......
......@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册