提交 08a362e7 编写于 作者: Y yejianwu

merge with origin master

...@@ -8,6 +8,7 @@ package( ...@@ -8,6 +8,7 @@ package(
cc_library( cc_library(
name = "generated_models", name = "generated_models",
srcs = glob(["models/*/*.cc"]), srcs = glob(["models/*/*.cc"]),
hdrs = glob(["models/*/*.h"]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/core", "//mace/core",
...@@ -33,7 +34,6 @@ cc_library( ...@@ -33,7 +34,6 @@ cc_library(
linkstatic = 1, linkstatic = 1,
) )
cc_library( cc_library(
name = "generated_version", name = "generated_version",
srcs = ["version/version.cc"], srcs = ["version/version.cc"],
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <map> #include <map>
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_types.h"
namespace mace { namespace mace {
......
此差异已折叠。
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/public/mace_runtime.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
<< ", gpu_priority_hint: " << gpu_priority_hint;
OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
}
void ConfigOmpThreadsAndAffinity(int omp_num_threads,
CPUPowerOption power_option) {
VLOG(1) << "Config CPU Runtime: omp_num_threads: " << omp_num_threads
<< ", cpu_power_option: " << power_option;
SetOmpThreadsAndAffinity(omp_num_threads, power_option);
}
}; // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <memory>
#include <numeric>
#include "mace/public/mace_types.h"
#include "mace/utils/logging.h"
namespace mace {
ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const DataType data_type,
uint32_t node_id)
: name_(name),
data_(data),
data_size_(std::accumulate(
dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(data_type),
node_id_(node_id) {}
ConstTensor::ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const int data_type,
uint32_t node_id)
: name_(name),
data_(data),
data_size_(std::accumulate(
dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
dims_(dims.begin(), dims.end()),
data_type_(static_cast<DataType>(data_type)),
node_id_(node_id) {}
const std::string &ConstTensor::name() const { return name_; }
const unsigned char *ConstTensor::data() const { return data_; }
int64_t ConstTensor::data_size() const { return data_size_; }
const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
DataType ConstTensor::data_type() const { return data_type_; }
uint32_t ConstTensor::node_id() const { return node_id_; }
Argument::Argument() : has_bits_(0) {}
void Argument::CopyFrom(const Argument &from) {
this->name_ = from.name();
this->f_ = from.f();
this->i_ = from.i();
this->s_ = from.s();
auto floats = from.floats();
this->floats_.resize(floats.size());
std::copy(floats.begin(), floats.end(), this->floats_.begin());
auto ints = from.ints();
this->ints_.resize(ints.size());
std::copy(ints.begin(), ints.end(), this->ints_.begin());
auto strings = from.floats();
this->strings_.resize(strings.size());
std::copy(floats.begin(), floats.end(), this->floats_.begin());
this->has_bits_ = from.has_bits_;
}
const std::string &Argument::name() const { return name_; }
void Argument::set_name(const std::string &value) { name_ = value; }
bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
float Argument::f() const { return f_; }
void Argument::set_f(float value) {
set_has_f();
f_ = value;
}
bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
int64_t Argument::i() const { return i_; }
void Argument::set_i(int64_t value) {
set_has_i();
i_ = value;
}
bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
std::string Argument::s() const { return s_; }
void Argument::set_s(const std::string &value) {
set_has_s();
s_ = value;
}
const std::vector<float> &Argument::floats() const { return floats_; }
void Argument::add_floats(float value) { floats_.push_back(value); }
void Argument::set_floats(const std::vector<float> &value) {
floats_.resize(value.size());
std::copy(value.begin(), value.end(), floats_.begin());
}
const std::vector<int64_t> &Argument::ints() const { return ints_; }
void Argument::add_ints(int64_t value) { ints_.push_back(value); }
void Argument::set_ints(const std::vector<int64_t> &value) {
ints_.resize(value.size());
std::copy(value.begin(), value.end(), ints_.begin());
}
const std::vector<std::string> &Argument::strings() const { return strings_; }
void Argument::add_strings(const ::std::string &value) {
strings_.push_back(value);
}
void Argument::set_strings(const std::vector<std::string> &value) {
strings_.resize(value.size());
std::copy(value.begin(), value.end(), strings_.begin());
}
// Node Input
NodeInput::NodeInput(int node_id, int output_port)
: node_id_(node_id), output_port_(output_port) {}
void NodeInput::CopyFrom(const NodeInput &from) {
node_id_ = from.node_id();
output_port_ = from.output_port();
}
int NodeInput::node_id() const { return node_id_; }
void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
int NodeInput::output_port() const { return output_port_; }
void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
// OutputShape
OutputShape::OutputShape() {}
OutputShape::OutputShape(const std::vector<int64_t> &dims)
: dims_(dims.begin(), dims.end()) {}
void OutputShape::CopyFrom(const OutputShape &from) {
auto from_dims = from.dims();
dims_.resize(from_dims.size());
std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
}
const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
// Operator Def
void OperatorDef::CopyFrom(const OperatorDef &from) {
name_ = from.name();
type_ = from.type();
auto from_input = from.input();
input_.resize(from_input.size());
std::copy(from_input.begin(), from_input.end(), input_.begin());
auto from_output = from.output();
output_.resize(from_output.size());
std::copy(from_output.begin(), from_output.end(), output_.begin());
auto from_arg = from.arg();
arg_.resize(from_arg.size());
for (int i = 0; i < from_arg.size(); ++i) {
arg_[i].CopyFrom(from_arg[i]);
}
auto from_output_shape = from.output_shape();
output_shape_.resize(from_output_shape.size());
for (int i = 0; i < from_output_shape.size(); ++i) {
output_shape_[i].CopyFrom(from_output_shape[i]);
}
auto from_data_type = from.output_type();
output_type_.resize(from_data_type.size());
std::copy(from_data_type.begin(), from_data_type.end(), output_type_.begin());
auto mem_ids = from.mem_id();
mem_id_.resize(mem_ids.size());
std::copy(mem_ids.begin(), mem_ids.end(), mem_id_.begin());
// nnlib
node_id_ = from.node_id();
op_id_ = from.op_id();
padding_ = from.padding();
auto from_node_input = from.node_input();
node_input_.resize(from_node_input.size());
for (int i = 0; i < from_node_input.size(); ++i) {
node_input_[i].CopyFrom(from_node_input[i]);
}
auto from_out_max_byte_size = from.out_max_byte_size();
out_max_byte_size_.resize(from_out_max_byte_size.size());
std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
out_max_byte_size_.begin());
has_bits_ = from.has_bits_;
}
const std::string &OperatorDef::name() const { return name_; }
void OperatorDef::set_name(const std::string &name_) {
set_has_name();
OperatorDef::name_ = name_;
}
bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
const std::string &OperatorDef::type() const { return type_; }
void OperatorDef::set_type(const std::string &type_) {
set_has_type();
OperatorDef::type_ = type_;
}
bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
const std::vector<int> &OperatorDef::mem_id() const { return mem_id_; }
void OperatorDef::set_mem_id(const std::vector<int> &value) {
mem_id_.resize(value.size());
std::copy(value.begin(), value.end(), mem_id_.begin());
}
uint32_t OperatorDef::node_id() const { return node_id_; }
void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
uint32_t OperatorDef::op_id() const { return op_id_; }
uint32_t OperatorDef::padding() const { return padding_; }
void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
const std::vector<NodeInput> &OperatorDef::node_input() const {
return node_input_;
}
void OperatorDef::add_node_input(const NodeInput &value) {
node_input_.push_back(value);
}
const std::vector<int> &OperatorDef::out_max_byte_size() const {
return out_max_byte_size_;
}
void OperatorDef::add_out_max_byte_size(int value) {
out_max_byte_size_.push_back(value);
}
const std::vector<std::string> &OperatorDef::input() const { return input_; }
const std::string &OperatorDef::input(int index) const {
MACE_CHECK(0 <= index && index <= input_.size());
return input_[index];
}
std::string *OperatorDef::add_input() {
input_.push_back("");
return &input_.back();
}
void OperatorDef::add_input(const ::std::string &value) {
input_.push_back(value);
}
void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
void OperatorDef::set_input(const std::vector<std::string> &value) {
input_.resize(value.size());
std::copy(value.begin(), value.end(), input_.begin());
}
const std::vector<std::string> &OperatorDef::output() const { return output_; }
const std::string &OperatorDef::output(int index) const {
MACE_CHECK(0 <= index && index <= output_.size());
return output_[index];
}
std::string *OperatorDef::add_output() {
output_.push_back("");
return &output_.back();
}
void OperatorDef::add_output(const ::std::string &value) {
output_.push_back(value);
}
void OperatorDef::add_output(::std::string &&value) {
output_.push_back(value);
}
void OperatorDef::set_output(const std::vector<std::string> &value) {
output_.resize(value.size());
std::copy(value.begin(), value.end(), output_.begin());
}
const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
Argument *OperatorDef::add_arg() {
arg_.emplace_back(Argument());
return &arg_.back();
}
const std::vector<OutputShape> &OperatorDef::output_shape() const {
return output_shape_;
}
void OperatorDef::add_output_shape(const OutputShape &value) {
output_shape_.push_back(value);
}
const std::vector<DataType> &OperatorDef::output_type() const {
return output_type_;
}
void OperatorDef::set_output_type(const std::vector<DataType> &value) {
output_type_.resize(value.size());
std::copy(value.begin(), value.end(), output_type_.begin());
}
// MemoryBlock
MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
: mem_id_(mem_id), x_(x), y_(y) {}
int MemoryBlock::mem_id() const { return mem_id_; }
uint32_t MemoryBlock::x() const { return x_; }
uint32_t MemoryBlock::y() const { return y_; }
// MemoryArena
const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
return mem_block_;
}
std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
return mem_block_;
}
int MemoryArena::mem_block_size() const { return mem_block_.size(); }
// InputInfo
const std::string &InputInfo::name() const { return name_; }
int32_t InputInfo::node_id() const { return node_id_; }
int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
DataType InputInfo::data_type() const { return data_type_; }
const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
// OutputInfo
const std::string &OutputInfo::name() const { return name_; }
int32_t OutputInfo::node_id() const { return node_id_; }
int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
DataType OutputInfo::data_type() const { return data_type_; }
void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
// NetDef
NetDef::NetDef() : has_bits_(0) {}
const std::string &NetDef::name() const { return name_; }
void NetDef::set_name(const std::string &value) {
set_has_name();
name_ = value;
}
bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
const std::string &NetDef::version() const { return version_; }
void NetDef::set_version(const std::string &value) {
set_has_version();
version_ = value;
}
bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
const std::vector<OperatorDef> &NetDef::op() const { return op_; }
OperatorDef *NetDef::add_op() {
op_.emplace_back(OperatorDef());
return &op_.back();
}
std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
const std::vector<Argument> &NetDef::arg() const { return arg_; }
Argument *NetDef::add_arg() {
arg_.emplace_back(Argument());
return &arg_.back();
}
std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
MemoryArena &NetDef::mutable_mem_arena() {
set_has_mem_arena();
return mem_arena_;
}
bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
const std::vector<OutputInfo> &NetDef::output_info() const {
return output_info_;
}
std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
int NetDef::op_size() const { return op_.size(); }
const OperatorDef &NetDef::op(const int idx) const {
MACE_CHECK(0 <= idx && idx < op_size());
return op_[idx];
}
}; // namespace mace
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_types.h"
namespace mace { namespace mace {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#include "mace/public/mace.h" #include "mace/public/mace_runtime.h"
namespace mace { namespace mace {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h" #include "mace/core/runtime/opencl/opencl_wrapper.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
namespace mace { namespace mace {
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <cstdint> #include <cstdint>
#include <string> #include <string>
#include "mace/public/mace.h" #include "mace/public/mace_types.h"
#include "include/half.hpp" #include "include/half.hpp"
namespace mace { namespace mace {
......
...@@ -23,9 +23,11 @@ ...@@ -23,9 +23,11 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/env_time.h" #include "mace/utils/env_time.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
namespace mace { namespace mace {
namespace MACE_MODEL_TAG { namespace MACE_MODEL_TAG {
...@@ -145,19 +147,26 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) { ...@@ -145,19 +147,26 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
return curr; return curr;
} }
DEFINE_string(input_node, "input_node0,input_node1", DEFINE_string(input_node,
"input_node0,input_node1",
"input nodes, separated by comma"); "input nodes, separated by comma");
DEFINE_string(input_shape, "1,224,224,3:1,1,1,10", DEFINE_string(input_shape,
"1,224,224,3:1,1,1,10",
"input shapes, separated by colon and comma"); "input shapes, separated by colon and comma");
DEFINE_string(output_node, "output_node0,output_node1", DEFINE_string(output_node,
"output_node0,output_node1",
"output nodes, separated by comma"); "output nodes, separated by comma");
DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", DEFINE_string(output_shape,
"1,224,224,2:1,1,1,10",
"output shapes, separated by colon and comma"); "output shapes, separated by colon and comma");
DEFINE_string(input_file, "", DEFINE_string(input_file,
"",
"input file name | input file prefix for multiple inputs."); "input file name | input file prefix for multiple inputs.");
DEFINE_string(output_file, "", DEFINE_string(output_file,
"",
"output file name | output file prefix for multiple outputs"); "output file name | output file prefix for multiple outputs");
DEFINE_string(model_data_file, "", DEFINE_string(model_data_file,
"",
"model data file name, used when EMBED_MODEL_DATA set to 0"); "model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
DEFINE_int32(round, 1, "round"); DEFINE_int32(round, 1, "round");
...@@ -166,112 +175,11 @@ DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); ...@@ -166,112 +175,11 @@ DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, 8, "num of openmp threads"); DEFINE_int32(omp_num_threads, 8, "num of openmp threads");
DEFINE_int32(cpu_power_option, 0, DEFINE_int32(cpu_power_option,
0,
"0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");
bool SingleInputAndOutput(const std::vector<int64_t> &input_shape, bool RunModel(const std::vector<std::string> &input_names,
const std::vector<int64_t> &output_shape) {
// load model
int64_t t0 = NowMicros();
const unsigned char *model_data =
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
int64_t t1 = NowMicros();
LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
int64_t init_micros = t1 - t0;
DeviceType device_type = ParseDeviceType(FLAGS_device);
LOG(INFO) << "Runing with device type: " << device_type;
// config runtime
if (device_type == DeviceType::OPENCL) {
mace::ConfigOpenCLRuntime(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} else if (device_type == DeviceType::CPU) {
mace::ConfigOmpThreadsAndAffinity(
FLAGS_omp_num_threads,
static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
}
// Init model
LOG(INFO) << "Run init";
t0 = NowMicros();
mace::MaceEngine engine(&net_def, device_type);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
t1 = NowMicros();
init_micros += t1 - t0;
LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
LOG(INFO) << "Total init latency: " << init_micros << " us";
// Allocate input and output
int64_t input_size =
std::accumulate(input_shape.begin(), input_shape.end(), 1,
std::multiplies<int64_t>());
int64_t output_size =
std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int64_t>());
std::unique_ptr<float[]> input_data(new float[input_size]);
std::unique_ptr<float[]> output_data(new float[output_size]);
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node),
std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_data.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
LOG(INFO) << "Warm up run";
t0 = NowMicros();
engine.Run(input_data.get(), input_shape, output_data.get());
t1 = NowMicros();
LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
if (FLAGS_round > 0) {
LOG(INFO) << "Run model";
t0 = NowMicros();
struct mallinfo prev = mallinfo();
for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(input_data.get(), input_shape, output_data.get());
if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
LOG(INFO) << "=== check malloc info change #" << i << " ===";
prev = LogMallinfoChange(prev);
}
}
t1 = NowMicros();
LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
}
if (FLAGS_restart_round == 1) {
if (output_data != nullptr) {
std::string
output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node);
std::ofstream out_file(output_name, std::ios::binary);
out_file.write((const char *) (output_data.get()),
output_size * sizeof(float));
out_file.flush();
out_file.close();
LOG(INFO) << "Write output file "
<< output_name
<< " with size " << output_size
<< " done.";
} else {
LOG(INFO) << "Output data is null";
}
}
return true;
}
bool MultipleInputOrOutput(
const std::vector<std::string> &input_names,
const std::vector<std::vector<int64_t>> &input_shapes, const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::string> &output_names, const std::vector<std::string> &output_names,
const std::vector<std::vector<int64_t>> &output_shapes) { const std::vector<std::vector<int64_t>> &output_shapes) {
...@@ -312,42 +220,42 @@ bool MultipleInputOrOutput( ...@@ -312,42 +220,42 @@ bool MultipleInputOrOutput(
const size_t input_count = input_names.size(); const size_t input_count = input_names.size();
const size_t output_count = output_names.size(); const size_t output_count = output_names.size();
std::vector<mace::MaceInputInfo> input_infos(input_count);
std::map<std::string, float*> outputs; std::map<std::string, mace::MaceTensor> inputs;
std::vector<std::unique_ptr<float[]>> input_datas(input_count); std::map<std::string, mace::MaceTensor> outputs;
for (size_t i = 0; i < input_count; ++i) { for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output // Allocate input and output
int64_t input_size = int64_t input_size =
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
input_datas[i].reset(new float[input_size]); auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
// load input // load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
std::ios::in | std::ios::binary); std::ios::in | std::ios::binary);
if (in_file.is_open()) { if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_datas[i].get()), in_file.read(reinterpret_cast<char *>(buffer_in.get()),
input_size * sizeof(float)); input_size * sizeof(float));
in_file.close(); in_file.close();
} else { } else {
LOG(INFO) << "Open input file failed"; LOG(INFO) << "Open input file failed";
return -1; return -1;
} }
input_infos[i].name = input_names[i]; inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
input_infos[i].shape = input_shapes[i];
input_infos[i].data = input_datas[i].get();
} }
std::vector<std::unique_ptr<float[]>> output_datas(output_count);
for (size_t i = 0; i < output_count; ++i) { for (size_t i = 0; i < output_count; ++i) {
int64_t output_size = int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
output_datas[i].reset(new float[output_size]); auto buffer_out = std::shared_ptr<float>(new float[output_size],
outputs[output_names[i]] = output_datas[i].get(); std::default_delete<float[]>());
outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
} }
LOG(INFO) << "Warm up run"; LOG(INFO) << "Warm up run";
t0 = NowMicros(); t0 = NowMicros();
engine.Run(input_infos, outputs); engine.Run(inputs, &outputs);
t1 = NowMicros(); t1 = NowMicros();
LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
...@@ -356,7 +264,7 @@ bool MultipleInputOrOutput( ...@@ -356,7 +264,7 @@ bool MultipleInputOrOutput(
t0 = NowMicros(); t0 = NowMicros();
struct mallinfo prev = mallinfo(); struct mallinfo prev = mallinfo();
for (int i = 0; i < FLAGS_round; ++i) { for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(input_infos, outputs); engine.Run(inputs, &outputs);
if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) { if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
LOG(INFO) << "=== check malloc info change #" << i << " ==="; LOG(INFO) << "=== check malloc info change #" << i << " ===";
prev = LogMallinfoChange(prev); prev = LogMallinfoChange(prev);
...@@ -367,20 +275,19 @@ bool MultipleInputOrOutput( ...@@ -367,20 +275,19 @@ bool MultipleInputOrOutput(
} }
for (size_t i = 0; i < output_count; ++i) { for (size_t i = 0; i < output_count; ++i) {
std::string output_name = FLAGS_output_file + "_" std::string output_name =
+ FormatName(output_names[i]); FLAGS_output_file + "_" + FormatName(output_names[i]);
std::ofstream out_file(output_name, std::ios::binary); std::ofstream out_file(output_name, std::ios::binary);
int64_t output_size = int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
out_file.write((const char *) outputs[output_names[i]], out_file.write(
reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
output_size * sizeof(float)); output_size * sizeof(float));
out_file.flush(); out_file.flush();
out_file.close(); out_file.close();
LOG(INFO) << "Write output file " LOG(INFO) << "Write output file " << output_name << " with size "
<< output_name << output_size << " done.";
<< " with size " << output_size
<< " done.";
} }
return true; return true;
...@@ -391,7 +298,6 @@ int Main(int argc, char **argv) { ...@@ -391,7 +298,6 @@ int Main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "mace version: " << MaceVersion(); LOG(INFO) << "mace version: " << MaceVersion();
LOG(INFO) << "mace git version: " << MaceGitVersion();
LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum(); LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
LOG(INFO) << "input node: " << FLAGS_input_node; LOG(INFO) << "input node: " << FLAGS_input_node;
LOG(INFO) << "input shape: " << FLAGS_input_shape; LOG(INFO) << "input shape: " << FLAGS_input_shape;
...@@ -431,14 +337,8 @@ int Main(int argc, char **argv) { ...@@ -431,14 +337,8 @@ int Main(int argc, char **argv) {
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < FLAGS_restart_round; ++i) { for (int i = 0; i < FLAGS_restart_round; ++i) {
VLOG(0) << "restart round " << i; VLOG(0) << "restart round " << i;
if (input_count == 1 && output_count == 1) { ret =
ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]); RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
} else {
ret = MultipleInputOrOutput(input_names,
input_shape_vec,
output_names,
output_shape_vec);
}
} }
if (ret) { if (ret) {
return 0; return 0;
......
...@@ -95,7 +95,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -95,7 +95,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(buffer->buffer_offset() / static_cast<uint32_t>(buffer->buffer_offset() /
GetEnumTypeSize(buffer->dtype()))); GetEnumTypeSize(buffer->dtype())));
} }
if (type == ARGUMENT) { if (type == CONV2D_FILTER) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
} else if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) { } else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset, __private const int input_offset,
__private const int filter_h,
__private const int filter_w, __private const int filter_w,
__private const int out_channel, __private const int out_channel,
__private const int in_channel, __private const int in_channel,
...@@ -22,16 +23,18 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o ...@@ -22,16 +23,18 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
} }
#endif #endif
const int out_channel_idx = h * 4; const int in_channel_idx = w;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4; const int hw_size = filter_w * filter_h;
const int hw_idx = w / rounded_in_channel; const int out_channel_idx = h / hw_size * 4;
const int in_channel_idx = w % rounded_in_channel; const int hw_idx = h % hw_size;
const int h_idx = hw_idx / filter_w; const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w; const int w_idx = hw_idx % filter_w;
const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel const int offset = input_offset
+ ((h_idx * filter_w + w_idx) * out_channel
+ out_channel_idx) * in_channel
+ in_channel_idx; + in_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; DATA_TYPE4 values = 0;
if (out_channel_idx < out_channel) { if (out_channel_idx < out_channel) {
const int size = out_channel - out_channel_idx; const int size = out_channel - out_channel_idx;
if (size < 4) { if (size < 4) {
...@@ -52,10 +55,11 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o ...@@ -52,10 +55,11 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w, __private const int filter_w,
__private const int out_channel, __private const int out_channel,
__private const int in_channel, __private const int in_channel,
...@@ -76,18 +80,19 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic ...@@ -76,18 +80,19 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
} }
#endif #endif
const int out_channel_idx = h * 4; const int in_channel_idx = w;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4; const int hw_size = filter_w * filter_h;
const int hw_idx = w / rounded_in_channel; const int out_channel_idx = h / hw_size * 4;
const int in_channel_idx = w % rounded_in_channel; const int hw_idx = h % hw_size;
const int h_idx = hw_idx / filter_w; const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w; const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel const int offset = ((h_idx * filter_w + w_idx) * out_channel
+ out_channel_idx) * in_channel
+ in_channel_idx; + in_channel_idx;
if (out_channel_idx < out_channel) { if (out_channel_idx < out_channel) {
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
const int size = (out_channel - out_channel_idx); const int size = (out_channel - out_channel_idx);
if (size < 4) { if (size < 4) {
switch (size) { switch (size) {
...@@ -200,7 +205,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -200,7 +205,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+ channel_idx; + channel_idx;
const int size = channels - channel_idx; const int size = channels - channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; DATA_TYPE4 values = 0;
if (size < 4) { if (size < 4) {
switch(size) { switch(size) {
case 3: case 3:
...@@ -214,7 +219,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -214,7 +219,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
values = vload4(0, input + offset); values = vload4(0, input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
...@@ -246,7 +251,7 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -246,7 +251,7 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+ channel_idx; + channel_idx;
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
const int size = channels - channel_idx; const int size = channels - channel_idx;
if (size < 4) { if (size < 4) {
switch (size) { switch (size) {
...@@ -286,7 +291,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -286,7 +291,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
const int size = count - w * 4; const int size = count - w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; DATA_TYPE4 values = 0;
if (size < 4) { if (size < 4) {
switch(size) { switch(size) {
case 3: case 3:
...@@ -300,7 +305,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -300,7 +305,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
values = vload4(0, input + offset); values = vload4(0, input + offset);
} }
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values); WRITE_IMAGET(output, coord, values);
} }
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
...@@ -325,7 +330,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -325,7 +330,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const int offset = w * 4; const int offset = w * 4;
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
const int size = count - offset; const int size = count - offset;
if (size < 4) { if (size < 4) {
switch (size) { switch (size) {
......
#include <common.h> #include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif #endif
...@@ -41,8 +41,6 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -41,8 +41,6 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
#endif #endif
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
...@@ -64,21 +62,21 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -64,21 +62,21 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int height_idx = mad24((out_hb % out_height), stride, -padding_top); const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
const int batch_idx = mul24((out_hb / out_height), in_height); const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); const int filter_hw = mul24(filter_width, filter_height);
DATA_TYPE4 in0, in1, in2, in3; DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 weights0, weights1, weights2, weights3; DATA_TYPE4 weights0, weights1, weights2, weights3;
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_idx = in_ch_blk << 2;
int filter_y_idx = mul24(out_ch_blk, filter_hw);
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
// TODO(heliangliang) optimize out these muls
int in_hb_value = height_idx + mul24(hb_idx, dilation_h); int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
int filter_x_part1 = 0; #pragma unroll
for (short width_idx = 0; width_idx < filter_width; ++width_idx) { for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
...@@ -96,11 +94,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -96,11 +94,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#undef READ_INPUT #undef READ_INPUT
// int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2); // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1; weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
out0 = mad(in0.x, weights0, out0); out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0); out0 = mad(in0.y, weights1, out0);
...@@ -123,9 +120,8 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -123,9 +120,8 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
out3 = mad(in3.z, weights2, out3); out3 = mad(in3.z, weights2, out3);
out3 = mad(in3.w, weights3, out3); out3 = mad(in3.w, weights3, out3);
filter_x_part1 += rounded_in_ch; filter_y_idx += 1;
} }
filter_x_part0 += rounded_in_ch_x_filter_width;
} }
} }
......
#include <common.h> #include <common.h>
__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS #ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
#endif #endif
...@@ -39,8 +39,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -39,8 +39,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
#endif #endif
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
...@@ -65,19 +63,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -65,19 +63,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int height_idx = mad24((out_hb % out_height), stride, -padding_top); const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
const int batch_idx = mul24((out_hb / out_height), in_height); const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;
DATA_TYPE4 in0, in1, in2, in3, in4; DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3; DATA_TYPE4 weights0, weights1, weights2, weights3;
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width); const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2; int filter_x_idx = in_ch_blk << 2;
int filter_y_idx = mul24(out_ch_blk, 9);
int in_hb_idx = height_idx; int in_hb_idx = height_idx;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = select(in_hb_idx + batch_idx, int in_hb_value = select(in_hb_idx + batch_idx,
-1, -1,
(in_hb_idx < 0 || in_hb_idx >= in_height)); (in_hb_idx < 0 || in_hb_idx >= in_height));
int filter_x_part1 = 0;
int in_width_idx = 0; int in_width_idx = 0;
for (short width_idx = 0; width_idx < 3; ++width_idx) { for (short width_idx = 0; width_idx < 3; ++width_idx) {
int in_width_value; int in_width_value;
...@@ -97,11 +94,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -97,11 +94,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#undef READ_INPUT #undef READ_INPUT
// int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2); // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1; weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
out0 = mad(in0.x, weights0, out0); out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0); out0 = mad(in0.y, weights1, out0);
...@@ -129,10 +125,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -129,10 +125,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out4 = mad(in4.z, weights2, out4); out4 = mad(in4.z, weights2, out4);
out4 = mad(in4.w, weights3, out4); out4 = mad(in4.w, weights3, out4);
filter_x_part1 += rounded_in_ch;
in_width_idx += dilation_w; in_width_idx += dilation_w;
filter_y_idx += 1;
} }
filter_x_part0 += rounded_in_ch_x_3;
in_hb_idx += dilation_h; in_hb_idx += dilation_h;
} }
} }
......
...@@ -23,13 +23,13 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -23,13 +23,13 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
(*image_shape)[1] = shape[0] * shape[1]; (*image_shape)[1] = shape[0] * shape[1];
} }
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4] // [RoundUp<4>(Ic), H * W * (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
std::vector<size_t> *image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape->resize(2); image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4); (*image_shape)[0] = RoundUp<index_t>(shape[3], 4);
(*image_shape)[1] = RoundUpDiv4(shape[2]); (*image_shape)[1] = shape[0] * shape[1] * RoundUpDiv4(shape[2]);
} }
// [H * W * M, (Ic + 3) / 4] // [H * W * M, (Ic + 3) / 4]
......
...@@ -114,6 +114,7 @@ static void Conv2d(int iters, ...@@ -114,6 +114,7 @@ static void Conv2d(int iters,
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256); BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256);
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, 1, VALID, 1024); BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, 1, VALID, 1024);
...@@ -135,6 +136,8 @@ BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128); ...@@ -135,6 +136,8 @@ BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 1024, 16, 16, 15, 1, 1, 1, SAME, 2);
// Dilation // Dilation
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32); BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
......
...@@ -7,11 +7,11 @@ package( ...@@ -7,11 +7,11 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android")
cc_library( cc_library(
name = "public", name = "public",
hdrs = [ hdrs = [
"mace.h", "mace.h",
"mace_runtime.h",
"mace_types.h",
], ],
) )
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
// This file defines core MACE APIs.
// There APIs will be stable and backward compatible.
#ifndef MACE_PUBLIC_MACE_H_ #ifndef MACE_PUBLIC_MACE_H_
#define MACE_PUBLIC_MACE_H_ #define MACE_PUBLIC_MACE_H_
...@@ -13,415 +16,60 @@ ...@@ -13,415 +16,60 @@
namespace mace { namespace mace {
#define MACE_MAJOR_VERSION 0 const char *MaceVersion();
#define MACE_MINOR_VERSION 1
#define MACE_PATCH_VERSION 0
// MACE_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
// "-beta", "-rc", "-rc.1")
#define MACE_VERSION_SUFFIX ""
#define MACE_STR_HELPER(x) #x
#define MACE_STR(x) MACE_STR_HELPER(x)
// e.g. "0.5.0" or "0.6.0-alpha".
#define MACE_VERSION_STRING \
(MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
inline const char *MaceVersion() { return MACE_VERSION_STRING; }
extern const char *MaceGitVersion();
// Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN
#define DISABLE_COPY_AND_ASSIGN(classname) \
private: \
classname(const classname &) = delete; \
classname &operator=(const classname &) = delete
#endif
enum NetMode { INIT = 0, NORMAL = 1 };
enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 }; enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
enum DataType { enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
DT_INVALID = 0,
DT_FLOAT = 1,
DT_DOUBLE = 2,
DT_INT32 = 3,
DT_UINT8 = 4,
DT_INT16 = 5,
DT_INT8 = 6,
DT_STRING = 7,
DT_INT64 = 8,
DT_UINT16 = 9,
DT_BOOL = 10,
DT_HALF = 19,
DT_UINT32 = 22
};
enum GPUPerfHint {
PERF_DEFAULT = 0,
PERF_LOW = 1,
PERF_NORMAL = 2,
PERF_HIGH = 3
};
enum GPUPriorityHint {
PRIORITY_DEFAULT = 0,
PRIORITY_LOW = 1,
PRIORITY_NORMAL = 2,
PRIORITY_HIGH = 3
};
enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2};
class ConstTensor {
public:
ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const DataType data_type = DT_FLOAT,
uint32_t node_id = 0);
ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const int data_type,
uint32_t node_id = 0);
const std::string &name() const;
const unsigned char *data() const;
int64_t data_size() const;
const std::vector<int64_t> &dims() const;
DataType data_type() const;
uint32_t node_id() const;
private:
const std::string name_;
const unsigned char *data_;
const int64_t data_size_;
const std::vector<int64_t> dims_;
const DataType data_type_;
const uint32_t node_id_;
};
class Argument {
public:
Argument();
void CopyFrom(const Argument &from);
public:
const std::string &name() const;
void set_name(const std::string &value);
bool has_f() const;
float f() const;
void set_f(float value);
bool has_i() const;
int64_t i() const;
void set_i(int64_t value);
bool has_s() const;
std::string s() const;
void set_s(const std::string &value);
const std::vector<float> &floats() const;
void add_floats(float value);
void set_floats(const std::vector<float> &value);
const std::vector<int64_t> &ints() const;
void add_ints(int64_t value);
void set_ints(const std::vector<int64_t> &value);
const std::vector<std::string> &strings() const;
void add_strings(const ::std::string &value);
void set_strings(const std::vector<std::string> &value);
private:
void set_has_f();
void set_has_i();
void set_has_s();
private:
std::string name_;
float f_;
int64_t i_;
std::string s_;
std::vector<float> floats_;
std::vector<int64_t> ints_;
std::vector<std::string> strings_;
uint32_t has_bits_;
};
class NodeInput {
public:
NodeInput() {}
NodeInput(int node_id, int output_port);
void CopyFrom(const NodeInput &from);
public:
int node_id() const;
void set_node_id(int node_id);
int output_port() const;
void set_output_port(int output_port);
private:
int node_id_;
int output_port_;
};
class OutputShape {
public:
OutputShape();
OutputShape(const std::vector<int64_t> &dims); // NOLINT(runtime/explicit)
void CopyFrom(const OutputShape &from);
public:
const std::vector<int64_t> &dims() const;
private:
std::vector<int64_t> dims_;
};
class OperatorDef {
public:
void CopyFrom(const OperatorDef &from);
public:
const std::string &name() const;
void set_name(const std::string &name_);
bool has_name() const;
const std::string &type() const;
void set_type(const std::string &type_);
bool has_type() const;
const std::vector<int> &mem_id() const;
void set_mem_id(const std::vector<int> &value);
uint32_t node_id() const;
void set_node_id(uint32_t node_id);
uint32_t op_id() const;
uint32_t padding() const;
void set_padding(uint32_t padding);
const std::vector<NodeInput> &node_input() const;
void add_node_input(const NodeInput &value);
const std::vector<int> &out_max_byte_size() const;
void add_out_max_byte_size(int value);
const std::vector<std::string> &input() const;
const std::string &input(int index) const;
std::string *add_input();
void add_input(const ::std::string &value);
void add_input(::std::string &&value);
void set_input(const std::vector<std::string> &value);
const std::vector<std::string> &output() const;
const std::string &output(int index) const;
std::string *add_output();
void add_output(const ::std::string &value);
void add_output(::std::string &&value);
void set_output(const std::vector<std::string> &value);
const std::vector<Argument> &arg() const;
Argument *add_arg();
const std::vector<OutputShape> &output_shape() const;
void add_output_shape(const OutputShape &value);
const std::vector<DataType> &output_type() const;
void set_output_type(const std::vector<DataType> &value);
private:
void set_has_name();
void set_has_type();
void set_has_mem_id();
private:
std::string name_;
std::string type_;
std::vector<std::string> input_;
std::vector<std::string> output_;
std::vector<Argument> arg_;
std::vector<OutputShape> output_shape_;
std::vector<DataType> output_type_;
std::vector<int> mem_id_;
// nnlib
uint32_t node_id_;
uint32_t op_id_;
uint32_t padding_;
std::vector<NodeInput> node_input_;
std::vector<int> out_max_byte_size_;
uint32_t has_bits_;
};
class MemoryBlock {
public:
MemoryBlock(int mem_id, uint32_t x, uint32_t y);
public:
int mem_id() const;
uint32_t x() const;
uint32_t y() const;
private:
int mem_id_;
uint32_t x_;
uint32_t y_;
};
class MemoryArena {
public:
const std::vector<MemoryBlock> &mem_block() const;
std::vector<MemoryBlock> &mutable_mem_block();
int mem_block_size() const;
private:
std::vector<MemoryBlock> mem_block_;
};
// for hexagon mace-nnlib // MACE input/output tensor
class InputInfo { class MaceTensor {
public: public:
const std::string &name() const; // shape - the shape of the tensor, with size n
int32_t node_id() const; // data - the buffer of the tensor, must not be null with size equals
int32_t max_byte_size() const; // shape[0] * shape[1] * ... * shape[n-1]
DataType data_type() const; explicit MaceTensor(const std::vector<int64_t> &shape,
const std::vector<int32_t> &dims() const; std::shared_ptr<float> data);
MaceTensor();
MaceTensor(const MaceTensor &other);
MaceTensor(const MaceTensor &&other);
MaceTensor &operator=(const MaceTensor &other);
MaceTensor &operator=(const MaceTensor &&other);
~MaceTensor();
private: const std::vector<int64_t> &shape() const;
std::string name_; const std::shared_ptr<float> data() const;
int32_t node_id_; std::shared_ptr<float> data();
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
};
class OutputInfo {
public:
const std::string &name() const;
int32_t node_id() const;
int32_t max_byte_size() const;
DataType data_type() const;
void set_data_type(DataType data_type);
const std::vector<int32_t> &dims() const;
void set_dims(const std::vector<int32_t> &dims);
private: private:
std::string name_; class Impl;
int32_t node_id_; std::unique_ptr<Impl> impl_;
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
}; };
class NetDef { class NetDef;
public: class RunMetadata;
NetDef();
int op_size() const;
const OperatorDef &op(const int idx) const;
public:
const std::string &name() const;
bool has_name() const;
void set_name(const std::string &value);
const std::string &version() const;
bool has_version() const;
void set_version(const std::string &value);
const std::vector<OperatorDef> &op() const;
OperatorDef *add_op();
std::vector<OperatorDef> &mutable_op();
const std::vector<Argument> &arg() const;
Argument *add_arg();
std::vector<Argument> &mutable_arg();
const std::vector<ConstTensor> &tensors() const;
std::vector<ConstTensor> &mutable_tensors();
const MemoryArena &mem_arena() const;
bool has_mem_arena() const;
MemoryArena &mutable_mem_arena();
const std::vector<InputInfo> &input_info() const;
const std::vector<OutputInfo> &output_info() const;
std::vector<OutputInfo> &mutable_output_info();
private:
void set_has_name();
void set_has_version();
void set_has_mem_arena();
private:
std::string name_;
std::string version_;
std::vector<OperatorDef> op_;
std::vector<Argument> arg_;
std::vector<ConstTensor> tensors_;
// for mem optimization
MemoryArena mem_arena_;
// for hexagon mace-nnlib
std::vector<InputInfo> input_info_;
std::vector<OutputInfo> output_info_;
uint32_t has_bits_;
};
struct CallStats {
int64_t start_micros;
int64_t end_micros;
};
struct OperatorStats {
std::string operator_name;
std::string type;
CallStats stats;
};
struct RunMetadata {
std::vector<OperatorStats> op_stats;
};
class Workspace;
class NetBase;
class OperatorRegistry;
class HexagonControlWrapper;
struct MaceInputInfo {
std::string name;
std::vector<int64_t> shape;
const float *data;
};
void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
void ConfigOmpThreadsAndAffinity(int omp_num_threads,
CPUPowerOption power_option);
class MaceEngine { class MaceEngine {
public: public:
// Single input and output
explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
// Multiple input or output
explicit MaceEngine(const NetDef *net_def, explicit MaceEngine(const NetDef *net_def,
DeviceType device_type, DeviceType device_type,
const std::vector<std::string> &input_nodes, const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes); const std::vector<std::string> &output_nodes);
~MaceEngine(); ~MaceEngine();
// Single input and output
bool Run(const float *input, MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
const std::vector<int64_t> &input_shape, std::map<std::string, MaceTensor> *outputs);
float *output);
// Single input and output for benchmark MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
bool Run(const float *input, std::map<std::string, MaceTensor> *outputs,
const std::vector<int64_t> &input_shape,
float *output,
RunMetadata *run_metadata); RunMetadata *run_metadata);
// Multiple input or output
bool Run(
const std::vector<MaceInputInfo> &input,
std::map<std::string, float *> &output, // NOLINT(runtime/references)
RunMetadata *run_metadata = nullptr);
MaceEngine(const MaceEngine &) = delete;
MaceEngine &operator=(const MaceEngine &) = delete;
private: private:
std::shared_ptr<OperatorRegistry> op_registry_; class Impl;
DeviceType device_type_; std::unique_ptr<Impl> impl_;
std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; MaceEngine(const MaceEngine &) = delete;
std::unique_ptr<HexagonControlWrapper> hexagon_controller_; MaceEngine &operator=(const MaceEngine &) = delete;
}; };
} // namespace mace } // namespace mace
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
// This file defines runtime tuning APIs.
// These APIs are not stable.
#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
#define MACE_PUBLIC_MACE_RUNTIME_H_
namespace mace {
enum GPUPerfHint {
PERF_DEFAULT = 0,
PERF_LOW = 1,
PERF_NORMAL = 2,
PERF_HIGH = 3
};
enum GPUPriorityHint {
PRIORITY_DEFAULT = 0,
PRIORITY_LOW = 1,
PRIORITY_NORMAL = 2,
PRIORITY_HIGH = 3
};
enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 };
void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
void ConfigOmpThreadsAndAffinity(int omp_num_threads,
CPUPowerOption power_option);
} // namespace mace
#endif // MACE_PUBLIC_MACE_RUNTIME_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
// This file defines data types used by net creation and benchmark tools.
// These APIs are not stable and should only be used by advanced users.
#ifndef MACE_PUBLIC_MACE_TYPES_H_
#define MACE_PUBLIC_MACE_TYPES_H_
#include <string>
#include <vector>
namespace mace {
// Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN
#define DISABLE_COPY_AND_ASSIGN(classname) \
private: \
classname(const classname &) = delete; \
classname &operator=(const classname &) = delete
#endif
enum NetMode { INIT = 0, NORMAL = 1 };
enum DataType {
DT_INVALID = 0,
DT_FLOAT = 1,
DT_DOUBLE = 2,
DT_INT32 = 3,
DT_UINT8 = 4,
DT_INT16 = 5,
DT_INT8 = 6,
DT_STRING = 7,
DT_INT64 = 8,
DT_UINT16 = 9,
DT_BOOL = 10,
DT_HALF = 19,
DT_UINT32 = 22
};
class ConstTensor {
public:
ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const DataType data_type = DT_FLOAT,
uint32_t node_id = 0);
ConstTensor(const std::string &name,
const unsigned char *data,
const std::vector<int64_t> &dims,
const int data_type,
uint32_t node_id = 0);
const std::string &name() const;
const unsigned char *data() const;
int64_t data_size() const;
const std::vector<int64_t> &dims() const;
DataType data_type() const;
uint32_t node_id() const;
private:
const std::string name_;
const unsigned char *data_;
const int64_t data_size_;
const std::vector<int64_t> dims_;
const DataType data_type_;
const uint32_t node_id_;
};
class Argument {
public:
Argument();
void CopyFrom(const Argument &from);
public:
const std::string &name() const;
void set_name(const std::string &value);
bool has_f() const;
float f() const;
void set_f(float value);
bool has_i() const;
int64_t i() const;
void set_i(int64_t value);
bool has_s() const;
std::string s() const;
void set_s(const std::string &value);
const std::vector<float> &floats() const;
void add_floats(float value);
void set_floats(const std::vector<float> &value);
const std::vector<int64_t> &ints() const;
void add_ints(int64_t value);
void set_ints(const std::vector<int64_t> &value);
const std::vector<std::string> &strings() const;
void add_strings(const ::std::string &value);
void set_strings(const std::vector<std::string> &value);
private:
void set_has_f();
void set_has_i();
void set_has_s();
private:
std::string name_;
float f_;
int64_t i_;
std::string s_;
std::vector<float> floats_;
std::vector<int64_t> ints_;
std::vector<std::string> strings_;
uint32_t has_bits_;
};
class NodeInput {
public:
NodeInput() {}
NodeInput(int node_id, int output_port);
void CopyFrom(const NodeInput &from);
public:
int node_id() const;
void set_node_id(int node_id);
int output_port() const;
void set_output_port(int output_port);
private:
int node_id_;
int output_port_;
};
class OutputShape {
public:
OutputShape();
OutputShape(const std::vector<int64_t> &dims); // NOLINT(runtime/explicit)
void CopyFrom(const OutputShape &from);
public:
const std::vector<int64_t> &dims() const;
private:
std::vector<int64_t> dims_;
};
class OperatorDef {
public:
void CopyFrom(const OperatorDef &from);
public:
const std::string &name() const;
void set_name(const std::string &name_);
bool has_name() const;
const std::string &type() const;
void set_type(const std::string &type_);
bool has_type() const;
const std::vector<int> &mem_id() const;
void set_mem_id(const std::vector<int> &value);
uint32_t node_id() const;
void set_node_id(uint32_t node_id);
uint32_t op_id() const;
uint32_t padding() const;
void set_padding(uint32_t padding);
const std::vector<NodeInput> &node_input() const;
void add_node_input(const NodeInput &value);
const std::vector<int> &out_max_byte_size() const;
void add_out_max_byte_size(int value);
const std::vector<std::string> &input() const;
const std::string &input(int index) const;
std::string *add_input();
void add_input(const ::std::string &value);
void add_input(::std::string &&value);
void set_input(const std::vector<std::string> &value);
const std::vector<std::string> &output() const;
const std::string &output(int index) const;
std::string *add_output();
void add_output(const ::std::string &value);
void add_output(::std::string &&value);
void set_output(const std::vector<std::string> &value);
const std::vector<Argument> &arg() const;
Argument *add_arg();
const std::vector<OutputShape> &output_shape() const;
void add_output_shape(const OutputShape &value);
const std::vector<DataType> &output_type() const;
void set_output_type(const std::vector<DataType> &value);
private:
void set_has_name();
void set_has_type();
void set_has_mem_id();
private:
std::string name_;
std::string type_;
std::vector<std::string> input_;
std::vector<std::string> output_;
std::vector<Argument> arg_;
std::vector<OutputShape> output_shape_;
std::vector<DataType> output_type_;
std::vector<int> mem_id_;
// nnlib
uint32_t node_id_;
uint32_t op_id_;
uint32_t padding_;
std::vector<NodeInput> node_input_;
std::vector<int> out_max_byte_size_;
uint32_t has_bits_;
};
class MemoryBlock {
public:
MemoryBlock(int mem_id, uint32_t x, uint32_t y);
public:
int mem_id() const;
uint32_t x() const;
uint32_t y() const;
private:
int mem_id_;
uint32_t x_;
uint32_t y_;
};
class MemoryArena {
public:
const std::vector<MemoryBlock> &mem_block() const;
std::vector<MemoryBlock> &mutable_mem_block();
int mem_block_size() const;
private:
std::vector<MemoryBlock> mem_block_;
};
// for hexagon mace-nnlib
class InputInfo {
public:
const std::string &name() const;
int32_t node_id() const;
int32_t max_byte_size() const;
DataType data_type() const;
const std::vector<int32_t> &dims() const;
private:
std::string name_;
int32_t node_id_;
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
};
class OutputInfo {
public:
const std::string &name() const;
int32_t node_id() const;
int32_t max_byte_size() const;
DataType data_type() const;
void set_data_type(DataType data_type);
const std::vector<int32_t> &dims() const;
void set_dims(const std::vector<int32_t> &dims);
private:
std::string name_;
int32_t node_id_;
int32_t max_byte_size_; // only support 32-bit len
DataType data_type_;
std::vector<int32_t> dims_;
};
class NetDef {
public:
NetDef();
int op_size() const;
const OperatorDef &op(const int idx) const;
public:
const std::string &name() const;
bool has_name() const;
void set_name(const std::string &value);
const std::string &version() const;
bool has_version() const;
void set_version(const std::string &value);
const std::vector<OperatorDef> &op() const;
OperatorDef *add_op();
std::vector<OperatorDef> &mutable_op();
const std::vector<Argument> &arg() const;
Argument *add_arg();
std::vector<Argument> &mutable_arg();
const std::vector<ConstTensor> &tensors() const;
std::vector<ConstTensor> &mutable_tensors();
const MemoryArena &mem_arena() const;
bool has_mem_arena() const;
MemoryArena &mutable_mem_arena();
const std::vector<InputInfo> &input_info() const;
const std::vector<OutputInfo> &output_info() const;
std::vector<OutputInfo> &mutable_output_info();
private:
void set_has_name();
void set_has_version();
void set_has_mem_arena();
private:
std::string name_;
std::string version_;
std::vector<OperatorDef> op_;
std::vector<Argument> arg_;
std::vector<ConstTensor> tensors_;
// for mem optimization
MemoryArena mem_arena_;
// for hexagon mace-nnlib
std::vector<InputInfo> input_info_;
std::vector<OutputInfo> output_info_;
uint32_t has_bits_;
};
struct CallStats {
int64_t start_micros;
int64_t end_micros;
};
struct OperatorStats {
std::string operator_name;
std::string type;
CallStats stats;
};
struct RunMetadata {
std::vector<OperatorStats> op_stats;
};
} // namespace mace
#endif // MACE_PUBLIC_MACE_TYPES_H_
...@@ -247,11 +247,8 @@ class CaffeConverter(object): ...@@ -247,11 +247,8 @@ class CaffeConverter(object):
arg.i = self.dt arg.i = self.dt
return output_name return output_name
def add_input_transform(self, names, is_single): def add_input_transform(self, names):
for name in names: for name in names:
if is_single:
new_input_name = MACE_INPUT_NODE_NAME + ":0"
else:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = name op_def.name = name
...@@ -267,11 +264,8 @@ class CaffeConverter(object): ...@@ -267,11 +264,8 @@ class CaffeConverter(object):
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
def add_output_transform(self, names, is_single): def add_output_transform(self, names):
for name in names: for name in names:
if is_single:
output_name = MACE_OUTPUT_NODE_NAME + ":0"
else:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
...@@ -333,8 +327,18 @@ class CaffeConverter(object): ...@@ -333,8 +327,18 @@ class CaffeConverter(object):
return pad, stride, kernel return pad, stride, kernel
def convert_conv2d(self, op): def convert_conv2d(self, op):
op_def = self.CommonConvert(op, 'Conv2D')
param = op.layer.convolution_param param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
is_depthwise = True
else:
raise Exception("Mace do not support group convolution yet")
if is_depthwise:
op_def = self.CommonConvert(op, 'DepthwiseConv2d')
else:
op_def = self.CommonConvert(op, 'Conv2D')
# Add filter # Add filter
weight_tensor_name = op.name + '_weight:0' weight_tensor_name = op.name + '_weight:0'
...@@ -342,7 +346,7 @@ class CaffeConverter(object): ...@@ -342,7 +346,7 @@ class CaffeConverter(object):
self.add_tensor(weight_tensor_name, weight_data) self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu': if self.device == 'gpu':
buffer_type = "CONV2D_FILTER" buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER"
output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
op_def.input.extend([output_name]) op_def.input.extend([output_name])
else: else:
...@@ -381,6 +385,7 @@ class CaffeConverter(object): ...@@ -381,6 +385,7 @@ class CaffeConverter(object):
if len(self.ops_map[final_op.name].children) == 1 \ if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map: and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0] activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
op_def.type = "FusedConv2D" op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add() fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation' fused_act_arg.name = 'activation'
...@@ -412,7 +417,7 @@ class CaffeConverter(object): ...@@ -412,7 +417,7 @@ class CaffeConverter(object):
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
return self.winograd and self.device == 'gpu' and \ return self.winograd and self.device == 'gpu' and \
filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
dilations[0] == 1 and (dilations[0] == dilations[1]) and\ dilations[0] == 1 and (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \ (strides[0] == 1) and (strides[0] == strides[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
...@@ -789,7 +794,6 @@ class CaffeConverter(object): ...@@ -789,7 +794,6 @@ class CaffeConverter(object):
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]] shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
print shape_param
for i in range(len(shape_param)): for i in range(len(shape_param)):
if shape_param[i] != 0: if shape_param[i] != 0:
output_shape[i] = shape_param[i] output_shape[i] = shape_param[i]
...@@ -844,18 +848,9 @@ class CaffeConverter(object): ...@@ -844,18 +848,9 @@ class CaffeConverter(object):
self.net_def.op.extend([op_def]) self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name) self.resolved_ops.add(op.name)
def replace_in_out_name(self, input_names, output_names, is_single): def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names]) in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names]) out_names = set([output_name + ":0" for output_name in output_names])
if is_single:
for op in self.net_def.op:
for i in range(len(op.input)):
if op.input[i] in in_names:
op.input[i] = MACE_INPUT_NODE_NAME + ':0'
for i in range(len(op.output)):
if op.output[i] in out_names:
op.output[i] = MACE_OUTPUT_NODE_NAME + ':0'
else:
for op in self.net_def.op: for op in self.net_def.op:
for i in range(len(op.input)): for i in range(len(op.input)):
if op.input[i] in in_names: if op.input[i] in in_names:
...@@ -878,9 +873,8 @@ class CaffeConverter(object): ...@@ -878,9 +873,8 @@ class CaffeConverter(object):
input_op.output_shape_map[input_op.name] = input_shapes[i] input_op.output_shape_map[input_op.name] = input_shapes[i]
def convert(self, input_nodes, input_shapes, output_nodes): def convert(self, input_nodes, input_shapes, output_nodes):
is_single = len(input_nodes) == 1 and len(output_nodes) == 1
if self.device == 'gpu': if self.device == 'gpu':
self.add_input_transform(input_nodes, is_single) self.add_input_transform(input_nodes)
assert self.ops[0].type == 'Input' assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes) self.add_input_op_shape(input_nodes, input_shapes)
...@@ -925,10 +919,10 @@ class CaffeConverter(object): ...@@ -925,10 +919,10 @@ class CaffeConverter(object):
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu': if self.device == 'gpu':
self.add_output_transform(output_nodes, is_single) self.add_output_transform(output_nodes)
if self.device == 'cpu': if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes, is_single) self.replace_in_out_name(input_nodes, output_nodes)
for op in self.ops: for op in self.ops:
if op.name not in self.resolved_ops: if op.name not in self.resolved_ops:
...@@ -967,3 +961,4 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, ...@@ -967,3 +961,4 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str,
print "Memory optimization done." print "Memory optimization done."
return net_def return net_def
// //
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
// //
// Generated by the mace converter. DO NOT EDIT!
#ifndef MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
#define MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
#include <string> #include <string>
...@@ -10,13 +13,16 @@ ...@@ -10,13 +13,16 @@
namespace mace { namespace mace {
namespace {{tag}} { namespace {{tag}} {
extern const unsigned char *LoadModelData(const char *model_data_file); const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data); void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data); NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum(); const std::string ModelChecksum();
} // namespace {{ tag }} } // namespace {{ tag }}
} // namespace mace } // namespace mace
#endif // MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
...@@ -118,11 +118,8 @@ class TFConverter(object): ...@@ -118,11 +118,8 @@ class TFConverter(object):
arg.i = self.dt arg.i = self.dt
return output_name return output_name
def add_input_transform(self, names, is_single): def add_input_transform(self, names):
for name in names: for name in names:
if is_single:
new_input_name = MACE_INPUT_NODE_NAME + ":0"
else:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = name op_def.name = name
...@@ -138,11 +135,8 @@ class TFConverter(object): ...@@ -138,11 +135,8 @@ class TFConverter(object):
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
def add_output_transform(self, names, is_single): def add_output_transform(self, names):
for name in names: for name in names:
if is_single:
output_name = MACE_OUTPUT_NODE_NAME + ":0"
else:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
...@@ -362,6 +356,7 @@ class TFConverter(object): ...@@ -362,6 +356,7 @@ class TFConverter(object):
if len(self.tf_graph.get(final_op.name, [])) == 1 \ if len(self.tf_graph.get(final_op.name, [])) == 1 \
and self.tf_graph[final_op.name][0].type in activation_name_map: and self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0] activation_op = self.tf_graph[final_op.name][0]
if op_def.type == "Conv2D":
op_def.type = "FusedConv2D" op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add() fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation' fused_act_arg.name = 'activation'
...@@ -805,26 +800,18 @@ class TFConverter(object): ...@@ -805,26 +800,18 @@ class TFConverter(object):
self.add_output_shape(op.outputs, op_def) self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
def replace_in_out_name(self, input_names, output_names, is_single): def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names]) in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names]) out_names = set([output_name + ":0" for output_name in output_names])
if is_single:
for op in self.net_def.op:
if len(op.input) > 0 and op.input[0] in in_names:
op.input[0] = MACE_INPUT_NODE_NAME + ':0'
if len(op.output) > 0 and op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + ':0'
else:
for op in self.net_def.op: for op in self.net_def.op:
if len(op.input) > 0 and op.input[0] in in_names: if op.input[0] in in_names:
op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0] op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
if len(op.output) > 0 and op.output[0] in out_names: if op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0] op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
def convert(self, input_nodes, output_nodes): def convert(self, input_nodes, output_nodes):
is_single = len(input_nodes) == 1 and len(output_nodes) == 1
if self.device == 'gpu': if self.device == 'gpu':
self.add_input_transform(input_nodes, is_single) self.add_input_transform(input_nodes)
for op in self.tf_ops: for op in self.tf_ops:
if self.resolved_ops[op.name] == 1: if self.resolved_ops[op.name] == 1:
...@@ -892,10 +879,10 @@ class TFConverter(object): ...@@ -892,10 +879,10 @@ class TFConverter(object):
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu': if self.device == 'gpu':
self.add_output_transform(output_nodes, is_single) self.add_output_transform(output_nodes)
if self.device == 'cpu': if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes, is_single) self.replace_in_out_name(input_nodes, output_nodes)
for key in self.resolved_ops: for key in self.resolved_ops:
if self.resolved_ops[key] != 1: if self.resolved_ops[key] != 1:
......
...@@ -25,6 +25,6 @@ cat <<EOF > ${OUTPUT_FILENAME} ...@@ -25,6 +25,6 @@ cat <<EOF > ${OUTPUT_FILENAME}
// This is a generated file, DO NOT EDIT // This is a generated file, DO NOT EDIT
namespace mace { namespace mace {
const char *MaceGitVersion() { return "${GIT_VERSION}"; } const char *MaceVersion() { return "${GIT_VERSION}"; }
} // namespace mace } // namespace mace
EOF EOF
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <utility> #include <utility>
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_types.h"
#include "mace/utils/env_time.h" #include "mace/utils/env_time.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
......
...@@ -76,17 +76,19 @@ def generate_random_input(target_soc, model_output_dir, ...@@ -76,17 +76,19 @@ def generate_random_input(target_soc, model_output_dir,
target_soc, model_output_dir, int(generate_data_or_not)) target_soc, model_output_dir, int(generate_data_or_not))
run_command(command) run_command(command)
input_name_list = []
input_file_list = [] input_file_list = []
if isinstance(input_names, list):
input_name_list.extend(input_names)
else:
input_name_list.append(input_names)
if isinstance(input_files, list): if isinstance(input_files, list):
input_file_list.extend(input_files) input_file_list.extend(input_files)
else: else:
input_file_list.append(input_files) input_file_list.append(input_files)
assert len(input_file_list) == len(input_name_list) if len(input_file_list) != 0:
input_name_list = []
if isinstance(input_names, list):
input_name_list.extend(input_names)
else:
input_name_list.append(input_names)
if len(input_file_list) != len(input_name_list):
raise Exception('If input_files set, the input files should match the input names.')
for i in range(len(input_file_list)): for i in range(len(input_file_list)):
if input_file_list[i] is not None: if input_file_list[i] is not None:
dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i]) dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册