提交 857e6e60 编写于 作者: Y yejianwu

merge with origin master

......@@ -6,5 +6,6 @@ cmake-build-debug/
mace/codegen/models/
mace/codegen/opencl/
mace/codegen/opencl_bin/
mace/codegen/tuning/
mace/codegen/version/
build/
......@@ -25,6 +25,7 @@
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
#include "mace/core/types.h"
namespace mace {
......@@ -72,10 +73,43 @@ extern const std::string ModelChecksum();
namespace benchmark {
void Split(const std::string &str,
char delims,
std::vector<std::string> *result) {
MACE_CHECK_NOTNULL(result);
std::string tmp = str;
while (!tmp.empty()) {
size_t next_offset = tmp.find(delims);
result->push_back(tmp.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
void SplitAndParseToInts(const std::string &str,
char delims,
std::vector<int64_t> *result) {
MACE_CHECK_NOTNULL(result);
std::string tmp = str;
while (!tmp.empty()) {
index_t dim = atoi(tmp.data());
result->push_back(dim);
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
std::string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
index_t dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == std::string::npos) {
......@@ -86,6 +120,14 @@ void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
}
}
std::string FormatName(const std::string input) {
std::string res = input;
for (size_t i = 0; i < input.size(); ++i) {
if (!::isalnum(res[i])) res[i] = '_';
}
return res;
}
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
......@@ -100,6 +142,10 @@ DeviceType ParseDeviceType(const std::string &device_str) {
}
}
DEFINE_string(input_node, "input_node0,input_node1",
"input nodes, separated by comma");
DEFINE_string(output_node, "output_node0,output_node1",
"output nodes, separated by comma");
DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
DEFINE_string(input_file, "", "input file name");
......@@ -113,7 +159,6 @@ int Main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "mace version: " << MaceVersion();
LOG(INFO) << "mace git version: " << MaceGitVersion();
#ifdef MACE_CPU_MODEL_TAG
LOG(INFO) << "cpu model checksum: "
<< mace::MACE_CPU_MODEL_TAG::ModelChecksum();
......@@ -126,7 +171,9 @@ int Main(int argc, char **argv) {
LOG(INFO) << "dsp model checksum: "
<< mace::MACE_DSP_MODEL_TAG::ModelChecksum();
#endif
LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
LOG(INFO) << "input_shape: " << FLAGS_input_shape;
LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
LOG(INFO) << "output_shape: " << FLAGS_output_shape;
LOG(INFO) << "input_file: " << FLAGS_input_file;
LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
......@@ -134,31 +181,63 @@ int Main(int argc, char **argv) {
LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
std::vector<int64_t> input_shape_vec;
std::vector<int64_t> output_shape_vec;
ParseShape(FLAGS_input_shape, &input_shape_vec);
ParseShape(FLAGS_output_shape, &output_shape_vec);
int64_t input_size =
std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1,
std::multiplies<int64_t>());
int64_t output_size =
std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1,
std::multiplies<int64_t>());
std::unique_ptr<float[]> input_data(new float[input_size]);
std::unique_ptr<float[]> cpu_output_data(new float[output_size]);
std::unique_ptr<float[]> gpu_output_data(new float[output_size]);
std::unique_ptr<float[]> dsp_output_data(new float[output_size]);
// load input
std::ifstream in_file(FLAGS_input_file, std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_data.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
std::vector<std::string> input_names;
std::vector<std::string> output_names;
std::vector<std::string> input_shapes;
std::vector<std::string> output_shapes;
Split(FLAGS_input_node, ',', &input_names);
Split(FLAGS_output_node, ',', &output_names);
Split(FLAGS_input_shape, ':', &input_shapes);
Split(FLAGS_output_shape, ':', &output_shapes);
const size_t input_count = input_shapes.size();
const size_t output_count = output_shapes.size();
std::vector<std::vector<int64_t>> input_shape_vec(input_count);
std::vector<std::vector<int64_t>> output_shape_vec(output_count);
for (size_t i = 0; i < input_count; ++i) {
ParseShape(input_shapes[i], &input_shape_vec[i]);
}
for (size_t i = 0; i < output_count; ++i) {
ParseShape(output_shapes[i], &output_shape_vec[i]);
}
std::map<std::string, mace::MaceTensor> inputs;
std::map<std::string, mace::MaceTensor> cpu_outputs;
std::map<std::string, mace::MaceTensor> gpu_outputs;
std::map<std::string, mace::MaceTensor> dsp_outputs;
for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output
int64_t input_size =
std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
std::multiplies<int64_t>());
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(buffer_in.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(FATAL) << "Open input file failed";
}
inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
}
for (size_t i = 0; i < output_count; ++i) {
int64_t output_size =
std::accumulate(output_shape_vec[i].begin(),
output_shape_vec[i].end(), 1,
std::multiplies<int64_t>());
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
cpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
buffer_out);
gpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
buffer_out);
dsp_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
buffer_out);
}
int64_t t0, t1, init_micros;
......@@ -170,11 +249,12 @@ int Main(int argc, char **argv) {
FLAGS_cpu_model_data_file.c_str());
NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU);
mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU, input_names,
output_names);
LOG(INFO) << "CPU Warm up run";
t0 = NowMicros();
cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
cpu_engine.Run(inputs, &cpu_outputs);
t1 = NowMicros();
LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
#endif
......@@ -187,12 +267,13 @@ int Main(int argc, char **argv) {
FLAGS_gpu_model_data_file.c_str());
NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL);
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names,
output_names);
mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
LOG(INFO) << "GPU Warm up run";
t0 = NowMicros();
gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
gpu_engine.Run(inputs, &gpu_outputs);
t1 = NowMicros();
LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
#endif
......@@ -202,15 +283,16 @@ int Main(int argc, char **argv) {
LOG(INFO) << "Load & init dsp model and warm up";
const unsigned char *dsp_model_data =
mace::MACE_DSP_MODEL_TAG::LoadModelData(
FLAGS_gpu_model_data_file.c_str());
FLAGS_dsp_model_data_file.c_str());
NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON);
mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON, input_names,
output_names);
mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
LOG(INFO) << "DSP Warm up run";
t0 = NowMicros();
gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
dsp_engine.Run(inputs, &dsp_outputs);
t1 = NowMicros();
LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
#endif
......@@ -226,7 +308,7 @@ int Main(int argc, char **argv) {
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
cpu_engine.Run(inputs, &cpu_outputs);
int64_t end = NowMicros();
micros = end - start;
}
......@@ -240,7 +322,7 @@ int Main(int argc, char **argv) {
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
gpu_engine.Run(inputs, &gpu_outputs);
int64_t end = NowMicros();
micros = end - start;
}
......@@ -254,7 +336,7 @@ int Main(int argc, char **argv) {
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
dsp_engine.Run(inputs, &dsp_outputs);
int64_t end = NowMicros();
micros = end - start;
}
......
......@@ -17,14 +17,8 @@ cc_library(
)
cc_library(
name = "generated_opencl_dev",
srcs = ["opencl/opencl_encrypt_program.cc"],
linkstatic = 1,
)
cc_library(
name = "generated_opencl_prod",
srcs = ["opencl/opencl_compiled_program.cc"],
name = "generated_opencl",
srcs = glob(["opencl/*.cc"]),
linkstatic = 1,
)
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
// This is a generated file, DO NOT EDIT
#include <map>
#include <string>
#include <vector>
namespace mace {
extern const std::map<std::string, std::vector<unsigned int>>
kTuningParamsData = {};
} // namespace mace
......@@ -27,8 +27,6 @@ cc_library(
"runtime/hexagon/*.cc",
],
exclude = [
"runtime/opencl/opencl_production.cc",
"runtime/opencl/opencl_development.cc",
"*_test.cc",
"runtime/hexagon/hexagon_controller_dummy.cc",
],
......@@ -50,15 +48,15 @@ cc_library(
]),
deps = [
":opencl_headers",
"//mace/utils",
"//mace/codegen:generated_opencl",
"//mace/codegen:generated_version",
"//mace/utils",
"@half//:half",
] + if_production_mode([
"//mace/codegen:generated_tuning_params",
"//mace/utils:utils_prod",
"//mace/core:opencl_prod",
]) + if_not_production_mode([
"//mace/utils:utils_dev",
"//mace/core:opencl_dev",
]),
)
......@@ -90,26 +88,3 @@ cc_library(
],
alwayslink = 1,
)
cc_library(
name = "opencl_dev",
srcs = ["runtime/opencl/opencl_development.cc"],
linkstatic = 1,
deps = [
":opencl_headers",
"//mace/codegen:generated_opencl_dev",
"//mace/utils",
],
)
cc_library(
name = "opencl_prod",
srcs = ["runtime/opencl/opencl_production.cc"],
linkstatic = 1,
deps = [
":opencl_headers",
"//mace/codegen:generated_opencl_prod",
"//mace/codegen:generated_tuning_params",
"//mace/utils",
],
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/file_storage.h"
#include <fcntl.h>
#include <limits.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "mace/utils/logging.h"
namespace mace {
class FileStorageFactory::Impl {
public:
explicit Impl(const std::string &path);
std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
private:
std::string path_;
};
FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
const std::string &name) {
return std::move(std::unique_ptr<KVStorage>(
new FileStorage(path_ + "/" + name)));
}
FileStorageFactory::FileStorageFactory(const std::string &path):
impl_(new FileStorageFactory::Impl(path)) {}
FileStorageFactory::~FileStorageFactory() = default;
std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
const std::string &name) {
return impl_->CreateStorage(name);
}
FileStorage::FileStorage(const std::string &file_path):
file_path_(file_path) {}
int FileStorage::Load() {
struct stat st;
if (stat(file_path_.c_str(), &st) == -1) {
if (errno == ENOENT) {
LOG(INFO) << "File " << file_path_
<< " does not exist";
return 0;
} else {
LOG(WARNING) << "Stat file " << file_path_
<< " failed, error code: " << errno;
return -1;
}
}
int fd = open(file_path_.c_str(), O_RDONLY);
if (fd < 0) {
if (errno == ENOENT) {
LOG(INFO) << "File " << file_path_
<< " does not exist";
return 0;
} else {
LOG(WARNING) << "open file " << file_path_
<< " failed, error code: " << errno;
return -1;
}
}
size_t file_size = st.st_size;
unsigned char *file_data =
static_cast<unsigned char *>(mmap(nullptr, file_size, PROT_READ,
MAP_PRIVATE, fd, 0));
int res = 0;
if (file_data == MAP_FAILED) {
LOG(WARNING) << "mmap file " << file_path_
<< " failed, error code: " << errno;
res = close(fd);
if (res != 0) {
LOG(WARNING) << "close file " << file_path_
<< " failed, error code: " << errno;
}
return -1;
}
unsigned char *file_data_ptr = file_data;
const size_t int_size = sizeof(int32_t);
int64_t data_size = 0;
memcpy(&data_size, file_data_ptr, sizeof(int64_t));
file_data_ptr += sizeof(int64_t);
int32_t key_size = 0;
int32_t value_size = 0;
for (int i = 0; i < data_size; ++i) {
memcpy(&key_size, file_data_ptr, int_size);
file_data_ptr += int_size;
std::unique_ptr<char[]> key(new char[key_size+1]);
memcpy(&key[0], file_data_ptr, key_size);
file_data_ptr += key_size;
key[key_size] = '\0';
memcpy(&value_size, file_data_ptr, int_size);
file_data_ptr += int_size;
std::vector<unsigned char> value(value_size);
memcpy(value.data(), file_data_ptr, value_size);
file_data_ptr += value_size;
data_.emplace(std::string(&key[0]), value);
}
res = munmap(file_data, file_size);
if (res != 0) {
LOG(WARNING) << "munmap file " << file_path_
<< " failed, error code: " << errno;
res = close(fd);
if (res != 0) {
LOG(WARNING) << "close file " << file_path_
<< " failed, error code: " << errno;
}
return -1;
}
res = close(fd);
if (res != 0) {
LOG(WARNING) << "close file " << file_path_
<< " failed, error code: " << errno;
return -1;
}
return 0;
}
bool FileStorage::Insert(const std::string &key,
const std::vector<unsigned char> &value) {
data_.emplace(key, value);
return true;
}
const std::vector<unsigned char> *FileStorage::Find(const std::string &key) {
auto iter = data_.find(key);
if (iter == data_.end()) return nullptr;
return &(iter->second);
}
int FileStorage::Flush() {
int fd = open(file_path_.c_str(), O_WRONLY | O_CREAT, 0600);
if (fd < 0) {
LOG(WARNING) << "open file " << file_path_
<< " failed, error code:" << errno;
return -1;
}
const size_t int_size = sizeof(int32_t);
int64_t data_size = sizeof(int64_t);
for (auto &kv : data_) {
data_size += 2 * int_size + kv.first.size() + kv.second.size();
}
std::unique_ptr<unsigned char[]> buffer(new unsigned char[data_size]);
unsigned char *buffer_ptr = &buffer[0];
int64_t num_of_data = data_.size();
memcpy(buffer_ptr, &num_of_data, sizeof(int64_t));
buffer_ptr += sizeof(int64_t);
for (auto &kv : data_) {
int32_t key_size = kv.first.size();
memcpy(buffer_ptr, &key_size, int_size);
buffer_ptr += int_size;
memcpy(buffer_ptr, kv.first.c_str(), kv.first.size());
buffer_ptr += kv.first.size();
int32_t value_size = kv.second.size();
memcpy(buffer_ptr, &value_size, int_size);
buffer_ptr += int_size;
memcpy(buffer_ptr, kv.second.data(), kv.second.size());
buffer_ptr += kv.second.size();
}
int res = 0;
buffer_ptr = &buffer[0];
int64_t remain_size = data_size;
while (remain_size > 0) {
size_t buffer_size = std::min<int64_t>(remain_size, SSIZE_MAX);
res = write(fd, buffer_ptr, buffer_size);
if (res == -1) {
LOG(WARNING) << "write file " << file_path_
<< " failed, error code: " << errno;
res = close(fd);
if (res != 0) {
LOG(WARNING) << "close file " << file_path_
<< " failed, error code: " << errno;
}
return -1;
}
remain_size -= buffer_size;
buffer_ptr += buffer_size;
}
res = close(fd);
if (res != 0) {
LOG(WARNING) << "close file " << file_path_
<< " failed, error code: " << errno;
return -1;
}
return 0;
}
}; // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_FILE_STORAGE_H_
#define MACE_CORE_FILE_STORAGE_H_
#include <map>
#include <string>
#include <vector>
#include "mace/public/mace_runtime.h"
namespace mace {
class FileStorage : public KVStorage {
public:
explicit FileStorage(const std::string &file_path);
public:
int Load() override;
bool Insert(const std::string &key,
const std::vector<unsigned char> &value) override;
const std::vector<unsigned char> *Find(const std::string &key) override;
int Flush() override;
private:
std::string file_path_;
std::map<std::string, std::vector<unsigned char>> data_;
};
} // namespace mace
#endif // MACE_CORE_FILE_STORAGE_H_
......@@ -4,8 +4,10 @@
#include <memory>
#include "mace/core/file_storage.h"
#include "mace/core/net.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
......@@ -94,6 +96,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
net_(nullptr),
hexagon_controller_(nullptr) {
LOG(INFO) << "MACE version: " << MaceVersion();
// Set storage path for internal usage
for (auto input_name : input_nodes) {
ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
GetDeviceAllocator(device_type_), DT_FLOAT);
......@@ -173,6 +176,7 @@ MaceStatus MaceEngine::Impl::Run(
LOG(FATAL) << "Net run failed";
}
}
OpenCLRuntime::Global()->SaveBuiltCLProgram();
for (auto &output : *outputs) {
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
......
......@@ -8,6 +8,8 @@
namespace mace {
std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
......@@ -15,6 +17,11 @@ void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
}
void ConfigKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory) {
VLOG(1) << "Set internal KV Storage Engine";
kStorageFactory = storage_factory;
}
void ConfigOmpThreads(int omp_num_threads) {
VLOG(1) << "Config CPU omp_num_threads: " << omp_num_threads;
SetOmpThreads(omp_num_threads);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace mace {
bool GetSourceOrBinaryProgram(const std::string &program_name,
const std::string &binary_file_name_prefix,
const cl::Context &context,
const cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
*is_binary = false;
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source == kEncryptedProgramMap.end()) {
return false;
}
cl::Program::Sources sources;
std::string content(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(content);
sources.push_back(kernel_source);
*program = cl::Program(context, sources);
return true;
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/logging.h"
namespace mace {
bool GetSourceOrBinaryProgram(const std::string &program_name,
const std::string &binary_file_name_prefix,
const cl::Context &context,
const cl::Device &device,
cl::Program *program,
bool *is_binary) {
extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
*is_binary = true;
auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
if (it_binary == kCompiledProgramMap.end()) {
return false;
}
*program = cl::Program(context, {device}, {it_binary->second});
return true;
}
} // namespace mace
......@@ -10,35 +10,20 @@
#include <mutex> // NOLINT(build/c++11)
#include <string>
#include <vector>
#include <utility>
#include "mace/core/file_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/public/mace.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace {
bool WriteFile(const std::string &filename,
bool binary,
const std::vector<unsigned char> &content) {
std::ios_base::openmode mode = std::ios_base::out | std::ios_base::trunc;
if (binary) {
mode |= std::ios::binary;
}
std::ofstream ofs(filename, mode);
ofs.write(reinterpret_cast<const char *>(&content[0]),
content.size() * sizeof(char));
ofs.close();
if (ofs.fail()) {
LOG(ERROR) << "Failed to write to file " << filename;
return false;
}
return true;
}
} // namespace
extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap;
extern const std::string kCompiledProgramPlatform;
extern const std::map<std::string, std::vector<unsigned char>>
kEncryptedProgramMap;
const std::string OpenCLErrorToString(cl_int error) {
switch (error) {
......@@ -194,19 +179,19 @@ void OpenCLProfilingTimer::ClearTiming() {
accumulated_micros_ = 0;
}
GPUPerfHint OpenCLRuntime::gpu_perf_hint_ = GPUPerfHint::PERF_DEFAULT;
GPUPriorityHint OpenCLRuntime::gpu_priority_hint_ =
GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_DEFAULT;
GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
GPUPriorityHint::PRIORITY_DEFAULT;
OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime runtime(gpu_perf_hint_, gpu_priority_hint_);
static OpenCLRuntime runtime(kGPUPerfHint, kGPUPriorityHint);
return &runtime;
}
void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
OpenCLRuntime::gpu_perf_hint_ = gpu_perf_hint;
OpenCLRuntime::gpu_priority_hint_ = gpu_priority_hint;
OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
}
void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
......@@ -250,7 +235,8 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
}
OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
GPUPriorityHint gpu_priority_hint):
storage_(nullptr) {
LoadOpenCLLibrary();
std::vector<cl::Platform> all_platforms;
......@@ -259,9 +245,12 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
LOG(FATAL) << "No OpenCL platforms found";
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>()
<< ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
std::stringstream ss;
ss << default_platform.getInfo<CL_PLATFORM_NAME>()
<< ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
platform_info_ = ss.str();
VLOG(1) << "Using platform: " << platform_info_;
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
......@@ -278,10 +267,10 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
gpu_detected = true;
const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
gpu_type_ = ParseGPUType(device_name);
const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
opencl_version_ = device_version.substr(7, 3);
opencl_version_ = ParseDeviceVersion(device_version);
VLOG(1) << "Using device: " << device_name;
break;
......@@ -320,9 +309,19 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
&err);
MACE_CHECK_CL_SUCCESS(err);
const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ =
std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
this->program_map_changed_ = false;
extern std::shared_ptr<KVStorageFactory> kStorageFactory;
if (kStorageFactory != nullptr) {
const std::string cl_compiled_file_name = "mace_cl_compiled_program.bin";
storage_ = kStorageFactory->CreateStorage(cl_compiled_file_name);
if (platform_info_ != kCompiledProgramPlatform) {
if (storage_->Load() != 0) {
LOG(FATAL) << "Load opencl compiled kernel file failed";
}
}
}
const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
......@@ -348,45 +347,49 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix(
const std::string &filename_msg) {
// TODO(heliangliang) This can be long and slow, fix it
std::string filename_prefix = filename_msg;
for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) {
if (*it == ' ' || *it == '-' || *it == '=') {
*it = '_';
bool OpenCLRuntime::BuildProgramFromBinary(
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from binary
if (kCompiledProgramPlatform != platform_info_) return false;
auto it_binary = kCompiledProgramMap.find(built_program_key);
if (it_binary == kCompiledProgramMap.end()) return false;
*program = cl::Program(context(), {device()}, {it_binary->second});
cl_int ret = program->build({device()}, build_options_str.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(WARNING) << "Build program "
<< built_program_key << " from Binary failed:"
<< (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible "
"cause 1: the MACE library is built from SoC 1 but is "
"used on different SoC 2, possible cause 2: the MACE "
"buffer is corrupted make sure your code has no "
"out-of-range memory writing" : MakeString(ret));
return false;
}
return MACE_OBFUSCATE_SYMBOL(filename_prefix);
VLOG(3) << "Program from Binary: " << built_program_key;
return true;
}
extern bool GetSourceOrBinaryProgram(const std::string &program_name,
const std::string &binary_file_name_prefix,
const cl::Context &context,
const cl::Device &device,
cl::Program *program,
bool *is_opencl_binary);
void OpenCLRuntime::BuildProgram(const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options,
cl::Program *program) {
MACE_CHECK_NOTNULL(program);
bool OpenCLRuntime::BuildProgramFromCache(
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from binary
if (this->storage_ == nullptr) return false;
auto content = this->storage_->Find(built_program_key);
if (content == nullptr) {
return false;
}
std::string binary_file_name_prefix =
GenerateCLBinaryFilenamePrefix(built_program_key);
std::vector<unsigned char> program_vec;
bool is_opencl_binary;
const bool found =
GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
device(), program, &is_opencl_binary);
MACE_CHECK(found, "Program not found for ",
is_opencl_binary ? "binary: " : "source: ", built_program_key);
// Build program
std::string build_options_str =
build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math";
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
*program = cl::Program(context(), {device()}, {*content});
cl_int ret = program->build({device()}, build_options_str.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
......@@ -395,20 +398,43 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program from "
<< (is_opencl_binary ? "binary: " : "source: ")
<< built_program_key << " failed: "
<< (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible "
"cause 1: the MACE library is built from SoC 1 but is "
"used on different SoC 2, possible cause 2: the MACE "
"buffer is corrupted make sure your code has no "
"out-of-range memory writing" : MakeString(ret));
LOG(WARNING) << "Build program "
<< built_program_key << " from Cache failed:"
<< MakeString(ret);
return false;
}
VLOG(3) << "Program from Cache: " << built_program_key;
return true;
}
void OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program) {
// Find from source
auto it_source = kEncryptedProgramMap.find(program_name);
if (it_source != kEncryptedProgramMap.end()) {
cl::Program::Sources sources;
std::string source(it_source->second.begin(), it_source->second.end());
std::string kernel_source = ObfuscateString(source);
sources.push_back(kernel_source);
*program = cl::Program(context(), sources);
cl_int ret = program->build({device()}, build_options_str.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(WARNING) << "Build program "
<< program_name << " from source failed: "
<< MakeString(ret);
return;
}
if (!is_opencl_binary) {
// Write binary if necessary
std::string binary_filename =
kernel_path_ + binary_file_name_prefix + ".bin";
// Keep built program binary
size_t device_list_size = 1;
std::unique_ptr<size_t[]> program_binary_sizes(
new size_t[device_list_size]);
......@@ -432,7 +458,34 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
program_binary_sizes[0]);
MACE_CHECK(WriteFile(binary_filename, true, content));
if (this->storage_ != nullptr) {
this->storage_->Insert(built_program_key, content);
this->program_map_changed_ = true;
}
VLOG(3) << "Program from source: " << built_program_key;
}
}
void OpenCLRuntime::BuildProgram(const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options,
cl::Program *program) {
MACE_CHECK_NOTNULL(program);
std::string build_options_str =
build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math";
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
bool ret = BuildProgramFromBinary(built_program_key,
build_options_str, program);
if (!ret) {
ret = BuildProgramFromCache(built_program_key,
build_options_str, program);
// Fallback to source.
if (!ret) {
BuildProgramFromSource(program_name, built_program_key,
build_options_str, program);
}
}
}
......@@ -459,6 +512,15 @@ cl::Kernel OpenCLRuntime::BuildKernel(
return cl::Kernel(program, kernel_name.c_str());
}
void OpenCLRuntime::SaveBuiltCLProgram() {
if (program_map_changed_ && storage_ != nullptr) {
if (storage_->Flush() != 0) {
LOG(FATAL) << "Store opencl compiled kernel to file failed";
}
program_map_changed_ = false;
}
}
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) {
stats->start_micros =
......@@ -480,7 +542,6 @@ uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
return size;
}
// TODO(liuqi): not compatible with mali gpu.
uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
......@@ -496,7 +557,11 @@ const GPUType OpenCLRuntime::gpu_type() const {
return gpu_type_;
}
const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
const std::string OpenCLRuntime::platform_info() const {
return platform_info_;
}
const GPUType OpenCLRuntime::ParseGPUType(
const std::string &device_name) {
constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
constexpr const char *kMaliGPUStr = "Mali";
......@@ -512,6 +577,14 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
return GPUType::UNKNOWN;
}
}
const std::string OpenCLRuntime::ParseDeviceVersion(
const std::string &device_version) {
// OpenCL Device version string format:
// OpenCL<space><major_version.minor_version><space>\
// <vendor-specific information>
auto words = Split(device_version, ' ');
return words[1];
}
const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
return out_of_range_check_;
......
......@@ -10,6 +10,7 @@
#include <mutex> // NOLINT(build/c++11)
#include <set>
#include <string>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -55,22 +56,27 @@ class OpenCLRuntime {
public:
static OpenCLRuntime *Global();
static void Configure(GPUPerfHint, GPUPriorityHint);
static void Configure(std::shared_ptr<KVStorage> storage_engine);
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
const GPUType gpu_type() const;
const std::string platform_info() const;
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
const bool IsNonUniformWorkgroupsSupported();
const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
const GPUType gpu_type() const;
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
const bool IsOutOfRangeCheckEnabled() const;
const GPUType ParseGPUType(const std::string &device_name);
const std::string ParseDeviceVersion(const std::string &device_version);
void SaveBuiltCLProgram();
private:
OpenCLRuntime(GPUPerfHint, GPUPriorityHint);
......@@ -82,7 +88,19 @@ class OpenCLRuntime {
const std::string &binary_file_name,
const std::string &build_options,
cl::Program *program);
std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg);
bool BuildProgramFromBinary(
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program);
bool BuildProgramFromCache(
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program);
void BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program);
private:
// All OpenCL object must be a pointer and manually deleted before unloading
......@@ -92,13 +110,15 @@ class OpenCLRuntime {
std::shared_ptr<cl::CommandQueue> command_queue_;
std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_;
std::string kernel_path_;
GPUType gpu_type_;
std::string opencl_version_;
bool out_of_range_check_;
std::string platform_info_;
bool program_map_changed_;
std::unique_ptr<KVStorage> storage_;
static GPUPerfHint gpu_perf_hint_;
static GPUPriorityHint gpu_priority_hint_;
static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint;
};
} // namespace mace
......
......@@ -2,8 +2,8 @@
load("//mace:mace.bzl", "if_openmp_enabled")
cc_binary(
name = "mace_run",
srcs = ["mace_run.cc"],
name = "example",
srcs = ["example.cc"],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
/**
* Usage:
* mace_run --model=mobi_mace.pb \
* --input=input_node \
* --output=output_node \
* --input_shape=1,224,224,3 \
* --output_shape=1,224,224,2 \
* --input_file=input_data \
* --output_file=mace.out \
* --model_data_file=model_data.data \
* --device=OPENCL
*/
#include <malloc.h>
#include <stdint.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
namespace mace {
namespace MACE_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace MACE_MODEL_TAG
} // namespace mace
namespace mace {
namespace examples {
namespace str_util {
std::vector<std::string> Split(const std::string &str, char delims) {
std::vector<std::string> result;
std::string tmp = str;
while (!tmp.empty()) {
size_t next_offset = tmp.find(delims);
result.push_back(tmp.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return result;
}
} // namespace str_util
void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
std::string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
std::string FormatName(const std::string input) {
std::string res = input;
for (size_t i = 0; i < input.size(); ++i) {
if (!isalnum(res[i])) res[i] = '_';
}
return res;
}
DeviceType ParseDeviceType(const std::string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
return DeviceType::CPU;
}
}
DEFINE_string(input_node,
"input_node0,input_node1",
"input nodes, separated by comma");
DEFINE_string(input_shape,
"1,224,224,3:1,1,1,10",
"input shapes, separated by colon and comma");
DEFINE_string(output_node,
"output_node0,output_node1",
"output nodes, separated by comma");
DEFINE_string(output_shape,
"1,224,224,2:1,1,1,10",
"output shapes, separated by colon and comma");
DEFINE_string(input_file,
"",
"input file name | input file prefix for multiple inputs.");
DEFINE_string(output_file,
"",
"output file name | output file prefix for multiple outputs");
DEFINE_string(model_data_file,
"",
"model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, 8, "num of openmp threads");
DEFINE_int32(cpu_power_option,
0,
"0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");
bool RunModel(const std::vector<std::string> &input_names,
const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::string> &output_names,
const std::vector<std::vector<int64_t>> &output_shapes) {
// load model
const unsigned char *model_data =
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime
mace::ConfigOmpThreads(FLAGS_omp_num_threads);
mace::ConfigCPUPowerOption(
static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
if (device_type == DeviceType::OPENCL) {
mace::ConfigOpenCLRuntime(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
}
const std::string kernel_file_path =
"/data/local/tmp/mace_run/cl";
// Config internal kv storage factory.
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(kernel_file_path));
ConfigKVStorageFactory(storage_factory);
// Init model
mace::MaceEngine engine(&net_def, device_type, input_names,
output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
const size_t input_count = input_names.size();
const size_t output_count = output_names.size();
std::map<std::string, mace::MaceTensor> inputs;
std::map<std::string, mace::MaceTensor> outputs;
for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output
int64_t input_size =
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
std::multiplies<int64_t>());
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(buffer_in.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
}
for (size_t i = 0; i < output_count; ++i) {
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>());
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
}
LOG(INFO) << "Warm up run";
engine.Run(inputs, &outputs);
if (FLAGS_round > 0) {
LOG(INFO) << "Run model";
for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(inputs, &outputs);
}
}
for (size_t i = 0; i < output_count; ++i) {
std::string output_name =
FLAGS_output_file + "_" + FormatName(output_names[i]);
std::ofstream out_file(output_name, std::ios::binary);
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>());
out_file.write(
reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
output_size * sizeof(float));
out_file.flush();
out_file.close();
}
return true;
}
int Main(int argc, char **argv) {
gflags::SetUsageMessage("some usage message");
gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "mace version: " << MaceVersion();
LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
LOG(INFO) << "input node: " << FLAGS_input_node;
LOG(INFO) << "input shape: " << FLAGS_input_shape;
LOG(INFO) << "output node: " << FLAGS_output_node;
LOG(INFO) << "output shape: " << FLAGS_output_shape;
LOG(INFO) << "input_file: " << FLAGS_input_file;
LOG(INFO) << "output_file: " << FLAGS_output_file;
LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
LOG(INFO) << "device: " << FLAGS_device;
LOG(INFO) << "round: " << FLAGS_round;
LOG(INFO) << "restart_round: " << FLAGS_restart_round;
LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint;
LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option;
std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
std::vector<std::string> output_names =
str_util::Split(FLAGS_output_node, ',');
std::vector<std::string> input_shapes =
str_util::Split(FLAGS_input_shape, ':');
std::vector<std::string> output_shapes =
str_util::Split(FLAGS_output_shape, ':');
const size_t input_count = input_shapes.size();
const size_t output_count = output_shapes.size();
std::vector<std::vector<int64_t>> input_shape_vec(input_count);
std::vector<std::vector<int64_t>> output_shape_vec(output_count);
for (size_t i = 0; i < input_count; ++i) {
ParseShape(input_shapes[i], &input_shape_vec[i]);
}
for (size_t i = 0; i < output_count; ++i) {
ParseShape(output_shapes[i], &output_shape_vec[i]);
}
bool ret;
#pragma omp parallel for
for (int i = 0; i < FLAGS_restart_round; ++i) {
VLOG(0) << "restart round " << i;
ret =
RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
}
if (ret) {
return 0;
} else {
return -1;
}
}
} // namespace examples
} // namespace mace
int main(int argc, char **argv) { mace::examples::Main(argc, argv); }
......@@ -90,18 +90,22 @@ void DoActivation(const T *input_ptr,
template <typename T>
void PReLUActivation(const T *input_ptr,
const index_t size,
const index_t outer_size,
const index_t input_chan,
const index_t inner_size,
const T *alpha_ptr,
T *output_ptr) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
const index_t chan_idx = i % input_chan;
T in = input_ptr[i];
if (in < 0) {
output_ptr[i] = in * alpha_ptr[chan_idx];
} else {
output_ptr[i] = in;
#pragma omp parallel for collapse(3)
for (index_t i = 0; i < outer_size; ++i) {
for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
for (index_t j = 0; j < inner_size; ++j) {
index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j;
if (input_ptr[idx] < 0) {
output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx];
} else {
output_ptr[idx] = input_ptr[idx];
}
}
}
}
}
......@@ -121,7 +125,9 @@ class ActivationFunctor {
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
const T *alpha_ptr = alpha->data<T>();
PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
const index_t outer_size = output->dim(0) * output->dim(1)
* output->dim(2);
PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr,
output_ptr);
} else {
DoActivation(input_ptr, output_ptr, output->size(), activation_,
......
......@@ -17,7 +17,9 @@ void ActivationFunctor<DeviceType::NEON, float>::operator()(
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
const float *alpha_ptr = alpha->data<float>();
PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr,
const index_t outer_size = output->dim(0);
const index_t inner_size = output->dim(2) * output->dim(3);
PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr,
output_ptr);
} else {
DoActivation(input_ptr, output_ptr, output->size(), activation_,
......
......@@ -7,6 +7,7 @@
// winograd is always superior to neon impl during benchmark
#define USE_WINOGRAD 1
#define WINOGRAD_OUT_TILE_SIZE 6
namespace mace {
namespace kernels {
......@@ -162,10 +163,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
&& stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = RoundUp<index_t>(height, 2);
&& dilation_h == 1 && dilation_w == 1
&& input_channels >= 8 && channels >= 8) {
extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, 2);
extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
......@@ -174,12 +176,15 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
pad_right += (extra_input_width - padded_input_width);
}
index_t tile_height_count = (extra_output_height + 1) / 2;
index_t tile_width_count = (extra_output_width + 1) / 2;
index_t tile_height_count = extra_output_height / WINOGRAD_OUT_TILE_SIZE;
index_t tile_width_count = extra_output_width / WINOGRAD_OUT_TILE_SIZE;
index_t tile_count = tile_height_count * tile_width_count;
transformed_input_.Resize({16, batch, input_channels, tile_count});
transformed_filter_.Resize({16, channels, input_channels});
transformed_output_.Resize({16, batch, channels, tile_count});
index_t in_tile_area =
(WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
transformed_input_.Resize({in_tile_area, batch, input_channels,
tile_count});
transformed_filter_.Resize({in_tile_area, channels, input_channels});
transformed_output_.Resize({in_tile_area, batch, channels, tile_count});
conv_func = [=](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input,
......@@ -189,6 +194,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
extra_input_width,
input_channels,
channels,
WINOGRAD_OUT_TILE_SIZE,
transformed_input_.mutable_data<float>(),
transformed_filter_.mutable_data<float>(),
transformed_output_.mutable_data<float>(),
......
此差异已折叠。
......@@ -21,6 +21,7 @@ void WinoGradConv3x3s1(const float *input,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
const int out_tile_size,
float *output);
void WinoGradConv3x3s1(const float *input,
......@@ -30,12 +31,22 @@ void WinoGradConv3x3s1(const float *input,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
const int out_tile_size,
float *transformed_input,
float *transformed_filter,
float *transformed_output,
bool is_filter_transformed,
float *output);
void ConvRef3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *output);
} // namespace kernels
} // namespace mace
......
......@@ -58,11 +58,12 @@ TEST(ConvWinogradTest, winograd) {
in_width,
in_channels,
out_channels,
6,
output_data);
// test
for (index_t i = 0; i < output_size; ++i) {
EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1);
EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i;
}
delete[]input_data;
......
......@@ -13,22 +13,6 @@ namespace mace {
namespace kernels {
namespace {
void GemmRef(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
float *C) {
memset(C, 0, sizeof(float) * height * width);
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
for (int k = 0; k < K; ++k) {
C[i * width + j] += A[i * K + k] * B[k * width + j];
}
}
}
}
inline void GemmBlock(const float *A,
const float *B,
const index_t height,
......@@ -49,8 +33,8 @@ inline void GemmBlock(const float *A,
// TODO(liyin): may need implement 883 since RGB
inline void Gemm884(const float *a_ptr,
const float *b_ptr,
index_t stride_w,
index_t stride_k,
index_t stride_w,
float *c_ptr) {
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
......@@ -136,29 +120,300 @@ inline void GemmTile(const float *A,
float *C) {
index_t h, w, k;
for (h = 0; h + 7 < height; h += 8) {
for (w = 0; w + 3 < width; w += 4) {
for (k = 0; k + 7 < K; k += 8) {
const float *a_ptr = A + (h * stride_k + k);
for (k = 0; k + 7 < K; k += 8) {
const float *a_ptr = A + (h * stride_k + k);
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#ifdef __clang__
int nw = width >> 2;
if (nw > 0) {
// load A
float32x4_t a0, a1, a2, a3, a4, a5, a6, a7,
a8, a9, a10, a11, a12, a13, a14, a15;
a0 = vld1q_f32(a_ptr);
a1 = vld1q_f32(a_ptr + 4);
a2 = vld1q_f32(a_ptr + 1 * stride_k);
a3 = vld1q_f32(a_ptr + 1 * stride_k + 4);
a4 = vld1q_f32(a_ptr + 2 * stride_k);
a5 = vld1q_f32(a_ptr + 2 * stride_k + 4);
a6 = vld1q_f32(a_ptr + 3 * stride_k);
a7 = vld1q_f32(a_ptr + 3 * stride_k + 4);
a8 = vld1q_f32(a_ptr + 4 * stride_k);
a9 = vld1q_f32(a_ptr + 4 * stride_k + 4);
a10 = vld1q_f32(a_ptr + 5 * stride_k);
a11 = vld1q_f32(a_ptr + 5 * stride_k + 4);
a12 = vld1q_f32(a_ptr + 6 * stride_k);
a13 = vld1q_f32(a_ptr + 6 * stride_k + 4);
a14 = vld1q_f32(a_ptr + 7 * stride_k);
a15 = vld1q_f32(a_ptr + 7 * stride_k + 4);
const float *b_ptr0 = B + k * stride_w;
const float *b_ptr1 = B + (k + 1) * stride_w;
const float *b_ptr2 = B + (k + 2) * stride_w;
const float *b_ptr3 = B + (k + 3) * stride_w;
const float *b_ptr4 = B + (k + 4) * stride_w;
const float *b_ptr5 = B + (k + 5) * stride_w;
const float *b_ptr6 = B + (k + 6) * stride_w;
const float *b_ptr7 = B + (k + 7) * stride_w;
float *c_ptr0 = C + h * stride_w;
float *c_ptr1 = C + (h + 1) * stride_w;
float *c_ptr2 = C + (h + 2) * stride_w;
float *c_ptr3 = C + (h + 3) * stride_w;
float *c_ptr4 = C + (h + 4) * stride_w;
float *c_ptr5 = C + (h + 5) * stride_w;
float *c_ptr6 = C + (h + 6) * stride_w;
float *c_ptr7 = C + (h + 7) * stride_w;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v24.4s}, [%1] \n"
// load b: 0-7
"prfm pldl1keep, [%9, #128] \n"
"ld1 {v16.4s}, [%9], #16 \n"
"prfm pldl1keep, [%10, #128] \n"
"ld1 {v17.4s}, [%10], #16 \n"
"prfm pldl1keep, [%11, #128] \n"
"ld1 {v18.4s}, [%11], #16 \n"
"prfm pldl1keep, [%12, #128] \n"
"ld1 {v19.4s}, [%12], #16 \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v25.4s}, [%2] \n"
"prfm pldl1keep, [%13, #128] \n"
"ld1 {v20.4s}, [%13], #16 \n"
"prfm pldl1keep, [%14, #128] \n"
"ld1 {v21.4s}, [%14], #16 \n"
"prfm pldl1keep, [%15, #128] \n"
"ld1 {v22.4s}, [%15], #16 \n"
"prfm pldl1keep, [%16, #128] \n"
"ld1 {v23.4s}, [%16], #16 \n"
"prfm pldl1keep, [%3, #128] \n"
"ld1 {v26.4s}, [%3] \n"
"fmla v24.4s, v16.4s, %34.s[0] \n"
"fmla v24.4s, v17.4s, %34.s[1] \n"
"fmla v24.4s, v18.4s, %34.s[2] \n"
"fmla v24.4s, v19.4s, %34.s[3] \n"
"fmla v24.4s, v20.4s, %35.s[0] \n"
"fmla v24.4s, v21.4s, %35.s[1] \n"
"fmla v24.4s, v22.4s, %35.s[2] \n"
"fmla v24.4s, v23.4s, %35.s[3] \n"
"st1 {v24.4s}, [%1], #16 \n"
"fmla v25.4s, v16.4s, %36.s[0] \n"
"fmla v25.4s, v17.4s, %36.s[1] \n"
"fmla v25.4s, v18.4s, %36.s[2] \n"
"fmla v25.4s, v19.4s, %36.s[3] \n"
"fmla v25.4s, v20.4s, %37.s[0] \n"
"fmla v25.4s, v21.4s, %37.s[1] \n"
"fmla v25.4s, v22.4s, %37.s[2] \n"
"fmla v25.4s, v23.4s, %37.s[3] \n"
"prfm pldl1keep, [%4, #128] \n"
"ld1 {v24.4s}, [%4] \n"
"st1 {v25.4s}, [%2], #16 \n"
"fmla v26.4s, v16.4s, %38.s[0] \n"
"fmla v26.4s, v17.4s, %38.s[1] \n"
"fmla v26.4s, v18.4s, %38.s[2] \n"
"fmla v26.4s, v19.4s, %38.s[3] \n"
"fmla v26.4s, v20.4s, %39.s[0] \n"
"fmla v26.4s, v21.4s, %39.s[1] \n"
"fmla v26.4s, v22.4s, %39.s[2] \n"
"fmla v26.4s, v23.4s, %39.s[3] \n"
"prfm pldl1keep, [%5, #128] \n"
"ld1 {v25.4s}, [%5] \n"
"st1 {v26.4s}, [%3], #16 \n"
"fmla v24.4s, v16.4s, %40.s[0] \n"
"fmla v24.4s, v17.4s, %40.s[1] \n"
"fmla v24.4s, v18.4s, %40.s[2] \n"
"fmla v24.4s, v19.4s, %40.s[3] \n"
"fmla v24.4s, v20.4s, %41.s[0] \n"
"fmla v24.4s, v21.4s, %41.s[1] \n"
"fmla v24.4s, v22.4s, %41.s[2] \n"
"fmla v24.4s, v23.4s, %41.s[3] \n"
"prfm pldl1keep, [%6, #128] \n"
"ld1 {v26.4s}, [%6] \n"
"st1 {v24.4s}, [%4], #16 \n"
"fmla v25.4s, v16.4s, %42.s[0] \n"
"fmla v25.4s, v17.4s, %42.s[1] \n"
"fmla v25.4s, v18.4s, %42.s[2] \n"
"fmla v25.4s, v19.4s, %42.s[3] \n"
"fmla v25.4s, v20.4s, %43.s[0] \n"
"fmla v25.4s, v21.4s, %43.s[1] \n"
"fmla v25.4s, v22.4s, %43.s[2] \n"
"fmla v25.4s, v23.4s, %43.s[3] \n"
"prfm pldl1keep, [%7, #128] \n"
"ld1 {v24.4s}, [%7] \n"
"st1 {v25.4s}, [%5], #16 \n"
"fmla v26.4s, v16.4s, %44.s[0] \n"
"fmla v26.4s, v17.4s, %44.s[1] \n"
"fmla v26.4s, v18.4s, %44.s[2] \n"
"fmla v26.4s, v19.4s, %44.s[3] \n"
"fmla v26.4s, v20.4s, %45.s[0] \n"
"fmla v26.4s, v21.4s, %45.s[1] \n"
"fmla v26.4s, v22.4s, %45.s[2] \n"
"fmla v26.4s, v23.4s, %45.s[3] \n"
"prfm pldl1keep, [%8, #128] \n"
"ld1 {v25.4s}, [%8] \n"
"st1 {v26.4s}, [%6], #16 \n"
"fmla v24.4s, v16.4s, %46.s[0] \n"
"fmla v24.4s, v17.4s, %46.s[1] \n"
"fmla v24.4s, v18.4s, %46.s[2] \n"
"fmla v24.4s, v19.4s, %46.s[3] \n"
"fmla v24.4s, v20.4s, %47.s[0] \n"
"fmla v24.4s, v21.4s, %47.s[1] \n"
"fmla v24.4s, v22.4s, %47.s[2] \n"
"fmla v24.4s, v23.4s, %47.s[3] \n"
"st1 {v24.4s}, [%7], #16 \n"
"fmla v25.4s, v16.4s, %48.s[0] \n"
"fmla v25.4s, v17.4s, %48.s[1] \n"
"fmla v25.4s, v18.4s, %48.s[2] \n"
"fmla v25.4s, v19.4s, %48.s[3] \n"
"fmla v25.4s, v20.4s, %49.s[0] \n"
"fmla v25.4s, v21.4s, %49.s[1] \n"
"fmla v25.4s, v22.4s, %49.s[2] \n"
"fmla v25.4s, v23.4s, %49.s[3] \n"
"st1 {v25.4s}, [%8], #16 \n"
"subs %w0, %w0, #1 \n"
"bne 0b \n"
: "=r"(nw), // 0
"=r"(c_ptr0), // 1
"=r"(c_ptr1), // 2
"=r"(c_ptr2), // 3
"=r"(c_ptr3), // 4
"=r"(c_ptr4), // 5
"=r"(c_ptr5), // 6
"=r"(c_ptr6), // 7
"=r"(c_ptr7), // 8
"=r"(b_ptr0), // 9
"=r"(b_ptr1), // 10
"=r"(b_ptr2), // 11
"=r"(b_ptr3), // 12
"=r"(b_ptr4), // 13
"=r"(b_ptr5), // 14
"=r"(b_ptr6), // 15
"=r"(b_ptr7) // 16
: "0"(nw), // 17
"1"(c_ptr0), // 18
"2"(c_ptr1), // 19
"3"(c_ptr2), // 20
"4"(c_ptr3), // 21
"5"(c_ptr4), // 22
"6"(c_ptr5), // 23
"7"(c_ptr6), // 24
"8"(c_ptr7), // 25
"9"(b_ptr0), // 26
"10"(b_ptr1), // 27
"11"(b_ptr2), // 28
"12"(b_ptr3), // 29
"13"(b_ptr4), // 30
"14"(b_ptr5), // 31
"15"(b_ptr6), // 32
"16"(b_ptr7), // 33
"w"(a0), // 34
"w"(a1), // 35
"w"(a2), // 36
"w"(a3), // 37
"w"(a4), // 38
"w"(a5), // 39
"w"(a6), // 40
"w"(a7), // 41
"w"(a8), // 42
"w"(a9), // 43
"w"(a10), // 44
"w"(a11), // 45
"w"(a12), // 46
"w"(a13), // 47
"w"(a14), // 48
"w"(a15) // 49
: "cc", "memory",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26"
);
w = (width >> 2) << 2;
}
#else // gcc
for (w = 0; w + 3 < width; w += 4) {
const float *b_ptr = B + (k * stride_w + w);
float *c_ptr = C + (h * stride_w + w);
Gemm884(a_ptr, b_ptr, stride_k, stride_w, c_ptr);
}
#endif
#else
for (w = 0; w + 3 < width; w += 4) {
const float *b_ptr = B + (k * stride_w + w);
float *c_ptr = C + (h * stride_w + w);
Gemm884(a_ptr, b_ptr, stride_w, stride_k, c_ptr);
GemmBlock(a_ptr, b_ptr, 8, 8, 4, stride_k, stride_w, c_ptr);
}
if (k < K) {
#endif
if (w < width) {
const float *a_ptr = A + (h * stride_k + k);
const float *b_ptr = B + (k * stride_w + w);
float *c_ptr = C + (h * stride_w + w);
GemmBlock(a_ptr, b_ptr, 8, K - k, 4, stride_k, stride_w, c_ptr);
GemmBlock(a_ptr, b_ptr, 8, 8, width - w, stride_k, stride_w, c_ptr);
}
}
if (w < width) {
const float *a_ptr = A + h * stride_k;
const float *b_ptr = B + w;
float *c_ptr = C + (h * stride_w + w);
if (k < K) {
const float *a_ptr = A + (h * stride_k + k);
const float *b_ptr = B + k * stride_w;
float *c_ptr = C + h * stride_w;
GemmBlock(a_ptr,
b_ptr,
8,
K,
width - w,
K - k,
width,
stride_k,
stride_w,
c_ptr);
......@@ -243,5 +498,21 @@ void Gemm(const float *A,
} // n
}
void GemmRef(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
float *C) {
memset(C, 0, sizeof(float) * height * width);
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
for (int k = 0; k < K; ++k) {
C[i * width + j] += A[i * K + k] * B[k * width + j];
}
}
}
}
} // namespace kernels
} // namespace mace
......@@ -22,6 +22,13 @@ void Gemm(const float *A,
const index_t width,
float *C);
void GemmRef(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
float *C);
} // namespace kernels
} // namespace mace
......
......@@ -31,7 +31,7 @@ TEST(GEMMTest, gemm) {
[&gen, &nd] {
return nd(gen);
});
kernels::Gemm(A, B, N, K, M, C);
kernels::Gemm(A, B, 1, N, K, M, C);
kernels::GemmRef(A, B, N, K, M, C_ref);
for (int i = 0; i < N * M; ++i) {
......
......@@ -6,6 +6,7 @@
#define MACE_KERNELS_SLICE_H_
#include <memory>
#include <functional>
#include <vector>
#include "mace/core/future.h"
......@@ -17,20 +18,34 @@
namespace mace {
namespace kernels {
struct SliceFunctorBase {
explicit SliceFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_;
};
template<DeviceType D, typename T>
struct SliceFunctor {
struct SliceFunctor : SliceFunctorBase {
explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) {
const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2);
const index_t input_channels = input->dim(3);
const index_t input_channels = input->dim(axis_);
const size_t outputs_count = output_list.size();
const index_t output_channels = input_channels / outputs_count;
std::vector<T *> output_ptrs(output_list.size(), nullptr);
std::vector<index_t> output_shape(input->shape());
output_shape[axis_] = output_channels;
std::vector<index_t> output_shape({input->dim(0), input->dim(1),
input->dim(2), output_channels});
const index_t outer_size = std::accumulate(output_shape.begin(),
output_shape.begin() + axis_,
1,
std::multiplies<index_t>());
const index_t inner_size = std::accumulate(output_shape.begin() + axis_ + 1,
output_shape.end(),
1,
std::multiplies<index_t>());
for (size_t i= 0; i < outputs_count; ++i) {
output_list[i]->Resize(output_shape);
output_ptrs[i] = output_list[i]->mutable_data<T>();
......@@ -39,25 +54,27 @@ struct SliceFunctor {
#pragma omp parallel for
for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
int input_idx = outer_idx * input_channels;
int output_idx = outer_idx * output_channels;
int input_idx = outer_idx * input_channels * inner_size;
int output_idx = outer_idx * output_channels * inner_size;
for (size_t i = 0; i < outputs_count; ++i) {
if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx,
output_channels * sizeof(T));
output_channels * inner_size * sizeof(T));
} else {
for (index_t k = 0; k < output_channels; ++k) {
for (index_t k = 0; k < output_channels * inner_size; ++k) {
*(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k);
}
}
input_idx += output_channels;
input_idx += output_channels * inner_size;
}
}
}
};
template<typename T>
struct SliceFunctor<DeviceType::OPENCL, T> {
struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future);
......
......@@ -249,14 +249,26 @@ void TestSimplePrelu() {
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
if (D == DeviceType::NEON) {
auto expected = CreateTensor<float>(
{2, 2, 2, 2},
{-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else {
auto expected = CreateTensor<float>(
{2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
}
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
TEST_F(ActivationOpTest, CPUSimplePrelu) {
TestSimplePrelu<DeviceType::CPU>();
}
TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
TEST_F(ActivationOpTest, NEONSimplePrelu) {
TestSimplePrelu<DeviceType::NEON>();
}
TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
TestSimplePrelu<DeviceType::OPENCL>();
......
......@@ -24,6 +24,11 @@ void Register_Slice(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
SliceOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
SliceOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -17,14 +17,16 @@ template <DeviceType D, typename T>
class SliceOp : public Operator<D, T> {
public:
SliceOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
bool Run(StatsFuture *future) override {
MACE_CHECK(this->OutputSize() >= 2)
<< "There must be at least two outputs for slicing";
const Tensor *input = this->Input(INPUT);
const std::vector<Tensor *> output_list = this->Outputs();
MACE_CHECK((input->dim(3) % this->OutputSize()) == 0)
const int32_t slice_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
<< "Outputs do not split input equally.";
functor_(input, output_list, future);
......
......@@ -16,7 +16,7 @@ namespace test {
class SliceOpTest : public OpsTestBase {};
template<DeviceType D, typename T>
void RandomTest(const int num_outputs) {
void RandomTest(const int num_outputs, const int axis) {
static unsigned int seed = time(NULL);
const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
const index_t input_channels = num_outputs * output_channels;
......@@ -27,7 +27,11 @@ void RandomTest(const int num_outputs) {
// Construct graph
OpsTestNet net;
std::vector<index_t> input_shape({batch, height, width, input_channels});
std::vector<index_t> input_shape;
if (axis == 1)
input_shape = {batch, input_channels, height, width};
else if (axis == 3)
input_shape = {batch, height, width, input_channels};
const index_t input_size = std::accumulate(input_shape.begin(),
input_shape.end(),
1,
......@@ -49,7 +53,7 @@ void RandomTest(const int num_outputs) {
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
auto builder = OpDefBuilder("Slice", "SliceTest");
auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis);
builder.Input("Input");
for (int i = 0; i < num_outputs; ++i) {
builder = builder.Output(MakeString("Output", i));
......@@ -70,9 +74,17 @@ void RandomTest(const int num_outputs) {
}
// Check
std::vector<index_t> expected_shape({batch, height, width, output_channels});
std::vector<index_t> expected_shape;
if (axis == 1)
expected_shape = {batch, output_channels, height, width};
else if (axis == 3)
expected_shape = {batch, height, width, output_channels};
const index_t outer_size = std::accumulate(expected_shape.begin(),
expected_shape.end() - 1,
expected_shape.begin() + axis,
1,
std::multiplies<index_t>());
const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1,
expected_shape.end(),
1,
std::multiplies<index_t>());
const float *input_ptr = input_data.data();
......@@ -83,8 +95,9 @@ void RandomTest(const int num_outputs) {
Tensor::MappingGuard output_mapper(output);
output_ptr = output->data<float>();
for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
const int idx = outer_idx * input_channels + i * output_channels;
for (int j = 0; j < output_channels; ++j) {
const int idx = (outer_idx * input_channels + i * output_channels)
* inner_size;
for (int j = 0; j < output_channels * inner_size; ++j) {
ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output "
<< i << " index " << idx + j;
}
......@@ -93,21 +106,27 @@ void RandomTest(const int num_outputs) {
}
TEST_F(SliceOpTest, CPU) {
RandomTest<DeviceType::CPU, float>(2);
RandomTest<DeviceType::CPU, float>(4);
RandomTest<DeviceType::CPU, float>(11);
RandomTest<DeviceType::CPU, float>(2, 3);
RandomTest<DeviceType::CPU, float>(4, 3);
RandomTest<DeviceType::CPU, float>(11, 3);
}
TEST_F(SliceOpTest, CPUAxis1) {
RandomTest<DeviceType::CPU, float>(2, 1);
RandomTest<DeviceType::CPU, float>(4, 1);
RandomTest<DeviceType::CPU, float>(11, 1);
}
TEST_F(SliceOpTest, OPENCLFloat) {
RandomTest<DeviceType::OPENCL, float>(2);
RandomTest<DeviceType::OPENCL, float>(4);
RandomTest<DeviceType::OPENCL, float>(11);
RandomTest<DeviceType::OPENCL, float>(2, 3);
RandomTest<DeviceType::OPENCL, float>(4, 3);
RandomTest<DeviceType::OPENCL, float>(11, 3);
}
TEST_F(SliceOpTest, OPENCLHalf) {
RandomTest<DeviceType::OPENCL, half>(2);
RandomTest<DeviceType::OPENCL, half>(4);
RandomTest<DeviceType::OPENCL, half>(11);
RandomTest<DeviceType::OPENCL, half>(2, 3);
RandomTest<DeviceType::OPENCL, half>(4, 3);
RandomTest<DeviceType::OPENCL, half>(11, 3);
}
} // namespace test
......
......@@ -8,6 +8,11 @@
#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
#define MACE_PUBLIC_MACE_RUNTIME_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace mace {
enum GPUPerfHint {
......@@ -26,10 +31,42 @@ enum GPUPriorityHint {
enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 };
class KVStorage {
public:
// return: 0 for success, -1 for error
virtual int Load() = 0;
virtual bool Insert(const std::string &key,
const std::vector<unsigned char> &value) = 0;
virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
// return: 0 for success, -1 for error
virtual int Flush() = 0;
};
class KVStorageFactory {
public:
virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
};
class FileStorageFactory : public KVStorageFactory {
public:
explicit FileStorageFactory(const std::string &path);
~FileStorageFactory();
std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
void ConfigKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
void ConfigOmpThreads(int omp_num_threads);
void ConfigCPUPowerOption(CPUPowerOption power_option);
} // namespace mace
#endif // MACE_PUBLIC_MACE_RUNTIME_H_
......@@ -25,9 +25,9 @@ def generate_cpp_source():
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
print "Generate binary from", binary_path
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8])
print size
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4])
......@@ -64,7 +64,7 @@ def parse_args():
parser.add_argument(
"--binary_dirs",
type=str,
default="cl_bin0/,cl_bin1/",
default="",
help="The binaries file path.")
parser.add_argument(
"--binary_file_name",
......
......@@ -68,14 +68,26 @@ def BlobToNPArray(blob):
class Shapes(object):
@staticmethod
def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func):
def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'):
output_shape = np.zeros_like(input_shape)
output_shape[0] = input_shape[0]
output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
- (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
- (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
if input_format == 'NHWC':
# input format: NHWC, filter format: HWOI
output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
- (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
- (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
elif input_format == 'NCHW':
# input format: NCHW, filter format: OIHW
output_shape[1] = filter_shape[0]
output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2]
- (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3]
- (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
else:
raise Exception("format %s is not supported" % input_format)
return output_shape
@staticmethod
......@@ -93,8 +105,13 @@ class Shapes(object):
return output_shape
@staticmethod
def slice_shape(input_shape, num_output):
return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
def slice_shape(input_shape, num_output, input_format='NHWC'):
if input_format == 'NHWC':
return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
elif input_format == 'NCHW':
return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]]
else:
raise Exception("format %s is not supported" % input_format)
# outputs' name is [op.name + '_' + #]
class CaffeConverter(object):
......@@ -168,7 +185,10 @@ class CaffeConverter(object):
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = mace_type
op_def.input.extend([name+':0' for name in self.inputs_map[op.name]])
......@@ -342,7 +362,11 @@ class CaffeConverter(object):
# Add filter
weight_tensor_name = op.name + '_weight:0'
weight_data = op.data[0].transpose((2, 3, 0, 1))
if self.device == 'neon':
weight_data = op.data[0]
else:
# OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
......@@ -376,10 +400,11 @@ class CaffeConverter(object):
final_op = op
self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
weight_data.shape,
paddings, strides, dilations,
math.floor)
math.floor, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 \
......@@ -399,9 +424,13 @@ class CaffeConverter(object):
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
filter_shape = filter_shape[[2, 3, 0, 1]]
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
dilations = [1, 1]
......@@ -411,17 +440,21 @@ class CaffeConverter(object):
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor)
filter_shape, paddings, strides, dilations, math.floor, input_format)
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
return self.winograd and self.device == 'gpu' and \
filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
dilations[0] == 1 and (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3])
return False
def convert_winograd_conv(self, op):
# Add filter
......@@ -435,11 +468,13 @@ class CaffeConverter(object):
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
filter_shape = filter_shape[[2, 3, 0, 1]]
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor)
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
# Input transform
wt_op = mace_pb2.OperatorDef()
......@@ -455,8 +490,12 @@ class CaffeConverter(object):
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
if self.device != 'neon':
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
......@@ -470,7 +509,10 @@ class CaffeConverter(object):
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
if self.device != 'neon':
matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
......@@ -483,10 +525,10 @@ class CaffeConverter(object):
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1]
height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2]
width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
......@@ -589,8 +631,9 @@ class CaffeConverter(object):
weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
if self.device != 'neon':
weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
......@@ -665,9 +708,12 @@ class CaffeConverter(object):
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \
if self.device != 'neon' else \
[input_shape[1], input_shape[1], kernels[0], kernels[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1], math.ceil)
paddings, strides, [1, 1], math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
......@@ -720,7 +766,7 @@ class CaffeConverter(object):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3
axis_arg.i = 3 if self.device != 'neon' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
......@@ -766,13 +812,19 @@ class CaffeConverter(object):
if len(param.slice_point) > 0:
raise Exception('Mace do not support slice with slice_point')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
num_outputs = len(op.layer.top)
if (input_shape[3] % num_outputs) != 0 or \
(self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
input_channels = input_shape[axis_arg.i]
if (input_channels % num_outputs) != 0 or \
(self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)):
raise Exception('Mace do not support slice with input shape '
+ str(input_shape) + ' and number of output ' + str(num_outputs))
output_shape = Shapes.slice_shape(input_shape, num_outputs)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format)
for i in range(len(op.layer.top)):
op.output_shape_map[op.layer.top[i]] = output_shape
self.add_output_shape(op_def, output_shape)
......@@ -790,10 +842,15 @@ class CaffeConverter(object):
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
op_def = self.CommonConvert(op, 'ReOrganize')
if self.device == 'neon':
op_def = self.CommonConvert(op, 'Reshape')
else:
op_def = self.CommonConvert(op, 'ReOrganize')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
if self.device != 'neon':
shape_param = shape_param[[0, 3, 1, 2]]
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
......@@ -867,15 +924,50 @@ class CaffeConverter(object):
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)):
input_op = self.ops_map[input_nodes[i]]
input_shape = input_shapes[i] if self.device != 'neon' else \
[input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]]
if input_op.layer is not None:
input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i]
input_op.output_shape_map[input_op.layer.top[0]] = input_shape
else:
input_op.output_shape_map[input_op.name] = input_shapes[i]
input_op.output_shape_map[input_op.name] = input_shape
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
......@@ -924,6 +1016,9 @@ class CaffeConverter(object):
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
for op in self.ops:
if op.name not in self.resolved_ops:
print 'Unresolve Op: %s with type %s' % (op.name, op.type)
......
import argparse
import os
import sys
import struct
import numpy as np
......@@ -14,30 +15,49 @@ FLAGS = None
def generate_cpp_source():
maps = {}
cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",")
for cl_binary_dir in cl_binary_dir_arr:
if not os.path.exists(cl_binary_dir):
print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!")
for file_name in os.listdir(cl_binary_dir):
file_path = os.path.join(cl_binary_dir, file_name)
if file_path[-4:] == ".bin":
# read binary
f = open(file_path, "rb")
binary_array = np.fromfile(f, dtype=np.uint8)
f.close()
maps[file_name[:-4]] = []
for ele in binary_array:
maps[file_name[:-4]].append(hex(ele))
platform_info = ''
binary_dirs = FLAGS.cl_binary_dirs.strip().split(",")
for binary_dir in binary_dirs:
binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name)
if not os.path.exists(binary_path):
continue
print 'generate opencl code from', binary_path
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8])
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
idx += key_size
value_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
maps[key] = []
value = struct.unpack(str(value_size) + "B",
binary_array[idx:idx+value_size])
idx += value_size
for ele in value:
maps[key].append(hex(ele))
cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name)
with open(cl_platform_info_path, 'r') as f:
curr_platform_info = f.read()
if platform_info != "":
assert(curr_platform_info == platform_info)
platform_info = curr_platform_info
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.jinja2').render(
return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
maps = maps,
data_type = 'unsigned char',
variable_name = 'kCompiledProgramMap'
variable_name = 'kCompiledProgramMap',
platform_info = platform_info,
)
def main(unused_args):
cpp_cl_binary_source = generate_cpp_source()
......@@ -54,7 +74,17 @@ def parse_args():
parser.add_argument(
"--cl_binary_dirs",
type=str,
default="cl_bin0/,cl_bin1/,cl_bin2/",
default="",
help="The cl binaries directories.")
parser.add_argument(
"--built_kernel_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--platform_info_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--output_path",
......
......@@ -10,7 +10,20 @@
namespace mace {
extern const std::map<std::string, std::vector<unsigned char>>
kCompiledProgramMap = {};
extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
{
{% for key, value in maps.iteritems() %}
{
"{{key}}",
{
{%- for ele in value -%}
{{ele}},
{%- endfor -%}
}
}, // {{key}}
{% endfor %}
};
extern const std::string kCompiledProgramPlatform = {{platform_info|tojson}};
} // namespace mace
# Examples
load("//mace:mace.bzl", "if_openmp_enabled")
cc_binary(
name = "mace_run",
srcs = ["mace_run.cc"],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
"//external:gflags_nothreads",
"//mace/codegen:generated_models",
"//mace/core:core",
],
)
......@@ -16,12 +16,14 @@
*/
#include <malloc.h>
#include <stdint.h>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include "gflags/gflags.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/env_time.h"
......@@ -100,6 +102,20 @@ DeviceType ParseDeviceType(const std::string &device_str) {
}
}
void WriteOpenCLPlatformInfo(const std::string &output_dir) {
std::string platform_info = OpenCLRuntime::Global()->platform_info();
const std::string cl_platform_info_file_name = output_dir
+ "/mace_cl_platform_info.txt";
std::ofstream ofs(cl_platform_info_file_name);
if (ofs.is_open()) {
ofs << platform_info;
ofs.close();
} else {
LOG(WARNING) << "Write opencl platform info failed.";
}
}
struct mallinfo LogMallinfoChange(struct mallinfo prev) {
struct mallinfo curr = mallinfo();
if (prev.arena != curr.arena) {
......@@ -189,8 +205,8 @@ bool RunModel(const std::vector<std::string> &input_names,
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
int64_t t1 = NowMicros();
LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
int64_t init_micros = t1 - t0;
double create_net_millis = (t1 - t0) / 1000.0;
LOG(INFO) << "CreateNetDef latency: " << create_net_millis << " ms";
DeviceType device_type = ParseDeviceType(FLAGS_device);
LOG(INFO) << "Runing with device type: " << device_type;
......@@ -205,17 +221,26 @@ bool RunModel(const std::vector<std::string> &input_names,
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
}
const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH");
const std::string kernel_file_path =
std::string(kernel_path == nullptr ?
"/data/local/tmp/mace_run/cl_program" : kernel_path);
// Init model
LOG(INFO) << "Run init";
t0 = NowMicros();
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(kernel_file_path));
ConfigKVStorageFactory(storage_factory);
mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
t1 = NowMicros();
init_micros += t1 - t0;
LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
LOG(INFO) << "Total init latency: " << init_micros << " us";
int64_t t2 = NowMicros();
double mace_engine_ctor_millis = (t2 - t1) / 1000.0;
double init_millis = (t2 - t0) / 1000.0;
LOG(INFO) << "MaceEngine constructor latency: "
<< mace_engine_ctor_millis << " ms";
LOG(INFO) << "Total init latency: " << init_millis << " ms";
const size_t input_count = input_names.size();
const size_t output_count = output_names.size();
......@@ -253,14 +278,16 @@ bool RunModel(const std::vector<std::string> &input_names,
}
LOG(INFO) << "Warm up run";
t0 = NowMicros();
int64_t t3 = NowMicros();
engine.Run(inputs, &outputs);
t1 = NowMicros();
LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
int64_t t4 = NowMicros();
double warmup_millis = (t4 - t3) / 1000.0;
LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
double model_run_millis = -1;
if (FLAGS_round > 0) {
LOG(INFO) << "Run model";
t0 = NowMicros();
int64_t t0 = NowMicros();
struct mallinfo prev = mallinfo();
for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(inputs, &outputs);
......@@ -269,10 +296,20 @@ bool RunModel(const std::vector<std::string> &input_names,
prev = LogMallinfoChange(prev);
}
}
t1 = NowMicros();
LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
int64_t t1 = NowMicros();
model_run_millis = (t1 - t0) / 1000.0 / FLAGS_round;
LOG(INFO) << "Average latency: " << model_run_millis << " ms";
}
// Metrics reporting tools depends on the format, keep in consistent
printf("================================================================\n");
printf(" create_net engine_ctor init warmup run_avg\n");
printf("================================================================\n");
printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis,
mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);
WriteOpenCLPlatformInfo(kernel_file_path);
for (size_t i = 0; i < output_count; ++i) {
std::string output_name =
FLAGS_output_file + "_" + FormatName(output_names[i]);
......
......@@ -9,6 +9,7 @@
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace mace {
template <typename Integer>
......@@ -96,5 +97,20 @@ inline std::string ObfuscateSymbol(const std::string &src) {
#define MACE_OBFUSCATE_SYMBOL(str) (str)
#endif
inline std::vector<std::string> Split(const std::string &str, char delims) {
std::vector<std::string> result;
std::string tmp = str;
while (!tmp.empty()) {
size_t next_offset = tmp.find(delims);
result.push_back(tmp.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return result;
}
} // namespace mace
#endif // MACE_UTILS_UTILS_H_
......@@ -33,10 +33,16 @@ def ops_benchmark_stdout_processor(stdout, device_properties, abi):
line = line.strip()
parts = line.split()
if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1000000.0)
metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
sh_commands.falcon_push_metrics(metrics, device_properties, abi,
platform = device_properties["ro.board.platform"].replace(" ", "-")
model = device_properties["ro.product.model"].replace(" ", "-")
tags = {"ro.board.platform": platform,
"ro.product.model": model,
"abi": abi}
sh_commands.falcon_push_metrics(metrics, tags=tags,
endpoint="mace_ops_benchmark")
def parse_args():
......@@ -88,6 +94,7 @@ def main(unused_args):
# generate sources
sh_commands.gen_encrypted_opencl_source()
sh_commands.gen_compiled_opencl_source()
sh_commands.gen_mace_version()
for target_abi in target_abis:
......
......@@ -30,7 +30,7 @@ if [ x"$TARGET_ABI" = x"host" ]; then
--copt="-O3" \
$PRODUCTION_MODE_BUILD_FLAGS || exit 1
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
......@@ -47,7 +47,7 @@ else
NEON_ENABLE_FLAG="--define neon=true"
fi
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--cpu=${TARGET_ABI} \
......@@ -64,14 +64,13 @@ else
$HEXAGON_MODE_BUILD_FLAG || exit 1
fi
if [ "$PRODUCTION_MODE" = 1 ]; then
cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
fi
rm -rf $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
if [ -f "$MODEL_OUTPUT_DIR/mace_run" ]; then
rm -rf $MODEL_OUTPUT_DIR/mace_run
fi
cp bazel-bin/mace/examples/mace_run $MODEL_OUTPUT_DIR
cp bazel-bin/mace/tools/validation/mace_run $MODEL_OUTPUT_DIR
if [ "$EMBED_MODEL_DATA" = 0 ]; then
cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
fi
......
......@@ -35,9 +35,9 @@ build_target()
}
if [ x"$TARGET_ABI" = x"host" ]; then
build_host_target //mace/codegen:generated_opencl_prod
build_host_target //mace/codegen:generated_opencl
build_host_target //mace/codegen:generated_tuning_params
else
build_target //mace/codegen:generated_opencl_prod
build_target //mace/codegen:generated_opencl
build_target //mace/codegen:generated_tuning_params
fi
......@@ -72,9 +72,11 @@ adb -s $DEVICE_ID </dev/null shell \
MACE_KERNEL_PATH=$KERNEL_DIR \
MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
${PHONE_DATA_DIR}/model_throughput_test \
--input_node="${INPUT_NODES}" \
--input_shape="${INPUT_SHAPES}" \
--output_node="${OUTPUT_NODES}" \
--output_shape="${OUTPUT_SHAPES}" \
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME}_${INPUT_NODES} \
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--cpu_model_data_file=${PHONE_DATA_DIR}/${CPU_MODEL_TAG}.data \
--gpu_model_data_file=${PHONE_DATA_DIR}/${GPU_MODEL_TAG}.data \
--dsp_model_data_file=${PHONE_DATA_DIR}/${DSP_MODEL_TAG}.data \
......
......@@ -26,4 +26,3 @@ if [ x"$TARGET_ABI" != x"host" ]; then
fi
rm -rf mace/codegen/models
git checkout -- mace/codegen/opencl/opencl_compiled_program.cc mace/codegen/tuning/tuning_params.cc
......@@ -3,12 +3,14 @@ LIBMACE_TAG=`git describe --abbrev=0 --tags`
MACE_SOURCE_DIR=`/bin/pwd`
PHONE_DATA_DIR="/data/local/tmp/mace_run"
KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
COMPILED_PROGRAM_DIR="${PHONE_DATA_DIR}/cl_program/"
CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/${MODEL_TAG}
CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
CL_BUILT_KERNEL_FILE_NAME=mace_cl_compiled_program.bin
CL_PLATFORM_INFO_FILE_NAME=mace_cl_platform_info.txt
if [ -z ${EMBED_MODEL_DATA} ]; then
EMBED_MODEL_DATA=1
fi
......
#!/usr/bin/env bash
Usage() {
echo "Usage: bash tools/genenrate_opencl_code.sh type [target_soc] [cl_bin_dirs] [pull_or_not]"
}
if [ $# -lt 1 ]; then
Usage
exit 1
fi
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
TYPE=$1
TARGET_SOC=$2
CL_BIN_DIRS=$3
PULL_OR_NOT=$4
mkdir -p ${CL_CODEGEN_DIR}
if [ x"$TYPE" == x"source" ];then
python mace/python/tools/encrypt_opencl_codegen.py \
--cl_kernel_dir=./mace/kernels/opencl/cl/ \
--output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
elif [ x"$#" == x"1" ];then
python mace/python/tools/opencl_codegen.py \
--built_kernel_file_name=${CL_BUILT_KERNEL_FILE_NAME} \
--platform_info_file_name=${CL_PLATFORM_INFO_FILE_NAME} \
--output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc || exit 1
else
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ "$PULL_OR_NOT" = 1 ]; then
CL_BIN_DIR=${CL_BIN_DIRS}
mkdir -p ${CL_BIN_DIR}
rm -rf ${CL_BIN_DIR}/${CL_BUILT_KERNEL_FILE_NAME}
rm -rf ${CL_BIN_DIR}/${CL_PLATFORM_INFO_FILE_NAME}
if [ x"$TARGET_ABI" != x"host" ]; then
adb -s $DEVICE_ID pull ${COMPILED_PROGRAM_DIR}/. ${CL_BIN_DIR} > /dev/null
fi
fi
python mace/python/tools/opencl_codegen.py \
--built_kernel_file_name=${CL_BUILT_KERNEL_FILE_NAME} \
--platform_info_file_name=${CL_PLATFORM_INFO_FILE_NAME} \
--cl_binary_dirs=${CL_BIN_DIRS} \
--output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc || exit 1
fi
#!/bin/bash
Usage() {
echo "Usage: bash tools/generate_production_code.sh target_soc cl_bin_dirs pull_or_not"
}
if [ $# -lt 3 ]; then
Usage
exit 1
fi
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
TARGET_SOC=$1
CL_BIN_DIRS=$2
PULL_OR_NOT=$3
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ "$PULL_OR_NOT" = 1 ]; then
CL_BIN_DIR=${CL_BIN_DIRS}
rm -rf ${CL_BIN_DIR}
mkdir -p ${CL_BIN_DIR}
if [ x"$TARGET_ABI" != x"host" ]; then
adb -s $DEVICE_ID pull ${KERNEL_DIR}/. ${CL_BIN_DIR} > /dev/null
adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR} > /dev/null
fi
fi
python mace/python/tools/opencl_codegen.py \
--cl_binary_dirs=${CL_BIN_DIRS} \
--output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc
python mace/python/tools/binary_codegen.py \
--binary_dirs=${CL_BIN_DIRS} \
--binary_file_name=mace_run.config \
--output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
#!/bin/bash
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
mkdir -p ${TUNING_CODEGEN_DIR}
if [ "$#" -eq "0" ]; then
python mace/python/tools/binary_codegen.py \
--binary_file_name=mace_run.config \
--output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
else
TARGET_SOC=$1
BIN_DIRS=$2
PULL_OR_NOT=$3
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ "$PULL_OR_NOT" = 1 ]; then
mkdir -p ${BIN_DIRS}
rm -rf ${BIN_DIRS}/mace_run.config
if [ x"$TARGET_ABI" != x"host" ]; then
adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${BIN_DIRS} > /dev/null
fi
fi
python mace/python/tools/binary_codegen.py \
--binary_dirs=${BIN_DIRS} \
--binary_file_name=mace_run.config \
--output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
fi
#!/bin/bash
CL_KERNEL_DIR_TAG=$1
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
python mace/python/tools/encrypt_opencl_codegen.py \
--cl_kernel_dir=./mace/kernels/opencl/cl/ \
--output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
rm -rf ${CODEGEN_DIR}/version
mkdir ${CODEGEN_DIR}/version
bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit 1
......@@ -9,6 +9,7 @@
import argparse
import hashlib
import os
import sh
import shutil
import subprocess
import sys
......@@ -58,10 +59,41 @@ def get_global_runtime(configs):
return global_runtime
def generate_opencl_and_version_code():
command = "bash tools/generate_opencl_and_version_code.sh"
def generate_version_code():
command = "bash tools/generate_version_code.sh"
run_command(command)
def generate_opencl_source_code():
command = "bash tools/generate_opencl_code.sh source"
run_command(command)
def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_opencl_code.sh binary"
else:
command = "bash tools/generate_opencl_code.sh {} {} {} {}".format(
'binary', target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_tuning_param_code.sh"
else:
command = "bash tools/generate_tuning_param_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
def generate_code(target_soc, model_output_dirs, pull_or_not):
generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not)
generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not)
def clear_env(target_soc):
command = "bash tools/clear_env.sh {}".format(target_soc)
......@@ -111,18 +143,41 @@ def build_mace_run(production_mode, model_output_dir, hexagon_mode):
run_command(command)
def tuning_run(target_soc,
def tuning_run(model_name,
target_runtime,
target_abi,
target_soc,
model_output_dir,
running_round,
tuning,
production_mode,
restart_round,
option_args=''):
command = "bash tools/tuning_run.sh {} {} {} {} {} {} \"{}\"".format(
target_soc, model_output_dir, running_round, int(tuning),
int(production_mode), restart_round, option_args)
run_command(command)
# TODO(yejianwu) refactoring the hackish code
stdout_buff = []
process_output = sh_commands.make_output_processor(stdout_buff)
p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir,
running_round, int(tuning),
restart_round, option_args, _out=process_output,
_bg=True, _err_to_out=True)
p.wait()
metrics = {}
for line in stdout_buff:
line = line.strip()
parts = line.split()
if len(parts) == 6 and parts[0].startswith("time"):
metrics["%s.create_net_ms" % model_name] = str(float(parts[1]))
metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2]))
metrics["%s.init_ms" % model_name] = str(float(parts[3]))
metrics["%s.warmup_ms" % model_name] = str(float(parts[4]))
if float(parts[5]) > 0:
metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5]))
tags = {"ro.board.platform": target_soc,
"abi": target_abi,
# "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime
"round": running_round, # TODO(yejianwu) change this to source/binary
"tuning": tuning}
sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark",
tags=tags)
def benchmark_model(target_soc, model_output_dir, option_args=''):
command = "bash tools/benchmark.sh {} {} \"{}\"".format(
......@@ -130,9 +185,10 @@ def benchmark_model(target_soc, model_output_dir, option_args=''):
run_command(command)
def run_model(target_soc, model_output_dir, running_round, restart_round,
option_args):
tuning_run(target_soc, model_output_dir, running_round, False, False,
def run_model(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, restart_round, option_args):
tuning_run(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, False,
restart_round, option_args)
......@@ -146,25 +202,28 @@ def generate_production_code(target_soc, model_output_dirs, pull_or_not):
run_command(command)
def build_mace_run_prod(target_soc, model_output_dir, tuning, global_runtime):
if "dsp" == global_runtime:
def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc,
model_output_dir, tuning):
if "dsp" == target_runtime:
hexagon_mode = True
else:
hexagon_mode = False
generate_code(target_soc, [], False)
production_or_not = False
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
tuning_run(
model_name,
target_runtime,
target_abi,
target_soc,
model_output_dir,
running_round=0,
tuning=tuning,
production_mode=production_or_not,
restart_round=1)
generate_code(target_soc, [model_output_dir], True)
production_or_not = True
pull_or_not = True
generate_production_code(target_soc, [model_output_dir], pull_or_not)
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
......@@ -188,8 +247,7 @@ def build_production_code():
def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs):
pull_or_not = False
generate_production_code(target_soc, model_output_dirs, pull_or_not)
generate_code(target_soc, model_output_dirs, False)
build_production_code()
model_output_dirs_str = ",".join(model_output_dirs)
......@@ -202,6 +260,26 @@ def packaging_lib_file(output_dir):
command = "bash tools/packaging_lib.sh {}".format(output_dir)
run_command(command)
def download_model_files(model_file_path,
model_output_dir,
weight_file_path=""):
if model_file_path.startswith("http://") or \
model_file_path.startswith("https://"):
os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"])
if weight_file_path.startswith("http://") or \
weight_file_path.startswith("https://"):
os.environ[
"WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
urllib.urlretrieve(weight_file_path,
os.environ["WEIGHT_FILE_PATH"])
def md5sum(str):
md5 = hashlib.md5()
md5.update(str)
return md5.hexdigest()
def parse_model_configs():
with open(FLAGS.config) as f:
......@@ -268,7 +346,9 @@ def main(unused_args):
shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
generate_opencl_and_version_code()
generate_version_code()
generate_opencl_source_code()
option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')])
available_socs = sh_commands.adb_get_all_socs()
......@@ -285,6 +365,7 @@ def main(unused_args):
print("Error: devices with SoCs are not connected %s" % missing_socs)
exit(1)
for target_soc in target_socs:
for target_abi in configs["target_abis"]:
global_runtime = get_global_runtime(configs)
......@@ -292,9 +373,9 @@ def main(unused_args):
os.environ["TARGET_ABI"] = target_abi
model_output_dirs = []
for model_name in configs["models"]:
print '=======================', model_name, '======================='
# Transfer params by environment
os.environ["MODEL_TAG"] = model_name
print '=======================', model_name, '======================='
model_config = configs["models"][model_name]
input_file_list = model_config.get("validation_inputs_data", [])
for key in model_config:
......@@ -307,9 +388,8 @@ def main(unused_args):
else:
os.environ[key.upper()] = str(model_config[key])
md5 = hashlib.md5()
md5.update(model_config["model_file_path"])
model_path_digest = md5.hexdigest()
# Create model build directory
model_path_digest = md5sum(model_config["model_file_path"])
model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir,
os.environ["PROJECT_NAME"],
"build", model_name,
......@@ -323,21 +403,8 @@ def main(unused_args):
os.makedirs(model_output_dir)
clear_env(target_soc)
# Support http:// and https://
if model_config["model_file_path"].startswith(
"http://") or model_config["model_file_path"].startswith(
"https://"):
os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
urllib.urlretrieve(model_config["model_file_path"],
os.environ["MODEL_FILE_PATH"])
if model_config["platform"] == "caffe" and (
model_config["weight_file_path"].startswith("http://") or
model_config["weight_file_path"].startswith("https://")):
os.environ[
"WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
urllib.urlretrieve(model_config["weight_file_path"],
os.environ["WEIGHT_FILE_PATH"])
download_model_files(model_config["model_file_path"],
model_output_dir, model_config.get("weight_file_path", ""))
if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\
or FLAGS.mode == "benchmark" or FLAGS.mode == "all":
......@@ -346,12 +413,13 @@ def main(unused_args):
if FLAGS.mode == "build" or FLAGS.mode == "all":
generate_model_code()
build_mace_run_prod(target_soc, model_output_dir, FLAGS.tuning,
global_runtime)
build_mace_run_prod(model_name, global_runtime, target_abi,
target_soc, model_output_dir, FLAGS.tuning)
if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all":
run_model(target_soc, model_output_dir, FLAGS.round,
FLAGS.restart_round, option_args)
run_model(model_name, global_runtime, target_abi, target_soc,
model_output_dir, FLAGS.round, FLAGS.restart_round,
option_args)
if FLAGS.mode == "benchmark":
benchmark_model(target_soc, model_output_dir, option_args)
......
......@@ -41,14 +41,13 @@ LIBMACE_TEMP_DIR=`mktemp -d -t libmace.XXXX`
echo "create ${LIBMACE_BUILD_DIR}/${TARGET_ABI}/libmace_${PROJECT_NAME}.${TARGET_SOC}.a" > ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
if [ x"$TARGET_ABI" = x"host" ]; then
echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
else
echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/codegen/libgenerated_version.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/core/libcore.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/core/libopencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/kernels/libkernels.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/utils/libutils.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
echo "addlib bazel-bin/mace/utils/libutils_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
......
......@@ -70,22 +70,20 @@ def adb_run(serialno, host_bin_path, bin_name,
out_of_range_check=1):
host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
device_cl_path = "%s/cl" % device_bin_path
props = adb_getprop_by_serialno(serialno)
print("=====================================================================")
print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"],
props["ro.product.model"]))
sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_cl_path)
print("Push %s to %s" % (host_bin_full_path, device_bin_full_path))
sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_path)
sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path)
print("Run %s" % device_bin_full_path)
stdout_buff=[]
process_output = make_output_processor(stdout_buff)
p = sh.adb("-s", serialno, "shell",
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(out_of_range_check, opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args),
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args),
_out=process_output, _bg=True, _err_to_out=True)
p.wait()
return "".join(stdout_buff)
......@@ -131,6 +129,7 @@ def bazel_target_to_bin(target):
################################
# TODO this should be refactored
def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/encrypt_opencl_codegen.py",
"--cl_kernel_dir=./mace/kernels/opencl/cl/",
"--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path)
......@@ -140,24 +139,32 @@ def gen_mace_version(codegen_path="mace/codegen"):
sh.bash("mace/tools/git/gen_version_source.sh",
"%s/version/version.cc" % codegen_path)
def gen_compiled_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/opencl_codegen.py",
"--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path)
################################
# falcon
################################
def falcon_tags(platform, model, abi):
return "ro.board.platform=%s,ro.product.model=%s,abi=%s" % (platform, model, abi)
def falcon_push_metrics(metrics, device_properties, abi, endpoint="mace_dev"):
def falcon_tags(tags_dict):
tags = ""
for k, v in tags_dict.iteritems():
if tags == "":
tags = "%s=%s" % (k, v)
else:
tags = tags + ",%s=%s" % (k, v)
return tags
def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv",
port=8433,
debug=False)
platform = device_properties["ro.board.platform"].replace(" ", "-")
model = device_properties["ro.product.model"].replace(" ", "-")
tags = falcon_tags(platform, model, abi)
ts = int(time.time())
falcon_metrics = [{
"endpoint": endpoint,
"metric": key,
"tags": tags,
"tags": falcon_tags(tags),
"timestamp": ts,
"value": value,
"step": 86400,
......
#!/bin/bash
Usage() {
echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning production_mode"
echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning "
}
if [ $# -lt 7 ]; then
if [ $# -lt 6 ]; then
Usage
exit 1
fi
......@@ -16,9 +16,8 @@ TARGET_SOC=$1
MODEL_OUTPUT_DIR=$2
ROUND=$3
TUNING_OR_NOT=$4
PRODUCTION_MODE=$5
RESTART_ROUND=$6
OPTION_ARGS=$7
RESTART_ROUND=$5
OPTION_ARGS=$6
echo $OPTION_ARGS
......@@ -45,16 +44,14 @@ if [ x"$TARGET_ABI" = x"host" ]; then
--restart_round=1 \
$OPTION_ARGS || exit 1
else
if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != 1 ]];then
if [[ "${TUNING_OR_NOT}" != "0" ]];then
tuning_flag=1
else
tuning_flag=0
fi
adb -s $DEVICE_ID shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
if [ "$PRODUCTION_MODE" = 0 ]; then
adb -s $DEVICE_ID shell "mkdir -p ${KERNEL_DIR}" || exit 1
fi
adb -s $DEVICE_ID shell "mkdir -p ${COMPILED_PROGRAM_DIR}" || exit 1
IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
for NAME in "${INPUT_NAMES[@]}";do
......@@ -72,7 +69,7 @@ else
MACE_TUNING=${tuning_flag} \
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \
MACE_CL_PROGRAM_PATH=$COMPILED_PROGRAM_DIR \
MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
${PHONE_DATA_DIR}/mace_run \
--input_node="${INPUT_NODES}" \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册