diff --git a/.gitignore b/.gitignore index b3a51dd83dbcba3ce393e112b6df5fb02310ba4a..c78b060c4a06e8597917b39f67d90fb53f466a29 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,6 @@ cmake-build-debug/ mace/codegen/models/ mace/codegen/opencl/ mace/codegen/opencl_bin/ +mace/codegen/tuning/ mace/codegen/version/ build/ diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc index bc2be7a6c61f4f7f796884cebed17d340dd2ebe1..8c3a84b3f3693d3e44e1929bd9523b1435753d05 100644 --- a/mace/benchmark/model_throughput_test.cc +++ b/mace/benchmark/model_throughput_test.cc @@ -25,6 +25,7 @@ #include "mace/public/mace.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" +#include "mace/core/types.h" namespace mace { @@ -72,10 +73,43 @@ extern const std::string ModelChecksum(); namespace benchmark { +void Split(const std::string &str, + char delims, + std::vector *result) { + MACE_CHECK_NOTNULL(result); + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result->push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +void SplitAndParseToInts(const std::string &str, + char delims, + std::vector *result) { + MACE_CHECK_NOTNULL(result); + std::string tmp = str; + while (!tmp.empty()) { + index_t dim = atoi(tmp.data()); + result->push_back(dim); + size_t next_offset = tmp.find(delims); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + void ParseShape(const std::string &str, std::vector *shape) { std::string tmp = str; while (!tmp.empty()) { - int dim = atoi(tmp.data()); + index_t dim = atoi(tmp.data()); shape->push_back(dim); size_t next_offset = tmp.find(","); if (next_offset == std::string::npos) { @@ -86,6 +120,14 @@ void ParseShape(const std::string &str, std::vector *shape) { } } +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!::isalnum(res[i])) res[i] = '_'; + } + return res; +} + DeviceType ParseDeviceType(const std::string &device_str) { if (device_str.compare("CPU") == 0) { return DeviceType::CPU; @@ -100,6 +142,10 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +DEFINE_string(input_node, "input_node0,input_node1", + "input nodes, separated by comma"); +DEFINE_string(output_node, "output_node0,output_node1", + "output nodes, separated by comma"); DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma"); DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma"); DEFINE_string(input_file, "", "input file name"); @@ -113,7 +159,6 @@ int Main(int argc, char **argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); LOG(INFO) << "mace version: " << MaceVersion(); - LOG(INFO) << "mace git version: " << MaceGitVersion(); #ifdef MACE_CPU_MODEL_TAG LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum(); @@ -126,7 +171,9 @@ int Main(int argc, char **argv) { LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum(); #endif + LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]"; LOG(INFO) << "input_shape: " << FLAGS_input_shape; + LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]"; LOG(INFO) << "output_shape: " << FLAGS_output_shape; LOG(INFO) << "input_file: " << FLAGS_input_file; LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file; @@ -134,31 +181,63 @@ int Main(int argc, char **argv) { LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file; LOG(INFO) << "run_seconds: " << FLAGS_run_seconds; - std::vector input_shape_vec; - std::vector output_shape_vec; - ParseShape(FLAGS_input_shape, &input_shape_vec); - ParseShape(FLAGS_output_shape, &output_shape_vec); - - int64_t input_size = - std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1, - std::multiplies()); - int64_t output_size = - std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1, - std::multiplies()); - std::unique_ptr input_data(new float[input_size]); - std::unique_ptr cpu_output_data(new float[output_size]); - std::unique_ptr gpu_output_data(new float[output_size]); - std::unique_ptr dsp_output_data(new float[output_size]); - - // load input - std::ifstream in_file(FLAGS_input_file, std::ios::in | std::ios::binary); - if (in_file.is_open()) { - in_file.read(reinterpret_cast(input_data.get()), - input_size * sizeof(float)); - in_file.close(); - } else { - LOG(INFO) << "Open input file failed"; - return -1; + std::vector input_names; + std::vector output_names; + std::vector input_shapes; + std::vector output_shapes; + Split(FLAGS_input_node, ',', &input_names); + Split(FLAGS_output_node, ',', &output_names); + Split(FLAGS_input_shape, ':', &input_shapes); + Split(FLAGS_output_shape, ':', &output_shapes); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + std::map inputs; + std::map cpu_outputs; + std::map gpu_outputs; + std::map dsp_outputs; + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + // load input + std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), + std::ios::in | std::ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(buffer_in.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(FATAL) << "Open input file failed"; + } + inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in); + } + + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shape_vec[i].begin(), + output_shape_vec[i].end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + cpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], + buffer_out); + gpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], + buffer_out); + dsp_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], + buffer_out); } int64_t t0, t1, init_micros; @@ -170,11 +249,12 @@ int Main(int argc, char **argv) { FLAGS_cpu_model_data_file.c_str()); NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data); - mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU); + mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU, input_names, + output_names); LOG(INFO) << "CPU Warm up run"; t0 = NowMicros(); - cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + cpu_engine.Run(inputs, &cpu_outputs); t1 = NowMicros(); LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us"; #endif @@ -187,12 +267,13 @@ int Main(int argc, char **argv) { FLAGS_gpu_model_data_file.c_str()); NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); - mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL); + mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names, + output_names); mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); LOG(INFO) << "GPU Warm up run"; t0 = NowMicros(); - gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + gpu_engine.Run(inputs, &gpu_outputs); t1 = NowMicros(); LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us"; #endif @@ -202,15 +283,16 @@ int Main(int argc, char **argv) { LOG(INFO) << "Load & init dsp model and warm up"; const unsigned char *dsp_model_data = mace::MACE_DSP_MODEL_TAG::LoadModelData( - FLAGS_gpu_model_data_file.c_str()); + FLAGS_dsp_model_data_file.c_str()); NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data); - mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON); + mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON, input_names, + output_names); mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data); LOG(INFO) << "DSP Warm up run"; t0 = NowMicros(); - gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + dsp_engine.Run(inputs, &dsp_outputs); t1 = NowMicros(); LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us"; #endif @@ -226,7 +308,7 @@ int Main(int argc, char **argv) { int64_t micros = 0; int64_t start = NowMicros(); for (; micros < run_micros; ++frames) { - cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + cpu_engine.Run(inputs, &cpu_outputs); int64_t end = NowMicros(); micros = end - start; } @@ -240,7 +322,7 @@ int Main(int argc, char **argv) { int64_t micros = 0; int64_t start = NowMicros(); for (; micros < run_micros; ++frames) { - gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + gpu_engine.Run(inputs, &gpu_outputs); int64_t end = NowMicros(); micros = end - start; } @@ -254,7 +336,7 @@ int Main(int argc, char **argv) { int64_t micros = 0; int64_t start = NowMicros(); for (; micros < run_micros; ++frames) { - dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + dsp_engine.Run(inputs, &dsp_outputs); int64_t end = NowMicros(); micros = end - start; } diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD index b715869984d414234af5961a21cf4867bdb2601d..bc92a7bf13b871954b5f18de9e218fad7722b459 100644 --- a/mace/codegen/BUILD +++ b/mace/codegen/BUILD @@ -17,14 +17,8 @@ cc_library( ) cc_library( - name = "generated_opencl_dev", - srcs = ["opencl/opencl_encrypt_program.cc"], - linkstatic = 1, -) - -cc_library( - name = "generated_opencl_prod", - srcs = ["opencl/opencl_compiled_program.cc"], + name = "generated_opencl", + srcs = glob(["opencl/*.cc"]), linkstatic = 1, ) diff --git a/mace/codegen/opencl/opencl_compiled_program.cc b/mace/codegen/opencl/opencl_compiled_program.cc deleted file mode 100644 index 62fbad33022b92d4d72a7f407cca18c5e4073fe3..0000000000000000000000000000000000000000 --- a/mace/codegen/opencl/opencl_compiled_program.cc +++ /dev/null @@ -1,16 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -// This is a generated file, DO NOT EDIT - -#include -#include -#include - -namespace mace { - -extern const std::map> - kCompiledProgramMap = {}; - -} // namespace mace diff --git a/mace/codegen/tuning/tuning_params.cc b/mace/codegen/tuning/tuning_params.cc deleted file mode 100644 index 8a8b7132a26a150d14ff9504b13aa668fc109285..0000000000000000000000000000000000000000 --- a/mace/codegen/tuning/tuning_params.cc +++ /dev/null @@ -1,16 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -// This is a generated file, DO NOT EDIT - -#include -#include -#include - -namespace mace { - -extern const std::map> - kTuningParamsData = {}; - -} // namespace mace diff --git a/mace/core/BUILD b/mace/core/BUILD index 9a957ad1d4becc788d554aa34b58d911428d60ea..1b4cdaa8d51c054a9f8b55d5c7e4266273724f5d 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -27,8 +27,6 @@ cc_library( "runtime/hexagon/*.cc", ], exclude = [ - "runtime/opencl/opencl_production.cc", - "runtime/opencl/opencl_development.cc", "*_test.cc", "runtime/hexagon/hexagon_controller_dummy.cc", ], @@ -50,15 +48,15 @@ cc_library( ]), deps = [ ":opencl_headers", - "//mace/utils", + "//mace/codegen:generated_opencl", "//mace/codegen:generated_version", + "//mace/utils", "@half//:half", ] + if_production_mode([ + "//mace/codegen:generated_tuning_params", "//mace/utils:utils_prod", - "//mace/core:opencl_prod", ]) + if_not_production_mode([ "//mace/utils:utils_dev", - "//mace/core:opencl_dev", ]), ) @@ -90,26 +88,3 @@ cc_library( ], alwayslink = 1, ) - -cc_library( - name = "opencl_dev", - srcs = ["runtime/opencl/opencl_development.cc"], - linkstatic = 1, - deps = [ - ":opencl_headers", - "//mace/codegen:generated_opencl_dev", - "//mace/utils", - ], -) - -cc_library( - name = "opencl_prod", - srcs = ["runtime/opencl/opencl_production.cc"], - linkstatic = 1, - deps = [ - ":opencl_headers", - "//mace/codegen:generated_opencl_prod", - "//mace/codegen:generated_tuning_params", - "//mace/utils", - ], -) diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc new file mode 100644 index 0000000000000000000000000000000000000000..de9a67fa73a197d8bfe73ec27652fff2f0afd85f --- /dev/null +++ b/mace/core/file_storage.cc @@ -0,0 +1,216 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/file_storage.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + + +#include "mace/utils/logging.h" + +namespace mace { + +class FileStorageFactory::Impl { + public: + explicit Impl(const std::string &path); + + std::unique_ptr CreateStorage(const std::string &name); + + private: + std::string path_; +}; + +FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {} +std::unique_ptr FileStorageFactory::Impl::CreateStorage( + const std::string &name) { + return std::move(std::unique_ptr( + new FileStorage(path_ + "/" + name))); +} + +FileStorageFactory::FileStorageFactory(const std::string &path): + impl_(new FileStorageFactory::Impl(path)) {} + +FileStorageFactory::~FileStorageFactory() = default; + +std::unique_ptr FileStorageFactory::CreateStorage( + const std::string &name) { + return impl_->CreateStorage(name); +} + +FileStorage::FileStorage(const std::string &file_path): + file_path_(file_path) {} + +int FileStorage::Load() { + struct stat st; + if (stat(file_path_.c_str(), &st) == -1) { + if (errno == ENOENT) { + LOG(INFO) << "File " << file_path_ + << " does not exist"; + return 0; + } else { + LOG(WARNING) << "Stat file " << file_path_ + << " failed, error code: " << errno; + return -1; + } + } + int fd = open(file_path_.c_str(), O_RDONLY); + if (fd < 0) { + if (errno == ENOENT) { + LOG(INFO) << "File " << file_path_ + << " does not exist"; + return 0; + } else { + LOG(WARNING) << "open file " << file_path_ + << " failed, error code: " << errno; + return -1; + } + } + size_t file_size = st.st_size; + unsigned char *file_data = + static_cast(mmap(nullptr, file_size, PROT_READ, + MAP_PRIVATE, fd, 0)); + int res = 0; + if (file_data == MAP_FAILED) { + LOG(WARNING) << "mmap file " << file_path_ + << " failed, error code: " << errno; + + res = close(fd); + if (res != 0) { + LOG(WARNING) << "close file " << file_path_ + << " failed, error code: " << errno; + } + return -1; + } + unsigned char *file_data_ptr = file_data; + + const size_t int_size = sizeof(int32_t); + + int64_t data_size = 0; + memcpy(&data_size, file_data_ptr, sizeof(int64_t)); + file_data_ptr += sizeof(int64_t); + int32_t key_size = 0; + int32_t value_size = 0; + for (int i = 0; i < data_size; ++i) { + memcpy(&key_size, file_data_ptr, int_size); + file_data_ptr += int_size; + std::unique_ptr key(new char[key_size+1]); + memcpy(&key[0], file_data_ptr, key_size); + file_data_ptr += key_size; + key[key_size] = '\0'; + + memcpy(&value_size, file_data_ptr, int_size); + file_data_ptr += int_size; + std::vector value(value_size); + memcpy(value.data(), file_data_ptr, value_size); + file_data_ptr += value_size; + + data_.emplace(std::string(&key[0]), value); + } + + res = munmap(file_data, file_size); + if (res != 0) { + LOG(WARNING) << "munmap file " << file_path_ + << " failed, error code: " << errno; + res = close(fd); + if (res != 0) { + LOG(WARNING) << "close file " << file_path_ + << " failed, error code: " << errno; + } + return -1; + } + res = close(fd); + if (res != 0) { + LOG(WARNING) << "close file " << file_path_ + << " failed, error code: " << errno; + return -1; + } + return 0; +} + +bool FileStorage::Insert(const std::string &key, + const std::vector &value) { + data_.emplace(key, value); + return true; +} + +const std::vector *FileStorage::Find(const std::string &key) { + auto iter = data_.find(key); + if (iter == data_.end()) return nullptr; + + return &(iter->second); +} + +int FileStorage::Flush() { + int fd = open(file_path_.c_str(), O_WRONLY | O_CREAT, 0600); + if (fd < 0) { + LOG(WARNING) << "open file " << file_path_ + << " failed, error code:" << errno; + return -1; + } + + const size_t int_size = sizeof(int32_t); + + int64_t data_size = sizeof(int64_t); + for (auto &kv : data_) { + data_size += 2 * int_size + kv.first.size() + kv.second.size(); + } + std::unique_ptr buffer(new unsigned char[data_size]); + unsigned char *buffer_ptr = &buffer[0]; + + int64_t num_of_data = data_.size(); + memcpy(buffer_ptr, &num_of_data, sizeof(int64_t)); + buffer_ptr += sizeof(int64_t); + for (auto &kv : data_) { + int32_t key_size = kv.first.size(); + memcpy(buffer_ptr, &key_size, int_size); + buffer_ptr += int_size; + + memcpy(buffer_ptr, kv.first.c_str(), kv.first.size()); + buffer_ptr += kv.first.size(); + + int32_t value_size = kv.second.size(); + memcpy(buffer_ptr, &value_size, int_size); + buffer_ptr += int_size; + + memcpy(buffer_ptr, kv.second.data(), kv.second.size()); + buffer_ptr += kv.second.size(); + } + int res = 0; + buffer_ptr = &buffer[0]; + int64_t remain_size = data_size; + while (remain_size > 0) { + size_t buffer_size = std::min(remain_size, SSIZE_MAX); + res = write(fd, buffer_ptr, buffer_size); + if (res == -1) { + LOG(WARNING) << "write file " << file_path_ + << " failed, error code: " << errno; + res = close(fd); + if (res != 0) { + LOG(WARNING) << "close file " << file_path_ + << " failed, error code: " << errno; + } + return -1; + } + remain_size -= buffer_size; + buffer_ptr += buffer_size; + } + + res = close(fd); + if (res != 0) { + LOG(WARNING) << "close file " << file_path_ + << " failed, error code: " << errno; + return -1; + } + return 0; +} + +}; // namespace mace diff --git a/mace/core/file_storage.h b/mace/core/file_storage.h new file mode 100644 index 0000000000000000000000000000000000000000..c29628cdd657d7a16d494128cfee908dfdc1f18f --- /dev/null +++ b/mace/core/file_storage.h @@ -0,0 +1,34 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_CORE_FILE_STORAGE_H_ +#define MACE_CORE_FILE_STORAGE_H_ + +#include +#include +#include + +#include "mace/public/mace_runtime.h" + +namespace mace { + +class FileStorage : public KVStorage { + public: + explicit FileStorage(const std::string &file_path); + + public: + int Load() override; + bool Insert(const std::string &key, + const std::vector &value) override; + const std::vector *Find(const std::string &key) override; + int Flush() override; + + private: + std::string file_path_; + std::map> data_; +}; + +} // namespace mace + +#endif // MACE_CORE_FILE_STORAGE_H_ diff --git a/mace/core/mace.cc b/mace/core/mace.cc index 98b0190d242c2853faecaa40338630e5e7a8400e..484c9472af8e9e12124c78b054d539584e4d2801 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -4,8 +4,10 @@ #include +#include "mace/core/file_storage.h" #include "mace/core/net.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/types.h" #include "mace/public/mace.h" @@ -94,6 +96,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def, net_(nullptr), hexagon_controller_(nullptr) { LOG(INFO) << "MACE version: " << MaceVersion(); + // Set storage path for internal usage for (auto input_name : input_nodes) { ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"), GetDeviceAllocator(device_type_), DT_FLOAT); @@ -173,6 +176,7 @@ MaceStatus MaceEngine::Impl::Run( LOG(FATAL) << "Net run failed"; } } + OpenCLRuntime::Global()->SaveBuiltCLProgram(); for (auto &output : *outputs) { Tensor *output_tensor = ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0")); diff --git a/mace/core/mace_runtime.cc b/mace/core/mace_runtime.cc index da70f96f72b2e0c76d704773c34eefd2a789b5b1..5f94195370990480c0003836cbde8d2a529c70c3 100644 --- a/mace/core/mace_runtime.cc +++ b/mace/core/mace_runtime.cc @@ -8,6 +8,8 @@ namespace mace { +std::shared_ptr kStorageFactory = nullptr; + void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint @@ -15,6 +17,11 @@ void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint, OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint); } +void ConfigKVStorageFactory(std::shared_ptr storage_factory) { + VLOG(1) << "Set internal KV Storage Engine"; + kStorageFactory = storage_factory; +} + void ConfigOmpThreads(int omp_num_threads) { VLOG(1) << "Config CPU omp_num_threads: " << omp_num_threads; SetOmpThreads(omp_num_threads); diff --git a/mace/core/runtime/opencl/opencl_development.cc b/mace/core/runtime/opencl/opencl_development.cc deleted file mode 100644 index 036a97ed6f19d7b5fb499d1166f5183f99d84bc4..0000000000000000000000000000000000000000 --- a/mace/core/runtime/opencl/opencl_development.cc +++ /dev/null @@ -1,39 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include -#include -#include -#include -#include - -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/utils/logging.h" -#include "mace/utils/utils.h" - -namespace mace { - -bool GetSourceOrBinaryProgram(const std::string &program_name, - const std::string &binary_file_name_prefix, - const cl::Context &context, - const cl::Device &device, - cl::Program *program, - bool *is_binary) { - extern const std::map> - kEncryptedProgramMap; - *is_binary = false; - auto it_source = kEncryptedProgramMap.find(program_name); - if (it_source == kEncryptedProgramMap.end()) { - return false; - } - cl::Program::Sources sources; - std::string content(it_source->second.begin(), it_source->second.end()); - std::string kernel_source = ObfuscateString(content); - sources.push_back(kernel_source); - *program = cl::Program(context, sources); - - return true; -} - -} // namespace mace diff --git a/mace/core/runtime/opencl/opencl_production.cc b/mace/core/runtime/opencl/opencl_production.cc deleted file mode 100644 index f4df016f52911c9ed4c9a37cd4d7660de1621a43..0000000000000000000000000000000000000000 --- a/mace/core/runtime/opencl/opencl_production.cc +++ /dev/null @@ -1,32 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include -#include -#include -#include - -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/utils/logging.h" - -namespace mace { - -bool GetSourceOrBinaryProgram(const std::string &program_name, - const std::string &binary_file_name_prefix, - const cl::Context &context, - const cl::Device &device, - cl::Program *program, - bool *is_binary) { - extern const std::map> - kCompiledProgramMap; - *is_binary = true; - auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix); - if (it_binary == kCompiledProgramMap.end()) { - return false; - } - *program = cl::Program(context, {device}, {it_binary->second}); - return true; -} - -} // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index a592b6fa119239ef09c293e602caa29fd66e8d23..f24333ac461e9cbff8708f4202ad0f7545e56eeb 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -10,35 +10,20 @@ #include // NOLINT(build/c++11) #include #include +#include +#include "mace/core/file_storage.h" #include "mace/core/runtime/opencl/opencl_extension.h" #include "mace/public/mace.h" #include "mace/utils/tuner.h" namespace mace { -namespace { - -bool WriteFile(const std::string &filename, - bool binary, - const std::vector &content) { - std::ios_base::openmode mode = std::ios_base::out | std::ios_base::trunc; - if (binary) { - mode |= std::ios::binary; - } - std::ofstream ofs(filename, mode); - - ofs.write(reinterpret_cast(&content[0]), - content.size() * sizeof(char)); - ofs.close(); - if (ofs.fail()) { - LOG(ERROR) << "Failed to write to file " << filename; - return false; - } - return true; -} - -} // namespace +extern const std::map> + kCompiledProgramMap; +extern const std::string kCompiledProgramPlatform; +extern const std::map> + kEncryptedProgramMap; const std::string OpenCLErrorToString(cl_int error) { switch (error) { @@ -194,19 +179,19 @@ void OpenCLProfilingTimer::ClearTiming() { accumulated_micros_ = 0; } -GPUPerfHint OpenCLRuntime::gpu_perf_hint_ = GPUPerfHint::PERF_DEFAULT; -GPUPriorityHint OpenCLRuntime::gpu_priority_hint_ = +GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_DEFAULT; +GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = GPUPriorityHint::PRIORITY_DEFAULT; OpenCLRuntime *OpenCLRuntime::Global() { - static OpenCLRuntime runtime(gpu_perf_hint_, gpu_priority_hint_); + static OpenCLRuntime runtime(kGPUPerfHint, kGPUPriorityHint); return &runtime; } void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { - OpenCLRuntime::gpu_perf_hint_ = gpu_perf_hint; - OpenCLRuntime::gpu_priority_hint_ = gpu_priority_hint; + OpenCLRuntime::kGPUPerfHint = gpu_perf_hint; + OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint; } void GetAdrenoContextProperties(std::vector *properties, @@ -250,7 +235,8 @@ void GetAdrenoContextProperties(std::vector *properties, } OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, - GPUPriorityHint gpu_priority_hint) { + GPUPriorityHint gpu_priority_hint): + storage_(nullptr) { LoadOpenCLLibrary(); std::vector all_platforms; @@ -259,9 +245,12 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, LOG(FATAL) << "No OpenCL platforms found"; } cl::Platform default_platform = all_platforms[0]; - VLOG(1) << "Using platform: " << default_platform.getInfo() - << ", " << default_platform.getInfo() << ", " - << default_platform.getInfo(); + std::stringstream ss; + ss << default_platform.getInfo() + << ", " << default_platform.getInfo() << ", " + << default_platform.getInfo(); + platform_info_ = ss.str(); + VLOG(1) << "Using platform: " << platform_info_; // get default device (CPUs, GPUs) of the default platform std::vector all_devices; @@ -278,10 +267,10 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, gpu_detected = true; const std::string device_name = device.getInfo(); - gpu_type_ = ParseGPUTypeFromDeviceName(device_name); + gpu_type_ = ParseGPUType(device_name); const std::string device_version = device.getInfo(); - opencl_version_ = device_version.substr(7, 3); + opencl_version_ = ParseDeviceVersion(device_version); VLOG(1) << "Using device: " << device_name; break; @@ -320,9 +309,19 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, &err); MACE_CHECK_CL_SUCCESS(err); - const char *kernel_path = getenv("MACE_KERNEL_PATH"); - this->kernel_path_ = - std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; + this->program_map_changed_ = false; + + extern std::shared_ptr kStorageFactory; + if (kStorageFactory != nullptr) { + const std::string cl_compiled_file_name = "mace_cl_compiled_program.bin"; + storage_ = kStorageFactory->CreateStorage(cl_compiled_file_name); + + if (platform_info_ != kCompiledProgramPlatform) { + if (storage_->Load() != 0) { + LOG(FATAL) << "Load opencl compiled kernel file failed"; + } + } + } const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK"); if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1 @@ -348,45 +347,49 @@ cl::Device &OpenCLRuntime::device() { return *device_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } -std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix( - const std::string &filename_msg) { - // TODO(heliangliang) This can be long and slow, fix it - std::string filename_prefix = filename_msg; - for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) { - if (*it == ' ' || *it == '-' || *it == '=') { - *it = '_'; +bool OpenCLRuntime::BuildProgramFromBinary( + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program) { + // Find from binary + if (kCompiledProgramPlatform != platform_info_) return false; + auto it_binary = kCompiledProgramMap.find(built_program_key); + if (it_binary == kCompiledProgramMap.end()) return false; + + *program = cl::Program(context(), {device()}, {it_binary->second}); + cl_int ret = program->build({device()}, build_options_str.c_str()); + if (ret != CL_SUCCESS) { + if (program->getBuildInfo(device()) == + CL_BUILD_ERROR) { + std::string build_log = + program->getBuildInfo(device()); + LOG(INFO) << "Program build log: " << build_log; } + LOG(WARNING) << "Build program " + << built_program_key << " from Binary failed:" + << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible " + "cause 1: the MACE library is built from SoC 1 but is " + "used on different SoC 2, possible cause 2: the MACE " + "buffer is corrupted make sure your code has no " + "out-of-range memory writing" : MakeString(ret)); + return false; } - return MACE_OBFUSCATE_SYMBOL(filename_prefix); + VLOG(3) << "Program from Binary: " << built_program_key; + return true; } -extern bool GetSourceOrBinaryProgram(const std::string &program_name, - const std::string &binary_file_name_prefix, - const cl::Context &context, - const cl::Device &device, - cl::Program *program, - bool *is_opencl_binary); - -void OpenCLRuntime::BuildProgram(const std::string &program_name, - const std::string &built_program_key, - const std::string &build_options, - cl::Program *program) { - MACE_CHECK_NOTNULL(program); +bool OpenCLRuntime::BuildProgramFromCache( + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program) { + // Find from binary + if (this->storage_ == nullptr) return false; + auto content = this->storage_->Find(built_program_key); + if (content == nullptr) { + return false; + } - std::string binary_file_name_prefix = - GenerateCLBinaryFilenamePrefix(built_program_key); - std::vector program_vec; - bool is_opencl_binary; - const bool found = - GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(), - device(), program, &is_opencl_binary); - MACE_CHECK(found, "Program not found for ", - is_opencl_binary ? "binary: " : "source: ", built_program_key); - - // Build program - std::string build_options_str = - build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math"; - // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math + *program = cl::Program(context(), {device()}, {*content}); cl_int ret = program->build({device()}, build_options_str.c_str()); if (ret != CL_SUCCESS) { if (program->getBuildInfo(device()) == @@ -395,20 +398,43 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, program->getBuildInfo(device()); LOG(INFO) << "Program build log: " << build_log; } - LOG(FATAL) << "Build program from " - << (is_opencl_binary ? "binary: " : "source: ") - << built_program_key << " failed: " - << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible " - "cause 1: the MACE library is built from SoC 1 but is " - "used on different SoC 2, possible cause 2: the MACE " - "buffer is corrupted make sure your code has no " - "out-of-range memory writing" : MakeString(ret)); + LOG(WARNING) << "Build program " + << built_program_key << " from Cache failed:" + << MakeString(ret); + return false; } + VLOG(3) << "Program from Cache: " << built_program_key; + return true; +} + +void OpenCLRuntime::BuildProgramFromSource( + const std::string &program_name, + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program) { + // Find from source + auto it_source = kEncryptedProgramMap.find(program_name); + if (it_source != kEncryptedProgramMap.end()) { + cl::Program::Sources sources; + std::string source(it_source->second.begin(), it_source->second.end()); + std::string kernel_source = ObfuscateString(source); + sources.push_back(kernel_source); + *program = cl::Program(context(), sources); + cl_int ret = program->build({device()}, build_options_str.c_str()); + if (ret != CL_SUCCESS) { + if (program->getBuildInfo(device()) == + CL_BUILD_ERROR) { + std::string build_log = + program->getBuildInfo(device()); + LOG(INFO) << "Program build log: " << build_log; + } + LOG(WARNING) << "Build program " + << program_name << " from source failed: " + << MakeString(ret); + return; + } - if (!is_opencl_binary) { - // Write binary if necessary - std::string binary_filename = - kernel_path_ + binary_file_name_prefix + ".bin"; + // Keep built program binary size_t device_list_size = 1; std::unique_ptr program_binary_sizes( new size_t[device_list_size]); @@ -432,7 +458,34 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, reinterpret_cast(program_binaries[0].get()) + program_binary_sizes[0]); - MACE_CHECK(WriteFile(binary_filename, true, content)); + if (this->storage_ != nullptr) { + this->storage_->Insert(built_program_key, content); + this->program_map_changed_ = true; + } + + VLOG(3) << "Program from source: " << built_program_key; + } +} + +void OpenCLRuntime::BuildProgram(const std::string &program_name, + const std::string &built_program_key, + const std::string &build_options, + cl::Program *program) { + MACE_CHECK_NOTNULL(program); + + std::string build_options_str = + build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math"; + // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math + bool ret = BuildProgramFromBinary(built_program_key, + build_options_str, program); + if (!ret) { + ret = BuildProgramFromCache(built_program_key, + build_options_str, program); + // Fallback to source. + if (!ret) { + BuildProgramFromSource(program_name, built_program_key, + build_options_str, program); + } } } @@ -459,6 +512,15 @@ cl::Kernel OpenCLRuntime::BuildKernel( return cl::Kernel(program, kernel_name.c_str()); } +void OpenCLRuntime::SaveBuiltCLProgram() { + if (program_map_changed_ && storage_ != nullptr) { + if (storage_->Flush() != 0) { + LOG(FATAL) << "Store opencl compiled kernel to file failed"; + } + program_map_changed_ = false; + } +} + void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) { if (stats != nullptr) { stats->start_micros = @@ -480,7 +542,6 @@ uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { return size; } -// TODO(liuqi): not compatible with mali gpu. uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { uint64_t size = 0; kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size); @@ -496,7 +557,11 @@ const GPUType OpenCLRuntime::gpu_type() const { return gpu_type_; } -const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( +const std::string OpenCLRuntime::platform_info() const { + return platform_info_; +} + +const GPUType OpenCLRuntime::ParseGPUType( const std::string &device_name) { constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; constexpr const char *kMaliGPUStr = "Mali"; @@ -512,6 +577,14 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( return GPUType::UNKNOWN; } } +const std::string OpenCLRuntime::ParseDeviceVersion( + const std::string &device_version) { + // OpenCL Device version string format: + // OpenCL\ + // + auto words = Split(device_version, ' '); + return words[1]; +} const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const { return out_of_range_check_; diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 8b29f145784a7b25ff7206b4a0bcbeaf1533475d..c06564908e56ba794963a8a70314760f31b5afc0 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -10,6 +10,7 @@ #include // NOLINT(build/c++11) #include #include +#include #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" @@ -55,22 +56,27 @@ class OpenCLRuntime { public: static OpenCLRuntime *Global(); static void Configure(GPUPerfHint, GPUPriorityHint); + static void Configure(std::shared_ptr storage_engine); cl::Context &context(); cl::Device &device(); cl::CommandQueue &command_queue(); + const GPUType gpu_type() const; + const std::string platform_info() const; + + cl::Kernel BuildKernel(const std::string &program_name, + const std::string &kernel_name, + const std::set &build_options); void GetCallStats(const cl::Event &event, CallStats *stats); uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); const bool IsNonUniformWorkgroupsSupported(); - const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name); - const GPUType gpu_type() const; - cl::Kernel BuildKernel(const std::string &program_name, - const std::string &kernel_name, - const std::set &build_options); const bool IsOutOfRangeCheckEnabled() const; + const GPUType ParseGPUType(const std::string &device_name); + const std::string ParseDeviceVersion(const std::string &device_version); + void SaveBuiltCLProgram(); private: OpenCLRuntime(GPUPerfHint, GPUPriorityHint); @@ -82,7 +88,19 @@ class OpenCLRuntime { const std::string &binary_file_name, const std::string &build_options, cl::Program *program); - std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg); + bool BuildProgramFromBinary( + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program); + bool BuildProgramFromCache( + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program); + void BuildProgramFromSource( + const std::string &program_name, + const std::string &built_program_key, + const std::string &build_options_str, + cl::Program *program); private: // All OpenCL object must be a pointer and manually deleted before unloading @@ -92,13 +110,15 @@ class OpenCLRuntime { std::shared_ptr command_queue_; std::map built_program_map_; std::mutex program_build_mutex_; - std::string kernel_path_; GPUType gpu_type_; std::string opencl_version_; bool out_of_range_check_; + std::string platform_info_; + bool program_map_changed_; + std::unique_ptr storage_; - static GPUPerfHint gpu_perf_hint_; - static GPUPriorityHint gpu_priority_hint_; + static GPUPerfHint kGPUPerfHint; + static GPUPriorityHint kGPUPriorityHint; }; } // namespace mace diff --git a/mace/examples/BUILD b/mace/examples/BUILD index 7940c6bb67931aca077da743fbf51ee256e412a6..d572e11d319c3da42599055583ead80bd534e65e 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -2,8 +2,8 @@ load("//mace:mace.bzl", "if_openmp_enabled") cc_binary( - name = "mace_run", - srcs = ["mace_run.cc"], + name = "example", + srcs = ["example.cc"], linkopts = if_openmp_enabled(["-fopenmp"]), linkstatic = 1, deps = [ diff --git a/mace/examples/example.cc b/mace/examples/example.cc new file mode 100644 index 0000000000000000000000000000000000000000..e70025d6c171087fad4393bc3620f728517ac564 --- /dev/null +++ b/mace/examples/example.cc @@ -0,0 +1,290 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * mace_run --model=mobi_mace.pb \ + * --input=input_node \ + * --output=output_node \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --output_file=mace.out \ + * --model_data_file=model_data.data \ + * --device=OPENCL + */ +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/public/mace_runtime.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead +namespace mace { +namespace MACE_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_MODEL_TAG +} // namespace mace + +namespace mace { +namespace examples { + +namespace str_util { + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + +} // namespace str_util + +void ParseShape(const std::string &str, std::vector *shape) { + std::string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!isalnum(res[i])) res[i] = '_'; + } + return res; +} + +DeviceType ParseDeviceType(const std::string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + + +DEFINE_string(input_node, + "input_node0,input_node1", + "input nodes, separated by comma"); +DEFINE_string(input_shape, + "1,224,224,3:1,1,1,10", + "input shapes, separated by colon and comma"); +DEFINE_string(output_node, + "output_node0,output_node1", + "output nodes, separated by comma"); +DEFINE_string(output_shape, + "1,224,224,2:1,1,1,10", + "output shapes, separated by colon and comma"); +DEFINE_string(input_file, + "", + "input file name | input file prefix for multiple inputs."); +DEFINE_string(output_file, + "", + "output file name | output file prefix for multiple outputs"); +DEFINE_string(model_data_file, + "", + "model data file name, used when EMBED_MODEL_DATA set to 0"); +DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); +DEFINE_int32(round, 1, "round"); +DEFINE_int32(restart_round, 1, "restart round"); +DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); +DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, 8, "num of openmp threads"); +DEFINE_int32(cpu_power_option, + 0, + "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); + +bool RunModel(const std::vector &input_names, + const std::vector> &input_shapes, + const std::vector &output_names, + const std::vector> &output_shapes) { + // load model + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + + DeviceType device_type = ParseDeviceType(FLAGS_device); + + // config runtime + mace::ConfigOmpThreads(FLAGS_omp_num_threads); + mace::ConfigCPUPowerOption( + static_cast(FLAGS_cpu_power_option)); + if (device_type == DeviceType::OPENCL) { + mace::ConfigOpenCLRuntime( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); + } + + const std::string kernel_file_path = + "/data/local/tmp/mace_run/cl"; + + // Config internal kv storage factory. + std::shared_ptr storage_factory( + new FileStorageFactory(kernel_file_path)); + ConfigKVStorageFactory(storage_factory); + // Init model + mace::MaceEngine engine(&net_def, device_type, input_names, + output_names); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + + const size_t input_count = input_names.size(); + const size_t output_count = output_names.size(); + + std::map inputs; + std::map outputs; + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + // load input + std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), + std::ios::in | std::ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(buffer_in.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + } + + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + } + + LOG(INFO) << "Warm up run"; + engine.Run(inputs, &outputs); + + if (FLAGS_round > 0) { + LOG(INFO) << "Run model"; + for (int i = 0; i < FLAGS_round; ++i) { + engine.Run(inputs, &outputs); + } + } + + for (size_t i = 0; i < output_count; ++i) { + std::string output_name = + FLAGS_output_file + "_" + FormatName(output_names[i]); + std::ofstream out_file(output_name, std::ios::binary); + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + out_file.write( + reinterpret_cast(outputs[output_names[i]].data().get()), + output_size * sizeof(float)); + out_file.flush(); + out_file.close(); + } + + return true; +} + +int Main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum(); + LOG(INFO) << "input node: " << FLAGS_input_node; + LOG(INFO) << "input shape: " << FLAGS_input_shape; + LOG(INFO) << "output node: " << FLAGS_output_node; + LOG(INFO) << "output shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "output_file: " << FLAGS_output_file; + LOG(INFO) << "model_data_file: " << FLAGS_model_data_file; + LOG(INFO) << "device: " << FLAGS_device; + LOG(INFO) << "round: " << FLAGS_round; + LOG(INFO) << "restart_round: " << FLAGS_restart_round; + LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; + LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; + LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; + LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option; + + std::vector input_names = str_util::Split(FLAGS_input_node, ','); + std::vector output_names = + str_util::Split(FLAGS_output_node, ','); + std::vector input_shapes = + str_util::Split(FLAGS_input_shape, ':'); + std::vector output_shapes = + str_util::Split(FLAGS_output_shape, ':'); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + bool ret; +#pragma omp parallel for + for (int i = 0; i < FLAGS_restart_round; ++i) { + VLOG(0) << "restart round " << i; + ret = + RunModel(input_names, input_shape_vec, output_names, output_shape_vec); + } + if (ret) { + return 0; + } else { + return -1; + } +} + +} // namespace examples +} // namespace mace + +int main(int argc, char **argv) { mace::examples::Main(argc, argv); } diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 67b01458bdabb60dd56797a4912a18f6386944ae..e62181bc3ae743e2662b3a364537de6e1d8ed88d 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -90,18 +90,22 @@ void DoActivation(const T *input_ptr, template void PReLUActivation(const T *input_ptr, - const index_t size, + const index_t outer_size, const index_t input_chan, + const index_t inner_size, const T *alpha_ptr, T *output_ptr) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - const index_t chan_idx = i % input_chan; - T in = input_ptr[i]; - if (in < 0) { - output_ptr[i] = in * alpha_ptr[chan_idx]; - } else { - output_ptr[i] = in; +#pragma omp parallel for collapse(3) + for (index_t i = 0; i < outer_size; ++i) { + for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) { + for (index_t j = 0; j < inner_size; ++j) { + index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j; + if (input_ptr[idx] < 0) { + output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx]; + } else { + output_ptr[idx] = input_ptr[idx]; + } + } } } } @@ -121,7 +125,9 @@ class ActivationFunctor { if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); const T *alpha_ptr = alpha->data(); - PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, + const index_t outer_size = output->dim(0) * output->dim(1) + * output->dim(2); + PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr, output_ptr); } else { DoActivation(input_ptr, output_ptr, output->size(), activation_, diff --git a/mace/kernels/arm/activation.cc b/mace/kernels/arm/activation.cc index 30f21e03c5333292fd1e44aa71684a2e2cfcbfc7..59b94ffac8a6c1d84d580253d7694cd54fe6ffc3 100644 --- a/mace/kernels/arm/activation.cc +++ b/mace/kernels/arm/activation.cc @@ -17,7 +17,9 @@ void ActivationFunctor::operator()( if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); const float *alpha_ptr = alpha->data(); - PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr, + const index_t outer_size = output->dim(0); + const index_t inner_size = output->dim(2) * output->dim(3); + PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr, output_ptr); } else { DoActivation(input_ptr, output_ptr, output->size(), activation_, diff --git a/mace/kernels/arm/conv_2d.cc b/mace/kernels/arm/conv_2d.cc index e50cac08bd1f4d7d7693079d0f47f35962ff2e10..7fc16cda27b4c3c93d491aa9caf6372247df6e96 100644 --- a/mace/kernels/arm/conv_2d.cc +++ b/mace/kernels/arm/conv_2d.cc @@ -7,6 +7,7 @@ // winograd is always superior to neon impl during benchmark #define USE_WINOGRAD 1 +#define WINOGRAD_OUT_TILE_SIZE 6 namespace mace { namespace kernels { @@ -162,10 +163,11 @@ void Conv2dFunctor::operator()(const Tensor *input, if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { - extra_output_height = RoundUp(height, 2); + && dilation_h == 1 && dilation_w == 1 + && input_channels >= 8 && channels >= 8) { + extra_output_height = RoundUp(height, WINOGRAD_OUT_TILE_SIZE); extra_input_height = std::max(padded_input_height, extra_output_height + 2); - extra_output_width = RoundUp(width, 2); + extra_output_width = RoundUp(width, WINOGRAD_OUT_TILE_SIZE); extra_input_width = std::max(padded_input_width, extra_output_width + 2); if (extra_input_height != padded_input_height) { pad_bottom += (extra_input_height - padded_input_height); @@ -174,12 +176,15 @@ void Conv2dFunctor::operator()(const Tensor *input, pad_right += (extra_input_width - padded_input_width); } - index_t tile_height_count = (extra_output_height + 1) / 2; - index_t tile_width_count = (extra_output_width + 1) / 2; + index_t tile_height_count = extra_output_height / WINOGRAD_OUT_TILE_SIZE; + index_t tile_width_count = extra_output_width / WINOGRAD_OUT_TILE_SIZE; index_t tile_count = tile_height_count * tile_width_count; - transformed_input_.Resize({16, batch, input_channels, tile_count}); - transformed_filter_.Resize({16, channels, input_channels}); - transformed_output_.Resize({16, batch, channels, tile_count}); + index_t in_tile_area = + (WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2); + transformed_input_.Resize({in_tile_area, batch, input_channels, + tile_count}); + transformed_filter_.Resize({in_tile_area, channels, input_channels}); + transformed_output_.Resize({in_tile_area, batch, channels, tile_count}); conv_func = [=](const float *pad_input, float *pad_output) { WinoGradConv3x3s1(pad_input, @@ -189,6 +194,7 @@ void Conv2dFunctor::operator()(const Tensor *input, extra_input_width, input_channels, channels, + WINOGRAD_OUT_TILE_SIZE, transformed_input_.mutable_data(), transformed_filter_.mutable_data(), transformed_output_.mutable_data(), diff --git a/mace/kernels/arm/conv_winograd.cc b/mace/kernels/arm/conv_winograd.cc index c0509689f832d702ed23afdbd29c5b53abf42773..272e3c5227645ff228113c6d226765617aee724b 100644 --- a/mace/kernels/arm/conv_winograd.cc +++ b/mace/kernels/arm/conv_winograd.cc @@ -8,19 +8,20 @@ #include "mace/kernels/arm/conv_winograd.h" #include "mace/kernels/gemm.h" #include "mace/utils/utils.h" +#include "mace/utils/logging.h" namespace mace { namespace kernels { namespace { // NCHW => TNCB (T: in tile pixels, B: tile indices) -void TransformInput(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t tile_count, - float *output) { +void TransformInput4x4(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output) { const index_t stride = batch * in_channels * tile_count; const index_t in_height_width = in_height * in_width; @@ -101,12 +102,124 @@ void TransformInput(const float *input, } } +// NCHW => TNCB (T: in tile pixels, B: tile indices) +/** + * BT = +⎡1 0 -21/4 0 21/4 0 -1 0⎤ +⎢ ⎥ +⎢0 1 1 -17/4 -17/4 1 1 0⎥ +⎢ ⎥ +⎢0 -1 1 17/4 -17/4 -1 1 0⎥ +⎢ ⎥ +⎢0 1/2 1/4 -5/2 -5/4 2 1 0⎥ +⎢ ⎥ +⎢0 -1/2 1/4 5/2 -5/4 -2 1 0⎥ +⎢ ⎥ +⎢0 2 4 -5/2 -5 1/2 1 0⎥ +⎢ ⎥ +⎢0 -2 4 5/2 -5 -1/2 1 0⎥ +⎢ ⎥ +⎣0 -1 0 21/4 0 -21/4 0 1⎦ + + * @param input + * @param batch + * @param in_height + * @param in_width + * @param in_channels + * @param tile_count + * @param output + */ +void TransformInput8x8(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output) { + const index_t stride = batch * in_channels * tile_count; + const index_t in_height_width = in_height * in_width; + +#pragma omp parallel for + for (index_t nc = 0; nc < batch * in_channels; ++nc) { + index_t tile_index = nc * tile_count; + float s[8][8]; + for (index_t h = 0; h < in_height - 2; h += 6) { + for (index_t w = 0; w < in_width - 2; w += 6) { + index_t tile_offset = nc * in_height_width + h * in_width + w; + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = input[tile_offset]; + d1 = input[tile_offset + 1]; + d2 = input[tile_offset + 2]; + d3 = input[tile_offset + 3]; + d4 = input[tile_offset + 4]; + d5 = input[tile_offset + 5]; + d6 = input[tile_offset + 6]; + d7 = input[tile_offset + 7]; + + s[i][0] = d0 - d6 + (d4 - d2) * 5.25; + s[i][7] = d7 - d1 + (d3 - d5) * 5.25; + + float u = d2 + d6 - d4 * 4.25; + float v = d1 + d5 - d3 * 4.25; + s[i][1] = u + v; + s[i][2] = u - v; + + u = d6 + d2 * 0.25 - d4 * 1.25; + v = d1 * 0.5 - d3 * 2.5 + d5 * 2; + s[i][3] = u + v; + s[i][4] = u - v; + + u = d6 + (d2 - d4 * 1.25) * 4; + v = d1 * 2 - d3 * 2.5 + d5 * 0.5; + s[i][5] = u + v; + s[i][6] = u - v; + + tile_offset += in_width; + } + + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = s[0][i]; + d1 = s[1][i]; + d2 = s[2][i]; + d3 = s[3][i]; + d4 = s[4][i]; + d5 = s[5][i]; + d6 = s[6][i]; + d7 = s[7][i]; + + output[tile_index + i * stride] = d0 - d6 + (d4 - d2) * 5.25; + output[tile_index + (56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25; + + float u = d2 + d6 - d4 * 4.25; + float v = d1 + d5 - d3 * 4.25; + output[tile_index + (8 + i) * stride] = u + v; + output[tile_index + (16 + i) * stride] = u - v; + + u = d6 + d2 * 0.25 - d4 * 1.25; + v = d1 * 0.5 - d3 * 2.5 + d5 * 2; + output[tile_index + (24 + i) * stride] = u + v; + output[tile_index + (32 + i) * stride] = u - v; + + u = d6 + (d2 - d4 * 1.25) * 4; + v = d1 * 2 - d3 * 2.5 + d5 * 0.5; + output[tile_index + (40 + i) * stride] = u + v; + output[tile_index + (48 + i) * stride] = u - v; + } + + ++tile_index; + } + } + } +} + // OCHW => TOC // no need to optimize, it will exist in converter -void TransformFilter(const float *filter, - const index_t in_channels, - const index_t out_channels, - float *output) { +void TransformFilter4x4(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output) { const index_t stride = out_channels * in_channels; #pragma omp parallel for collapse(2) @@ -171,6 +284,83 @@ void TransformFilter(const float *filter, } } +// OCHW => TOC +// no need to optimize, it will exist in converter +/** + * G = +⎡ 1 0 0 ⎤ +⎢ ⎥ +⎢-2/9 -2/9 -2/9 ⎥ +⎢ ⎥ +⎢-2/9 2/9 -2/9 ⎥ +⎢ ⎥ +⎢1/90 1/45 2/45 ⎥ +⎢ ⎥ +⎢1/90 -1/45 2/45 ⎥ +⎢ ⎥ +⎢1/45 1/90 1/180⎥ +⎢ ⎥ +⎢1/45 -1/90 1/180⎥ +⎢ ⎥ +⎣ 0 0 1 ⎦ + * + * @param filter + * @param in_channels + * @param out_channels + * @param output + */ +void TransformFilter8x8(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output) { + const index_t stride = out_channels * in_channels; + + const float G[8][3] = { + {1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f} + }; + +#pragma omp parallel for collapse(2) + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + float s[3][8]; + for (int i = 0; i < 8; ++i) { + s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2]; + s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2]; + s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2]; + } + + // store output + index_t output_offset = m * in_channels + c; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + output[output_offset + (i * 8 + j) * stride] = + G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; + } + } + } + } +} + // TOC * TNCB => TNOB void BatchGemm(const float *input, const float *filter, @@ -178,17 +368,24 @@ void BatchGemm(const float *input, index_t in_channels, index_t out_channels, index_t tile_count, + int out_tile_size, float *output) { const index_t in_stride = batch * in_channels * tile_count; const index_t in_channels_tile_count = in_channels * tile_count; const index_t filter_stride = out_channels * in_channels; const index_t out_stride = batch * out_channels * tile_count; const index_t out_channels_tile_count = out_channels * tile_count; - + const int in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); if (batch == 1) { - Gemm(filter, input, 16, out_channels, in_channels, tile_count, output); + Gemm(filter, + input, + in_tile_area, + out_channels, + in_channels, + tile_count, + output); } else { - for (int i = 0; i < 16; ++i) { + for (int i = 0; i < in_tile_area; ++i) { for (int b = 0; b < batch; ++b) { const float *in_ptr = input + i * in_stride + b * in_channels_tile_count; @@ -207,13 +404,13 @@ void BatchGemm(const float *input, } // TNOB => ToNOB => NOHoWo -void TransformOutput(const float *input, - index_t batch, - index_t out_height, - index_t out_width, - index_t out_channels, - index_t tile_count, - float *output) { +void TransformOutput4x4(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output) { const index_t in_stride = batch * out_channels * tile_count; #pragma omp parallel for @@ -272,39 +469,102 @@ void TransformOutput(const float *input, } } -void ConvRef3x3s1(const float *input, - const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - float *output) { - index_t out_height = in_height - 2; - index_t out_width = in_width - 2; +// TNOB => ToNOB => NOHoWo +/** + * AT = +⎡1 1 1 1 1 32 32 0⎤ +⎢ ⎥ +⎢0 1 -1 2 -2 16 -16 0⎥ +⎢ ⎥ +⎢0 1 1 4 4 8 8 0⎥ +⎢ ⎥ +⎢0 1 -1 8 -8 4 -4 0⎥ +⎢ ⎥ +⎢0 1 1 16 16 2 2 0⎥ +⎢ ⎥ +⎣0 1 -1 32 -32 1 -1 1⎦ + * + * @param input + * @param batch + * @param out_height + * @param out_width + * @param out_channels + * @param tile_count + * @param output + */ +void TransformOutput8x8(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output) { + const index_t in_stride = batch * out_channels * tile_count; -#pragma omp parallel for collapse(4) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - index_t out_offset = - ((b * out_channels + m) * out_height + h) * out_width + w; - output[out_offset] = 0; - for (index_t c = 0; c < in_channels; ++c) { - for (index_t kh = 0; kh < 3; ++kh) { - for (index_t kw = 0; kw < 3; ++kw) { - index_t ih = h + kh; - index_t iw = w + kw; - index_t in_offset = - ((b * in_channels + c) * in_height + ih) * in_width + iw; - index_t - filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; - output[out_offset] += input[in_offset] * filter[filter_offset]; - } - } - } +#pragma omp parallel for + for (index_t nm = 0; nm < batch * out_channels; ++nm) { + index_t tile_offset = nm * tile_count; + float s[8][6]; + for (index_t h = 0; h < out_height; h += 6) { + for (index_t w = 0; w < out_width; w += 6) { + index_t tile_offset_tmp = tile_offset; + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = input[tile_offset_tmp + 0 * in_stride]; + d1 = input[tile_offset_tmp + 1 * in_stride]; + d2 = input[tile_offset_tmp + 2 * in_stride]; + d3 = input[tile_offset_tmp + 3 * in_stride]; + d4 = input[tile_offset_tmp + 4 * in_stride]; + d5 = input[tile_offset_tmp + 5 * in_stride]; + d6 = input[tile_offset_tmp + 6 * in_stride]; + d7 = input[tile_offset_tmp + 7 * in_stride]; + + float u = d1 + d2; + float v = d1 - d2; + float w = d3 + d4; + float x = d3 - d4; + float y = d5 + d6; + float z = d5 - d6; + + s[i][0] = d0 + u + w + y * 32; + s[i][1] = v + x + x + z * 16; + s[i][2] = u + w * 4 + y * 8; + s[i][3] = v + x * 8 + z * 4; + s[i][4] = u + w * 16 + y + y; + s[i][5] = v + x * 32 + z + d7; + + tile_offset_tmp += 8 * in_stride; + } + + index_t out_offset = nm * out_height * out_width + h * out_width + w; + + for (int i = 0; i < 6; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = s[0][i]; + d1 = s[1][i]; + d2 = s[2][i]; + d3 = s[3][i]; + d4 = s[4][i]; + d5 = s[5][i]; + d6 = s[6][i]; + d7 = s[7][i]; + + float u = d1 + d2; + float v = d1 - d2; + float w = d3 + d4; + float x = d3 - d4; + float y = d5 + d6; + float z = d5 - d6; + + output[out_offset + 0 * out_width + i] = d0 + u + w + y * 32; + output[out_offset + 1 * out_width + i] = v + x + x + z * 16; + output[out_offset + 2 * out_width + i] = u + w * 4 + y * 8; + output[out_offset + 3 * out_width + i] = v + x * 8 + z * 4; + output[out_offset + 4 * out_width + i] = u + w * 16 + y + y; + output[out_offset + 5 * out_width + i] = v + x * 32 + z + d7; } + + ++tile_offset; } } } @@ -318,6 +578,7 @@ void WinoGradConv3x3s1(const float *input, const index_t in_width, const index_t in_channels, const index_t out_channels, + const int out_tile_size, float *transformed_input, float *transformed_filter, float *transformed_output, @@ -325,22 +586,52 @@ void WinoGradConv3x3s1(const float *input, float *output) { index_t out_height = in_height - 2; index_t out_width = in_width - 2; - index_t tile_height_count = (out_height + 1) / 2; - index_t tile_width_count = (out_width + 1) / 2; + index_t tile_height_count = + RoundUpDiv(out_height, static_cast(out_tile_size)); + index_t tile_width_count = + RoundUpDiv(out_width, static_cast(out_tile_size)); index_t tile_count = tile_height_count * tile_width_count; - TransformInput(input, - batch, - in_height, - in_width, - in_channels, - tile_count, - transformed_input); + switch (out_tile_size) { + case 2: + TransformInput4x4(input, + batch, + in_height, + in_width, + in_channels, + tile_count, + transformed_input); + break; + case 6: + TransformInput8x8(input, + batch, + in_height, + in_width, + in_channels, + tile_count, + transformed_input); + break; + default:MACE_NOT_IMPLEMENTED; + } // TODO(liyin): put it in model converter, but do not worry, it is fast and // will only do once if (!is_filter_transformed) { - TransformFilter(filter, in_channels, out_channels, transformed_filter); + switch (out_tile_size) { + case 2: + TransformFilter4x4(filter, + in_channels, + out_channels, + transformed_filter); + break; + case 6: + TransformFilter8x8(filter, + in_channels, + out_channels, + transformed_filter); + break; + default:MACE_NOT_IMPLEMENTED; + } } BatchGemm(transformed_input, @@ -349,15 +640,30 @@ void WinoGradConv3x3s1(const float *input, in_channels, out_channels, tile_count, + out_tile_size, transformed_output); - TransformOutput(transformed_output, - batch, - out_height, - out_width, - out_channels, - tile_count, - output); + switch (out_tile_size) { + case 2: + TransformOutput4x4(transformed_output, + batch, + out_height, + out_width, + out_channels, + tile_count, + output); + break; + case 6: + TransformOutput8x8(transformed_output, + batch, + out_height, + out_width, + out_channels, + tile_count, + output); + break; + default:MACE_NOT_IMPLEMENTED; + } } void WinoGradConv3x3s1(const float *input, @@ -367,16 +673,21 @@ void WinoGradConv3x3s1(const float *input, const index_t in_width, const index_t in_channels, const index_t out_channels, + const int out_tile_size, float *output) { index_t out_height = in_height - 2; index_t out_width = in_width - 2; - index_t tile_height_count = (out_height + 1) / 2; - index_t tile_width_count = (out_width + 1) / 2; + index_t tile_height_count = + RoundUpDiv(out_height, static_cast(out_tile_size)); + index_t tile_width_count = + RoundUpDiv(out_width, static_cast(out_tile_size)); index_t tile_count = tile_height_count * tile_width_count; - - index_t transformed_input_size = 16 * batch * in_channels * tile_count; - index_t transformed_filter_size = 16 * out_channels * in_channels; - index_t transformed_output_size = 16 * batch * out_channels * tile_count; + index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); + index_t transformed_input_size = + in_tile_area * batch * in_channels * tile_count; + index_t transformed_filter_size = in_tile_area * out_channels * in_channels; + index_t + transformed_output_size = in_tile_area * batch * out_channels * tile_count; float *transformed_input = new float[transformed_input_size]; // TNCB float *transformed_filter = new float[transformed_filter_size]; // TOC @@ -389,6 +700,7 @@ void WinoGradConv3x3s1(const float *input, in_width, in_channels, out_channels, + out_tile_size, transformed_input, transformed_filter, transformed_output, @@ -400,5 +712,44 @@ void WinoGradConv3x3s1(const float *input, delete[]transformed_output; } +void ConvRef3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *output) { + index_t out_height = in_height - 2; + index_t out_width = in_width - 2; + +#pragma omp parallel for collapse(4) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + index_t out_offset = + ((b * out_channels + m) * out_height + h) * out_width + w; + output[out_offset] = 0; + for (index_t c = 0; c < in_channels; ++c) { + for (index_t kh = 0; kh < 3; ++kh) { + for (index_t kw = 0; kw < 3; ++kw) { + index_t ih = h + kh; + index_t iw = w + kw; + index_t in_offset = + ((b * in_channels + c) * in_height + ih) * in_width + iw; + index_t + filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; + output[out_offset] += + input[in_offset] * filter[filter_offset]; + } + } + } + } + } + } + } +} + } // namespace kernels } // namespace mace diff --git a/mace/kernels/arm/conv_winograd.h b/mace/kernels/arm/conv_winograd.h index 7611d65ae5e2a57b4542df40bc4d6bef3d04538d..d058a29c159f07ddb92a984f6e93ba38926c3f62 100644 --- a/mace/kernels/arm/conv_winograd.h +++ b/mace/kernels/arm/conv_winograd.h @@ -21,6 +21,7 @@ void WinoGradConv3x3s1(const float *input, const index_t in_width, const index_t in_channels, const index_t out_channels, + const int out_tile_size, float *output); void WinoGradConv3x3s1(const float *input, @@ -30,12 +31,22 @@ void WinoGradConv3x3s1(const float *input, const index_t in_width, const index_t in_channels, const index_t out_channels, + const int out_tile_size, float *transformed_input, float *transformed_filter, float *transformed_output, bool is_filter_transformed, float *output); +void ConvRef3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *output); + } // namespace kernels } // namespace mace diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc index 52be053bd9e5cb2d57a5c2d3c1b1fce322752996..4cb591ec1b6ae422fa963ec1505aed8df2dc874c 100644 --- a/mace/kernels/arm/conv_winograd_test.cc +++ b/mace/kernels/arm/conv_winograd_test.cc @@ -58,11 +58,12 @@ TEST(ConvWinogradTest, winograd) { in_width, in_channels, out_channels, + 6, output_data); // test for (index_t i = 0; i < output_size; ++i) { - EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1); + EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i; } delete[]input_data; diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc index 00be4829802ede5fadbc0244917f56fcf0dd6025..b200c7650d596d868022d292029fa29fb1abcc18 100644 --- a/mace/kernels/gemm.cc +++ b/mace/kernels/gemm.cc @@ -13,22 +13,6 @@ namespace mace { namespace kernels { namespace { -void GemmRef(const float *A, - const float *B, - const index_t height, - const index_t K, - const index_t width, - float *C) { - memset(C, 0, sizeof(float) * height * width); - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - for (int k = 0; k < K; ++k) { - C[i * width + j] += A[i * K + k] * B[k * width + j]; - } - } - } -} - inline void GemmBlock(const float *A, const float *B, const index_t height, @@ -49,8 +33,8 @@ inline void GemmBlock(const float *A, // TODO(liyin): may need implement 883 since RGB inline void Gemm884(const float *a_ptr, const float *b_ptr, - index_t stride_w, index_t stride_k, + index_t stride_w, float *c_ptr) { #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, @@ -136,29 +120,300 @@ inline void GemmTile(const float *A, float *C) { index_t h, w, k; for (h = 0; h + 7 < height; h += 8) { - for (w = 0; w + 3 < width; w += 4) { - for (k = 0; k + 7 < K; k += 8) { - const float *a_ptr = A + (h * stride_k + k); + for (k = 0; k + 7 < K; k += 8) { + const float *a_ptr = A + (h * stride_k + k); + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + +#ifdef __clang__ + int nw = width >> 2; + if (nw > 0) { + // load A + float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, + a8, a9, a10, a11, a12, a13, a14, a15; + a0 = vld1q_f32(a_ptr); + a1 = vld1q_f32(a_ptr + 4); + a2 = vld1q_f32(a_ptr + 1 * stride_k); + a3 = vld1q_f32(a_ptr + 1 * stride_k + 4); + a4 = vld1q_f32(a_ptr + 2 * stride_k); + a5 = vld1q_f32(a_ptr + 2 * stride_k + 4); + a6 = vld1q_f32(a_ptr + 3 * stride_k); + a7 = vld1q_f32(a_ptr + 3 * stride_k + 4); + a8 = vld1q_f32(a_ptr + 4 * stride_k); + a9 = vld1q_f32(a_ptr + 4 * stride_k + 4); + a10 = vld1q_f32(a_ptr + 5 * stride_k); + a11 = vld1q_f32(a_ptr + 5 * stride_k + 4); + a12 = vld1q_f32(a_ptr + 6 * stride_k); + a13 = vld1q_f32(a_ptr + 6 * stride_k + 4); + a14 = vld1q_f32(a_ptr + 7 * stride_k); + a15 = vld1q_f32(a_ptr + 7 * stride_k + 4); + + const float *b_ptr0 = B + k * stride_w; + const float *b_ptr1 = B + (k + 1) * stride_w; + const float *b_ptr2 = B + (k + 2) * stride_w; + const float *b_ptr3 = B + (k + 3) * stride_w; + const float *b_ptr4 = B + (k + 4) * stride_w; + const float *b_ptr5 = B + (k + 5) * stride_w; + const float *b_ptr6 = B + (k + 6) * stride_w; + const float *b_ptr7 = B + (k + 7) * stride_w; + + float *c_ptr0 = C + h * stride_w; + float *c_ptr1 = C + (h + 1) * stride_w; + float *c_ptr2 = C + (h + 2) * stride_w; + float *c_ptr3 = C + (h + 3) * stride_w; + float *c_ptr4 = C + (h + 4) * stride_w; + float *c_ptr5 = C + (h + 5) * stride_w; + float *c_ptr6 = C + (h + 6) * stride_w; + float *c_ptr7 = C + (h + 7) * stride_w; + + asm volatile( + "0: \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v24.4s}, [%1] \n" + + // load b: 0-7 + "prfm pldl1keep, [%9, #128] \n" + "ld1 {v16.4s}, [%9], #16 \n" + + "prfm pldl1keep, [%10, #128] \n" + "ld1 {v17.4s}, [%10], #16 \n" + + "prfm pldl1keep, [%11, #128] \n" + "ld1 {v18.4s}, [%11], #16 \n" + + "prfm pldl1keep, [%12, #128] \n" + "ld1 {v19.4s}, [%12], #16 \n" + + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v25.4s}, [%2] \n" + + "prfm pldl1keep, [%13, #128] \n" + "ld1 {v20.4s}, [%13], #16 \n" + + "prfm pldl1keep, [%14, #128] \n" + "ld1 {v21.4s}, [%14], #16 \n" + + "prfm pldl1keep, [%15, #128] \n" + "ld1 {v22.4s}, [%15], #16 \n" + + "prfm pldl1keep, [%16, #128] \n" + "ld1 {v23.4s}, [%16], #16 \n" + + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v26.4s}, [%3] \n" + + "fmla v24.4s, v16.4s, %34.s[0] \n" + "fmla v24.4s, v17.4s, %34.s[1] \n" + "fmla v24.4s, v18.4s, %34.s[2] \n" + "fmla v24.4s, v19.4s, %34.s[3] \n" + + "fmla v24.4s, v20.4s, %35.s[0] \n" + "fmla v24.4s, v21.4s, %35.s[1] \n" + "fmla v24.4s, v22.4s, %35.s[2] \n" + "fmla v24.4s, v23.4s, %35.s[3] \n" + + "st1 {v24.4s}, [%1], #16 \n" + + "fmla v25.4s, v16.4s, %36.s[0] \n" + "fmla v25.4s, v17.4s, %36.s[1] \n" + "fmla v25.4s, v18.4s, %36.s[2] \n" + "fmla v25.4s, v19.4s, %36.s[3] \n" + + "fmla v25.4s, v20.4s, %37.s[0] \n" + "fmla v25.4s, v21.4s, %37.s[1] \n" + "fmla v25.4s, v22.4s, %37.s[2] \n" + "fmla v25.4s, v23.4s, %37.s[3] \n" + + "prfm pldl1keep, [%4, #128] \n" + "ld1 {v24.4s}, [%4] \n" + + "st1 {v25.4s}, [%2], #16 \n" + + "fmla v26.4s, v16.4s, %38.s[0] \n" + "fmla v26.4s, v17.4s, %38.s[1] \n" + "fmla v26.4s, v18.4s, %38.s[2] \n" + "fmla v26.4s, v19.4s, %38.s[3] \n" + + "fmla v26.4s, v20.4s, %39.s[0] \n" + "fmla v26.4s, v21.4s, %39.s[1] \n" + "fmla v26.4s, v22.4s, %39.s[2] \n" + "fmla v26.4s, v23.4s, %39.s[3] \n" + + "prfm pldl1keep, [%5, #128] \n" + "ld1 {v25.4s}, [%5] \n" + + "st1 {v26.4s}, [%3], #16 \n" + + "fmla v24.4s, v16.4s, %40.s[0] \n" + "fmla v24.4s, v17.4s, %40.s[1] \n" + "fmla v24.4s, v18.4s, %40.s[2] \n" + "fmla v24.4s, v19.4s, %40.s[3] \n" + + "fmla v24.4s, v20.4s, %41.s[0] \n" + "fmla v24.4s, v21.4s, %41.s[1] \n" + "fmla v24.4s, v22.4s, %41.s[2] \n" + "fmla v24.4s, v23.4s, %41.s[3] \n" + + "prfm pldl1keep, [%6, #128] \n" + "ld1 {v26.4s}, [%6] \n" + + "st1 {v24.4s}, [%4], #16 \n" + + "fmla v25.4s, v16.4s, %42.s[0] \n" + "fmla v25.4s, v17.4s, %42.s[1] \n" + "fmla v25.4s, v18.4s, %42.s[2] \n" + "fmla v25.4s, v19.4s, %42.s[3] \n" + + "fmla v25.4s, v20.4s, %43.s[0] \n" + "fmla v25.4s, v21.4s, %43.s[1] \n" + "fmla v25.4s, v22.4s, %43.s[2] \n" + "fmla v25.4s, v23.4s, %43.s[3] \n" + + "prfm pldl1keep, [%7, #128] \n" + "ld1 {v24.4s}, [%7] \n" + + "st1 {v25.4s}, [%5], #16 \n" + + "fmla v26.4s, v16.4s, %44.s[0] \n" + "fmla v26.4s, v17.4s, %44.s[1] \n" + "fmla v26.4s, v18.4s, %44.s[2] \n" + "fmla v26.4s, v19.4s, %44.s[3] \n" + + "fmla v26.4s, v20.4s, %45.s[0] \n" + "fmla v26.4s, v21.4s, %45.s[1] \n" + "fmla v26.4s, v22.4s, %45.s[2] \n" + "fmla v26.4s, v23.4s, %45.s[3] \n" + + "prfm pldl1keep, [%8, #128] \n" + "ld1 {v25.4s}, [%8] \n" + + "st1 {v26.4s}, [%6], #16 \n" + + "fmla v24.4s, v16.4s, %46.s[0] \n" + "fmla v24.4s, v17.4s, %46.s[1] \n" + "fmla v24.4s, v18.4s, %46.s[2] \n" + "fmla v24.4s, v19.4s, %46.s[3] \n" + + "fmla v24.4s, v20.4s, %47.s[0] \n" + "fmla v24.4s, v21.4s, %47.s[1] \n" + "fmla v24.4s, v22.4s, %47.s[2] \n" + "fmla v24.4s, v23.4s, %47.s[3] \n" + + "st1 {v24.4s}, [%7], #16 \n" + + "fmla v25.4s, v16.4s, %48.s[0] \n" + "fmla v25.4s, v17.4s, %48.s[1] \n" + "fmla v25.4s, v18.4s, %48.s[2] \n" + "fmla v25.4s, v19.4s, %48.s[3] \n" + + "fmla v25.4s, v20.4s, %49.s[0] \n" + "fmla v25.4s, v21.4s, %49.s[1] \n" + "fmla v25.4s, v22.4s, %49.s[2] \n" + "fmla v25.4s, v23.4s, %49.s[3] \n" + + "st1 {v25.4s}, [%8], #16 \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + : "=r"(nw), // 0 + "=r"(c_ptr0), // 1 + "=r"(c_ptr1), // 2 + "=r"(c_ptr2), // 3 + "=r"(c_ptr3), // 4 + "=r"(c_ptr4), // 5 + "=r"(c_ptr5), // 6 + "=r"(c_ptr6), // 7 + "=r"(c_ptr7), // 8 + "=r"(b_ptr0), // 9 + "=r"(b_ptr1), // 10 + "=r"(b_ptr2), // 11 + "=r"(b_ptr3), // 12 + "=r"(b_ptr4), // 13 + "=r"(b_ptr5), // 14 + "=r"(b_ptr6), // 15 + "=r"(b_ptr7) // 16 + : "0"(nw), // 17 + "1"(c_ptr0), // 18 + "2"(c_ptr1), // 19 + "3"(c_ptr2), // 20 + "4"(c_ptr3), // 21 + "5"(c_ptr4), // 22 + "6"(c_ptr5), // 23 + "7"(c_ptr6), // 24 + "8"(c_ptr7), // 25 + "9"(b_ptr0), // 26 + "10"(b_ptr1), // 27 + "11"(b_ptr2), // 28 + "12"(b_ptr3), // 29 + "13"(b_ptr4), // 30 + "14"(b_ptr5), // 31 + "15"(b_ptr6), // 32 + "16"(b_ptr7), // 33 + "w"(a0), // 34 + "w"(a1), // 35 + "w"(a2), // 36 + "w"(a3), // 37 + "w"(a4), // 38 + "w"(a5), // 39 + "w"(a6), // 40 + "w"(a7), // 41 + "w"(a8), // 42 + "w"(a9), // 43 + "w"(a10), // 44 + "w"(a11), // 45 + "w"(a12), // 46 + "w"(a13), // 47 + "w"(a14), // 48 + "w"(a15) // 49 + : "cc", "memory", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26" + ); + + w = (width >> 2) << 2; + } +#else // gcc + for (w = 0; w + 3 < width; w += 4) { + const float *b_ptr = B + (k * stride_w + w); + float *c_ptr = C + (h * stride_w + w); + Gemm884(a_ptr, b_ptr, stride_k, stride_w, c_ptr); + } +#endif + +#else + for (w = 0; w + 3 < width; w += 4) { const float *b_ptr = B + (k * stride_w + w); float *c_ptr = C + (h * stride_w + w); - Gemm884(a_ptr, b_ptr, stride_w, stride_k, c_ptr); + GemmBlock(a_ptr, b_ptr, 8, 8, 4, stride_k, stride_w, c_ptr); } - if (k < K) { +#endif + + if (w < width) { const float *a_ptr = A + (h * stride_k + k); const float *b_ptr = B + (k * stride_w + w); float *c_ptr = C + (h * stride_w + w); - GemmBlock(a_ptr, b_ptr, 8, K - k, 4, stride_k, stride_w, c_ptr); + GemmBlock(a_ptr, b_ptr, 8, 8, width - w, stride_k, stride_w, c_ptr); } } - if (w < width) { - const float *a_ptr = A + h * stride_k; - const float *b_ptr = B + w; - float *c_ptr = C + (h * stride_w + w); + if (k < K) { + const float *a_ptr = A + (h * stride_k + k); + const float *b_ptr = B + k * stride_w; + float *c_ptr = C + h * stride_w; GemmBlock(a_ptr, b_ptr, 8, - K, - width - w, + K - k, + width, stride_k, stride_w, c_ptr); @@ -243,5 +498,21 @@ void Gemm(const float *A, } // n } +void GemmRef(const float *A, + const float *B, + const index_t height, + const index_t K, + const index_t width, + float *C) { + memset(C, 0, sizeof(float) * height * width); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + for (int k = 0; k < K; ++k) { + C[i * width + j] += A[i * K + k] * B[k * width + j]; + } + } + } +} + } // namespace kernels } // namespace mace diff --git a/mace/kernels/gemm.h b/mace/kernels/gemm.h index d17eab83e12d5531eed8bdaddd6af352b179b327..eec69e8ef7675a28a3fa79a8f5bf5f8e957d6865 100644 --- a/mace/kernels/gemm.h +++ b/mace/kernels/gemm.h @@ -22,6 +22,13 @@ void Gemm(const float *A, const index_t width, float *C); +void GemmRef(const float *A, + const float *B, + const index_t height, + const index_t K, + const index_t width, + float *C); + } // namespace kernels } // namespace mace diff --git a/mace/kernels/gemm_test.cc b/mace/kernels/gemm_test.cc index 9e4b964daf5922a6fd0912ca65d9b01225aaf2fb..dacb93eebcab3aae16035477d4a591780f6b32a6 100644 --- a/mace/kernels/gemm_test.cc +++ b/mace/kernels/gemm_test.cc @@ -31,7 +31,7 @@ TEST(GEMMTest, gemm) { [&gen, &nd] { return nd(gen); }); - kernels::Gemm(A, B, N, K, M, C); + kernels::Gemm(A, B, 1, N, K, M, C); kernels::GemmRef(A, B, N, K, M, C_ref); for (int i = 0; i < N * M; ++i) { diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 0fea6b586fd7c81790538fa4a318980cf072a96d..36cda42f345ff9bb1a4d118cb6627608ef698309 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -6,6 +6,7 @@ #define MACE_KERNELS_SLICE_H_ #include +#include #include #include "mace/core/future.h" @@ -17,20 +18,34 @@ namespace mace { namespace kernels { +struct SliceFunctorBase { + explicit SliceFunctorBase(const int32_t axis) : axis_(axis) {} + + int32_t axis_; +}; + template -struct SliceFunctor { +struct SliceFunctor : SliceFunctorBase { + explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} + void operator()(const Tensor *input, const std::vector &output_list, StatsFuture *future) { - const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2); - const index_t input_channels = input->dim(3); + const index_t input_channels = input->dim(axis_); const size_t outputs_count = output_list.size(); const index_t output_channels = input_channels / outputs_count; std::vector output_ptrs(output_list.size(), nullptr); + std::vector output_shape(input->shape()); + output_shape[axis_] = output_channels; - std::vector output_shape({input->dim(0), input->dim(1), - input->dim(2), output_channels}); - + const index_t outer_size = std::accumulate(output_shape.begin(), + output_shape.begin() + axis_, + 1, + std::multiplies()); + const index_t inner_size = std::accumulate(output_shape.begin() + axis_ + 1, + output_shape.end(), + 1, + std::multiplies()); for (size_t i= 0; i < outputs_count; ++i) { output_list[i]->Resize(output_shape); output_ptrs[i] = output_list[i]->mutable_data(); @@ -39,25 +54,27 @@ struct SliceFunctor { #pragma omp parallel for for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { - int input_idx = outer_idx * input_channels; - int output_idx = outer_idx * output_channels; + int input_idx = outer_idx * input_channels * inner_size; + int output_idx = outer_idx * output_channels * inner_size; for (size_t i = 0; i < outputs_count; ++i) { if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx, - output_channels * sizeof(T)); + output_channels * inner_size * sizeof(T)); } else { - for (index_t k = 0; k < output_channels; ++k) { + for (index_t k = 0; k < output_channels * inner_size; ++k) { *(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k); } } - input_idx += output_channels; + input_idx += output_channels * inner_size; } } } }; template -struct SliceFunctor { +struct SliceFunctor : SliceFunctorBase { + explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} + void operator()(const Tensor *input, const std::vector &output_list, StatsFuture *future); diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index 18bddcc75e4555cdc16d3cd63e828281ce7dd0e8..034a591fa21c0455bb9f6ccb35e6f12b494f4dd0 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -249,14 +249,26 @@ void TestSimplePrelu() { net.RunOp(D); } - auto expected = CreateTensor( - {2, 2, 2, 2}, - {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); + if (D == DeviceType::NEON) { + auto expected = CreateTensor( + {2, 2, 2, 2}, + {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } else { + auto expected = CreateTensor( + {2, 2, 2, 2}, + {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } +} - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +TEST_F(ActivationOpTest, CPUSimplePrelu) { + TestSimplePrelu(); } -TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu(); } +TEST_F(ActivationOpTest, NEONSimplePrelu) { + TestSimplePrelu(); +} TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TestSimplePrelu(); diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index d482b0288bbcaaa8d7144ad98de10b50d4c7db40..e8fe0acc2ae5b78bdf68ba0dc8f25a7225a911f7 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -24,6 +24,11 @@ void Register_Slice(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), SliceOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + SliceOp); } } // namespace ops diff --git a/mace/ops/slice.h b/mace/ops/slice.h index a1a6ad112578c41f5966a0a61951a730c50cf245..228693450de1e8a460ef921766e87b08f2141176 100644 --- a/mace/ops/slice.h +++ b/mace/ops/slice.h @@ -17,14 +17,16 @@ template class SliceOp : public Operator { public: SliceOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + : Operator(op_def, ws), + functor_(OperatorBase::GetSingleArgument("axis", 3)) {} bool Run(StatsFuture *future) override { MACE_CHECK(this->OutputSize() >= 2) << "There must be at least two outputs for slicing"; const Tensor *input = this->Input(INPUT); const std::vector output_list = this->Outputs(); - MACE_CHECK((input->dim(3) % this->OutputSize()) == 0) + const int32_t slice_axis = OperatorBase::GetSingleArgument("axis", 3); + MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0) << "Outputs do not split input equally."; functor_(input, output_list, future); diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc index bd0244d3cd9453785c30a219c6011c82edd2bd84..ad507af380a29ee0a3d138e983d4fe34396ad75f 100644 --- a/mace/ops/slice_test.cc +++ b/mace/ops/slice_test.cc @@ -16,7 +16,7 @@ namespace test { class SliceOpTest : public OpsTestBase {}; template -void RandomTest(const int num_outputs) { +void RandomTest(const int num_outputs, const int axis) { static unsigned int seed = time(NULL); const index_t output_channels = 4 * (1 + rand_r(&seed) % 10); const index_t input_channels = num_outputs * output_channels; @@ -27,7 +27,11 @@ void RandomTest(const int num_outputs) { // Construct graph OpsTestNet net; - std::vector input_shape({batch, height, width, input_channels}); + std::vector input_shape; + if (axis == 1) + input_shape = {batch, input_channels, height, width}; + else if (axis == 3) + input_shape = {batch, height, width, input_channels}; const index_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, @@ -49,7 +53,7 @@ void RandomTest(const int num_outputs) { .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } else { - auto builder = OpDefBuilder("Slice", "SliceTest"); + auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis); builder.Input("Input"); for (int i = 0; i < num_outputs; ++i) { builder = builder.Output(MakeString("Output", i)); @@ -70,9 +74,17 @@ void RandomTest(const int num_outputs) { } // Check - std::vector expected_shape({batch, height, width, output_channels}); + std::vector expected_shape; + if (axis == 1) + expected_shape = {batch, output_channels, height, width}; + else if (axis == 3) + expected_shape = {batch, height, width, output_channels}; const index_t outer_size = std::accumulate(expected_shape.begin(), - expected_shape.end() - 1, + expected_shape.begin() + axis, + 1, + std::multiplies()); + const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1, + expected_shape.end(), 1, std::multiplies()); const float *input_ptr = input_data.data(); @@ -83,8 +95,9 @@ void RandomTest(const int num_outputs) { Tensor::MappingGuard output_mapper(output); output_ptr = output->data(); for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { - const int idx = outer_idx * input_channels + i * output_channels; - for (int j = 0; j < output_channels; ++j) { + const int idx = (outer_idx * input_channels + i * output_channels) + * inner_size; + for (int j = 0; j < output_channels * inner_size; ++j) { ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output " << i << " index " << idx + j; } @@ -93,21 +106,27 @@ void RandomTest(const int num_outputs) { } TEST_F(SliceOpTest, CPU) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); +} + +TEST_F(SliceOpTest, CPUAxis1) { + RandomTest(2, 1); + RandomTest(4, 1); + RandomTest(11, 1); } TEST_F(SliceOpTest, OPENCLFloat) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } TEST_F(SliceOpTest, OPENCLHalf) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } } // namespace test diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index 63ca972324a9ff538d088dc407460e5853e67a59..8acb1cace385a876e97d62a2c2563c0bd13bd497 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -8,6 +8,11 @@ #ifndef MACE_PUBLIC_MACE_RUNTIME_H_ #define MACE_PUBLIC_MACE_RUNTIME_H_ +#include +#include +#include +#include + namespace mace { enum GPUPerfHint { @@ -26,10 +31,42 @@ enum GPUPriorityHint { enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 }; +class KVStorage { + public: + // return: 0 for success, -1 for error + virtual int Load() = 0; + virtual bool Insert(const std::string &key, + const std::vector &value) = 0; + virtual const std::vector *Find(const std::string &key) = 0; + // return: 0 for success, -1 for error + virtual int Flush() = 0; +}; + +class KVStorageFactory { + public: + virtual std::unique_ptr CreateStorage(const std::string &name) = 0; +}; + +class FileStorageFactory : public KVStorageFactory { + public: + explicit FileStorageFactory(const std::string &path); + + ~FileStorageFactory(); + + std::unique_ptr CreateStorage(const std::string &name) override; + + private: + class Impl; + std::unique_ptr impl_; +}; + +void ConfigKVStorageFactory(std::shared_ptr storage_factory); + void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint); void ConfigOmpThreads(int omp_num_threads); void ConfigCPUPowerOption(CPUPowerOption power_option); + } // namespace mace #endif // MACE_PUBLIC_MACE_RUNTIME_H_ diff --git a/mace/python/tools/binary_codegen.py b/mace/python/tools/binary_codegen.py index 9956624b41086a1cb91403a21fd25b79c56a2efd..3be2b086461dd70e7fa6066f086d91442601dec5 100644 --- a/mace/python/tools/binary_codegen.py +++ b/mace/python/tools/binary_codegen.py @@ -25,9 +25,9 @@ def generate_cpp_source(): with open(binary_path, "rb") as f: binary_array = np.fromfile(f, dtype=np.uint8) + print "Generate binary from", binary_path idx = 0 size, = struct.unpack("Q", binary_array[idx:idx+8]) - print size idx += 8 for _ in xrange(size): key_size, = struct.unpack("i", binary_array[idx:idx+4]) @@ -64,7 +64,7 @@ def parse_args(): parser.add_argument( "--binary_dirs", type=str, - default="cl_bin0/,cl_bin1/", + default="", help="The binaries file path.") parser.add_argument( "--binary_file_name", diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 166bb6ec3012f2f3075e6dc5577dd5a9e6832463..7a94d2805def0c5120fc2692cf69e541bd4c122b 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -68,14 +68,26 @@ def BlobToNPArray(blob): class Shapes(object): @staticmethod - def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func): + def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'): output_shape = np.zeros_like(input_shape) output_shape[0] = input_shape[0] - output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] - - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] - - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 - output_shape[3] = filter_shape[2] + if input_format == 'NHWC': + # input format: NHWC, filter format: HWOI + output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] + - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] + - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 + output_shape[3] = filter_shape[2] + elif input_format == 'NCHW': + # input format: NCHW, filter format: OIHW + output_shape[1] = filter_shape[0] + output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2] + - (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3] + - (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 + else: + raise Exception("format %s is not supported" % input_format) + return output_shape @staticmethod @@ -93,8 +105,13 @@ class Shapes(object): return output_shape @staticmethod - def slice_shape(input_shape, num_output): - return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] + def slice_shape(input_shape, num_output, input_format='NHWC'): + if input_format == 'NHWC': + return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] + elif input_format == 'NCHW': + return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]] + else: + raise Exception("format %s is not supported" % input_format) # outputs' name is [op.name + '_' + #] class CaffeConverter(object): @@ -168,7 +185,10 @@ class CaffeConverter(object): arg.i = self.dt data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' op_def.name = op.name op_def.type = mace_type op_def.input.extend([name+':0' for name in self.inputs_map[op.name]]) @@ -342,7 +362,11 @@ class CaffeConverter(object): # Add filter weight_tensor_name = op.name + '_weight:0' - weight_data = op.data[0].transpose((2, 3, 0, 1)) + if self.device == 'neon': + weight_data = op.data[0] + else: + # OIHW -> HWOI + weight_data = op.data[0].transpose((2, 3, 0, 1)) self.add_tensor(weight_tensor_name, weight_data) if self.device == 'gpu': @@ -376,10 +400,11 @@ class CaffeConverter(object): final_op = op self.resolved_ops.add(op.name) + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]], weight_data.shape, paddings, strides, dilations, - math.floor) + math.floor, input_format) op.output_shape_map[op.layer.top[0]] = output_shape if len(self.ops_map[final_op.name].children) == 1 \ @@ -399,9 +424,13 @@ class CaffeConverter(object): self.net_def.op.extend([op_def]) def check_winograd_conv(self, op): + # TODO: support winograd conv on neon + if self.device == 'neon': + return False param = op.layer.convolution_param filter_shape = np.asarray(op.data[0].shape) - filter_shape = filter_shape[[2, 3, 0, 1]] + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) dilations = [1, 1] @@ -411,17 +440,21 @@ class CaffeConverter(object): elif len(param.dilation) == 2: dilations = [param.dilation[0], param.dilation[1]] + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape( op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, dilations, math.floor) + filter_shape, paddings, strides, dilations, math.floor, input_format) width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - return self.winograd and self.device == 'gpu' and \ - filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ - dilations[0] == 1 and (dilations[0] == dilations[1]) and \ - (strides[0] == 1) and (strides[0] == strides[1]) and \ - (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) + if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \ + (strides[0] == 1) and (strides[0] == strides[1]): + if self.device == 'gpu': + return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ + (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ + (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ + (width < OPENCL_IMAGE_MAX_SIZE) + elif self.device == 'neon': + return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3]) + return False def convert_winograd_conv(self, op): # Add filter @@ -435,11 +468,13 @@ class CaffeConverter(object): paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) filter_shape = np.asarray(op.data[0].shape) - filter_shape = filter_shape[[2, 3, 0, 1]] + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape( op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, [1, 1], math.floor) + filter_shape, paddings, strides, [1, 1], math.floor, input_format) # Input transform wt_op = mace_pb2.OperatorDef() @@ -455,8 +490,12 @@ class CaffeConverter(object): wt_output_name = wt_op.name + ":0" wt_op.output.extend([wt_output_name]) wt_output_shape = mace_pb2.OutputShape() - wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) + if self.device != 'neon': + wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) + wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) + else: + wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2) + wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1]) wt_op.output_shape.extend([wt_output_shape]) # MatMul @@ -470,7 +509,10 @@ class CaffeConverter(object): matmul_output_name = matmul_op.name + ":0" matmul_op.output.extend([matmul_output_name]) matmul_output_shape = mace_pb2.OutputShape() - matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + if self.device != 'neon': + matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + else: + matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1]) matmul_op.output_shape.extend([matmul_output_shape]) # Inverse transform @@ -483,10 +525,10 @@ class CaffeConverter(object): batch_arg.i = output_shape[0] height_arg = iwt_op.arg.add() height_arg.name = 'height' - height_arg.i = output_shape[1] + height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2] width_arg = iwt_op.arg.add() width_arg.name = 'width' - width_arg.i = output_shape[2] + width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3] iwt_op.name = op.name + '_inverse_transform' iwt_op.type = 'WinogradInverseTransform' iwt_op.input.extend([matmul_output_name]) @@ -589,8 +631,9 @@ class CaffeConverter(object): weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3]) - weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) - weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) + if self.device != 'neon': + weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) + weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) self.add_tensor(weight_tensor_name, weight_data) if self.device == 'gpu': if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \ @@ -665,9 +708,12 @@ class CaffeConverter(object): kernel_arg.name = 'kernels' kernel_arg.ints.extend(kernels) - filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] + filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \ + if self.device != 'neon' else \ + [input_shape[1], input_shape[1], kernels[0], kernels[1]] + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, - paddings, strides, [1, 1], math.ceil) + paddings, strides, [1, 1], math.ceil, input_format) op.output_shape_map[op.layer.top[0]] = output_shape op_def.output.extend([op.name + ':0']) @@ -720,7 +766,7 @@ class CaffeConverter(object): op_def = self.CommonConvert(op, 'Concat') axis_arg = op_def.arg.add() axis_arg.name = 'axis' - axis_arg.i = 3 + axis_arg.i = 3 if self.device != 'neon' else 1 try: if op.layer.concat_param.HasFeild('axis'): axis_arg.i = op.concat_param.axis @@ -766,13 +812,19 @@ class CaffeConverter(object): if len(param.slice_point) > 0: raise Exception('Mace do not support slice with slice_point') + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = 3 if self.device != 'neon' else 1 + input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] num_outputs = len(op.layer.top) - if (input_shape[3] % num_outputs) != 0 or \ - (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) : + input_channels = input_shape[axis_arg.i] + if (input_channels % num_outputs) != 0 or \ + (self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)): raise Exception('Mace do not support slice with input shape ' + str(input_shape) + ' and number of output ' + str(num_outputs)) - output_shape = Shapes.slice_shape(input_shape, num_outputs) + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format) for i in range(len(op.layer.top)): op.output_shape_map[op.layer.top[i]] = output_shape self.add_output_shape(op_def, output_shape) @@ -790,10 +842,15 @@ class CaffeConverter(object): self.resolved_ops.add(op.name) def convert_reshape(self, op): - op_def = self.CommonConvert(op, 'ReOrganize') + if self.device == 'neon': + op_def = self.CommonConvert(op, 'Reshape') + else: + op_def = self.CommonConvert(op, 'ReOrganize') input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] output_shape = input_shape - shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]] + shape_param = np.asarray(op.layer.reshape_param.shape.dim) + if self.device != 'neon': + shape_param = shape_param[[0, 3, 1, 2]] for i in range(len(shape_param)): if shape_param[i] != 0: output_shape[i] = shape_param[i] @@ -867,15 +924,50 @@ class CaffeConverter(object): assert len(input_nodes) == len(input_shapes) for i in range(len(input_nodes)): input_op = self.ops_map[input_nodes[i]] + input_shape = input_shapes[i] if self.device != 'neon' else \ + [input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]] if input_op.layer is not None: - input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i] + input_op.output_shape_map[input_op.layer.top[0]] = input_shape else: - input_op.output_shape_map[input_op.name] = input_shapes[i] + input_op.output_shape_map[input_op.name] = input_shape + + def add_neon_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'Transpose' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_neon_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'Transpose' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC def convert(self, input_nodes, input_shapes, output_nodes): if self.device == 'gpu': self.add_input_transform(input_nodes) + if self.device == 'neon': + self.add_neon_input_transform(input_nodes) + assert self.ops[0].type == 'Input' self.add_input_op_shape(input_nodes, input_shapes) @@ -924,6 +1016,9 @@ class CaffeConverter(object): if self.device == 'cpu': self.replace_in_out_name(input_nodes, output_nodes) + if self.device == 'neon': + self.add_neon_output_transform(output_nodes) + for op in self.ops: if op.name not in self.resolved_ops: print 'Unresolve Op: %s with type %s' % (op.name, op.type) diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py index 04bda9a82ff46d02d7cfb9a1ebc1a19ee7f7b9c2..96f4c6a6d11b27c60f01b83e35ecd47bca863e05 100644 --- a/mace/python/tools/opencl_codegen.py +++ b/mace/python/tools/opencl_codegen.py @@ -1,6 +1,7 @@ import argparse import os import sys +import struct import numpy as np @@ -14,30 +15,49 @@ FLAGS = None def generate_cpp_source(): maps = {} - cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",") - for cl_binary_dir in cl_binary_dir_arr: - if not os.path.exists(cl_binary_dir): - print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!") - for file_name in os.listdir(cl_binary_dir): - file_path = os.path.join(cl_binary_dir, file_name) - if file_path[-4:] == ".bin": - # read binary - f = open(file_path, "rb") - binary_array = np.fromfile(f, dtype=np.uint8) - f.close() - - maps[file_name[:-4]] = [] - for ele in binary_array: - maps[file_name[:-4]].append(hex(ele)) + platform_info = '' + binary_dirs = FLAGS.cl_binary_dirs.strip().split(",") + for binary_dir in binary_dirs: + binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name) + if not os.path.exists(binary_path): + continue + + print 'generate opencl code from', binary_path + with open(binary_path, "rb") as f: + binary_array = np.fromfile(f, dtype=np.uint8) + + idx = 0 + size, = struct.unpack("Q", binary_array[idx:idx+8]) + idx += 8 + for _ in xrange(size): + key_size, = struct.unpack("i", binary_array[idx:idx+4]) + idx += 4 + key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) + idx += key_size + value_size, = struct.unpack("i", binary_array[idx:idx+4]) + idx += 4 + maps[key] = [] + value = struct.unpack(str(value_size) + "B", + binary_array[idx:idx+value_size]) + idx += value_size + for ele in value: + maps[key].append(hex(ele)) + + cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name) + with open(cl_platform_info_path, 'r') as f: + curr_platform_info = f.read() + if platform_info != "": + assert(curr_platform_info == platform_info) + platform_info = curr_platform_info env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - return env.get_template('str2vec_maps.cc.jinja2').render( + return env.get_template('opencl_compiled_kernel.cc.jinja2').render( maps = maps, data_type = 'unsigned char', - variable_name = 'kCompiledProgramMap' + variable_name = 'kCompiledProgramMap', + platform_info = platform_info, ) - def main(unused_args): cpp_cl_binary_source = generate_cpp_source() @@ -54,7 +74,17 @@ def parse_args(): parser.add_argument( "--cl_binary_dirs", type=str, - default="cl_bin0/,cl_bin1/,cl_bin2/", + default="", + help="The cl binaries directories.") + parser.add_argument( + "--built_kernel_file_name", + type=str, + default="", + help="The cl binaries directories.") + parser.add_argument( + "--platform_info_file_name", + type=str, + default="", help="The cl binaries directories.") parser.add_argument( "--output_path", diff --git a/mace/python/tools/opencl_compiled_kernel.cc.jinja2 b/mace/python/tools/opencl_compiled_kernel.cc.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..391b03f0cd64b9c07ad4fe3b5345a3aaa7c1cd69 --- /dev/null +++ b/mace/python/tools/opencl_compiled_kernel.cc.jinja2 @@ -0,0 +1,29 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +// This is a generated file, DO NOT EDIT + +#include +#include +#include + +namespace mace { + +extern const std::map> {{variable_name}} = +{ + {% for key, value in maps.iteritems() %} + { + "{{key}}", + { + {%- for ele in value -%} + {{ele}}, + {%- endfor -%} + } + }, // {{key}} +{% endfor %} +}; + +extern const std::string kCompiledProgramPlatform = {{platform_info|tojson}}; + +} // namespace mace diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..a4e1cb876dd508b9a1a365c2350345301b9a3af4 --- /dev/null +++ b/mace/tools/validation/BUILD @@ -0,0 +1,14 @@ +# Examples +load("//mace:mace.bzl", "if_openmp_enabled") + +cc_binary( + name = "mace_run", + srcs = ["mace_run.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + "//external:gflags_nothreads", + "//mace/codegen:generated_models", + "//mace/core:core", + ], +) diff --git a/mace/examples/mace_run.cc b/mace/tools/validation/mace_run.cc similarity index 84% rename from mace/examples/mace_run.cc rename to mace/tools/validation/mace_run.cc index f71ca08d34467c58a5e8d7cffb1269f16f4622f6..d4fa7c29a00073b9aff74d0a107e620540fc45cb 100644 --- a/mace/examples/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -16,12 +16,14 @@ */ #include #include +#include #include #include #include #include #include "gflags/gflags.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/public/mace.h" #include "mace/public/mace_runtime.h" #include "mace/utils/env_time.h" @@ -100,6 +102,20 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +void WriteOpenCLPlatformInfo(const std::string &output_dir) { + std::string platform_info = OpenCLRuntime::Global()->platform_info(); + const std::string cl_platform_info_file_name = output_dir + + "/mace_cl_platform_info.txt"; + + std::ofstream ofs(cl_platform_info_file_name); + if (ofs.is_open()) { + ofs << platform_info; + ofs.close(); + } else { + LOG(WARNING) << "Write opencl platform info failed."; + } +} + struct mallinfo LogMallinfoChange(struct mallinfo prev) { struct mallinfo curr = mallinfo(); if (prev.arena != curr.arena) { @@ -189,8 +205,8 @@ bool RunModel(const std::vector &input_names, mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); int64_t t1 = NowMicros(); - LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us"; - int64_t init_micros = t1 - t0; + double create_net_millis = (t1 - t0) / 1000.0; + LOG(INFO) << "CreateNetDef latency: " << create_net_millis << " ms"; DeviceType device_type = ParseDeviceType(FLAGS_device); LOG(INFO) << "Runing with device type: " << device_type; @@ -205,17 +221,26 @@ bool RunModel(const std::vector &input_names, static_cast(FLAGS_gpu_priority_hint)); } + const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH"); + const std::string kernel_file_path = + std::string(kernel_path == nullptr ? + "/data/local/tmp/mace_run/cl_program" : kernel_path); + // Init model LOG(INFO) << "Run init"; - t0 = NowMicros(); + std::shared_ptr storage_factory( + new FileStorageFactory(kernel_file_path)); + ConfigKVStorageFactory(storage_factory); mace::MaceEngine engine(&net_def, device_type, input_names, output_names); if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); } - t1 = NowMicros(); - init_micros += t1 - t0; - LOG(INFO) << "Net init latency: " << t1 - t0 << " us"; - LOG(INFO) << "Total init latency: " << init_micros << " us"; + int64_t t2 = NowMicros(); + double mace_engine_ctor_millis = (t2 - t1) / 1000.0; + double init_millis = (t2 - t0) / 1000.0; + LOG(INFO) << "MaceEngine constructor latency: " + << mace_engine_ctor_millis << " ms"; + LOG(INFO) << "Total init latency: " << init_millis << " ms"; const size_t input_count = input_names.size(); const size_t output_count = output_names.size(); @@ -253,14 +278,16 @@ bool RunModel(const std::vector &input_names, } LOG(INFO) << "Warm up run"; - t0 = NowMicros(); + int64_t t3 = NowMicros(); engine.Run(inputs, &outputs); - t1 = NowMicros(); - LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; + int64_t t4 = NowMicros(); + double warmup_millis = (t4 - t3) / 1000.0; + LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms"; + double model_run_millis = -1; if (FLAGS_round > 0) { LOG(INFO) << "Run model"; - t0 = NowMicros(); + int64_t t0 = NowMicros(); struct mallinfo prev = mallinfo(); for (int i = 0; i < FLAGS_round; ++i) { engine.Run(inputs, &outputs); @@ -269,10 +296,20 @@ bool RunModel(const std::vector &input_names, prev = LogMallinfoChange(prev); } } - t1 = NowMicros(); - LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us"; + int64_t t1 = NowMicros(); + model_run_millis = (t1 - t0) / 1000.0 / FLAGS_round; + LOG(INFO) << "Average latency: " << model_run_millis << " ms"; } + // Metrics reporting tools depends on the format, keep in consistent + printf("================================================================\n"); + printf(" create_net engine_ctor init warmup run_avg\n"); + printf("================================================================\n"); + printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis, + mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis); + + WriteOpenCLPlatformInfo(kernel_file_path); + for (size_t i = 0; i < output_count; ++i) { std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]); diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 54ab5793d229bba9453ae38fae6ec260f3c9cb6e..a9ca2a40aae252c10ce96d5968987b68af9894ea 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace mace { template @@ -96,5 +97,20 @@ inline std::string ObfuscateSymbol(const std::string &src) { #define MACE_OBFUSCATE_SYMBOL(str) (str) #endif +inline std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + } // namespace mace #endif // MACE_UTILS_UTILS_H_ diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 185167673aca32cc1b188234892ae98d551d329d..c8c9ddb46f9ae640824a0b6d4faaea3a700c6993 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -33,10 +33,16 @@ def ops_benchmark_stdout_processor(stdout, device_properties, abi): line = line.strip() parts = line.split() if len(parts) == 5 and parts[0].startswith("BM_"): - metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1000000.0) + metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6) metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] - sh_commands.falcon_push_metrics(metrics, device_properties, abi, + + platform = device_properties["ro.board.platform"].replace(" ", "-") + model = device_properties["ro.product.model"].replace(" ", "-") + tags = {"ro.board.platform": platform, + "ro.product.model": model, + "abi": abi} + sh_commands.falcon_push_metrics(metrics, tags=tags, endpoint="mace_ops_benchmark") def parse_args(): @@ -88,6 +94,7 @@ def main(unused_args): # generate sources sh_commands.gen_encrypted_opencl_source() + sh_commands.gen_compiled_opencl_source() sh_commands.gen_mace_version() for target_abi in target_abis: diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh index 658ae26745f330cd0dded36da0f24d85d1bb9361..8f22ee33966692798cbb4ee25d427dc4579ff265 100644 --- a/tools/build_mace_run.sh +++ b/tools/build_mace_run.sh @@ -30,7 +30,7 @@ if [ x"$TARGET_ABI" = x"host" ]; then --copt="-O3" \ $PRODUCTION_MODE_BUILD_FLAGS || exit 1 - bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ + bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \ --copt="-std=c++11" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-Werror=return-type" \ @@ -47,7 +47,7 @@ else NEON_ENABLE_FLAG="--define neon=true" fi - bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ + bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ --cpu=${TARGET_ABI} \ @@ -64,14 +64,13 @@ else $HEXAGON_MODE_BUILD_FLAG || exit 1 fi -if [ "$PRODUCTION_MODE" = 1 ]; then - cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a -fi +rm -rf $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a +cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a if [ -f "$MODEL_OUTPUT_DIR/mace_run" ]; then rm -rf $MODEL_OUTPUT_DIR/mace_run fi -cp bazel-bin/mace/examples/mace_run $MODEL_OUTPUT_DIR +cp bazel-bin/mace/tools/validation/mace_run $MODEL_OUTPUT_DIR if [ "$EMBED_MODEL_DATA" = 0 ]; then cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR fi diff --git a/tools/build_production_code.sh b/tools/build_production_code.sh index 7987f67fa7747177517aeef621b2a137ce4a9f9d..4072019429ecd61d4b7e18f0cbbed17e1851dc09 100644 --- a/tools/build_production_code.sh +++ b/tools/build_production_code.sh @@ -35,9 +35,9 @@ build_target() } if [ x"$TARGET_ABI" = x"host" ]; then - build_host_target //mace/codegen:generated_opencl_prod + build_host_target //mace/codegen:generated_opencl build_host_target //mace/codegen:generated_tuning_params else - build_target //mace/codegen:generated_opencl_prod + build_target //mace/codegen:generated_opencl build_target //mace/codegen:generated_tuning_params fi diff --git a/tools/build_run_throughput_test.sh b/tools/build_run_throughput_test.sh index ef148e6ab3cd954b99a9f653ea299e7086708ab9..75387b62879f528e67be1acc78ea96225d299cb8 100644 --- a/tools/build_run_throughput_test.sh +++ b/tools/build_run_throughput_test.sh @@ -72,9 +72,11 @@ adb -s $DEVICE_ID /dev/null + fi + fi + + python mace/python/tools/opencl_codegen.py \ + --built_kernel_file_name=${CL_BUILT_KERNEL_FILE_NAME} \ + --platform_info_file_name=${CL_PLATFORM_INFO_FILE_NAME} \ + --cl_binary_dirs=${CL_BIN_DIRS} \ + --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc || exit 1 +fi + diff --git a/tools/generate_production_code.sh b/tools/generate_production_code.sh deleted file mode 100644 index f4f021c4e930e50a5c969568e0ecda0891357914..0000000000000000000000000000000000000000 --- a/tools/generate_production_code.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -Usage() { - echo "Usage: bash tools/generate_production_code.sh target_soc cl_bin_dirs pull_or_not" -} - -if [ $# -lt 3 ]; then - Usage - exit 1 -fi - -CURRENT_DIR=`dirname $0` -source ${CURRENT_DIR}/env.sh - -TARGET_SOC=$1 -CL_BIN_DIRS=$2 -PULL_OR_NOT=$3 - -RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC` -if [ $? -ne 0 ]; then - echo $RESULT_VALUE - exit 1 -else - DEVICE_ID=$RESULT_VALUE -fi - -if [ "$PULL_OR_NOT" = 1 ]; then - CL_BIN_DIR=${CL_BIN_DIRS} - rm -rf ${CL_BIN_DIR} - mkdir -p ${CL_BIN_DIR} - if [ x"$TARGET_ABI" != x"host" ]; then - adb -s $DEVICE_ID pull ${KERNEL_DIR}/. ${CL_BIN_DIR} > /dev/null - adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR} > /dev/null - fi -fi - -python mace/python/tools/opencl_codegen.py \ - --cl_binary_dirs=${CL_BIN_DIRS} \ - --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc - -python mace/python/tools/binary_codegen.py \ - --binary_dirs=${CL_BIN_DIRS} \ - --binary_file_name=mace_run.config \ - --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc diff --git a/tools/generate_tuning_param_code.sh b/tools/generate_tuning_param_code.sh new file mode 100644 index 0000000000000000000000000000000000000000..4d116c9fec7c140a2078e0d6f28e1c5181468e51 --- /dev/null +++ b/tools/generate_tuning_param_code.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +CURRENT_DIR=`dirname $0` +source ${CURRENT_DIR}/env.sh + +mkdir -p ${TUNING_CODEGEN_DIR} + +if [ "$#" -eq "0" ]; then + python mace/python/tools/binary_codegen.py \ + --binary_file_name=mace_run.config \ + --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc +else + + TARGET_SOC=$1 + BIN_DIRS=$2 + PULL_OR_NOT=$3 + + RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC` + if [ $? -ne 0 ]; then + echo $RESULT_VALUE + exit 1 + else + DEVICE_ID=$RESULT_VALUE + fi + + if [ "$PULL_OR_NOT" = 1 ]; then + mkdir -p ${BIN_DIRS} + rm -rf ${BIN_DIRS}/mace_run.config + if [ x"$TARGET_ABI" != x"host" ]; then + adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${BIN_DIRS} > /dev/null + fi + fi + + python mace/python/tools/binary_codegen.py \ + --binary_dirs=${BIN_DIRS} \ + --binary_file_name=mace_run.config \ + --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc +fi + + diff --git a/tools/generate_opencl_and_version_code.sh b/tools/generate_version_code.sh similarity index 54% rename from tools/generate_opencl_and_version_code.sh rename to tools/generate_version_code.sh index ee32acc7dac7fdfc83beb0aab1be26106efbf6e0..7b1c33d219a6a886c29335deff8c4bc6661c39c4 100644 --- a/tools/generate_opencl_and_version_code.sh +++ b/tools/generate_version_code.sh @@ -1,13 +1,10 @@ #!/bin/bash +CL_KERNEL_DIR_TAG=$1 + CURRENT_DIR=`dirname $0` source ${CURRENT_DIR}/env.sh -python mace/python/tools/encrypt_opencl_codegen.py \ - --cl_kernel_dir=./mace/kernels/opencl/cl/ \ - --output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1 - - rm -rf ${CODEGEN_DIR}/version mkdir ${CODEGEN_DIR}/version bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit 1 diff --git a/tools/mace_tools.py b/tools/mace_tools.py index f9dd9181b2e55717598b9d526ccf4505750ee0e0..d229573a5f1210068eb00616b9cf9b5e7b360b17 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -9,6 +9,7 @@ import argparse import hashlib import os +import sh import shutil import subprocess import sys @@ -58,10 +59,41 @@ def get_global_runtime(configs): return global_runtime -def generate_opencl_and_version_code(): - command = "bash tools/generate_opencl_and_version_code.sh" +def generate_version_code(): + command = "bash tools/generate_version_code.sh" run_command(command) +def generate_opencl_source_code(): + command = "bash tools/generate_opencl_code.sh source" + run_command(command) + +def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not): + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + if not cl_bin_dirs: + command = "bash tools/generate_opencl_code.sh binary" + else: + command = "bash tools/generate_opencl_code.sh {} {} {} {}".format( + 'binary', target_soc, cl_bin_dirs_str, int(pull_or_not)) + run_command(command) + +def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not): + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + if not cl_bin_dirs: + command = "bash tools/generate_tuning_param_code.sh" + else: + command = "bash tools/generate_tuning_param_code.sh {} {} {}".format( + target_soc, cl_bin_dirs_str, int(pull_or_not)) + run_command(command) + +def generate_code(target_soc, model_output_dirs, pull_or_not): + generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not) + generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not) def clear_env(target_soc): command = "bash tools/clear_env.sh {}".format(target_soc) @@ -111,18 +143,41 @@ def build_mace_run(production_mode, model_output_dir, hexagon_mode): run_command(command) -def tuning_run(target_soc, +def tuning_run(model_name, + target_runtime, + target_abi, + target_soc, model_output_dir, running_round, tuning, - production_mode, restart_round, option_args=''): - command = "bash tools/tuning_run.sh {} {} {} {} {} {} \"{}\"".format( - target_soc, model_output_dir, running_round, int(tuning), - int(production_mode), restart_round, option_args) - run_command(command) - + # TODO(yejianwu) refactoring the hackish code + stdout_buff = [] + process_output = sh_commands.make_output_processor(stdout_buff) + p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir, + running_round, int(tuning), + restart_round, option_args, _out=process_output, + _bg=True, _err_to_out=True) + p.wait() + metrics = {} + for line in stdout_buff: + line = line.strip() + parts = line.split() + if len(parts) == 6 and parts[0].startswith("time"): + metrics["%s.create_net_ms" % model_name] = str(float(parts[1])) + metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2])) + metrics["%s.init_ms" % model_name] = str(float(parts[3])) + metrics["%s.warmup_ms" % model_name] = str(float(parts[4])) + if float(parts[5]) > 0: + metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5])) + tags = {"ro.board.platform": target_soc, + "abi": target_abi, + # "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime + "round": running_round, # TODO(yejianwu) change this to source/binary + "tuning": tuning} + sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark", + tags=tags) def benchmark_model(target_soc, model_output_dir, option_args=''): command = "bash tools/benchmark.sh {} {} \"{}\"".format( @@ -130,9 +185,10 @@ def benchmark_model(target_soc, model_output_dir, option_args=''): run_command(command) -def run_model(target_soc, model_output_dir, running_round, restart_round, - option_args): - tuning_run(target_soc, model_output_dir, running_round, False, False, +def run_model(model_name, target_runtime, target_abi, target_soc, + model_output_dir, running_round, restart_round, option_args): + tuning_run(model_name, target_runtime, target_abi, target_soc, + model_output_dir, running_round, False, restart_round, option_args) @@ -146,25 +202,28 @@ def generate_production_code(target_soc, model_output_dirs, pull_or_not): run_command(command) -def build_mace_run_prod(target_soc, model_output_dir, tuning, global_runtime): - if "dsp" == global_runtime: +def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc, + model_output_dir, tuning): + if "dsp" == target_runtime: hexagon_mode = True else: hexagon_mode = False + generate_code(target_soc, [], False) production_or_not = False build_mace_run(production_or_not, model_output_dir, hexagon_mode) tuning_run( + model_name, + target_runtime, + target_abi, target_soc, model_output_dir, running_round=0, tuning=tuning, - production_mode=production_or_not, restart_round=1) + generate_code(target_soc, [model_output_dir], True) production_or_not = True - pull_or_not = True - generate_production_code(target_soc, [model_output_dir], pull_or_not) build_mace_run(production_or_not, model_output_dir, hexagon_mode) @@ -188,8 +247,7 @@ def build_production_code(): def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs): - pull_or_not = False - generate_production_code(target_soc, model_output_dirs, pull_or_not) + generate_code(target_soc, model_output_dirs, False) build_production_code() model_output_dirs_str = ",".join(model_output_dirs) @@ -202,6 +260,26 @@ def packaging_lib_file(output_dir): command = "bash tools/packaging_lib.sh {}".format(output_dir) run_command(command) +def download_model_files(model_file_path, + model_output_dir, + weight_file_path=""): + if model_file_path.startswith("http://") or \ + model_file_path.startswith("https://"): + os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" + urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"]) + + if weight_file_path.startswith("http://") or \ + weight_file_path.startswith("https://"): + os.environ[ + "WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" + urllib.urlretrieve(weight_file_path, + os.environ["WEIGHT_FILE_PATH"]) + +def md5sum(str): + md5 = hashlib.md5() + md5.update(str) + return md5.hexdigest() + def parse_model_configs(): with open(FLAGS.config) as f: @@ -268,7 +346,9 @@ def main(unused_args): shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) - generate_opencl_and_version_code() + generate_version_code() + generate_opencl_source_code() + option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')]) available_socs = sh_commands.adb_get_all_socs() @@ -285,6 +365,7 @@ def main(unused_args): print("Error: devices with SoCs are not connected %s" % missing_socs) exit(1) + for target_soc in target_socs: for target_abi in configs["target_abis"]: global_runtime = get_global_runtime(configs) @@ -292,9 +373,9 @@ def main(unused_args): os.environ["TARGET_ABI"] = target_abi model_output_dirs = [] for model_name in configs["models"]: + print '=======================', model_name, '=======================' # Transfer params by environment os.environ["MODEL_TAG"] = model_name - print '=======================', model_name, '=======================' model_config = configs["models"][model_name] input_file_list = model_config.get("validation_inputs_data", []) for key in model_config: @@ -307,9 +388,8 @@ def main(unused_args): else: os.environ[key.upper()] = str(model_config[key]) - md5 = hashlib.md5() - md5.update(model_config["model_file_path"]) - model_path_digest = md5.hexdigest() + # Create model build directory + model_path_digest = md5sum(model_config["model_file_path"]) model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir, os.environ["PROJECT_NAME"], "build", model_name, @@ -323,21 +403,8 @@ def main(unused_args): os.makedirs(model_output_dir) clear_env(target_soc) - # Support http:// and https:// - if model_config["model_file_path"].startswith( - "http://") or model_config["model_file_path"].startswith( - "https://"): - os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" - urllib.urlretrieve(model_config["model_file_path"], - os.environ["MODEL_FILE_PATH"]) - - if model_config["platform"] == "caffe" and ( - model_config["weight_file_path"].startswith("http://") or - model_config["weight_file_path"].startswith("https://")): - os.environ[ - "WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" - urllib.urlretrieve(model_config["weight_file_path"], - os.environ["WEIGHT_FILE_PATH"]) + download_model_files(model_config["model_file_path"], + model_output_dir, model_config.get("weight_file_path", "")) if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\ or FLAGS.mode == "benchmark" or FLAGS.mode == "all": @@ -346,12 +413,13 @@ def main(unused_args): if FLAGS.mode == "build" or FLAGS.mode == "all": generate_model_code() - build_mace_run_prod(target_soc, model_output_dir, FLAGS.tuning, - global_runtime) + build_mace_run_prod(model_name, global_runtime, target_abi, + target_soc, model_output_dir, FLAGS.tuning) if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all": - run_model(target_soc, model_output_dir, FLAGS.round, - FLAGS.restart_round, option_args) + run_model(model_name, global_runtime, target_abi, target_soc, + model_output_dir, FLAGS.round, FLAGS.restart_round, + option_args) if FLAGS.mode == "benchmark": benchmark_model(target_soc, model_output_dir, option_args) diff --git a/tools/merge_libs.sh b/tools/merge_libs.sh index cc4cd4b054d4b14c8a1b25e4dbfd4d582d054884..90bfa7d96a104775bc344975649c47e411f783a3 100644 --- a/tools/merge_libs.sh +++ b/tools/merge_libs.sh @@ -41,14 +41,13 @@ LIBMACE_TEMP_DIR=`mktemp -d -t libmace.XXXX` echo "create ${LIBMACE_BUILD_DIR}/${TARGET_ABI}/libmace_${PROJECT_NAME}.${TARGET_SOC}.a" > ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri if [ x"$TARGET_ABI" = x"host" ]; then - echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri else - echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/codegen/libgenerated_version.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/core/libcore.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri - echo "addlib bazel-bin/mace/core/libopencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/kernels/libkernels.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/utils/libutils.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri echo "addlib bazel-bin/mace/utils/libutils_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 21dcef8be73707f3e32a6ba7ca3ba9d2b598c28c..666ce7b0bff61461cf547ec416114c77e808683d 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -70,22 +70,20 @@ def adb_run(serialno, host_bin_path, bin_name, out_of_range_check=1): host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) - device_cl_path = "%s/cl" % device_bin_path props = adb_getprop_by_serialno(serialno) print("=====================================================================") print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"], props["ro.product.model"])) sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path) sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path) - sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_cl_path) print("Push %s to %s" % (host_bin_full_path, device_bin_full_path)) - sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_path) + sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path) print("Run %s" % device_bin_full_path) stdout_buff=[] process_output = make_output_processor(stdout_buff) p = sh.adb("-s", serialno, "shell", - "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % - (out_of_range_check, opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args), + "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % + (out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args), _out=process_output, _bg=True, _err_to_out=True) p.wait() return "".join(stdout_buff) @@ -131,6 +129,7 @@ def bazel_target_to_bin(target): ################################ # TODO this should be refactored def gen_encrypted_opencl_source(codegen_path="mace/codegen"): + sh.mkdir("-p", "%s/opencl" % codegen_path) sh.python("mace/python/tools/encrypt_opencl_codegen.py", "--cl_kernel_dir=./mace/kernels/opencl/cl/", "--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path) @@ -140,24 +139,32 @@ def gen_mace_version(codegen_path="mace/codegen"): sh.bash("mace/tools/git/gen_version_source.sh", "%s/version/version.cc" % codegen_path) +def gen_compiled_opencl_source(codegen_path="mace/codegen"): + sh.mkdir("-p", "%s/opencl" % codegen_path) + sh.python("mace/python/tools/opencl_codegen.py", + "--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path) + ################################ # falcon ################################ -def falcon_tags(platform, model, abi): - return "ro.board.platform=%s,ro.product.model=%s,abi=%s" % (platform, model, abi) - -def falcon_push_metrics(metrics, device_properties, abi, endpoint="mace_dev"): +def falcon_tags(tags_dict): + tags = "" + for k, v in tags_dict.iteritems(): + if tags == "": + tags = "%s=%s" % (k, v) + else: + tags = tags + ",%s=%s" % (k, v) + return tags + +def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}): cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv", port=8433, debug=False) - platform = device_properties["ro.board.platform"].replace(" ", "-") - model = device_properties["ro.product.model"].replace(" ", "-") - tags = falcon_tags(platform, model, abi) ts = int(time.time()) falcon_metrics = [{ "endpoint": endpoint, "metric": key, - "tags": tags, + "tags": falcon_tags(tags), "timestamp": ts, "value": value, "step": 86400, diff --git a/tools/tuning_run.sh b/tools/tuning_run.sh index c4e8dbe99e403b6d067b4a802022f341e6a49141..98fc9e092b88b55ec573bc688c95e14ec3242149 100644 --- a/tools/tuning_run.sh +++ b/tools/tuning_run.sh @@ -1,10 +1,10 @@ #!/bin/bash Usage() { - echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning production_mode" + echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning " } -if [ $# -lt 7 ]; then +if [ $# -lt 6 ]; then Usage exit 1 fi @@ -16,9 +16,8 @@ TARGET_SOC=$1 MODEL_OUTPUT_DIR=$2 ROUND=$3 TUNING_OR_NOT=$4 -PRODUCTION_MODE=$5 -RESTART_ROUND=$6 -OPTION_ARGS=$7 +RESTART_ROUND=$5 +OPTION_ARGS=$6 echo $OPTION_ARGS @@ -45,16 +44,14 @@ if [ x"$TARGET_ABI" = x"host" ]; then --restart_round=1 \ $OPTION_ARGS || exit 1 else - if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != 1 ]];then + if [[ "${TUNING_OR_NOT}" != "0" ]];then tuning_flag=1 else tuning_flag=0 fi adb -s $DEVICE_ID shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1 - if [ "$PRODUCTION_MODE" = 0 ]; then - adb -s $DEVICE_ID shell "mkdir -p ${KERNEL_DIR}" || exit 1 - fi + adb -s $DEVICE_ID shell "mkdir -p ${COMPILED_PROGRAM_DIR}" || exit 1 IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}" for NAME in "${INPUT_NAMES[@]}";do @@ -72,7 +69,7 @@ else MACE_TUNING=${tuning_flag} \ MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \ MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \ - MACE_KERNEL_PATH=$KERNEL_DIR \ + MACE_CL_PROGRAM_PATH=$COMPILED_PROGRAM_DIR \ MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \ ${PHONE_DATA_DIR}/mace_run \ --input_node="${INPUT_NODES}" \