From 39fb3055a507afc308a5bf2e73095d9d17e744f3 Mon Sep 17 00:00:00 2001 From: liuqi Date: Mon, 4 Jun 2018 09:29:16 +0800 Subject: [PATCH] Refactor opencl binary load logic: use file instead of code. --- docs/getting_started/how_to_build.rst | 27 +++- docs/getting_started/how_to_build_zh.rst | 7 + mace/benchmark/benchmark_model.cc | 8 + mace/core/runtime/opencl/opencl_runtime.cc | 146 +++++++++++++----- mace/core/runtime/opencl/opencl_runtime.h | 11 +- mace/examples/example.cc | 7 + mace/public/mace_runtime.h | 9 +- mace/python/tools/opencl_codegen.py | 124 --------------- .../tools/opencl_compiled_kernel.cc.jinja2 | 39 ----- mace/tools/validation/BUILD | 1 - mace/tools/validation/mace_run.cc | 32 +--- tools/bazel_adb_run.py | 1 - tools/common.py | 9 ++ tools/converter.py | 49 +++--- tools/sh_commands.py | 93 ++++++++--- 15 files changed, 284 insertions(+), 279 deletions(-) delete mode 100644 mace/python/tools/opencl_codegen.py delete mode 100644 mace/python/tools/opencl_compiled_kernel.cc.jinja2 diff --git a/docs/getting_started/how_to_build.rst b/docs/getting_started/how_to_build.rst index 26ec3cb9..d07379bf 100644 --- a/docs/getting_started/how_to_build.rst +++ b/docs/getting_started/how_to_build.rst @@ -365,6 +365,16 @@ The followings list the details. ``.pb`` file will be generated only when build_type is ``proto``. +**OpenCL compiled kernel binary file** + * ``opencl/compiled_kernel.bin`` + + .. note:: + + This file will be generated only when specify ``target_soc`` and runtime is ``gpu``. + + .. warning:: + + This file rely on the OpenCL driver on the phone, you should update the file when OpenCL driver changed. ============= 5. how to use @@ -385,14 +395,21 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li new FileStorageFactory(file_path)); ConfigKVStorageFactory(storage_factory); - //1. Declare the device type(must be same with ``runtime`` in configuration file) + // 1. set precompiled OpenCL binary file paths if you use gpu of specified SOC, + // Besides the binary rely on the OpenCL driver of the SOC, + // if OpenCL driver changed, you should recompiled the binary file. + if (device_type == DeviceType::GPU) { + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + + // 2. Declare the device type(must be same with ``runtime`` in configuration file) DeviceType device_type = DeviceType::GPU; - //2. Define the input and output tensor names. + // 3. Define the input and output tensor names. std::vector input_names = {...}; std::vector output_names = {...}; - //3. Create MaceEngine object + // 4. Create MaceEngine object std::shared_ptr engine; MaceStatus create_engine_status; // Create Engine from code @@ -415,7 +432,7 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li // do something } - //4. Create Input and Output objects + // 5. Create Input and Output objects std::map inputs; std::map outputs; for (size_t i = 0; i < input_count; ++i) { @@ -440,6 +457,6 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); } - //5. Run the model + // 6. Run the model MaceStatus status = engine.Run(inputs, &outputs); diff --git a/docs/getting_started/how_to_build_zh.rst b/docs/getting_started/how_to_build_zh.rst index 3bbe3461..7945d7a1 100644 --- a/docs/getting_started/how_to_build_zh.rst +++ b/docs/getting_started/how_to_build_zh.rst @@ -390,6 +390,13 @@ Mace目前只提供静态库,有以下两种使用场景。 new FileStorageFactory(file_path)); ConfigKVStorageFactory(storage_factory); + // 2. 如果你使用特定SOC的GPU,可以设置OpenCL预编译的二进制文件路径。 + // * 该二进制文件是依赖于手机上OpenCL driver的,如果OpenCL driver改变了, + // 你需要重新编译并更新该二进制文件。 + if (device_type == DeviceType::GPU) { + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + //1. 声明设备类型(必须与build时指定的runtime一致) DeviceType device_type = DeviceType::GPU; diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index b6cd2d69..746a6c89 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -188,6 +188,9 @@ DEFINE_string(input_file, "", "input file name"); DEFINE_int32(max_num_runs, 100, "number of runs max"); DEFINE_string(max_time, "10.0", "length to run max"); DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); +DEFINE_string(opencl_binary_file, + "", + "compiled opencl binary file path"); DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); DEFINE_string(model_file, "", @@ -270,6 +273,11 @@ int Main(int argc, char **argv) { new FileStorageFactory(kernel_file_path)); SetKVStorageFactory(storage_factory); + if (device_type == DeviceType::GPU) { + std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + // Create Engine std::shared_ptr engine; MaceStatus create_engine_status; diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 6d068cf7..1a611beb 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -14,6 +14,8 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" +#include + #include #include #include @@ -31,9 +33,6 @@ namespace mace { -extern const std::map> - kCompiledProgramMap; -extern const std::string kCompiledProgramPlatform; extern const std::map> kEncryptedProgramMap; @@ -43,6 +42,12 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint); } +// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe) +void SetOpenCLBinaryPaths(const std::vector &paths) { + OpenCLRuntime::ConfigureOpenCLBinaryPath(paths); +} + + const std::string OpenCLErrorToString(cl_int error) { switch (error) { case CL_SUCCESS: @@ -237,6 +242,25 @@ GPUType ParseGPUType(const std::string &device_name) { return GPUType::UNKNOWN; } } + +std::string FindFirstExistPath(const std::vector &paths) { + std::string result; + struct stat st; + for (auto path : paths) { + if (stat(path.c_str(), &st) == 0) { + if (S_ISREG(st.st_mode)) { + result = path; + break; + } + } + } + return result; +} + +const char *kOpenCLPlatformInfoKey = + "mace_opencl_precompiled_platform_info_key"; +const char *kPrecompiledProgramFileName = + "mace_cl_compiled_program.bin"; } // namespace void OpenCLProfilingTimer::StartTiming() {} @@ -267,6 +291,8 @@ void OpenCLProfilingTimer::ClearTiming() { GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL; GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = GPUPriorityHint::PRIORITY_DEFAULT; +std::string + OpenCLRuntime::kPrecompiledBinaryPath = ""; // NOLINT(runtime/string) OpenCLRuntime *OpenCLRuntime::Global() { static OpenCLRuntime runtime; @@ -279,9 +305,19 @@ void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint, OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint; } +void OpenCLRuntime::ConfigureOpenCLBinaryPath( + const std::vector &paths) { + OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths); + if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { + LOG(WARNING) << "There is no precompiled OpenCL binary file in " + << MakeString(paths); + } +} OpenCLRuntime::OpenCLRuntime(): - storage_(nullptr), is_profiling_enabled_(false) { + precompiled_binary_storage_(nullptr), + cache_storage_(nullptr), + is_profiling_enabled_(false) { LoadOpenCLLibrary(); std::vector all_platforms; @@ -369,12 +405,38 @@ OpenCLRuntime::OpenCLRuntime(): extern std::shared_ptr kStorageFactory; if (kStorageFactory != nullptr) { - const std::string cl_compiled_file_name = "mace_cl_compiled_program.bin"; - storage_ = kStorageFactory->CreateStorage(cl_compiled_file_name); + cache_storage_ = + kStorageFactory->CreateStorage(kPrecompiledProgramFileName); - if (platform_info_ != kCompiledProgramPlatform) { - if (storage_->Load() != 0) { - LOG(FATAL) << "Load opencl compiled kernel file failed"; + if (cache_storage_->Load() != 0) { + LOG(FATAL) << "Load OpenCL cached compiled kernel file failed"; + } + auto platform_info_array = + this->cache_storage_->Find(kOpenCLPlatformInfoKey); + if (platform_info_array != nullptr) { + cached_binary_platform_info_ = + std::string(platform_info_array->begin(), + platform_info_array->end()); + } + } + + if (cached_binary_platform_info_ != platform_info_) { + if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { + LOG(WARNING) << "There is no precompiled OpenCL binary in" + " all OpenCL binary paths"; + } else { + precompiled_binary_storage_.reset( + new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); + if (precompiled_binary_storage_->Load() != 0) { + LOG(FATAL) << "Load OpenCL precompiled kernel file failed"; + } + + auto platform_info_array = + this->precompiled_binary_storage_->Find(kOpenCLPlatformInfoKey); + if (platform_info_array != nullptr) { + precompiled_binary_platform_info_ = + std::string(platform_info_array->begin(), + platform_info_array->end()); } } } @@ -416,16 +478,23 @@ uint32_t OpenCLRuntime::device_compute_units() const { return device_compute_units_; } -bool OpenCLRuntime::BuildProgramFromBinary( +bool OpenCLRuntime::BuildProgramFromCache( const std::string &built_program_key, const std::string &build_options_str, cl::Program *program) { // Find from binary - if (kCompiledProgramPlatform != platform_info_) return false; - auto it_binary = kCompiledProgramMap.find(built_program_key); - if (it_binary == kCompiledProgramMap.end()) return false; + if (this->cache_storage_ == nullptr) return false; + if (cached_binary_platform_info_ != platform_info_) { + VLOG(3) << "cached OpenCL binary version is not same" + " with current version"; + return false; + } + auto content = this->cache_storage_->Find(built_program_key); + if (content == nullptr) { + return false; + } - *program = cl::Program(context(), {device()}, {it_binary->second}); + *program = cl::Program(context(), {device()}, {*content}); cl_int ret = program->build({device()}, build_options_str.c_str()); if (ret != CL_SUCCESS) { if (program->getBuildInfo(device()) == @@ -435,25 +504,27 @@ bool OpenCLRuntime::BuildProgramFromBinary( LOG(INFO) << "Program build log: " << build_log; } LOG(WARNING) << "Build program " - << built_program_key << " from Binary failed:" - << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible " - "cause 1: the MACE library is built from SoC 1 but is " - "used on different SoC 2, possible cause 2: the MACE " - "buffer is corrupted make sure your code has no " - "out-of-range memory writing" : MakeString(ret)); + << built_program_key << " from Cache failed:" + << MakeString(ret); return false; } - VLOG(3) << "Program from Binary: " << built_program_key; + VLOG(3) << "Program from Cache: " << built_program_key; return true; } -bool OpenCLRuntime::BuildProgramFromCache( +bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( const std::string &built_program_key, const std::string &build_options_str, cl::Program *program) { // Find from binary - if (this->storage_ == nullptr) return false; - auto content = this->storage_->Find(built_program_key); + if (this->precompiled_binary_storage_ == nullptr) return false; + if (precompiled_binary_platform_info_ != platform_info_) { + VLOG(3) << "precompiled OpenCL binary version " + << precompiled_binary_platform_info_ + << " is not same with current version"; + return false; + } + auto content = this->precompiled_binary_storage_->Find(built_program_key); if (content == nullptr) { return false; } @@ -468,11 +539,11 @@ bool OpenCLRuntime::BuildProgramFromCache( LOG(INFO) << "Program build log: " << build_log; } LOG(WARNING) << "Build program " - << built_program_key << " from Cache failed:" + << built_program_key << " from precompiled binary failed:" << MakeString(ret); return false; } - VLOG(3) << "Program from Cache: " << built_program_key; + VLOG(3) << "Program from precompiled binary: " << built_program_key; return true; } @@ -527,8 +598,8 @@ void OpenCLRuntime::BuildProgramFromSource( reinterpret_cast(program_binaries[0].get()) + program_binary_sizes[0]); - if (this->storage_ != nullptr) { - this->storage_->Insert(built_program_key, content); + if (this->cache_storage_ != nullptr) { + this->cache_storage_->Insert(built_program_key, content); } VLOG(3) << "Program from source: " << built_program_key; @@ -543,13 +614,12 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, std::string build_options_str = build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math"; - // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math - bool ret = BuildProgramFromBinary(built_program_key, - build_options_str, program); + // Build flow: cache -> precompiled binary -> source + bool ret = BuildProgramFromCache(built_program_key, + build_options_str, program); if (!ret) { - ret = BuildProgramFromCache(built_program_key, - build_options_str, program); - // Fallback to source. + ret = BuildProgramFromPrecompiledBinary(built_program_key, + build_options_str, program); if (!ret) { BuildProgramFromSource(program_name, built_program_key, build_options_str, program); @@ -581,8 +651,12 @@ cl::Kernel OpenCLRuntime::BuildKernel( } void OpenCLRuntime::SaveBuiltCLProgram() { - if (storage_ != nullptr) { - if (storage_->Flush() != 0) { + if (cache_storage_ != nullptr) { + // update platform info + cache_storage_->Insert(kOpenCLPlatformInfoKey, + std::vector(platform_info_.begin(), + platform_info_.end())); + if (cache_storage_->Flush() != 0) { LOG(FATAL) << "Store OPENCL compiled kernel to file failed." " Please Make sure the storage directory exist."; } diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index f7fab747..447a2c37 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -66,6 +66,7 @@ class OpenCLRuntime { public: static OpenCLRuntime *Global(); static void Configure(GPUPerfHint, GPUPriorityHint); + static void ConfigureOpenCLBinaryPath(const std::vector &paths); cl::Context &context(); cl::Device &device(); @@ -99,11 +100,11 @@ class OpenCLRuntime { const std::string &binary_file_name, const std::string &build_options, cl::Program *program); - bool BuildProgramFromBinary( + bool BuildProgramFromCache( const std::string &built_program_key, const std::string &build_options_str, cl::Program *program); - bool BuildProgramFromCache( + bool BuildProgramFromPrecompiledBinary( const std::string &built_program_key, const std::string &build_options_str, cl::Program *program); @@ -115,7 +116,8 @@ class OpenCLRuntime { const std::string ParseDeviceVersion(const std::string &device_version); private: - std::unique_ptr storage_; + std::unique_ptr precompiled_binary_storage_; + std::unique_ptr cache_storage_; bool is_profiling_enabled_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. @@ -126,6 +128,8 @@ class OpenCLRuntime { std::mutex program_build_mutex_; std::string platform_info_; std::string opencl_version_; + std::string precompiled_binary_platform_info_; + std::string cached_binary_platform_info_; bool out_of_range_check_; uint64_t device_gloabl_mem_cache_size_; uint32_t device_compute_units_; @@ -133,6 +137,7 @@ class OpenCLRuntime { static GPUPerfHint kGPUPerfHint; static GPUPriorityHint kGPUPriorityHint; + static std::string kPrecompiledBinaryPath; }; } // namespace mace diff --git a/mace/examples/example.cc b/mace/examples/example.cc index 3a1f6831..caa80dcd 100644 --- a/mace/examples/example.cc +++ b/mace/examples/example.cc @@ -123,6 +123,9 @@ DEFINE_string(model_data_file, DEFINE_string(model_file, "", "model file name, used when load mace model in pb"); +DEFINE_string(opencl_binary_file, + "", + "compiled opencl binary file path"); DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); @@ -151,6 +154,10 @@ bool RunModel(const std::vector &input_names, } #endif // MACE_ENABLE_OPENCL + if (device_type == DeviceType::GPU) { + std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } // DO NOT USE tmp directory. // Please use APP's own directory and make sure the directory exists. // Just call once diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index 16bd817e..f353da75 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -76,9 +76,16 @@ class FileStorageFactory : public KVStorageFactory { std::unique_ptr impl_; }; -// Set KV store factory used as OpenCL cache. +// Set KV store factory used as OpenCL cache. (Call Once) void SetKVStorageFactory(std::shared_ptr storage_factory); +// Just call once. (Not thread-safe) +// Set paths of OpenCL Compiled Binary file if you use gpu of specific soc. +// Using OpenCL binary will speed up the initialization. +// OpenCL binary is corresponding to the OpenCL Driver version, +// you should update the binary when OpenCL Driver changed. +void SetOpenCLBinaryPaths(const std::vector &paths); + // Set GPU hints, currently only supports Adreno GPU. // // Caution: this function may hurt performance if improper parameters provided. diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py deleted file mode 100644 index c21e5b97..00000000 --- a/mace/python/tools/opencl_codegen.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2018 Xiaomi, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import sys -import struct - -import numpy as np - -import jinja2 - -# python mace/python/tools/opencl_codegen.py \ -# --cl_binary_dirs=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH} - -FLAGS = None - - -def generate_cpp_source(cl_binary_dirs, - built_kernel_file_name, - platform_info_file_name): - maps = {} - platform_info = '' - binary_dirs = cl_binary_dirs.strip().split(",") - for binary_dir in binary_dirs: - binary_path = os.path.join(binary_dir, built_kernel_file_name) - if not os.path.exists(binary_path): - continue - - print 'generate opencl code from', binary_path - with open(binary_path, "rb") as f: - binary_array = np.fromfile(f, dtype=np.uint8) - - idx = 0 - size, = struct.unpack("Q", binary_array[idx:idx + 8]) - idx += 8 - for _ in xrange(size): - key_size, = struct.unpack("i", binary_array[idx:idx + 4]) - idx += 4 - key, = struct.unpack( - str(key_size) + "s", binary_array[idx:idx + key_size]) - idx += key_size - value_size, = struct.unpack("i", binary_array[idx:idx + 4]) - idx += 4 - maps[key] = [] - value = struct.unpack( - str(value_size) + "B", binary_array[idx:idx + value_size]) - idx += value_size - for ele in value: - maps[key].append(hex(ele)) - - cl_platform_info_path = os.path.join(binary_dir, - platform_info_file_name) - with open(cl_platform_info_path, 'r') as f: - curr_platform_info = f.read() - if platform_info != "": - assert (curr_platform_info == platform_info) - platform_info = curr_platform_info - - env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - return env.get_template('opencl_compiled_kernel.cc.jinja2').render( - maps=maps, - data_type='unsigned char', - variable_name='kCompiledProgramMap', - platform_info=platform_info, - ) - - -def opencl_codegen(output_path, - cl_binary_dirs="", - built_kernel_file_name="", - platform_info_file_name=""): - cpp_cl_binary_source = generate_cpp_source(cl_binary_dirs, - built_kernel_file_name, - platform_info_file_name) - if os.path.isfile(output_path): - os.remove(output_path) - with open(output_path, "w") as w_file: - w_file.write(cpp_cl_binary_source) - - -def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--cl_binary_dirs", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--built_kernel_file_name", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--platform_info_file_name", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--output_path", - type=str, - default="./mace/examples/codegen/opencl/opencl_compiled_program.cc", - help="The path of generated C++ header file for cl binaries.") - return parser.parse_known_args() - - -if __name__ == '__main__': - FLAGS, unparsed = parse_args() - opencl_codegen(FLAGS.output_path, - FLAGS.cl_binary_dirs, - FLAGS.built_kernel_file_name, - FLAGS.platform_info_file_name) diff --git a/mace/python/tools/opencl_compiled_kernel.cc.jinja2 b/mace/python/tools/opencl_compiled_kernel.cc.jinja2 deleted file mode 100644 index db268413..00000000 --- a/mace/python/tools/opencl_compiled_kernel.cc.jinja2 +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This is a generated file. DO NOT EDIT! - -#include -#include -#include - -namespace mace { - -extern const std::map> {{variable_name}} = -{ - {% for key, value in maps.iteritems() %} - { - "{{key}}", - { - {%- for ele in value -%} - {{ele}}, - {%- endfor -%} - } - }, // {{key}} -{% endfor %} -}; - -extern const std::string kCompiledProgramPlatform = {{platform_info|tojson}}; - -} // namespace mace diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD index 236b0543..f53b9ab7 100644 --- a/mace/tools/validation/BUILD +++ b/mace/tools/validation/BUILD @@ -13,6 +13,5 @@ cc_binary( "//external:gflags_nothreads", "//mace/codegen:generated_mace_engine_factory", "//mace/codegen:generated_models", - "//mace/core", ], ) diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index ac524e41..bb2887ad 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -38,9 +38,6 @@ #include "mace/utils/logging.h" #include "mace/utils/utils.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_runtime.h" -#endif // MACE_ENABLE_OPENCL #include "mace/codegen/engine/mace_engine_factory.h" namespace mace { @@ -100,22 +97,6 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } -#ifdef MACE_ENABLE_OPENCL -void WriteOpenCLPlatformInfo(const std::string &output_dir) { - std::string platform_info = OpenCLRuntime::Global()->platform_info(); - const std::string cl_platform_info_file_name = output_dir - + "/mace_cl_platform_info.txt"; - - std::ofstream ofs(cl_platform_info_file_name); - if (ofs.is_open()) { - ofs << platform_info; - ofs.close(); - } else { - LOG(WARNING) << "Write opencl platform info failed."; - } -} -#endif // MACE_ENABLE_OPENCL - struct mallinfo LogMallinfoChange(struct mallinfo prev) { struct mallinfo curr = mallinfo(); if (prev.arena != curr.arena) { @@ -187,6 +168,9 @@ DEFINE_string(input_file, DEFINE_string(output_file, "", "output file name | output file prefix for multiple outputs"); +DEFINE_string(opencl_binary_file, + "", + "compiled opencl binary file path"); DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0 or 2"); @@ -230,6 +214,11 @@ bool RunModel(const std::string &model_name, new FileStorageFactory(kernel_file_path)); SetKVStorageFactory(storage_factory); + if (device_type == DeviceType::GPU) { + std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + std::vector model_pb_data; if (FLAGS_model_file != "") { if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) { @@ -397,11 +386,6 @@ bool RunModel(const std::string &model_name, printf("time %11.3f %11.3f %11.3f\n", init_millis, warmup_millis, model_run_millis); -#ifdef MACE_ENABLE_OPENCL - if (device_type == DeviceType::GPU) { - WriteOpenCLPlatformInfo(kernel_file_path); - } -#endif // MACE_ENABLE_OPENCL for (size_t i = 0; i < output_count; ++i) { std::string output_name = diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 40c4b5fb..166a0edc 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -132,7 +132,6 @@ def main(unused_args): # generate sources sh_commands.gen_encrypted_opencl_source() - sh_commands.gen_compiled_opencl_source() sh_commands.gen_mace_version() sh_commands.gen_tuning_param_code([]) diff --git a/tools/common.py b/tools/common.py index 5e3d1149..fec09692 100644 --- a/tools/common.py +++ b/tools/common.py @@ -108,6 +108,15 @@ class StringFormatter: return star_line + str(message).center(line_length) + '\n' + star_line +################################ +# definitions +################################ +class DeviceType(object): + CPU = 'CPU' + GPU = 'GPU' + HEXAGON = 'HEXAGON' + + ################################ # Argument types ################################ diff --git a/tools/converter.py b/tools/converter.py index 09a51a0a..468de321 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -29,6 +29,7 @@ import sh_commands from sh_commands import BuildType from common import CaffeEnvType +from common import DeviceType from common import mace_check from common import MaceLogger from common import StringFormatter @@ -37,13 +38,14 @@ from common import StringFormatter # common definitions ################################ BUILD_OUTPUT_DIR = 'build' -PHONE_DATA_DIR = "/data/local/tmp/mace_run/" +PHONE_DATA_DIR = "/data/local/tmp/mace_run" MODEL_OUTPUT_DIR_NAME = 'model' BUILD_TMP_DIR_NAME = '_tmp' BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general' OUTPUT_LIBRARY_DIR_NAME = 'library' -CL_BUILT_KERNEL_FILE_NAME = "mace_cl_compiled_program.bin" -CL_PLATFORM_INFO_FILE_NAME = "mace_cl_platform_info.txt" +OUTPUT_OPENCL_BINARY_DIR_NAME = 'opencl' +OUTPUT_OPENCL_BINARY_FILE_NAME = 'compiled_opencl_kernel.bin' +CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin" CODEGEN_BASE_DIR = 'mace/codegen' MODEL_CODEGEN_DIR = CODEGEN_BASE_DIR + '/models' MACE_RUN_TARGET = "//mace/tools/validation:mace_run" @@ -176,11 +178,11 @@ def parse_device_type(runtime): device_type = "" if runtime == RuntimeType.dsp: - device_type = "HEXAGON" + device_type = DeviceType.HEXAGON elif runtime == RuntimeType.gpu: - device_type = "GPU" + device_type = DeviceType.GPU elif runtime == RuntimeType.cpu: - device_type = "CPU" + device_type = DeviceType.CPU return device_type @@ -433,6 +435,13 @@ def get_build_model_dirs(library_name, model_name, target_abi, target_soc, return model_output_base_dir, model_output_dir, mace_model_dir +def get_opencl_binary_output_path(library_name): + return '%s/%s/%s/%s' % (BUILD_OUTPUT_DIR, + library_name, + OUTPUT_OPENCL_BINARY_DIR_NAME, + OUTPUT_OPENCL_BINARY_FILE_NAME) + + ################################ # build ################################ @@ -440,17 +449,7 @@ def pull_opencl_binary_and_tuning_param(target_abi, serialno, model_output_dirs): sh_commands.pull_binaries(target_abi, serialno, model_output_dirs, - CL_BUILT_KERNEL_FILE_NAME, - CL_PLATFORM_INFO_FILE_NAME) - - -def gen_opencl_and_tuning_code(model_output_dirs): - # generate opencl binary code - sh_commands.gen_opencl_binary_code(model_output_dirs, - CL_BUILT_KERNEL_FILE_NAME, - CL_PLATFORM_INFO_FILE_NAME) - - sh_commands.gen_tuning_param_code(model_output_dirs) + CL_COMPILED_BINARY_FILE_NAME) def print_configuration(flags, configs): @@ -612,7 +611,7 @@ def build_specific_lib(target_abi, target_soc, serial_num, sh.rm("-rf", build_tmp_binary_dir) os.makedirs(build_tmp_binary_dir) - gen_opencl_and_tuning_code([]) + sh_commands.gen_tuning_param_code(model_output_dirs) sh_commands.bazel_build( MACE_RUN_TARGET, abi=target_abi, @@ -639,7 +638,7 @@ def build_specific_lib(target_abi, target_soc, serial_num, os.makedirs(model_output_dir) # build for specified soc - if not address_sanitizer and tuning and target_abi != ABIType.host \ + if not address_sanitizer and target_abi != ABIType.host \ and target_soc is not None and \ model_runtime in [RuntimeType.gpu, RuntimeType.cpu_gpu]: sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR) @@ -674,7 +673,8 @@ def build_specific_lib(target_abi, target_soc, serial_num, tuning=tuning, out_of_range_check=False, phone_data_dir=PHONE_DATA_DIR, - build_type=build_type + build_type=build_type, + opencl_binary_file="", ) pull_opencl_binary_and_tuning_param(target_abi, serial_num, @@ -683,7 +683,10 @@ def build_specific_lib(target_abi, target_soc, serial_num, binary_changed = True if binary_changed: - gen_opencl_and_tuning_code(model_output_dirs) + sh_commands.merge_opencl_binaries( + model_output_dirs, CL_COMPILED_BINARY_FILE_NAME, + get_opencl_binary_output_path(library_name)) + sh_commands.gen_tuning_param_code(model_output_dirs) sh_commands.bazel_build( MACE_RUN_TARGET, abi=target_abi, @@ -919,6 +922,7 @@ def run_specific_target(flags, configs, target_abi, gpu_priority_hint=flags.gpu_priority_hint, runtime_failure_ratio=flags.runtime_failure_ratio, address_sanitizer=flags.address_sanitizer, + opencl_binary_file=get_opencl_binary_output_path(library_name), ) if flags.validate: model_file_path, weight_file_path = get_model_files_path( @@ -1051,7 +1055,8 @@ def bm_specific_target(flags, configs, target_abi, target_soc, serial_num): omp_num_threads=flags.omp_num_threads, cpu_affinity_policy=flags.cpu_affinity_policy, gpu_perf_hint=flags.gpu_perf_hint, - gpu_priority_hint=flags.gpu_priority_hint) + gpu_priority_hint=flags.gpu_priority_hint, + opencl_binary_file=get_opencl_binary_output_path(library_name)) def benchmark_model(flags): diff --git a/tools/sh_commands.py b/tools/sh_commands.py index b39112c9..0234eff4 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -16,9 +16,11 @@ import falcon_cli import filelock import glob import logging +import numpy as np import os import re import sh +import struct import subprocess import sys import time @@ -30,7 +32,6 @@ import common sys.path.insert(0, "mace/python/tools") try: from encrypt_opencl_codegen import encrypt_opencl_codegen - from opencl_codegen import opencl_codegen from binary_codegen import tuning_param_codegen from generate_data import generate_input_data from validate import validate @@ -362,8 +363,7 @@ def gen_mace_engine_factory_source(model_tags, def pull_binaries(abi, serialno, model_output_dirs, - cl_built_kernel_file_name, - cl_platform_info_file_name): + cl_built_kernel_file_name): compiled_opencl_dir = "/data/local/tmp/mace_run/interior/" mace_run_param_file = "mace_run.config" @@ -379,26 +379,66 @@ def pull_binaries(abi, serialno, model_output_dirs, if abi != "host": adb_pull(compiled_opencl_dir + cl_built_kernel_file_name, cl_bin_dir, serialno) - adb_pull(compiled_opencl_dir + cl_platform_info_file_name, - cl_bin_dir, serialno) adb_pull("/data/local/tmp/mace_run/%s" % mace_run_param_file, cl_bin_dir, serialno) -def gen_opencl_binary_code(model_output_dirs, - cl_built_kernel_file_name, - cl_platform_info_file_name, - codegen_path="mace/codegen"): - opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path - +def merge_opencl_binaries(binaries_dirs, + cl_compiled_program_file_name, + output_file_path): + platform_info_key = 'mace_opencl_precompiled_platform_info_key' cl_bin_dirs = [] - for d in model_output_dirs: + for d in binaries_dirs: cl_bin_dirs.append(os.path.join(d, "opencl_bin")) - cl_bin_dirs_str = ",".join(cl_bin_dirs) - opencl_codegen(opencl_codegen_file, - cl_bin_dirs_str, - cl_built_kernel_file_name, - cl_platform_info_file_name) + # create opencl binary output dir + opencl_binary_dir = os.path.dirname(output_file_path) + if os.path.exists(opencl_binary_dir): + sh.rm("-rf", opencl_binary_dir) + sh.mkdir("-p", opencl_binary_dir) + kvs = {} + for binary_dir in cl_bin_dirs: + binary_path = os.path.join(binary_dir, cl_compiled_program_file_name) + if not os.path.exists(binary_path): + continue + + print 'generate opencl code from', binary_path + with open(binary_path, "rb") as f: + binary_array = np.fromfile(f, dtype=np.uint8) + + idx = 0 + size, = struct.unpack("Q", binary_array[idx:idx + 8]) + idx += 8 + for _ in xrange(size): + key_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + key, = struct.unpack( + str(key_size) + "s", binary_array[idx:idx + key_size]) + idx += key_size + value_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + if key == platform_info_key and key in kvs: + common.mace_check( + (kvs[key] == binary_array[idx:idx + value_size]).all(), + "", + "There exists more than one OpenCL version for models:" + " %s vs %s " % + (kvs[key], binary_array[idx:idx + value_size])) + else: + kvs[key] = binary_array[idx:idx + value_size] + idx += value_size + + output_byte_array = bytearray() + data_size = len(kvs) + output_byte_array.extend(struct.pack("Q", data_size)) + for key, value in kvs.iteritems(): + key_size = len(key) + output_byte_array.extend(struct.pack("i", key_size)) + output_byte_array.extend(struct.pack(str(key_size) + "s", key)) + value_size = len(value) + output_byte_array.extend(struct.pack("i", value_size)) + output_byte_array.extend(value) + + np.array(output_byte_array).tofile(output_file_path) def gen_tuning_param_code(model_output_dirs, @@ -426,12 +466,6 @@ def gen_mace_version(codegen_path="mace/codegen"): "%s/version/version.cc" % codegen_path) -def gen_compiled_opencl_source(codegen_path="mace/codegen"): - opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path - sh.mkdir("-p", "%s/opencl" % codegen_path) - opencl_codegen(opencl_codegen_file) - - def gen_model_code(model_codegen_dir, platform, model_file_path, @@ -576,6 +610,7 @@ def tuning_run(abi, out_of_range_check, phone_data_dir, build_type, + opencl_binary_file, omp_num_threads=-1, cpu_affinity_policy=1, gpu_perf_hint=3, @@ -641,6 +676,10 @@ def tuning_run(abi, adb_push("%s/%s.data" % (mace_model_dir, model_tag), phone_data_dir, serialno) + if device_type == common.DeviceType.GPU\ + and os.path.exists(opencl_binary_file): + adb_push(opencl_binary_file, phone_data_dir, serialno) + adb_push("third_party/nnlib/libhexagon_controller.so", phone_data_dir, serialno) @@ -689,6 +728,8 @@ def tuning_run(abi, "--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_priority_hint=%s" % gpu_priority_hint, "--model_file=%s" % mace_model_phone_path, + "--opencl_binary_file=%s/%s" % + (phone_data_dir, os.path.basename(opencl_binary_file)), ]) adb_cmd = ' '.join(adb_cmd) sh.adb( @@ -1005,6 +1046,7 @@ def benchmark_model(abi, device_type, phone_data_dir, build_type, + opencl_binary_file, omp_num_threads=-1, cpu_affinity_policy=1, gpu_perf_hint=3, @@ -1049,6 +1091,9 @@ def benchmark_model(abi, if not embed_model_data: adb_push("%s/%s.data" % (mace_model_dir, model_tag), phone_data_dir, serialno) + if device_type == common.DeviceType.GPU \ + and os.path.exists(opencl_binary_file): + adb_push(opencl_binary_file, phone_data_dir, serialno) mace_model_phone_path = "" if build_type == BuildType.proto: mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag) @@ -1082,6 +1127,8 @@ def benchmark_model(abi, "--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_priority_hint=%s" % gpu_priority_hint, "--model_file=%s" % mace_model_phone_path, + "--opencl_binary_file=%s/%s" % + (phone_data_dir, os.path.basename(opencl_binary_file)), _fg=True) print("Benchmark done!\n") -- GitLab