Refactor opencl binary load logic: use file instead of code.

39fb3055 · liuqi · d4124708 · 39fb3055 · 39fb3055 · 39fb3055
15 changed file
--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -365,6 +365,16 @@ The followings list the details.

        ``.pb`` file will be generated only when build_type is ``proto``.

+**OpenCL compiled kernel binary file**
+    * ``opencl/compiled_kernel.bin``
+
+    .. note::
+
+        This file will be generated only when specify ``target_soc`` and runtime is ``gpu``.
+
+    .. warning::
+
+        This file rely on the OpenCL driver on the phone, you should update the file when OpenCL driver changed.

 =============
 5. how to use
@@ -385,14 +395,21 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li
        new FileStorageFactory(file_path));
    ConfigKVStorageFactory(storage_factory);

-    //1. Declare the device type(must be same with ``runtime`` in configuration file)
+    // 1. set precompiled OpenCL binary file paths if you use gpu of specified SOC,
+    //    Besides the binary rely on the OpenCL driver of the SOC,
+    //    if OpenCL driver changed, you should recompiled the binary file.
+    if (device_type == DeviceType::GPU) {
+      mace::SetOpenCLBinaryPaths(opencl_binary_paths);
+    }
+
+    // 2. Declare the device type(must be same with ``runtime`` in configuration file)
    DeviceType device_type = DeviceType::GPU;

-    //2. Define the input and output tensor names.
+    // 3. Define the input and output tensor names.
    std::vector<std::string> input_names = {...};
    std::vector<std::string> output_names = {...};

-    //3. Create MaceEngine object
+    // 4. Create MaceEngine object
    std::shared_ptr<mace::MaceEngine> engine;
    MaceStatus create_engine_status;
    // Create Engine from code
@@ -415,7 +432,7 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li
      // do something
    }

-    //4. Create Input and Output objects
+    // 5. Create Input and Output objects
    std::map<std::string, mace::MaceTensor> inputs;
    std::map<std::string, mace::MaceTensor> outputs;
    for (size_t i = 0; i < input_count; ++i) {
@@ -440,6 +457,6 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li
      outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
    }

-    //5. Run the model
+    // 6. Run the model
    MaceStatus status = engine.Run(inputs, &outputs);

--- a/docs/getting_started/how_to_build_zh.rst
+++ b/docs/getting_started/how_to_build_zh.rst
@@ -390,6 +390,13 @@ Mace目前只提供静态库，有以下两种使用场景。
        new FileStorageFactory(file_path));
    ConfigKVStorageFactory(storage_factory);

+    // 2. 如果你使用特定SOC的GPU，可以设置OpenCL预编译的二进制文件路径。
+    //    * 该二进制文件是依赖于手机上OpenCL driver的，如果OpenCL driver改变了，
+    //      你需要重新编译并更新该二进制文件。
+    if (device_type == DeviceType::GPU) {
+      mace::SetOpenCLBinaryPaths(opencl_binary_paths);
+    }
+
    //1. 声明设备类型(必须与build时指定的runtime一致）
    DeviceType device_type = DeviceType::GPU;


--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -188,6 +188,9 @@ DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "number of runs max");
 DEFINE_string(max_time, "10.0", "length to run max");
 DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
+DEFINE_string(opencl_binary_file,
+              "",
+              "compiled opencl binary file path");
 DEFINE_string(model_data_file, "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
 DEFINE_string(model_file, "",
@@ -270,6 +273,11 @@ int Main(int argc, char **argv) {
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);

+  if (device_type == DeviceType::GPU) {
+    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
+    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
+  }
+
  // Create Engine
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -14,6 +14,8 @@

 #include "mace/core/runtime/opencl/opencl_runtime.h"

+#include <sys/stat.h>
+
 #include <cstdlib>
 #include <fstream>
 #include <memory>
@@ -31,9 +33,6 @@

 namespace mace {

-extern const std::map<std::string, std::vector<unsigned char>>
-    kCompiledProgramMap;
-extern const std::string kCompiledProgramPlatform;
 extern const std::map<std::string, std::vector<unsigned char>>
    kEncryptedProgramMap;

@@ -43,6 +42,12 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
 }

+// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe)
+void SetOpenCLBinaryPaths(const std::vector<std::string> &paths) {
+  OpenCLRuntime::ConfigureOpenCLBinaryPath(paths);
+}
+
+
 const std::string OpenCLErrorToString(cl_int error) {
  switch (error) {
    case CL_SUCCESS:
@@ -237,6 +242,25 @@ GPUType ParseGPUType(const std::string &device_name) {
    return GPUType::UNKNOWN;
  }
 }
+
+std::string FindFirstExistPath(const std::vector<std::string> &paths) {
+  std::string result;
+  struct stat st;
+  for (auto path : paths) {
+    if (stat(path.c_str(), &st) == 0) {
+      if (S_ISREG(st.st_mode)) {
+        result = path;
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+const char *kOpenCLPlatformInfoKey =
+    "mace_opencl_precompiled_platform_info_key";
+const char *kPrecompiledProgramFileName =
+    "mace_cl_compiled_program.bin";
 }  // namespace

 void OpenCLProfilingTimer::StartTiming() {}
@@ -267,6 +291,8 @@ void OpenCLProfilingTimer::ClearTiming() {
 GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
 GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
    GPUPriorityHint::PRIORITY_DEFAULT;
+std::string
+    OpenCLRuntime::kPrecompiledBinaryPath = "";  // NOLINT(runtime/string)

 OpenCLRuntime *OpenCLRuntime::Global() {
  static OpenCLRuntime runtime;
@@ -279,9 +305,19 @@ void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
  OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
 }

+void OpenCLRuntime::ConfigureOpenCLBinaryPath(
+    const std::vector<std::string> &paths) {
+  OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
+  if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
+    LOG(WARNING) << "There is no precompiled OpenCL binary file in "
+                 << MakeString(paths);
+  }
+}

 OpenCLRuntime::OpenCLRuntime():
-    storage_(nullptr), is_profiling_enabled_(false) {
+    precompiled_binary_storage_(nullptr),
+    cache_storage_(nullptr),
+    is_profiling_enabled_(false) {
  LoadOpenCLLibrary();

  std::vector<cl::Platform> all_platforms;
@@ -369,12 +405,38 @@ OpenCLRuntime::OpenCLRuntime():

  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
  if (kStorageFactory != nullptr) {
-    const std::string cl_compiled_file_name = "mace_cl_compiled_program.bin";
-    storage_ = kStorageFactory->CreateStorage(cl_compiled_file_name);
+    cache_storage_ =
+        kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
+
+    if (cache_storage_->Load() != 0) {
+      LOG(FATAL) << "Load OpenCL cached compiled kernel file failed";
+    }
+    auto platform_info_array =
+        this->cache_storage_->Find(kOpenCLPlatformInfoKey);
+    if (platform_info_array != nullptr) {
+      cached_binary_platform_info_ =
+          std::string(platform_info_array->begin(),
+                      platform_info_array->end());
+    }
+  }

-    if (platform_info_ != kCompiledProgramPlatform) {
-      if (storage_->Load() != 0) {
-        LOG(FATAL) << "Load opencl compiled kernel file failed";
+  if (cached_binary_platform_info_ != platform_info_) {
+    if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
+      LOG(WARNING) << "There is no precompiled OpenCL binary in"
+          " all OpenCL binary paths";
+    } else {
+      precompiled_binary_storage_.reset(
+          new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
+      if (precompiled_binary_storage_->Load() != 0) {
+        LOG(FATAL) << "Load OpenCL precompiled kernel file failed";
+      }
+
+      auto platform_info_array =
+          this->precompiled_binary_storage_->Find(kOpenCLPlatformInfoKey);
+      if (platform_info_array != nullptr) {
+        precompiled_binary_platform_info_ =
+            std::string(platform_info_array->begin(),
+                        platform_info_array->end());
      }
    }
  }
@@ -416,16 +478,23 @@ uint32_t OpenCLRuntime::device_compute_units() const {
  return device_compute_units_;
 }

-bool OpenCLRuntime::BuildProgramFromBinary(
+bool OpenCLRuntime::BuildProgramFromCache(
    const std::string &built_program_key,
    const std::string &build_options_str,
    cl::Program *program) {
  // Find from binary
-  if (kCompiledProgramPlatform != platform_info_) return false;
-  auto it_binary = kCompiledProgramMap.find(built_program_key);
-  if (it_binary == kCompiledProgramMap.end()) return false;
+  if (this->cache_storage_ == nullptr) return false;
+  if (cached_binary_platform_info_ != platform_info_) {
+    VLOG(3) << "cached OpenCL binary version is not same"
+        " with current version";
+    return false;
+  }
+  auto content = this->cache_storage_->Find(built_program_key);
+  if (content == nullptr) {
+    return false;
+  }

-  *program = cl::Program(context(), {device()}, {it_binary->second});
+  *program = cl::Program(context(), {device()}, {*content});
  cl_int ret = program->build({device()}, build_options_str.c_str());
  if (ret != CL_SUCCESS) {
    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
@@ -435,25 +504,27 @@ bool OpenCLRuntime::BuildProgramFromBinary(
      LOG(INFO) << "Program build log: " << build_log;
    }
    LOG(WARNING) << "Build program "
-                 << built_program_key << " from Binary failed:"
-                 << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible "
-                     "cause 1: the MACE library is built from SoC 1 but is "
-                     "used on different SoC 2, possible cause 2: the MACE "
-                     "buffer is corrupted make sure your code has no "
-                     "out-of-range memory writing" : MakeString(ret));
+                 << built_program_key << " from Cache failed:"
+                 << MakeString(ret);
    return false;
  }
-  VLOG(3) << "Program from Binary: " << built_program_key;
+  VLOG(3) << "Program from Cache: " << built_program_key;
  return true;
 }

-bool OpenCLRuntime::BuildProgramFromCache(
+bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
    const std::string &built_program_key,
    const std::string &build_options_str,
    cl::Program *program) {
  // Find from binary
-  if (this->storage_ == nullptr) return false;
-  auto content = this->storage_->Find(built_program_key);
+  if (this->precompiled_binary_storage_ == nullptr) return false;
+  if (precompiled_binary_platform_info_ != platform_info_) {
+    VLOG(3) << "precompiled OpenCL binary version "
+            << precompiled_binary_platform_info_
+            << " is not same with current version";
+    return false;
+  }
+  auto content = this->precompiled_binary_storage_->Find(built_program_key);
  if (content == nullptr) {
    return false;
  }
@@ -468,11 +539,11 @@ bool OpenCLRuntime::BuildProgramFromCache(
      LOG(INFO) << "Program build log: " << build_log;
    }
    LOG(WARNING) << "Build program "
-                 << built_program_key << " from Cache failed:"
+                 << built_program_key << " from precompiled binary failed:"
                 << MakeString(ret);
    return false;
  }
-  VLOG(3) << "Program from Cache: " << built_program_key;
+  VLOG(3) << "Program from precompiled binary: " << built_program_key;
  return true;
 }

@@ -527,8 +598,8 @@ void OpenCLRuntime::BuildProgramFromSource(
        reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
            program_binary_sizes[0]);

-    if (this->storage_ != nullptr) {
-      this->storage_->Insert(built_program_key, content);
+    if (this->cache_storage_ != nullptr) {
+      this->cache_storage_->Insert(built_program_key, content);
    }

    VLOG(3) << "Program from source: " << built_program_key;
@@ -543,13 +614,12 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,

  std::string build_options_str =
      build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math";
-  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  bool ret = BuildProgramFromBinary(built_program_key,
+  // Build flow: cache -> precompiled binary -> source
+  bool ret = BuildProgramFromCache(built_program_key,
                                   build_options_str, program);
  if (!ret) {
-    ret = BuildProgramFromCache(built_program_key,
+    ret = BuildProgramFromPrecompiledBinary(built_program_key,
                                            build_options_str, program);
-    // Fallback to source.
    if (!ret) {
      BuildProgramFromSource(program_name, built_program_key,
                             build_options_str, program);
@@ -581,8 +651,12 @@ cl::Kernel OpenCLRuntime::BuildKernel(
 }

 void OpenCLRuntime::SaveBuiltCLProgram() {
-  if (storage_ != nullptr) {
-    if (storage_->Flush() != 0) {
+  if (cache_storage_ != nullptr) {
+    // update platform info
+    cache_storage_->Insert(kOpenCLPlatformInfoKey,
+                           std::vector<unsigned char>(platform_info_.begin(),
+                                                      platform_info_.end()));
+    if (cache_storage_->Flush() != 0) {
      LOG(FATAL) << "Store OPENCL compiled kernel to file failed."
          " Please Make sure the storage directory exist.";
    }

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -66,6 +66,7 @@ class OpenCLRuntime {
 public:
  static OpenCLRuntime *Global();
  static void Configure(GPUPerfHint, GPUPriorityHint);
+  static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths);

  cl::Context &context();
  cl::Device &device();
@@ -99,11 +100,11 @@ class OpenCLRuntime {
                    const std::string &binary_file_name,
                    const std::string &build_options,
                    cl::Program *program);
-  bool BuildProgramFromBinary(
+  bool BuildProgramFromCache(
      const std::string &built_program_key,
      const std::string &build_options_str,
      cl::Program *program);
-  bool BuildProgramFromCache(
+  bool BuildProgramFromPrecompiledBinary(
      const std::string &built_program_key,
      const std::string &build_options_str,
      cl::Program *program);
@@ -115,7 +116,8 @@ class OpenCLRuntime {
  const std::string ParseDeviceVersion(const std::string &device_version);

 private:
-  std::unique_ptr<KVStorage> storage_;
+  std::unique_ptr<KVStorage> precompiled_binary_storage_;
+  std::unique_ptr<KVStorage> cache_storage_;
  bool is_profiling_enabled_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
@@ -126,6 +128,8 @@ class OpenCLRuntime {
  std::mutex program_build_mutex_;
  std::string platform_info_;
  std::string opencl_version_;
+  std::string precompiled_binary_platform_info_;
+  std::string cached_binary_platform_info_;
  bool out_of_range_check_;
  uint64_t device_gloabl_mem_cache_size_;
  uint32_t device_compute_units_;
@@ -133,6 +137,7 @@ class OpenCLRuntime {

  static GPUPerfHint kGPUPerfHint;
  static GPUPriorityHint kGPUPriorityHint;
+  static std::string kPrecompiledBinaryPath;
 };

 }  // namespace mace

--- a/mace/examples/example.cc
+++ b/mace/examples/example.cc
@@ -123,6 +123,9 @@ DEFINE_string(model_data_file,
 DEFINE_string(model_file,
              "",
              "model file name, used when load mace model in pb");
+DEFINE_string(opencl_binary_file,
+              "",
+              "compiled opencl binary file path");
 DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
@@ -151,6 +154,10 @@ bool RunModel(const std::vector<std::string> &input_names,
  }
 #endif  // MACE_ENABLE_OPENCL

+  if (device_type == DeviceType::GPU) {
+    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
+    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
+  }
  // DO NOT USE tmp directory.
  // Please use APP's own directory and make sure the directory exists.
  // Just call once

--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -76,9 +76,16 @@ class FileStorageFactory : public KVStorageFactory {
  std::unique_ptr<Impl> impl_;
 };

-// Set KV store factory used as OpenCL cache.
+// Set KV store factory used as OpenCL cache. (Call Once)
 void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);

+// Just call once. (Not thread-safe)
+// Set paths of OpenCL Compiled Binary file if you use gpu of specific soc.
+// Using OpenCL binary will speed up the initialization.
+// OpenCL binary is corresponding to the OpenCL Driver version,
+// you should update the binary when OpenCL Driver changed.
+void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
+
 // Set GPU hints, currently only supports Adreno GPU.
 //
 // Caution: this function may hurt performance if improper parameters provided.

--- a/mace/python/tools/opencl_codegen.py
+++ b/mace/python/tools/opencl_codegen.py
-# Copyright 2018 Xiaomi, Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import struct
-
-import numpy as np
-
-import jinja2
-
-# python mace/python/tools/opencl_codegen.py \
-#     --cl_binary_dirs=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH}
-
-FLAGS = None
-
-
-def generate_cpp_source(cl_binary_dirs,
-                        built_kernel_file_name,
-                        platform_info_file_name):
-    maps = {}
-    platform_info = ''
-    binary_dirs = cl_binary_dirs.strip().split(",")
-    for binary_dir in binary_dirs:
-        binary_path = os.path.join(binary_dir, built_kernel_file_name)
-        if not os.path.exists(binary_path):
-            continue
-
-        print 'generate opencl code from', binary_path
-        with open(binary_path, "rb") as f:
-            binary_array = np.fromfile(f, dtype=np.uint8)
-
-        idx = 0
-        size, = struct.unpack("Q", binary_array[idx:idx + 8])
-        idx += 8
-        for _ in xrange(size):
-            key_size, = struct.unpack("i", binary_array[idx:idx + 4])
-            idx += 4
-            key, = struct.unpack(
-                str(key_size) + "s", binary_array[idx:idx + key_size])
-            idx += key_size
-            value_size, = struct.unpack("i", binary_array[idx:idx + 4])
-            idx += 4
-            maps[key] = []
-            value = struct.unpack(
-                str(value_size) + "B", binary_array[idx:idx + value_size])
-            idx += value_size
-            for ele in value:
-                maps[key].append(hex(ele))
-
-        cl_platform_info_path = os.path.join(binary_dir,
-                                             platform_info_file_name)
-        with open(cl_platform_info_path, 'r') as f:
-            curr_platform_info = f.read()
-        if platform_info != "":
-            assert (curr_platform_info == platform_info)
-        platform_info = curr_platform_info
-
-    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-    return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
-        maps=maps,
-        data_type='unsigned char',
-        variable_name='kCompiledProgramMap',
-        platform_info=platform_info,
-    )
-
-
-def opencl_codegen(output_path,
-                   cl_binary_dirs="",
-                   built_kernel_file_name="",
-                   platform_info_file_name=""):
-    cpp_cl_binary_source = generate_cpp_source(cl_binary_dirs,
-                                               built_kernel_file_name,
-                                               platform_info_file_name)
-    if os.path.isfile(output_path):
-        os.remove(output_path)
-    with open(output_path, "w") as w_file:
-        w_file.write(cpp_cl_binary_source)
-
-
-def parse_args():
-    """Parses command line arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--cl_binary_dirs",
-        type=str,
-        default="",
-        help="The cl binaries directories.")
-    parser.add_argument(
-        "--built_kernel_file_name",
-        type=str,
-        default="",
-        help="The cl binaries directories.")
-    parser.add_argument(
-        "--platform_info_file_name",
-        type=str,
-        default="",
-        help="The cl binaries directories.")
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
-        help="The path of generated C++ header file for cl binaries.")
-    return parser.parse_known_args()
-
-
-if __name__ == '__main__':
-    FLAGS, unparsed = parse_args()
-    opencl_codegen(FLAGS.output_path,
-                   FLAGS.cl_binary_dirs,
-                   FLAGS.built_kernel_file_name,
-                   FLAGS.platform_info_file_name)
--- a/mace/python/tools/opencl_compiled_kernel.cc.jinja2
+++ b/mace/python/tools/opencl_compiled_kernel.cc.jinja2
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This is a generated file. DO NOT EDIT!
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace mace {
-
-extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
-{
-  {% for key, value in maps.iteritems() %}
-  {
-    "{{key}}",
-    {
-      {%- for ele in value -%}
-      {{ele}},
-      {%- endfor -%}
-    }
-  },  // {{key}}
-{% endfor %}
-};
-
-extern const std::string kCompiledProgramPlatform = {{platform_info|tojson}};
-
-}  // namespace mace
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
@@ -13,6 +13,5 @@ cc_binary(
        "//external:gflags_nothreads",
        "//mace/codegen:generated_mace_engine_factory",
        "//mace/codegen:generated_models",
-        "//mace/core",
    ],
 )
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -38,9 +38,6 @@
 #include "mace/utils/logging.h"
 #include "mace/utils/utils.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#endif  // MACE_ENABLE_OPENCL
 #include "mace/codegen/engine/mace_engine_factory.h"

 namespace mace {
@@ -100,22 +97,6 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }

-#ifdef MACE_ENABLE_OPENCL
-void WriteOpenCLPlatformInfo(const std::string &output_dir) {
-  std::string platform_info = OpenCLRuntime::Global()->platform_info();
-  const std::string cl_platform_info_file_name = output_dir
-      + "/mace_cl_platform_info.txt";
-
-  std::ofstream ofs(cl_platform_info_file_name);
-  if (ofs.is_open()) {
-    ofs << platform_info;
-    ofs.close();
-  } else {
-    LOG(WARNING) << "Write opencl platform info failed.";
-  }
-}
-#endif  // MACE_ENABLE_OPENCL
-
 struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  struct mallinfo curr = mallinfo();
  if (prev.arena != curr.arena) {
@@ -187,6 +168,9 @@ DEFINE_string(input_file,
 DEFINE_string(output_file,
              "",
              "output file name | output file prefix for multiple outputs");
+DEFINE_string(opencl_binary_file,
+              "",
+              "compiled opencl binary file path");
 DEFINE_string(model_data_file,
              "",
              "model data file name, used when EMBED_MODEL_DATA set to 0 or 2");
@@ -230,6 +214,11 @@ bool RunModel(const std::string &model_name,
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);

+  if (device_type == DeviceType::GPU) {
+    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
+    mace::SetOpenCLBinaryPaths(opencl_binary_paths);
+  }
+
  std::vector<unsigned char> model_pb_data;
  if (FLAGS_model_file != "") {
    if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) {
@@ -397,11 +386,6 @@ bool RunModel(const std::string &model_name,
  printf("time %11.3f %11.3f %11.3f\n",
         init_millis, warmup_millis, model_run_millis);

-#ifdef MACE_ENABLE_OPENCL
-  if (device_type == DeviceType::GPU) {
-    WriteOpenCLPlatformInfo(kernel_file_path);
-  }
-#endif  // MACE_ENABLE_OPENCL

  for (size_t i = 0; i < output_count; ++i) {
    std::string output_name =

--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -132,7 +132,6 @@ def main(unused_args):

    # generate sources
    sh_commands.gen_encrypted_opencl_source()
-    sh_commands.gen_compiled_opencl_source()
    sh_commands.gen_mace_version()
    sh_commands.gen_tuning_param_code([])


--- a/tools/common.py
+++ b/tools/common.py
@@ -108,6 +108,15 @@ class StringFormatter:
        return star_line + str(message).center(line_length) + '\n' + star_line


+################################
+# definitions
+################################
+class DeviceType(object):
+    CPU = 'CPU'
+    GPU = 'GPU'
+    HEXAGON = 'HEXAGON'
+
+
 ################################
 # Argument types
 ################################

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -29,6 +29,7 @@ import sh_commands
 from sh_commands import BuildType

 from common import CaffeEnvType
+from common import DeviceType
 from common import mace_check
 from common import MaceLogger
 from common import StringFormatter
@@ -37,13 +38,14 @@ from common import StringFormatter
 # common definitions
 ################################
 BUILD_OUTPUT_DIR = 'build'
-PHONE_DATA_DIR = "/data/local/tmp/mace_run/"
+PHONE_DATA_DIR = "/data/local/tmp/mace_run"
 MODEL_OUTPUT_DIR_NAME = 'model'
 BUILD_TMP_DIR_NAME = '_tmp'
 BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
 OUTPUT_LIBRARY_DIR_NAME = 'library'
-CL_BUILT_KERNEL_FILE_NAME = "mace_cl_compiled_program.bin"
-CL_PLATFORM_INFO_FILE_NAME = "mace_cl_platform_info.txt"
+OUTPUT_OPENCL_BINARY_DIR_NAME = 'opencl'
+OUTPUT_OPENCL_BINARY_FILE_NAME = 'compiled_opencl_kernel.bin'
+CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin"
 CODEGEN_BASE_DIR = 'mace/codegen'
 MODEL_CODEGEN_DIR = CODEGEN_BASE_DIR + '/models'
 MACE_RUN_TARGET = "//mace/tools/validation:mace_run"
@@ -176,11 +178,11 @@ def parse_device_type(runtime):
    device_type = ""

    if runtime == RuntimeType.dsp:
-        device_type = "HEXAGON"
+        device_type = DeviceType.HEXAGON
    elif runtime == RuntimeType.gpu:
-        device_type = "GPU"
+        device_type = DeviceType.GPU
    elif runtime == RuntimeType.cpu:
-        device_type = "CPU"
+        device_type = DeviceType.CPU

    return device_type

@@ -433,6 +435,13 @@ def get_build_model_dirs(library_name, model_name, target_abi, target_soc,
    return model_output_base_dir, model_output_dir, mace_model_dir


+def get_opencl_binary_output_path(library_name):
+    return '%s/%s/%s/%s' % (BUILD_OUTPUT_DIR,
+                            library_name,
+                            OUTPUT_OPENCL_BINARY_DIR_NAME,
+                            OUTPUT_OPENCL_BINARY_FILE_NAME)
+
+
 ################################
 # build
 ################################
@@ -440,17 +449,7 @@ def pull_opencl_binary_and_tuning_param(target_abi,
                                        serialno,
                                        model_output_dirs):
    sh_commands.pull_binaries(target_abi, serialno, model_output_dirs,
-                              CL_BUILT_KERNEL_FILE_NAME,
-                              CL_PLATFORM_INFO_FILE_NAME)
-
-
-def gen_opencl_and_tuning_code(model_output_dirs):
-    # generate opencl binary code
-    sh_commands.gen_opencl_binary_code(model_output_dirs,
-                                       CL_BUILT_KERNEL_FILE_NAME,
-                                       CL_PLATFORM_INFO_FILE_NAME)
-
-    sh_commands.gen_tuning_param_code(model_output_dirs)
+                              CL_COMPILED_BINARY_FILE_NAME)


 def print_configuration(flags, configs):
@@ -612,7 +611,7 @@ def build_specific_lib(target_abi, target_soc, serial_num,
        sh.rm("-rf", build_tmp_binary_dir)
    os.makedirs(build_tmp_binary_dir)

-    gen_opencl_and_tuning_code([])
+    sh_commands.gen_tuning_param_code(model_output_dirs)
    sh_commands.bazel_build(
        MACE_RUN_TARGET,
        abi=target_abi,
@@ -639,7 +638,7 @@ def build_specific_lib(target_abi, target_soc, serial_num,
        os.makedirs(model_output_dir)

        # build for specified soc
-        if not address_sanitizer and tuning and target_abi != ABIType.host \
+        if not address_sanitizer and target_abi != ABIType.host \
                and target_soc is not None and \
                model_runtime in [RuntimeType.gpu, RuntimeType.cpu_gpu]:
            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
@@ -674,7 +673,8 @@ def build_specific_lib(target_abi, target_soc, serial_num,
                tuning=tuning,
                out_of_range_check=False,
                phone_data_dir=PHONE_DATA_DIR,
-                build_type=build_type
+                build_type=build_type,
+                opencl_binary_file="",
            )

            pull_opencl_binary_and_tuning_param(target_abi, serial_num,
@@ -683,7 +683,10 @@ def build_specific_lib(target_abi, target_soc, serial_num,
            binary_changed = True

    if binary_changed:
-        gen_opencl_and_tuning_code(model_output_dirs)
+        sh_commands.merge_opencl_binaries(
+            model_output_dirs, CL_COMPILED_BINARY_FILE_NAME,
+            get_opencl_binary_output_path(library_name))
+        sh_commands.gen_tuning_param_code(model_output_dirs)
        sh_commands.bazel_build(
            MACE_RUN_TARGET,
            abi=target_abi,
@@ -919,6 +922,7 @@ def run_specific_target(flags, configs, target_abi,
                gpu_priority_hint=flags.gpu_priority_hint,
                runtime_failure_ratio=flags.runtime_failure_ratio,
                address_sanitizer=flags.address_sanitizer,
+                opencl_binary_file=get_opencl_binary_output_path(library_name),
            )
            if flags.validate:
                model_file_path, weight_file_path = get_model_files_path(
@@ -1051,7 +1055,8 @@ def bm_specific_target(flags, configs, target_abi, target_soc, serial_num):
                omp_num_threads=flags.omp_num_threads,
                cpu_affinity_policy=flags.cpu_affinity_policy,
                gpu_perf_hint=flags.gpu_perf_hint,
-                gpu_priority_hint=flags.gpu_priority_hint)
+                gpu_priority_hint=flags.gpu_priority_hint,
+                opencl_binary_file=get_opencl_binary_output_path(library_name))


 def benchmark_model(flags):

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -16,9 +16,11 @@ import falcon_cli
 import filelock
 import glob
 import logging
+import numpy as np
 import os
 import re
 import sh
+import struct
 import subprocess
 import sys
 import time
@@ -30,7 +32,6 @@ import common
 sys.path.insert(0, "mace/python/tools")
 try:
    from encrypt_opencl_codegen import encrypt_opencl_codegen
-    from opencl_codegen import opencl_codegen
    from binary_codegen import tuning_param_codegen
    from generate_data import generate_input_data
    from validate import validate
@@ -362,8 +363,7 @@ def gen_mace_engine_factory_source(model_tags,


 def pull_binaries(abi, serialno, model_output_dirs,
-                  cl_built_kernel_file_name,
-                  cl_platform_info_file_name):
+                  cl_built_kernel_file_name):
    compiled_opencl_dir = "/data/local/tmp/mace_run/interior/"
    mace_run_param_file = "mace_run.config"

@@ -379,26 +379,66 @@ def pull_binaries(abi, serialno, model_output_dirs,
        if abi != "host":
            adb_pull(compiled_opencl_dir + cl_built_kernel_file_name,
                     cl_bin_dir, serialno)
-            adb_pull(compiled_opencl_dir + cl_platform_info_file_name,
-                     cl_bin_dir, serialno)
            adb_pull("/data/local/tmp/mace_run/%s" % mace_run_param_file,
                     cl_bin_dir, serialno)


-def gen_opencl_binary_code(model_output_dirs,
-                           cl_built_kernel_file_name,
-                           cl_platform_info_file_name,
-                           codegen_path="mace/codegen"):
-    opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path
-
+def merge_opencl_binaries(binaries_dirs,
+                          cl_compiled_program_file_name,
+                          output_file_path):
+    platform_info_key = 'mace_opencl_precompiled_platform_info_key'
    cl_bin_dirs = []
-    for d in model_output_dirs:
+    for d in binaries_dirs:
        cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
-    cl_bin_dirs_str = ",".join(cl_bin_dirs)
-    opencl_codegen(opencl_codegen_file,
-                   cl_bin_dirs_str,
-                   cl_built_kernel_file_name,
-                   cl_platform_info_file_name)
+    # create opencl binary output dir
+    opencl_binary_dir = os.path.dirname(output_file_path)
+    if os.path.exists(opencl_binary_dir):
+        sh.rm("-rf", opencl_binary_dir)
+    sh.mkdir("-p", opencl_binary_dir)
+    kvs = {}
+    for binary_dir in cl_bin_dirs:
+        binary_path = os.path.join(binary_dir, cl_compiled_program_file_name)
+        if not os.path.exists(binary_path):
+            continue
+
+        print 'generate opencl code from', binary_path
+        with open(binary_path, "rb") as f:
+            binary_array = np.fromfile(f, dtype=np.uint8)
+
+        idx = 0
+        size, = struct.unpack("Q", binary_array[idx:idx + 8])
+        idx += 8
+        for _ in xrange(size):
+            key_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            key, = struct.unpack(
+                str(key_size) + "s", binary_array[idx:idx + key_size])
+            idx += key_size
+            value_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            if key == platform_info_key and key in kvs:
+                common.mace_check(
+                    (kvs[key] == binary_array[idx:idx + value_size]).all(),
+                    "",
+                    "There exists more than one OpenCL version for models:"
+                    " %s vs %s " %
+                    (kvs[key], binary_array[idx:idx + value_size]))
+            else:
+                kvs[key] = binary_array[idx:idx + value_size]
+            idx += value_size
+
+    output_byte_array = bytearray()
+    data_size = len(kvs)
+    output_byte_array.extend(struct.pack("Q", data_size))
+    for key, value in kvs.iteritems():
+        key_size = len(key)
+        output_byte_array.extend(struct.pack("i", key_size))
+        output_byte_array.extend(struct.pack(str(key_size) + "s", key))
+        value_size = len(value)
+        output_byte_array.extend(struct.pack("i", value_size))
+        output_byte_array.extend(value)
+
+    np.array(output_byte_array).tofile(output_file_path)


 def gen_tuning_param_code(model_output_dirs,
@@ -426,12 +466,6 @@ def gen_mace_version(codegen_path="mace/codegen"):
            "%s/version/version.cc" % codegen_path)


-def gen_compiled_opencl_source(codegen_path="mace/codegen"):
-    opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path
-    sh.mkdir("-p", "%s/opencl" % codegen_path)
-    opencl_codegen(opencl_codegen_file)
-
-
 def gen_model_code(model_codegen_dir,
                   platform,
                   model_file_path,
@@ -576,6 +610,7 @@ def tuning_run(abi,
               out_of_range_check,
               phone_data_dir,
               build_type,
+               opencl_binary_file,
               omp_num_threads=-1,
               cpu_affinity_policy=1,
               gpu_perf_hint=3,
@@ -641,6 +676,10 @@ def tuning_run(abi,
            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
                     phone_data_dir, serialno)

+        if device_type == common.DeviceType.GPU\
+                and os.path.exists(opencl_binary_file):
+            adb_push(opencl_binary_file, phone_data_dir, serialno)
+
        adb_push("third_party/nnlib/libhexagon_controller.so",
                 phone_data_dir, serialno)

@@ -689,6 +728,8 @@ def tuning_run(abi,
            "--gpu_perf_hint=%s" % gpu_perf_hint,
            "--gpu_priority_hint=%s" % gpu_priority_hint,
            "--model_file=%s" % mace_model_phone_path,
+            "--opencl_binary_file=%s/%s" %
+            (phone_data_dir, os.path.basename(opencl_binary_file)),
        ])
        adb_cmd = ' '.join(adb_cmd)
        sh.adb(
@@ -1005,6 +1046,7 @@ def benchmark_model(abi,
                    device_type,
                    phone_data_dir,
                    build_type,
+                    opencl_binary_file,
                    omp_num_threads=-1,
                    cpu_affinity_policy=1,
                    gpu_perf_hint=3,
@@ -1049,6 +1091,9 @@ def benchmark_model(abi,
        if not embed_model_data:
            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
                     phone_data_dir, serialno)
+        if device_type == common.DeviceType.GPU \
+                and os.path.exists(opencl_binary_file):
+            adb_push(opencl_binary_file, phone_data_dir, serialno)
        mace_model_phone_path = ""
        if build_type == BuildType.proto:
            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
@@ -1082,6 +1127,8 @@ def benchmark_model(abi,
            "--gpu_perf_hint=%s" % gpu_perf_hint,
            "--gpu_priority_hint=%s" % gpu_priority_hint,
            "--model_file=%s" % mace_model_phone_path,
+            "--opencl_binary_file=%s/%s" %
+            (phone_data_dir, os.path.basename(opencl_binary_file)),
            _fg=True)

    print("Benchmark done!\n")