Add CreateMaceEngine API and speed up build logic.

dbf67ad9 · liuqi · 5c239b6e · dbf67ad9 · dbf67ad9 · dbf67ad9
12 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ mace/codegen/opencl/
 mace/codegen/opencl_bin/
 mace/codegen/tuning/
 mace/codegen/version/
+mace/codegen/engine/
 build/
 docs/_build/


--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -6,6 +6,7 @@ load(
    "if_not_production_mode",
    "if_hexagon_enabled",
    "if_openmp_enabled",
+    "if_android",
 )

 licenses(["notice"])  # Apache 2.0
@@ -26,12 +27,14 @@ cc_binary(
    srcs = [
        "benchmark_model.cc",
    ],
+    copts = if_android(["-DMACE_ENABLE_OPENCL"]),
    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
    deps = [
        ":statistics",
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",
+        "//mace/codegen:generated_mace_engine_creator",
    ],
 )


--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -26,20 +26,6 @@
 #include "mace/utils/logging.h"
 #include "mace/benchmark/statistics.h"

-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelChecksum();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
-
 namespace mace {
 namespace benchmark {
 namespace str_util {
@@ -188,6 +174,7 @@ bool Run(const std::string &title,
  return true;
 }

+DEFINE_string(model_tag, "", "model tag");
 DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
 DEFINE_string(input_node, "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -198,7 +185,6 @@ DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
 DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "number of runs max");
 DEFINE_string(max_time, "10.0", "length to run max");
-DEFINE_string(benchmark_name, "", "benchmark name");
 DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
 DEFINE_string(model_data_file, "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
@@ -214,7 +200,7 @@ int Main(int argc, char **argv) {
  gflags::SetUsageMessage("some usage message");
  gflags::ParseCommandLineFlags(&argc, &argv, true);

-  LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]";
+  LOG(INFO) << "Model tag: [" << FLAGS_model_tag << "]";
  LOG(INFO) << "Device: [" << FLAGS_device << "]";
  LOG(INFO) << "gpu_perf_hint: [" << FLAGS_gpu_perf_hint << "]";
  LOG(INFO) << "gpu_priority_hint: [" << FLAGS_gpu_priority_hint << "]";
@@ -233,17 +219,6 @@ int Main(int argc, char **argv) {

  std::unique_ptr<OpStat> statistician(new OpStat());

-  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
-
-  // config runtime
-  mace::SetOpenMPThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  }

  std::vector<std::string> input_names =
      str_util::Split(FLAGS_input_node, ',');
@@ -265,9 +240,36 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }

-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
+
+  // config runtime
+  mace::SetOpenMPThreadPolicy(
+      FLAGS_omp_num_threads,
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type == DeviceType::GPU) {
+    mace::SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+  }
+#endif  // MACE_ENABLE_OPENCL
+
+  const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH");
+  const std::string kernel_file_path =
+      std::string(kernel_path == nullptr ?
+                  "/data/local/tmp/mace_run/cl_program" : kernel_path);
+
+  std::shared_ptr<KVStorageFactory> storage_factory(
+      new FileStorageFactory(kernel_file_path));
+  SetKVStorageFactory(storage_factory);
+
+  // Create Engine
+  std::unique_ptr<mace::MaceEngine> engine_ptr =
+      CreateMaceEngine(FLAGS_model_tag,
+                       input_names,
+                       output_names,
+                       FLAGS_model_data_file.c_str(),
+                       device_type);

  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;
@@ -303,14 +305,6 @@ int Main(int argc, char **argv) {
                                                buffer_out);
  }

-  // Init model
-  LOG(INFO) << "Run init";
-  std::unique_ptr<mace::MaceEngine> engine_ptr(
-      new mace::MaceEngine(&net_def, device_type, input_names, output_names));
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
-
  int64_t warmup_time_us = 0;
  int64_t num_warmup_runs = 0;
  if (FLAGS_warmup_runs > 0) {

--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -33,3 +33,14 @@ cc_library(
    srcs = ["version/version.cc"],
    linkstatic = 1,
 )
+
+cc_library(
+    name = "generated_mace_engine_creator",
+    srcs = ["engine/mace_engine_creator.cc"],
+    linkstatic = 1,
+    deps = [
+        ":generated_models",
+        "//mace/public",
+        "//mace/utils",
+    ],
+)
--- a/mace/examples/example.cc
+++ b/mace/examples/example.cc
@@ -37,24 +37,6 @@
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"

-// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelName();
-extern const std::string ModelChecksum();
-extern const std::string ModelBuildTime();
-extern const std::string ModelBuildOptions();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
-
 namespace mace {
 namespace examples {

@@ -112,6 +94,9 @@ DeviceType ParseDeviceType(const std::string &device_str) {
 }


+DEFINE_string(model_tag,
+              "",
+              "model tag in yaml file");
 DEFINE_string(input_node,
              "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -148,37 +133,38 @@ bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::string> &output_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
  // load model
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-
  // config runtime
-  MaceStatus res = mace::SetOpenMPThreadPolicy(
+  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+#ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
+#endif  // MACE_ENABLE_OPENCL

  // DO NOT USE tmp directory.
  // Please use APP's own directory and make sure the directory exists.
+  // Just call once
  const std::string kernel_file_path =
-                  "/data/local/tmp/mace_run/cl";
+      "/data/local/tmp/mace_run/cl";

  // Config internal kv storage factory.
  std::shared_ptr<KVStorageFactory> storage_factory(
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);
-  // Init model
-  mace::MaceEngine engine(&net_def, device_type, input_names,
-                          output_names);
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
+
+  // Create Engine
+  std::unique_ptr<mace::MaceEngine> engine =
+      CreateMaceEngine(FLAGS_model_tag,
+                       input_names,
+                       output_names,
+                       FLAGS_model_data_file.c_str(),
+                       device_type);
+

  const size_t input_count = input_names.size();
  const size_t output_count = output_names.size();
@@ -216,12 +202,12 @@ bool RunModel(const std::vector<std::string> &input_names,
  }

  LOG(INFO) << "Warm up run";
-  engine.Run(inputs, &outputs);
+  engine->Run(inputs, &outputs);

  if (FLAGS_round > 0) {
    LOG(INFO) << "Run model";
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(inputs, &outputs);
+      engine->Run(inputs, &outputs);
    }
  }

@@ -247,10 +233,6 @@ int Main(int argc, char **argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "model name: " << mace::MACE_MODEL_TAG::ModelName();
-  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
-  LOG(INFO) << "build time: " << mace::MACE_MODEL_TAG::ModelBuildTime();
-  LOG(INFO) << "build options: " << mace::MACE_MODEL_TAG::ModelBuildOptions();
  LOG(INFO) << "input node: " << FLAGS_input_node;
  LOG(INFO) << "input shape: " << FLAGS_input_shape;
  LOG(INFO) << "output node: " << FLAGS_output_node;

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -28,7 +28,7 @@ namespace mace {

 const char *MaceVersion();

-enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
+enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, AUTO = 4 };

 enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };

@@ -82,6 +82,13 @@ class MaceEngine {
  MaceEngine &operator=(const MaceEngine &) = delete;
 };

+std::unique_ptr<MaceEngine> CreateMaceEngine(
+    const std::string &model_tag,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const char *model_data_file = nullptr,
+    const DeviceType device_type = DeviceType::AUTO);
+
 }  // namespace mace

 #endif  // MACE_PUBLIC_MACE_H_
--- a/mace/python/tools/mace_engine_creator.jinja2
+++ b/mace/python/tools/mace_engine_creator.jinja2
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a generated file. DO NOT EDIT!
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+{% for tag in model_tags %}
+namespace {{tag}} {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelName();
+extern const std::string ModelChecksum();
+extern const std::string ModelBuildTime();
+extern const std::string ModelBuildOptions();
+
+}  // namespace {{tag}}
+{% endfor %}
+
+namespace {
+std::map<std::string, int> model_tag_map {
+{% for i in range(model_tags |length) %}
+  std::make_pair({{ model_tags[i]|tojson }}, {{ i }}),
+{% endfor %}
+};
+}  // namespace
+
+std::unique_ptr<MaceEngine> CreateMaceEngine(
+    const std::string &model_tag,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const char *model_data_file,
+    const DeviceType device_type) {
+  // load model
+  std::unique_ptr<MaceEngine> engine;
+  const unsigned char * model_data = nullptr;
+  NetDef net_def;
+  switch (model_tag_map[model_tag]) {
+{% for i in range(model_tags |length) %}
+   case {{ i }}:
+    model_data =
+        mace::{{model_tags[i]}}::LoadModelData(model_data_file);
+    net_def = mace::{{model_tags[i]}}::CreateNet(model_data);
+    engine.reset(
+        new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes));
+    if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
+      mace::{{model_tags[i]}}::UnloadModelData(model_data);
+    }
+    break;
+{% endfor %}
+   default:
+     LOG(FATAL) << "There is no model named " << model_tag;
+  }
+
+  return engine;
+}
+
+}  // namespace mace
--- a/mace/python/tools/mace_engine_generator.py
+++ b/mace/python/tools/mace_engine_generator.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from jinja2 import Environment, FileSystemLoader
+
+
+FLAGS = None
+
+
+def gen_mace_engine_creator(model_tags, template_dir, output_dir):
+    # Create the jinja2 environment.
+    j2_env = Environment(
+        loader=FileSystemLoader(template_dir), trim_blocks=True)
+    # generate mace_run BUILD file
+    print model_tags
+    template_name = 'mace_engine_creator.jinja2'
+    source = j2_env.get_template(template_name).render(
+        model_tags=model_tags,
+    )
+    with open(output_dir + '/mace_engine_creator.cc', "wb") as f:
+        f.write(source)
+
+
+def parse_args():
+    """Parses command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_tag",
+        type=str,
+        default="",
+        help="model tag")
+    parser.add_argument(
+        "--template_dir", type=str, default="", help="template path")
+    parser.add_argument(
+        "--output_dir", type=str, default="", help="template path")
+    return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+    FLAGS, unparsed = parse_args()
+    gen_mace_engine_creator(FLAGS.model_tag, FLAGS.template_dir,
+                            FLAGS.output_dir)
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
@@ -10,6 +10,7 @@ cc_binary(
    deps = [
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",
+        "//mace/codegen:generated_mace_engine_creator",
        "//mace/core:core",
    ],
 )
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -42,24 +42,6 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL

-// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelName();
-extern const std::string ModelChecksum();
-extern const std::string ModelBuildTime();
-extern const std::string ModelBuildOptions();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
-
 namespace mace {
 namespace tools {
 namespace validation {
@@ -180,6 +162,9 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  return curr;
 }

+DEFINE_string(model_tag,
+              "",
+              "model tag in yaml");
 DEFINE_string(input_node,
              "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -211,22 +196,12 @@ DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");

-bool RunModel(const std::vector<std::string> &input_names,
+bool RunModel(const std::string &model_tag,
+              const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &input_shapes,
              const std::vector<std::string> &output_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
-  // load model
-  int64_t t0 = NowMicros();
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-  int64_t t1 = NowMicros();
-  double create_net_millis = (t1 - t0) / 1000.0;
-  LOG(INFO) << "CreateNetDef latency: " << create_net_millis << " ms";
-
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  LOG(INFO) << "Runing with device type: " << device_type;
-
  // config runtime
  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
@@ -244,20 +219,20 @@ bool RunModel(const std::vector<std::string> &input_names,
      std::string(kernel_path == nullptr ?
                  "/data/local/tmp/mace_run/cl_program" : kernel_path);

-  // Init model
-  LOG(INFO) << "Run init";
  std::shared_ptr<KVStorageFactory> storage_factory(
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);
-  mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
-  int64_t t2 = NowMicros();
-  double mace_engine_ctor_millis = (t2 - t1) / 1000.0;
-  double init_millis = (t2 - t0) / 1000.0;
-  LOG(INFO) << "MaceEngine constructor latency: "
-            << mace_engine_ctor_millis << " ms";
+
+  // Create Engine
+  int64_t t0 = NowMicros();
+  std::unique_ptr<mace::MaceEngine> engine =
+      CreateMaceEngine(model_tag,
+                       input_names,
+                       output_names,
+                       FLAGS_model_data_file.c_str(),
+                       device_type);
+  int64_t t1 = NowMicros();
+  double init_millis = (t1 - t0) / 1000.0;
  LOG(INFO) << "Total init latency: " << init_millis << " ms";

  const size_t input_count = input_names.size();
@@ -297,7 +272,7 @@ bool RunModel(const std::vector<std::string> &input_names,

  LOG(INFO) << "Warm up run";
  int64_t t3 = NowMicros();
-  engine.Run(inputs, &outputs);
+  engine->Run(inputs, &outputs);
  int64_t t4 = NowMicros();
  double warmup_millis = (t4 - t3) / 1000.0;
  LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
@@ -308,7 +283,7 @@ bool RunModel(const std::vector<std::string> &input_names,
    int64_t t0 = NowMicros();
    struct mallinfo prev = mallinfo();
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(inputs, &outputs);
+      engine->Run(inputs, &outputs);
      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
        LOG(INFO) << "=== check malloc info change #" << i << " ===";
        prev = LogMallinfoChange(prev);
@@ -320,11 +295,11 @@ bool RunModel(const std::vector<std::string> &input_names,
  }

  // Metrics reporting tools depends on the format, keep in consistent
-  printf("================================================================\n");
-  printf("      create_net engine_ctor        init      warmup     run_avg\n");
-  printf("================================================================\n");
-  printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis,
-         mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);
+  printf("========================================\n");
+  printf("            init      warmup     run_avg\n");
+  printf("========================================\n");
+  printf("time %11.3f %11.3f %11.3f\n",
+         init_millis, warmup_millis, model_run_millis);

 #ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::GPU) {
@@ -356,10 +331,6 @@ int Main(int argc, char **argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "model name: " << mace::MACE_MODEL_TAG::ModelName();
-  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
-  LOG(INFO) << "build time: " << mace::MACE_MODEL_TAG::ModelBuildTime();
-  LOG(INFO) << "build options: " << mace::MACE_MODEL_TAG::ModelBuildOptions();
  LOG(INFO) << "input node: " << FLAGS_input_node;
  LOG(INFO) << "input shape: " << FLAGS_input_shape;
  LOG(INFO) << "output node: " << FLAGS_output_node;
@@ -399,7 +370,8 @@ int Main(int argc, char **argv) {
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;
    ret =
-        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+        RunModel(FLAGS_model_tag, input_names, input_shape_vec,
+                 output_names, output_shape_vec);
  }
  if (ret) {
    return 0;

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -95,13 +95,17 @@ def gen_opencl_and_tuning_code(target_abi,
                               serialno,
                               model_output_dirs,
                               pull_or_not):
+    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
+    cl_platform_info_file_name = "mace_cl_platform_info.txt"
    if pull_or_not:
-        sh_commands.pull_binaries(target_abi, serialno, model_output_dirs)
-
-    codegen_path = "mace/codegen"
+        sh_commands.pull_binaries(target_abi, serialno, model_output_dirs,
+                                  cl_built_kernel_file_name,
+                                  cl_platform_info_file_name)

    # generate opencl binary code
-    sh_commands.gen_opencl_binary_code(model_output_dirs)
+    sh_commands.gen_opencl_binary_code(model_output_dirs,
+                                       cl_built_kernel_file_name,
+                                       cl_platform_info_file_name)

    sh_commands.gen_tuning_param_code(model_output_dirs)

@@ -227,12 +231,11 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
        sh_commands.bazel_build(
            mace_run_target,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=False,
            hexagon_mode=hexagon_mode,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)

        tuning_run(runtime, target_abi, serialno, vlog_level, embed_model_data,
@@ -254,13 +257,12 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
            mace_run_target,
            strip,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=True,
            hexagon_mode=hexagon_mode,
            debug=debug,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)
    else:
        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
@@ -268,13 +270,12 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
            mace_run_target,
            strip,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=True,
            hexagon_mode=hexagon_mode,
            debug=debug,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)


@@ -525,6 +526,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                   target_abi, phone_data_dir, target_soc="", serialno=""):
    hexagon_mode = get_hexagon_mode(configs)
    model_output_dirs = []
+
    for model_name in configs["models"]:
        print '===================', model_name, '==================='
        model_config = configs["models"][model_name]
@@ -534,16 +536,16 @@ def process_models(project_name, configs, embed_model_data, vlog_level,

        # Create model build directory
        model_path_digest = md5sum(model_config["model_file_path"])
+        model_output_base_dir = "%s/%s/%s/%s/%s" % (
+            FLAGS.output_dir, project_name, "build",
+            model_name, model_path_digest)

        if target_abi == "host":
-            model_output_dir = "%s/%s/%s/%s/%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                model_name, model_path_digest, target_abi)
+            model_output_dir = "%s/%s" % (model_output_base_dir, target_abi)
        else:
            device_name = sh_commands.adb_get_device_name_by_serialno(serialno)
-            model_output_dir = "%s/%s/%s/%s/%s/%s_%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                model_name, model_path_digest, device_name.replace(' ', ''),
+            model_output_dir = "%s/%s_%s/%s" % (
+                model_output_base_dir, device_name.replace(' ', ''),
                target_soc, target_abi)
        model_output_dirs.append(model_output_dir)

@@ -552,16 +554,14 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                sh.rm("-rf", model_output_dir)
            os.makedirs(model_output_dir)

-        if FLAGS.mode == "build" or FLAGS.mode == "benchmark" or \
-                FLAGS.mode == "all":
-            sh_commands.clear_mace_run_data(
-                    target_abi, serialno, phone_data_dir)
-
        model_file_path, weight_file_path = get_model_files(
                model_config["model_file_path"],
-                model_output_dir,
+                model_output_base_dir,
                model_config["weight_file_path"])

+        sh_commands.clear_phone_data_dir(
+            target_abi, serialno, phone_data_dir)
+
        if FLAGS.mode == "build" or FLAGS.mode == "run" or \
                FLAGS.mode == "validate" or \
                FLAGS.mode == "benchmark" or FLAGS.mode == "all":
@@ -570,25 +570,6 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                         model_config["input_shapes"],
                                         input_file_list)

-        if FLAGS.mode == "build" or FLAGS.mode == "benchmark" or \
-                FLAGS.mode == "all":
-            sh_commands.gen_model_code(
-                    "mace/codegen/models/%s" % model_name,
-                    model_config["platform"],
-                    model_file_path,
-                    weight_file_path,
-                    model_config["model_sha256_checksum"],
-                    ",".join(model_config["input_nodes"]),
-                    ",".join(model_config["output_nodes"]),
-                    data_type,
-                    model_config["runtime"],
-                    model_name,
-                    ":".join(model_config["input_shapes"]),
-                    model_config["dsp_mode"],
-                    embed_model_data,
-                    model_config["fast_conv"],
-                    model_config["obfuscate"])
-
        if FLAGS.mode == "build" or FLAGS.mode == "all":
            build_mace_run_prod(hexagon_mode,
                                model_config["runtime"],
@@ -609,9 +590,14 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                model_config["limit_opencl_kernel_time"],
                                phone_data_dir,
                                FLAGS.enable_openmp)
+            sh_commands.build_benchmark_model(target_abi,
+                                              embed_model_data,
+                                              model_output_dir,
+                                              model_name,
+                                              hexagon_mode)

        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
-                FLAGS.mode == "all":
+           FLAGS.mode == "all":
            tuning_run(model_config["runtime"],
                       target_abi,
                       serialno,
@@ -647,7 +633,6 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                        model_config["output_shapes"],
                                        model_name,
                                        device_type,
-                                        hexagon_mode,
                                        phone_data_dir,
                                        FLAGS.omp_num_threads,
                                        FLAGS.cpu_affinity_policy,
@@ -738,12 +723,56 @@ def main(unused_args):
        # generate source
        sh_commands.gen_mace_version()
        sh_commands.gen_encrypted_opencl_source()
+        sh_commands.gen_mace_engine_creator_source(configs['models'].keys())

+    embed_model_data = configs["embed_model_data"]
    target_socs = get_target_socs(configs)

-    embed_model_data = configs["embed_model_data"]
    vlog_level = FLAGS.vlog_level
    phone_data_dir = "/data/local/tmp/mace_run/"
+
+    if FLAGS.mode == "build" or FLAGS.mode == "all":
+        print '* Model Convert'
+        sh_commands.clear_model_codegen()
+        for model_name in configs["models"]:
+            print '===================', model_name, '==================='
+            model_config = configs["models"][model_name]
+            data_type, device_type = get_data_and_device_type(
+                model_config["runtime"])
+
+            # Create model build directory
+            model_path_digest = md5sum(model_config["model_file_path"])
+
+            model_output_base_dir = "%s/%s/%s/%s/%s" % (
+                FLAGS.output_dir, project_name, "build",
+                model_name, model_path_digest)
+
+            if os.path.exists(model_output_base_dir):
+                sh.rm("-rf", model_output_base_dir)
+            os.makedirs(model_output_base_dir)
+
+            model_file_path, weight_file_path = get_model_files(
+                model_config["model_file_path"],
+                model_output_base_dir,
+                model_config["weight_file_path"])
+
+            sh_commands.gen_model_code(
+                "mace/codegen/models/%s" % model_name,
+                model_config["platform"],
+                model_file_path,
+                weight_file_path,
+                model_config["model_sha256_checksum"],
+                ",".join(model_config["input_nodes"]),
+                ",".join(model_config["output_nodes"]),
+                data_type,
+                model_config["runtime"],
+                model_name,
+                ":".join(model_config["input_shapes"]),
+                model_config["dsp_mode"],
+                embed_model_data,
+                model_config["fast_conv"],
+                model_config["obfuscate"])
+
    for target_abi in configs["target_abis"]:
        for target_soc in target_socs:
            if target_abi != 'host':

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -33,6 +33,7 @@ try:
    from binary_codegen import tuning_param_codegen
    from generate_data import generate_input_data
    from validate import validate
+    from mace_engine_generator import gen_mace_engine_creator
 except Exception as e:
    print("Import error:\n%s" % e)
    exit(1)
@@ -74,15 +75,15 @@ def is_device_locked(serialno):
 ################################
 # clear data
 ################################
-def clear_mace_run_data(abi,
-                        serialno,
-                        phone_data_dir,
-                        model_codegen_dir="mace/codegen/models"):
+def clear_phone_data_dir(abi, serialno, phone_data_dir):
    if abi != "host":
        sh.adb("-s",
               serialno,
               "shell",
               "rm -rf %s" % phone_data_dir)
+
+
+def clear_model_codegen(model_codegen_dir="mace/codegen/models"):
    if os.path.exists(model_codegen_dir):
        sh.rm("-rf", model_codegen_dir)

@@ -268,7 +269,6 @@ def adb_run_valgrind(serialno,
 def bazel_build(target,
                strip="always",
                abi="armeabi-v7a",
-                model_tag="",
                production_mode=False,
                hexagon_mode=False,
                disable_no_tuning_warning=False,
@@ -289,7 +289,6 @@ def bazel_build(target,
            "--copt=-std=c++11",
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
            "--copt=-Werror=return-type",
-            "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
            "--copt=-O3",
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
@@ -315,7 +314,6 @@ def bazel_build(target,
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
            "--copt=-Werror=return-type",
            "--copt=-DMACE_OBFUSCATE_LITERALS",
-            "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
            "--copt=-O3",
            "--define",
            "neon=true",
@@ -371,7 +369,21 @@ def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
                           "mace/codegen/opencl/opencl_encrypt_program.cc")


-def pull_binaries(abi, serialno, model_output_dirs):
+def gen_mace_engine_creator_source(model_tags, codegen_path="mace/codegen"):
+    print("* Genearte mace engine creator source")
+    codegen_tools_dir = "%s/engine" % codegen_path
+    sh.rm("-rf", codegen_tools_dir)
+    sh.mkdir("-p", codegen_tools_dir)
+    gen_mace_engine_creator(
+        model_tags,
+        "mace/python/tools",
+        codegen_tools_dir)
+    print("Genearte mace engine creator source done!\n")
+
+
+def pull_binaries(abi, serialno, model_output_dirs,
+                  cl_built_kernel_file_name,
+                  cl_platform_info_file_name):
    compiled_opencl_dir = "/data/local/tmp/mace_run/cl_program/"
    mace_run_param_file = "mace_run.config"

@@ -385,15 +397,18 @@ def pull_binaries(abi, serialno, model_output_dirs):
            sh.rm("-rf", cl_bin_dir)
        sh.mkdir("-p", cl_bin_dir)
        if abi != "host":
-            adb_pull(compiled_opencl_dir, cl_bin_dir, serialno)
+            adb_pull(compiled_opencl_dir + cl_built_kernel_file_name,
+                     cl_bin_dir, serialno)
+            adb_pull(compiled_opencl_dir + cl_platform_info_file_name,
+                     cl_bin_dir, serialno)
            adb_pull("/data/local/tmp/mace_run/%s" % mace_run_param_file,
                     cl_bin_dir, serialno)


 def gen_opencl_binary_code(model_output_dirs,
+                           cl_built_kernel_file_name,
+                           cl_platform_info_file_name,
                           codegen_path="mace/codegen"):
-    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
-    cl_platform_info_file_name = "mace_cl_platform_info.txt"
    opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path

    cl_bin_dirs = []
@@ -528,25 +543,8 @@ def gen_random_input(model_output_dir,


 def update_mace_run_lib(model_output_dir,
-                        abi,
                        model_tag,
-                        embed_model_data,
-                        generated_model_lib_dir="bazel-bin/mace/codegen/"):
-    model_lib_path = model_output_dir + "/libmace_%s.a" % model_tag
-    if abi == "host":
-        bazel_build(
-                "//mace/codegen:generated_models",
-                abi=abi,
-                model_tag=model_tag)
-        generated_model_lib_name = "libgenerated_models.pic.a"
-    else:
-        generated_model_lib_name = "libgenerated_models.a"
-
-    if os.path.exists(model_lib_path):
-        sh.rm("-rf", model_lib_path)
-    sh.cp("-f", generated_model_lib_dir + "/" + generated_model_lib_name,
-          model_lib_path)
-
+                        embed_model_data):
    mace_run_filepath = model_output_dir + "/mace_run"
    if os.path.exists(mace_run_filepath):
        sh.rm("-rf", mace_run_filepath)
@@ -560,6 +558,11 @@ def update_mace_run_lib(model_output_dir,
          model_output_dir)


+def create_compiled_opencl_dir(serialno):
+    compiled_opencl_dir = "/data/local/tmp/mace_run/cl_program/"
+    sh.adb("-s", serialno, "shell", "mkdir", "-p", compiled_opencl_dir)
+
+
 def tuning_run(abi,
               serialno,
               vlog_level,
@@ -598,6 +601,7 @@ def tuning_run(abi,
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/mace_run" % model_output_dir,
+                "--model_tag=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
@@ -622,8 +626,7 @@ def tuning_run(abi,
        return stdout
    else:
        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
-        compiled_opencl_dir = "/data/local/tmp/mace_run/cl_program/"
-        sh.adb("-s", serialno, "shell", "mkdir", "-p", compiled_opencl_dir)
+        create_compiled_opencl_dir(serialno)

        for input_name in input_nodes:
            formatted_name = common.formatted_file_name(input_file_name,
@@ -657,6 +660,7 @@ def tuning_run(abi,
            ])
        adb_cmd.extend([
            "%s/mace_run" % phone_data_dir,
+            "--model_tag=%s" % model_tag,
            "--input_node=%s" % ",".join(input_nodes),
            "--output_node=%s" % ",".join(output_nodes),
            "--input_shape=%s" % ":".join(input_shapes),
@@ -846,6 +850,12 @@ def merge_libs(target_soc,
        mri_stream += (
                "addlib "
                "bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_models.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_mace_engine_creator.pic.a\n")
    else:
        mri_stream += "create %s/libmace_%s.%s.a\n" % \
                      (model_bin_dir, project_name, target_soc)
@@ -858,6 +868,12 @@ def merge_libs(target_soc,
        mri_stream += (
                "addlib "
                "bazel-bin/mace/codegen/libgenerated_version.a\n")
+        mri_stream += (
+                "addlib "
+                "bazel-bin/mace/codegen/libgenerated_models.a\n")
+        mri_stream += (
+                "addlib "
+                "bazel-bin/mace/codegen/libgenerated_mace_engine_creator.a\n")
        mri_stream += (
                "addlib "
                "bazel-bin/mace/core/libcore.a\n")
@@ -875,8 +891,6 @@ def merge_libs(target_soc,
                "bazel-bin/mace/ops/libops.lo\n")

    for model_output_dir in model_output_dirs:
-        for lib in sh.ls(glob.glob("%s/*.a" % model_output_dir), "-1"):
-            mri_stream += "addlib %s\n" % lib
        if not embed_model_data:
            sh.cp("-f", glob.glob("%s/*.data" % model_output_dir),
                  model_data_dir)
@@ -921,6 +935,28 @@ def packaging_lib(libmace_output_dir, project_name):
    print("Packaging Done!\n")


+def build_benchmark_model(abi,
+                          embed_model_data,
+                          model_output_dir,
+                          model_tag,
+                          hexagon_mode):
+    benchmark_binary_file = "%s/benchmark_model" % model_output_dir
+    if os.path.exists(benchmark_binary_file):
+        sh.rm("-rf", benchmark_binary_file)
+    if not embed_model_data:
+        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
+              model_output_dir)
+
+    benchmark_target = "//mace/benchmark:benchmark_model"
+    bazel_build(benchmark_target,
+                abi=abi,
+                production_mode=True,
+                hexagon_mode=hexagon_mode)
+
+    target_bin = "/".join(bazel_target_to_bin(benchmark_target))
+    sh.cp("-f", target_bin, model_output_dir)
+
+
 def benchmark_model(abi,
                    serialno,
                    vlog_level,
@@ -932,31 +968,13 @@ def benchmark_model(abi,
                    output_shapes,
                    model_tag,
                    device_type,
-                    hexagon_mode,
                    phone_data_dir,
                    omp_num_threads=-1,
                    cpu_affinity_policy=1,
                    gpu_perf_hint=3,
                    gpu_priority_hint=3,
-                    input_file_name="model_input",
-                    output_file_name="model_out"):
+                    input_file_name="model_input"):
    print("* Benchmark for %s" % model_tag)
-    benchmark_binary_file = "%s/benchmark_model" % model_output_dir
-    if os.path.exists(benchmark_binary_file):
-        sh.rm("-rf", benchmark_binary_file)
-    if not embed_model_data:
-        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
-              model_output_dir)
-
-    benchmark_target = "//mace/benchmark:benchmark_model"
-    bazel_build(benchmark_target,
-                abi=abi,
-                model_tag=model_tag,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
-
-    target_bin = "/".join(bazel_target_to_bin(benchmark_target))
-    sh.cp("-f", target_bin, model_output_dir)

    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
@@ -966,6 +984,7 @@ def benchmark_model(abi,
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/benchmark_model" % model_output_dir,
+                "--model_tag=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
@@ -981,6 +1000,7 @@ def benchmark_model(abi,
        p.wait()
    else:
        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
+        create_compiled_opencl_dir(serialno)

        for input_name in input_nodes:
            formatted_name = common.formatted_file_name(input_file_name,
@@ -1002,6 +1022,7 @@ def benchmark_model(abi,
            phone_data_dir,
            "MACE_OPENCL_PROFILING=1",
            "%s/benchmark_model" % phone_data_dir,
+            "--model_tag=%s" % model_tag,
            "--input_node=%s" % ",".join(input_nodes),
            "--output_node=%s" % ",".join(output_nodes),
            "--input_shape=%s" % ":".join(input_shapes),