Merge branch 'new-api' into 'master'

Add CreateMaceEngine API and speed up build logic. See merge request !463

Merge branch 'new-api' into 'master'
Add CreateMaceEngine API and speed up build logic. See merge request !463
dbb51228 · 叶剑武 · d20856d5 · 4fdcc85e · dbb51228 · dbb51228
12 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ mace/codegen/opencl/
 mace/codegen/opencl_bin/
 mace/codegen/tuning/
 mace/codegen/version/
+mace/codegen/engine/
 build/
 docs/_build/


--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -6,6 +6,7 @@ load(
    "if_not_production_mode",
    "if_hexagon_enabled",
    "if_openmp_enabled",
+    "if_android",
 )

 licenses(["notice"])  # Apache 2.0
@@ -26,12 +27,14 @@ cc_binary(
    srcs = [
        "benchmark_model.cc",
    ],
+    copts = if_android(["-DMACE_ENABLE_OPENCL"]),
    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
    deps = [
        ":statistics",
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",
+        "//mace/codegen:generated_mace_engine_factory",
    ],
 )


--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -25,20 +25,7 @@
 #include "mace/public/mace_runtime.h"
 #include "mace/utils/logging.h"
 #include "mace/benchmark/statistics.h"
-
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelChecksum();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
+#include "mace/codegen/engine/mace_engine_factory.h"

 namespace mace {
 namespace benchmark {
@@ -188,6 +175,7 @@ bool Run(const std::string &title,
  return true;
 }

+DEFINE_string(model_name, "", "model name in yaml");
 DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
 DEFINE_string(input_node, "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -198,7 +186,6 @@ DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
 DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "number of runs max");
 DEFINE_string(max_time, "10.0", "length to run max");
-DEFINE_string(benchmark_name, "", "benchmark name");
 DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
 DEFINE_string(model_data_file, "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
@@ -214,7 +201,7 @@ int Main(int argc, char **argv) {
  gflags::SetUsageMessage("some usage message");
  gflags::ParseCommandLineFlags(&argc, &argv, true);

-  LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]";
+  LOG(INFO) << "Model name: [" << FLAGS_model_name << "]";
  LOG(INFO) << "Device: [" << FLAGS_device << "]";
  LOG(INFO) << "gpu_perf_hint: [" << FLAGS_gpu_perf_hint << "]";
  LOG(INFO) << "gpu_priority_hint: [" << FLAGS_gpu_priority_hint << "]";
@@ -233,17 +220,6 @@ int Main(int argc, char **argv) {

  std::unique_ptr<OpStat> statistician(new OpStat());

-  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
-
-  // config runtime
-  mace::SetOpenMPThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::GPU) {
-    mace::SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  }

  std::vector<std::string> input_names =
      str_util::Split(FLAGS_input_node, ',');
@@ -265,9 +241,53 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }

-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
+
+  // config runtime
+  mace::SetOpenMPThreadPolicy(
+      FLAGS_omp_num_threads,
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type == DeviceType::GPU) {
+    mace::SetGPUHints(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+  }
+#endif  // MACE_ENABLE_OPENCL
+
+  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
+  const std::string kernel_file_path =
+      std::string(kernel_path == nullptr ?
+                  "/data/local/tmp/mace_run/interior" : kernel_path);
+
+  std::shared_ptr<KVStorageFactory> storage_factory(
+      new FileStorageFactory(kernel_file_path));
+  SetKVStorageFactory(storage_factory);
+
+  // Create Engine
+  std::shared_ptr<mace::MaceEngine> engine;
+  MaceStatus create_engine_status;
+  // Create Engine
+  if (FLAGS_model_data_file.empty()) {
+    create_engine_status =
+        CreateMaceEngine(FLAGS_model_name.c_str(),
+                         nullptr,
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
+  } else {
+    create_engine_status =
+        CreateMaceEngine(FLAGS_model_name.c_str(),
+                         FLAGS_model_data_file.c_str(),
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
+  }
+  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+    LOG(FATAL) << "Create engine error, please check the arguments";
+  }

  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;
@@ -303,19 +323,11 @@ int Main(int argc, char **argv) {
                                                buffer_out);
  }

-  // Init model
-  LOG(INFO) << "Run init";
-  std::unique_ptr<mace::MaceEngine> engine_ptr(
-      new mace::MaceEngine(&net_def, device_type, input_names, output_names));
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
-
  int64_t warmup_time_us = 0;
  int64_t num_warmup_runs = 0;
  if (FLAGS_warmup_runs > 0) {
    bool status =
-        Run("Warm Up", engine_ptr.get(), inputs, &outputs,
+        Run("Warm Up", engine.get(), inputs, &outputs,
            FLAGS_warmup_runs, -1.0,
            &warmup_time_us, &num_warmup_runs, nullptr);
    if (!status) {
@@ -326,7 +338,7 @@ int Main(int argc, char **argv) {
  int64_t no_stat_time_us = 0;
  int64_t no_stat_runs = 0;
  bool status =
-      Run("Run without statistics", engine_ptr.get(), inputs, &outputs,
+      Run("Run without statistics", engine.get(), inputs, &outputs,
          FLAGS_max_num_runs, max_benchmark_time_seconds,
          &no_stat_time_us, &no_stat_runs, nullptr);
  if (!status) {
@@ -335,7 +347,7 @@ int Main(int argc, char **argv) {

  int64_t stat_time_us = 0;
  int64_t stat_runs = 0;
-  status = Run("Run with statistics", engine_ptr.get(), inputs, &outputs,
+  status = Run("Run with statistics", engine.get(), inputs, &outputs,
               FLAGS_max_num_runs, max_benchmark_time_seconds,
               &stat_time_us, &stat_runs, statistician.get());
  if (!status) {

--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -33,3 +33,11 @@ cc_library(
    srcs = ["version/version.cc"],
    linkstatic = 1,
 )
+
+cc_library(
+    name = "generated_mace_engine_factory",
+    hdrs = ["engine/mace_engine_factory.h"],
+    deps = [
+        "//mace/public",
+    ],
+)
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -9,5 +9,6 @@ cc_binary(
    deps = [
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",
+        "//mace/codegen:generated_mace_engine_factory",
    ],
 )
--- a/mace/examples/example.cc
+++ b/mace/examples/example.cc
@@ -34,27 +34,11 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
 #include "mace/public/mace_runtime.h"
+// if convert model to code.
+#include "mace/codegen/engine/mace_engine_factory.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"

-// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelName();
-extern const std::string ModelChecksum();
-extern const std::string ModelBuildTime();
-extern const std::string ModelBuildOptions();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
-
 namespace mace {
 namespace examples {

@@ -112,6 +96,9 @@ DeviceType ParseDeviceType(const std::string &device_str) {
 }


+DEFINE_string(model_name,
+              "",
+              "model name in yaml file");
 DEFINE_string(input_node,
              "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -148,36 +135,53 @@ bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::string> &output_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
  // load model
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-
  // config runtime
-  MaceStatus res = mace::SetOpenMPThreadPolicy(
+  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+#ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
+#endif  // MACE_ENABLE_OPENCL

  // DO NOT USE tmp directory.
  // Please use APP's own directory and make sure the directory exists.
-  const std::string kernel_file_path =
-                  "/data/local/tmp/mace_run/cl";
+  // Just call once
+  const std::string internal_storage_path =
+      "/data/local/tmp/mace_run/interior";

  // Config internal kv storage factory.
  std::shared_ptr<KVStorageFactory> storage_factory(
-      new FileStorageFactory(kernel_file_path));
+      new FileStorageFactory(internal_storage_path));
  SetKVStorageFactory(storage_factory);
-  // Init model
-  mace::MaceEngine engine(&net_def, device_type, input_names,
-                          output_names);
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+
+  // Create Engine
+  std::shared_ptr<mace::MaceEngine> engine;
+  MaceStatus create_engine_status;
+  // Create Engine
+  if (FLAGS_model_data_file.empty()) {
+    create_engine_status =
+        CreateMaceEngine(FLAGS_model_name.c_str(),
+                         nullptr,
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
+  } else {
+    create_engine_status =
+        CreateMaceEngine(FLAGS_model_name.c_str(),
+                         FLAGS_model_data_file.c_str(),
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
+  }
+  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+    LOG(FATAL) << "Create engine error, please check the arguments";
  }

  const size_t input_count = input_names.size();
@@ -216,12 +220,12 @@ bool RunModel(const std::vector<std::string> &input_names,
  }

  LOG(INFO) << "Warm up run";
-  engine.Run(inputs, &outputs);
+  engine->Run(inputs, &outputs);

  if (FLAGS_round > 0) {
    LOG(INFO) << "Run model";
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(inputs, &outputs);
+      engine->Run(inputs, &outputs);
    }
  }

@@ -247,10 +251,6 @@ int Main(int argc, char **argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "model name: " << mace::MACE_MODEL_TAG::ModelName();
-  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
-  LOG(INFO) << "build time: " << mace::MACE_MODEL_TAG::ModelBuildTime();
-  LOG(INFO) << "build options: " << mace::MACE_MODEL_TAG::ModelBuildOptions();
  LOG(INFO) << "input node: " << FLAGS_input_node;
  LOG(INFO) << "input shape: " << FLAGS_input_shape;
  LOG(INFO) << "output node: " << FLAGS_output_node;

--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a generated file. DO NOT EDIT!
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
+
+namespace mace {
+{% for tag in model_tags %}
+namespace {{tag}} {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelName();
+extern const std::string ModelChecksum();
+extern const std::string ModelBuildTime();
+extern const std::string ModelBuildOptions();
+
+}  // namespace {{tag}}
+{% endfor %}
+
+namespace {
+std::map<std::string, int> model_name_map {
+{% for i in range(model_tags |length) %}
+  std::make_pair({{ model_tags[i]|tojson }}, {{ i }}),
+{% endfor %}
+};
+}  // namespace
+
+MaceStatus CreateMaceEngine(
+    const char *model_name,
+    const char *model_data_file,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const DeviceType device_type,
+    std::shared_ptr<MaceEngine> *engine) {
+  // load model
+  if (engine == nullptr) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+  const unsigned char * model_data = nullptr;
+  NetDef net_def;
+  switch (model_name_map[model_name]) {
+{% for i in range(model_tags |length) %}
+   case {{ i }}:
+    model_data =
+        mace::{{model_tags[i]}}::LoadModelData(model_data_file);
+    net_def = mace::{{model_tags[i]}}::CreateNet(model_data);
+    engine->reset(
+        new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes));
+    if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
+      mace::{{model_tags[i]}}::UnloadModelData(model_data);
+    }
+    break;
+{% endfor %}
+   default:
+     return MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace mace
--- a/mace/python/tools/mace_engine_factory_codegen.py
+++ b/mace/python/tools/mace_engine_factory_codegen.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from jinja2 import Environment, FileSystemLoader
+
+
+FLAGS = None
+
+
+def gen_mace_engine_factory(model_tags, template_dir, output_dir):
+    # Create the jinja2 environment.
+    j2_env = Environment(
+        loader=FileSystemLoader(template_dir), trim_blocks=True)
+    # generate mace_run BUILD file
+    print model_tags
+    template_name = 'mace_engine_factory.h.jinja2'
+    source = j2_env.get_template(template_name).render(
+        model_tags=model_tags,
+    )
+    with open(output_dir + '/mace_engine_factory.h', "wb") as f:
+        f.write(source)
+
+
+def parse_args():
+    """Parses command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_tag",
+        type=str,
+        default="",
+        help="model tag")
+    parser.add_argument(
+        "--template_dir", type=str, default="", help="template path")
+    parser.add_argument(
+        "--output_dir", type=str, default="", help="template path")
+    return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+    FLAGS, unparsed = parse_args()
+    gen_mace_engine_creator(FLAGS.model_tag, FLAGS.template_dir,
+                            FLAGS.output_dir)
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
@@ -10,6 +10,7 @@ cc_binary(
    deps = [
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",
+        "//mace/codegen:generated_mace_engine_factory",
        "//mace/core:core",
    ],
 )
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -41,24 +41,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
-
-// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelName();
-extern const std::string ModelChecksum();
-extern const std::string ModelBuildTime();
-extern const std::string ModelBuildOptions();
-
-}  // namespace MACE_MODEL_TAG
-}  // namespace mace
+#include "mace/codegen/engine/mace_engine_factory.h"

 namespace mace {
 namespace tools {
@@ -180,6 +163,9 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  return curr;
 }

+DEFINE_string(model_name,
+              "",
+              "model name in yaml");
 DEFINE_string(input_node,
              "input_node0,input_node1",
              "input nodes, separated by comma");
@@ -211,22 +197,12 @@ DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");

-bool RunModel(const std::vector<std::string> &input_names,
+bool RunModel(const std::string &model_name,
+              const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &input_shapes,
              const std::vector<std::string> &output_names,
              const std::vector<std::vector<int64_t>> &output_shapes) {
-  // load model
-  int64_t t0 = NowMicros();
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-  int64_t t1 = NowMicros();
-  double create_net_millis = (t1 - t0) / 1000.0;
-  LOG(INFO) << "CreateNetDef latency: " << create_net_millis << " ms";
-
  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  LOG(INFO) << "Runing with device type: " << device_type;
-
  // config runtime
  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
@@ -239,25 +215,43 @@ bool RunModel(const std::vector<std::string> &input_names,
  }
 #endif  // MACE_ENABLE_OPENCL

-  const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH");
+  const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
  const std::string kernel_file_path =
      std::string(kernel_path == nullptr ?
-                  "/data/local/tmp/mace_run/cl_program" : kernel_path);
+                  "/data/local/tmp/mace_run/interior" : kernel_path);

-  // Init model
-  LOG(INFO) << "Run init";
  std::shared_ptr<KVStorageFactory> storage_factory(
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);
-  mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+
+  std::shared_ptr<mace::MaceEngine> engine;
+  MaceStatus create_engine_status;
+  // Create Engine
+  int64_t t0 = NowMicros();
+  if (FLAGS_model_data_file.empty()) {
+    create_engine_status =
+        CreateMaceEngine(model_name.c_str(),
+                         nullptr,
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
+  } else {
+    create_engine_status =
+        CreateMaceEngine(model_name.c_str(),
+                         FLAGS_model_data_file.c_str(),
+                         input_names,
+                         output_names,
+                         device_type,
+                         &engine);
  }
-  int64_t t2 = NowMicros();
-  double mace_engine_ctor_millis = (t2 - t1) / 1000.0;
-  double init_millis = (t2 - t0) / 1000.0;
-  LOG(INFO) << "MaceEngine constructor latency: "
-            << mace_engine_ctor_millis << " ms";
+  int64_t t1 = NowMicros();
+
+  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+    LOG(FATAL) << "Create engine error, please check the arguments";
+  }
+
+  double init_millis = (t1 - t0) / 1000.0;
  LOG(INFO) << "Total init latency: " << init_millis << " ms";

  const size_t input_count = input_names.size();
@@ -297,7 +291,7 @@ bool RunModel(const std::vector<std::string> &input_names,

  LOG(INFO) << "Warm up run";
  int64_t t3 = NowMicros();
-  engine.Run(inputs, &outputs);
+  engine->Run(inputs, &outputs);
  int64_t t4 = NowMicros();
  double warmup_millis = (t4 - t3) / 1000.0;
  LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
@@ -308,7 +302,7 @@ bool RunModel(const std::vector<std::string> &input_names,
    int64_t t0 = NowMicros();
    struct mallinfo prev = mallinfo();
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(inputs, &outputs);
+      engine->Run(inputs, &outputs);
      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
        LOG(INFO) << "=== check malloc info change #" << i << " ===";
        prev = LogMallinfoChange(prev);
@@ -320,11 +314,11 @@ bool RunModel(const std::vector<std::string> &input_names,
  }

  // Metrics reporting tools depends on the format, keep in consistent
-  printf("================================================================\n");
-  printf("      create_net engine_ctor        init      warmup     run_avg\n");
-  printf("================================================================\n");
-  printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis,
-         mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);
+  printf("========================================\n");
+  printf("            init      warmup     run_avg\n");
+  printf("========================================\n");
+  printf("time %11.3f %11.3f %11.3f\n",
+         init_millis, warmup_millis, model_run_millis);

 #ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::GPU) {
@@ -355,11 +349,8 @@ int Main(int argc, char **argv) {
  gflags::SetUsageMessage("some usage message");
  gflags::ParseCommandLineFlags(&argc, &argv, true);

+  LOG(INFO) << "model name: " << FLAGS_model_name;
  LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "model name: " << mace::MACE_MODEL_TAG::ModelName();
-  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
-  LOG(INFO) << "build time: " << mace::MACE_MODEL_TAG::ModelBuildTime();
-  LOG(INFO) << "build options: " << mace::MACE_MODEL_TAG::ModelBuildOptions();
  LOG(INFO) << "input node: " << FLAGS_input_node;
  LOG(INFO) << "input shape: " << FLAGS_input_shape;
  LOG(INFO) << "output node: " << FLAGS_output_node;
@@ -399,7 +390,8 @@ int Main(int argc, char **argv) {
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;
    ret =
-        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+        RunModel(FLAGS_model_name, input_names, input_shape_vec,
+                 output_names, output_shape_vec);
  }
  if (ret) {
    return 0;

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -95,13 +95,17 @@ def gen_opencl_and_tuning_code(target_abi,
                               serialno,
                               model_output_dirs,
                               pull_or_not):
+    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
+    cl_platform_info_file_name = "mace_cl_platform_info.txt"
    if pull_or_not:
-        sh_commands.pull_binaries(target_abi, serialno, model_output_dirs)
-
-    codegen_path = "mace/codegen"
+        sh_commands.pull_binaries(target_abi, serialno, model_output_dirs,
+                                  cl_built_kernel_file_name,
+                                  cl_platform_info_file_name)

    # generate opencl binary code
-    sh_commands.gen_opencl_binary_code(model_output_dirs)
+    sh_commands.gen_opencl_binary_code(model_output_dirs,
+                                       cl_built_kernel_file_name,
+                                       cl_platform_info_file_name)

    sh_commands.gen_tuning_param_code(model_output_dirs)

@@ -111,16 +115,14 @@ def model_benchmark_stdout_processor(stdout,
                                     serialno,
                                     model_name,
                                     runtime):
-    metrics = [0] * 5
+    metrics = [0] * 3
    for line in stdout.split('\n'):
        line = line.strip()
        parts = line.split()
-        if len(parts) == 6 and parts[0].startswith("time"):
+        if len(parts) == 4 and parts[0].startswith("time"):
            metrics[0] = str(float(parts[1]))
            metrics[1] = str(float(parts[2]))
            metrics[2] = str(float(parts[3]))
-            metrics[3] = str(float(parts[4]))
-            metrics[4] = str(float(parts[5]))
            break

    device_name = ""
@@ -133,22 +135,20 @@ def model_benchmark_stdout_processor(stdout,
    report_filename = FLAGS.output_dir + "/report.csv"
    if not os.path.exists(report_filename):
        with open(report_filename, 'w') as f:
-            f.write("model_name,device_name,soc,abi,runtime,create_net,"
-                    "engine_ctor,init,warmup,run_avg\n")
+            f.write("model_name,device_name,soc,abi,runtime,"
+                    "init,warmup,run_avg\n")

    data_str = "{model_name},{device_name},{soc},{abi},{runtime}," \
-               "{create_net},{engine_ctor},{init},{warmup},{run_avg}\n" \
+               "{init},{warmup},{run_avg}\n" \
        .format(
            model_name=model_name,
            device_name=device_name,
            soc=target_soc,
            abi=abi,
            runtime=runtime,
-            create_net=metrics[0],
-            engine_ctor=metrics[1],
-            init=metrics[2],
-            warmup=metrics[3],
-            run_avg=metrics[4]
+            init=metrics[0],
+            warmup=metrics[1],
+            run_avg=metrics[2]
        )
    with open(report_filename, 'a') as f:
        f.write(data_str)
@@ -227,12 +227,11 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
        sh_commands.bazel_build(
            mace_run_target,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=False,
            hexagon_mode=hexagon_mode,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)

        tuning_run(runtime, target_abi, serialno, vlog_level, embed_model_data,
@@ -254,13 +253,12 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
            mace_run_target,
            strip,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=True,
            hexagon_mode=hexagon_mode,
            debug=debug,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)
    else:
        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
@@ -268,13 +266,12 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
            mace_run_target,
            strip,
            abi=target_abi,
-            model_tag=model_name,
            production_mode=True,
            hexagon_mode=hexagon_mode,
            debug=debug,
            enable_openmp=enable_openmp
        )
-        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
+        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)


@@ -299,22 +296,36 @@ def merge_libs_and_tuning_results(target_soc,
                           embed_model_data)


-def get_model_files(model_file_path,
-                    model_output_dir,
-                    weight_file_path=""):
+def download_model_files(model_file_path,
+                         model_output_dir,
+                         weight_file_path=""):
    model_file = ""
    weight_file = ""
    if model_file_path.startswith("http://") or \
            model_file_path.startswith("https://"):
        model_file = model_output_dir + "/model.pb"
        urllib.urlretrieve(model_file_path, model_file)
+
+    if weight_file_path.startswith("http://") or \
+            weight_file_path.startswith("https://"):
+        weight_file = model_output_dir + "/model.caffemodel"
+        urllib.urlretrieve(weight_file_path, weight_file)
+
+
+def get_model_files_path(model_file_path,
+                         model_output_dir,
+                         weight_file_path=""):
+    model_file = ""
+    weight_file = ""
+    if model_file_path.startswith("http://") or \
+            model_file_path.startswith("https://"):
+        model_file = model_output_dir + "/model.pb"
    else:
        model_file = model_file_path

    if weight_file_path.startswith("http://") or \
            weight_file_path.startswith("https://"):
        weight_file = model_output_dir + "/model.caffemodel"
-        urllib.urlretrieve(weight_file_path, weight_file)
    else:
        weight_file = weight_file_path

@@ -525,6 +536,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                   target_abi, phone_data_dir, target_soc="", serialno=""):
    hexagon_mode = get_hexagon_mode(configs)
    model_output_dirs = []
+
    for model_name in configs["models"]:
        print '===================', model_name, '==================='
        model_config = configs["models"][model_name]
@@ -534,17 +546,19 @@ def process_models(project_name, configs, embed_model_data, vlog_level,

        # Create model build directory
        model_path_digest = md5sum(model_config["model_file_path"])
+        model_output_base_dir = "%s/%s/%s/%s/%s" % (
+            FLAGS.output_dir, project_name, "build",
+            model_name, model_path_digest)

        if target_abi == "host":
-            model_output_dir = "%s/%s/%s/%s/%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                model_name, model_path_digest, target_abi)
+            model_output_dir = "%s/%s" % (model_output_base_dir, target_abi)
        else:
            device_name = sh_commands.adb_get_device_name_by_serialno(serialno)
-            model_output_dir = "%s/%s/%s/%s/%s/%s_%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                model_name, model_path_digest, device_name.replace(' ', ''),
+            model_output_dir = "%s/%s_%s/%s" % (
+                model_output_base_dir, device_name.replace(' ', ''),
                target_soc, target_abi)
+            sh_commands.clear_phone_data_dir(serialno, phone_data_dir)
+
        model_output_dirs.append(model_output_dir)

        if FLAGS.mode == "build" or FLAGS.mode == "all":
@@ -552,14 +566,9 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                sh.rm("-rf", model_output_dir)
            os.makedirs(model_output_dir)

-        if FLAGS.mode == "build" or FLAGS.mode == "benchmark" or \
-                FLAGS.mode == "all":
-            sh_commands.clear_mace_run_data(
-                    target_abi, serialno, phone_data_dir)
-
-        model_file_path, weight_file_path = get_model_files(
+        model_file_path, weight_file_path = get_model_files_path(
                model_config["model_file_path"],
-                model_output_dir,
+                model_output_base_dir,
                model_config["weight_file_path"])

        if FLAGS.mode == "build" or FLAGS.mode == "run" or \
@@ -570,25 +579,6 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                         model_config["input_shapes"],
                                         input_file_list)

-        if FLAGS.mode == "build" or FLAGS.mode == "benchmark" or \
-                FLAGS.mode == "all":
-            sh_commands.gen_model_code(
-                    "mace/codegen/models/%s" % model_name,
-                    model_config["platform"],
-                    model_file_path,
-                    weight_file_path,
-                    model_config["model_sha256_checksum"],
-                    ",".join(model_config["input_nodes"]),
-                    ",".join(model_config["output_nodes"]),
-                    data_type,
-                    model_config["runtime"],
-                    model_name,
-                    ":".join(model_config["input_shapes"]),
-                    model_config["dsp_mode"],
-                    embed_model_data,
-                    model_config["fast_conv"],
-                    model_config["obfuscate"])
-
        if FLAGS.mode == "build" or FLAGS.mode == "all":
            build_mace_run_prod(hexagon_mode,
                                model_config["runtime"],
@@ -609,9 +599,14 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                model_config["limit_opencl_kernel_time"],
                                phone_data_dir,
                                FLAGS.enable_openmp)
+            sh_commands.build_benchmark_model(target_abi,
+                                              embed_model_data,
+                                              model_output_dir,
+                                              model_name,
+                                              hexagon_mode)

        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
-                FLAGS.mode == "all":
+           FLAGS.mode == "all":
            tuning_run(model_config["runtime"],
                       target_abi,
                       serialno,
@@ -647,7 +642,6 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                        model_config["output_shapes"],
                                        model_name,
                                        device_type,
-                                        hexagon_mode,
                                        phone_data_dir,
                                        FLAGS.omp_num_threads,
                                        FLAGS.cpu_affinity_policy,
@@ -738,12 +732,61 @@ def main(unused_args):
        # generate source
        sh_commands.gen_mace_version()
        sh_commands.gen_encrypted_opencl_source()
+        sh_commands.gen_mace_engine_factory_source(configs['models'].keys())

+    embed_model_data = configs["embed_model_data"]
    target_socs = get_target_socs(configs)

-    embed_model_data = configs["embed_model_data"]
    vlog_level = FLAGS.vlog_level
    phone_data_dir = "/data/local/tmp/mace_run/"
+
+    if FLAGS.mode == "build" or FLAGS.mode == "all":
+        print '* Model Convert'
+        sh_commands.clear_model_codegen()
+        for model_name in configs["models"]:
+            print '===================', model_name, '==================='
+            model_config = configs["models"][model_name]
+            data_type, device_type = get_data_and_device_type(
+                model_config["runtime"])
+
+            # Create model build directory
+            model_path_digest = md5sum(model_config["model_file_path"])
+
+            model_output_base_dir = "%s/%s/%s/%s/%s" % (
+                FLAGS.output_dir, project_name, "build",
+                model_name, model_path_digest)
+
+            if os.path.exists(model_output_base_dir):
+                sh.rm("-rf", model_output_base_dir)
+            os.makedirs(model_output_base_dir)
+
+            download_model_files(
+                model_config["model_file_path"],
+                model_output_base_dir,
+                model_config["weight_file_path"])
+
+            model_file_path, weight_file_path = get_model_files_path(
+                model_config["model_file_path"],
+                model_output_base_dir,
+                model_config["weight_file_path"])
+
+            sh_commands.gen_model_code(
+                "mace/codegen/models/%s" % model_name,
+                model_config["platform"],
+                model_file_path,
+                weight_file_path,
+                model_config["model_sha256_checksum"],
+                ",".join(model_config["input_nodes"]),
+                ",".join(model_config["output_nodes"]),
+                data_type,
+                model_config["runtime"],
+                model_name,
+                ":".join(model_config["input_shapes"]),
+                model_config["dsp_mode"],
+                embed_model_data,
+                model_config["fast_conv"],
+                model_config["obfuscate"])
+
    for target_abi in configs["target_abis"]:
        for target_soc in target_socs:
            if target_abi != 'host':

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -33,6 +33,7 @@ try:
    from binary_codegen import tuning_param_codegen
    from generate_data import generate_input_data
    from validate import validate
+    from mace_engine_factory_codegen import gen_mace_engine_factory
 except Exception as e:
    print("Import error:\n%s" % e)
    exit(1)
@@ -74,15 +75,14 @@ def is_device_locked(serialno):
 ################################
 # clear data
 ################################
-def clear_mace_run_data(abi,
-                        serialno,
-                        phone_data_dir,
-                        model_codegen_dir="mace/codegen/models"):
-    if abi != "host":
-        sh.adb("-s",
-               serialno,
-               "shell",
-               "rm -rf %s" % phone_data_dir)
+def clear_phone_data_dir(serialno, phone_data_dir):
+    sh.adb("-s",
+           serialno,
+           "shell",
+           "rm -rf %s" % phone_data_dir)
+
+
+def clear_model_codegen(model_codegen_dir="mace/codegen/models"):
    if os.path.exists(model_codegen_dir):
        sh.rm("-rf", model_codegen_dir)

@@ -268,7 +268,6 @@ def adb_run_valgrind(serialno,
 def bazel_build(target,
                strip="always",
                abi="armeabi-v7a",
-                model_tag="",
                production_mode=False,
                hexagon_mode=False,
                disable_no_tuning_warning=False,
@@ -289,7 +288,6 @@ def bazel_build(target,
            "--copt=-std=c++11",
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
            "--copt=-Werror=return-type",
-            "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
            "--copt=-O3",
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
@@ -315,7 +313,6 @@ def bazel_build(target,
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
            "--copt=-Werror=return-type",
            "--copt=-DMACE_OBFUSCATE_LITERALS",
-            "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
            "--copt=-O3",
            "--define",
            "neon=true",
@@ -371,8 +368,22 @@ def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
                           "mace/codegen/opencl/opencl_encrypt_program.cc")


-def pull_binaries(abi, serialno, model_output_dirs):
-    compiled_opencl_dir = "/data/local/tmp/mace_run/cl_program/"
+def gen_mace_engine_factory_source(model_tags, codegen_path="mace/codegen"):
+    print("* Genearte mace engine creator source")
+    codegen_tools_dir = "%s/engine" % codegen_path
+    sh.rm("-rf", codegen_tools_dir)
+    sh.mkdir("-p", codegen_tools_dir)
+    gen_mace_engine_factory(
+        model_tags,
+        "mace/python/tools",
+        codegen_tools_dir)
+    print("Genearte mace engine creator source done!\n")
+
+
+def pull_binaries(abi, serialno, model_output_dirs,
+                  cl_built_kernel_file_name,
+                  cl_platform_info_file_name):
+    compiled_opencl_dir = "/data/local/tmp/mace_run/interior/"
    mace_run_param_file = "mace_run.config"

    cl_bin_dirs = []
@@ -385,15 +396,18 @@ def pull_binaries(abi, serialno, model_output_dirs):
            sh.rm("-rf", cl_bin_dir)
        sh.mkdir("-p", cl_bin_dir)
        if abi != "host":
-            adb_pull(compiled_opencl_dir, cl_bin_dir, serialno)
+            adb_pull(compiled_opencl_dir + cl_built_kernel_file_name,
+                     cl_bin_dir, serialno)
+            adb_pull(compiled_opencl_dir + cl_platform_info_file_name,
+                     cl_bin_dir, serialno)
            adb_pull("/data/local/tmp/mace_run/%s" % mace_run_param_file,
                     cl_bin_dir, serialno)


 def gen_opencl_binary_code(model_output_dirs,
+                           cl_built_kernel_file_name,
+                           cl_platform_info_file_name,
                           codegen_path="mace/codegen"):
-    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
-    cl_platform_info_file_name = "mace_cl_platform_info.txt"
    opencl_codegen_file = "%s/opencl/opencl_compiled_program.cc" % codegen_path

    cl_bin_dirs = []
@@ -528,25 +542,8 @@ def gen_random_input(model_output_dir,


 def update_mace_run_lib(model_output_dir,
-                        abi,
                        model_tag,
-                        embed_model_data,
-                        generated_model_lib_dir="bazel-bin/mace/codegen/"):
-    model_lib_path = model_output_dir + "/libmace_%s.a" % model_tag
-    if abi == "host":
-        bazel_build(
-                "//mace/codegen:generated_models",
-                abi=abi,
-                model_tag=model_tag)
-        generated_model_lib_name = "libgenerated_models.pic.a"
-    else:
-        generated_model_lib_name = "libgenerated_models.a"
-
-    if os.path.exists(model_lib_path):
-        sh.rm("-rf", model_lib_path)
-    sh.cp("-f", generated_model_lib_dir + "/" + generated_model_lib_name,
-          model_lib_path)
-
+                        embed_model_data):
    mace_run_filepath = model_output_dir + "/mace_run"
    if os.path.exists(mace_run_filepath):
        sh.rm("-rf", mace_run_filepath)
@@ -560,6 +557,12 @@ def update_mace_run_lib(model_output_dir,
          model_output_dir)


+def create_internal_storage_dir(serialno, phone_data_dir):
+    internal_storage_dir = "%s/interior/" % phone_data_dir
+    sh.adb("-s", serialno, "shell", "mkdir", "-p", internal_storage_dir)
+    return internal_storage_dir
+
+
 def tuning_run(abi,
               serialno,
               vlog_level,
@@ -598,6 +601,7 @@ def tuning_run(abi,
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/mace_run" % model_output_dir,
+                "--model_name=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
@@ -622,8 +626,8 @@ def tuning_run(abi,
        return stdout
    else:
        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
-        compiled_opencl_dir = "/data/local/tmp/mace_run/cl_program/"
-        sh.adb("-s", serialno, "shell", "mkdir", "-p", compiled_opencl_dir)
+        internal_storage_dir = create_internal_storage_dir(
+            serialno, phone_data_dir)

        for input_name in input_nodes:
            formatted_name = common.formatted_file_name(input_file_name,
@@ -646,7 +650,7 @@ def tuning_run(abi,
            "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % phone_data_dir,
-            "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir,
+            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
        ]
        if valgrind:
@@ -657,6 +661,7 @@ def tuning_run(abi,
            ])
        adb_cmd.extend([
            "%s/mace_run" % phone_data_dir,
+            "--model_name=%s" % model_tag,
            "--input_node=%s" % ",".join(input_nodes),
            "--output_node=%s" % ",".join(output_nodes),
            "--input_shape=%s" % ":".join(input_shapes),
@@ -836,47 +841,59 @@ def merge_libs(target_soc,
    if hexagon_mode:
        sh.cp("-f", hexagon_lib_file, model_bin_dir)

+    sh.cp("-f", glob.glob("mace/codegen/engine/*.h"), model_header_dir)
+
    mri_stream = ""
    if abi == "host":
        mri_stream += "create %s/libmace_%s.a\n" % \
                      (model_bin_dir, project_name)
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/codegen/libgenerated_opencl.pic.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_opencl.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_models.pic.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_mace_engine_creator.pic.a\n")
    else:
        mri_stream += "create %s/libmace_%s.%s.a\n" % \
                      (model_bin_dir, project_name, target_soc)
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/codegen/libgenerated_opencl.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_opencl.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/codegen/libgenerated_tuning_params.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_tuning_params.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/codegen/libgenerated_version.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_version.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/core/libcore.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_models.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/kernels/libkernels.a\n")
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_mace_engine_creator.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/utils/libutils.a\n")
+            "addlib "
+            "bazel-bin/mace/core/libcore.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/utils/libutils_prod.a\n")
+            "addlib "
+            "bazel-bin/mace/kernels/libkernels.a\n")
        mri_stream += (
-                "addlib "
-                "bazel-bin/mace/ops/libops.lo\n")
+            "addlib "
+            "bazel-bin/mace/utils/libutils.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/utils/libutils_prod.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/ops/libops.lo\n")

    for model_output_dir in model_output_dirs:
-        for lib in sh.ls(glob.glob("%s/*.a" % model_output_dir), "-1"):
-            mri_stream += "addlib %s\n" % lib
        if not embed_model_data:
            sh.cp("-f", glob.glob("%s/*.data" % model_output_dir),
                  model_data_dir)
@@ -921,6 +938,28 @@ def packaging_lib(libmace_output_dir, project_name):
    print("Packaging Done!\n")


+def build_benchmark_model(abi,
+                          embed_model_data,
+                          model_output_dir,
+                          model_tag,
+                          hexagon_mode):
+    benchmark_binary_file = "%s/benchmark_model" % model_output_dir
+    if os.path.exists(benchmark_binary_file):
+        sh.rm("-rf", benchmark_binary_file)
+    if not embed_model_data:
+        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
+              model_output_dir)
+
+    benchmark_target = "//mace/benchmark:benchmark_model"
+    bazel_build(benchmark_target,
+                abi=abi,
+                production_mode=True,
+                hexagon_mode=hexagon_mode)
+
+    target_bin = "/".join(bazel_target_to_bin(benchmark_target))
+    sh.cp("-f", target_bin, model_output_dir)
+
+
 def benchmark_model(abi,
                    serialno,
                    vlog_level,
@@ -932,31 +971,13 @@ def benchmark_model(abi,
                    output_shapes,
                    model_tag,
                    device_type,
-                    hexagon_mode,
                    phone_data_dir,
                    omp_num_threads=-1,
                    cpu_affinity_policy=1,
                    gpu_perf_hint=3,
                    gpu_priority_hint=3,
-                    input_file_name="model_input",
-                    output_file_name="model_out"):
+                    input_file_name="model_input"):
    print("* Benchmark for %s" % model_tag)
-    benchmark_binary_file = "%s/benchmark_model" % model_output_dir
-    if os.path.exists(benchmark_binary_file):
-        sh.rm("-rf", benchmark_binary_file)
-    if not embed_model_data:
-        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
-              model_output_dir)
-
-    benchmark_target = "//mace/benchmark:benchmark_model"
-    bazel_build(benchmark_target,
-                abi=abi,
-                model_tag=model_tag,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
-
-    target_bin = "/".join(bazel_target_to_bin(benchmark_target))
-    sh.cp("-f", target_bin, model_output_dir)

    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
@@ -966,6 +987,7 @@ def benchmark_model(abi,
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/benchmark_model" % model_output_dir,
+                "--model_name=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
@@ -981,6 +1003,8 @@ def benchmark_model(abi,
        p.wait()
    else:
        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
+        internal_storage_dir = create_internal_storage_dir(
+            serialno, phone_data_dir)

        for input_name in input_nodes:
            formatted_name = common.formatted_file_name(input_file_name,
@@ -1000,8 +1024,10 @@ def benchmark_model(abi,
            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
            phone_data_dir,
+            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
            "MACE_OPENCL_PROFILING=1",
            "%s/benchmark_model" % phone_data_dir,
+            "--model_name=%s" % model_tag,
            "--input_node=%s" % ",".join(input_nodes),
            "--output_node=%s" % ",".join(output_nodes),
            "--input_shape=%s" % ":".join(input_shapes),