Refactor model run: Support CMake everything

43b33191 · liyin · 李寅 · 8d64715c · 43b33191 · 43b33191
37 changed file
--- a/.gitignore
+++ b/.gitignore
 bazel-*
-build/
+build/*
 cmake-build/
 cmake-build-debug/
 docs/_build/

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -42,15 +42,15 @@ build_docs:
 cmake_build_android-armeabi-v7a:
  stage: build
  script:
-    - sh tools/cmake-build-android-armeabi-v7a-full.sh
-    - LIBMACE32_FULL_SIZE=`stat -c%s cmake-build/android-armeabi-v7a-full/install/lib/libmace.so`
+    - RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+    - LIBMACE32_FULL_SIZE=`stat -c%s build/cmake-build/armeabi-v7a/install/lib/libmace.so`
    - if (( LIBMACE32_FULL_SIZE > 2200000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 cmake_build_android-arm64-v8:
  stage: build
  script:
-    - sh tools/cmake-build-android-arm64-v8a-full.sh
-    - LIBMACE64_FULL_SIZE=`stat -c%s cmake-build/android-arm64-v8a-full/install/lib/libmace.so`
+    - RUNTIME=GPU bash tools/cmake/cmake-build-arm64-v8a.sh
+    - LIBMACE64_FULL_SIZE=`stat -c%s build/cmake-build/arm64-v8a/install/lib/libmace.so`
    - if (( LIBMACE64_FULL_SIZE > 3100000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 bazel_build:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,7 @@ option(MACE_ENABLE_BENCHMARKS  "whether to build c++ micro benchmarks"      OFF)
 option(MACE_ENABLE_OPT_SIZE    "whether to build with optimized binary size" ON)
 option(MACE_ENABLE_OBFUSCATE   "whether to build with code obfuscation"      ON)
 option(MACE_ENABLE_CCACHE      "whether to build with ccache"                ON)
+option(MACE_ENABLE_CODE_MODE   "whether to use code mode"                   OFF)

 message("CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")

@@ -40,11 +41,15 @@ if(MACE_ENABLE_OPT_SIZE)
  set(MACE_CODE_CC_FLAGS "${MACE_CODE_CC_FLAGS} -fno-rtti -fno-exceptions -DGOOGLE_PROTOBUF_NO_RTTI -DPROTOBUF_USE_EXCEPTIONS=0")
 endif(MACE_ENABLE_OPT_SIZE)

+if(MACE_ENABLE_CODE_MODE)
+  set(MACE_CODE_CC_FLAGS "${MACE_CODE_CC_FLAGS} -DMODEL_GRAPH_FORMAT_CODE")
+endif(MACE_ENABLE_CODE_MODE)
+
 # flags apply only to mace code (third_party excluded)
 # -Wno-error=unused-command-line-argument: official Android toolchain contains
 # unsupported argument and will break ccache preprocessor
 if(ANDROID)
-  set(MACE_CODE_CC_FLAGS "${MACE_CODE_CC_FLAGS} -Wall -Werror -Wno-error=unused-command-line-argument")
+  set(MACE_CODE_CC_FLAGS "${MACE_CODE_CC_FLAGS} -Wall -Werror -Wno-error=unused-command-line-argument -Wno-error=unevaluated-expression -Wno-error=tautological-compare")
 else(ANDROID)
  set(MACE_CODE_CC_FLAGS "${MACE_CODE_CC_FLAGS} -Wall -Werror")
 endif(ANDROID)

--- a/mace/codegen/CMakeLists.txt
+++ b/mace/codegen/CMakeLists.txt
@@ -31,3 +31,21 @@ add_dependencies(generated_opencl_kernel opencl_kernel_src)

 install(TARGETS generated_version ARCHIVE DESTINATION lib)
 install(TARGETS generated_opencl_kernel ARCHIVE DESTINATION lib)
+
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/null.cc "")
+file(GLOB CODEGEN_MODELS ${CMAKE_CURRENT_BINARY_DIR}/null.cc models/**/code/*.cc)
+
+add_library(model STATIC ${CODEGEN_MODELS})
+target_link_libraries(model PRIVATE core proto utils port)
+install(TARGETS model ARCHIVE DESTINATION lib)
+
+add_library(model_shared SHARED ${CODEGEN_MODELS})
+target_link_libraries(model_shared PRIVATE core proto utils port)
+if(NOT APPLE)
+    set_target_properties(model_shared PROPERTIES LINK_FLAGS
+            "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/model_version_script.lds")
+endif(NOT APPLE)
+install(TARGETS model_shared DESTINATION lib)
+
+file(GLOB MODEL_HEADERS engine/mace_engine_factory.h models/**/code/*.h)
+install(FILES ${MODEL_HEADERS} DESTINATION include/models)
--- a/mace/codegen/model_version_script.lds
+++ b/mace/codegen/model_version_script.lds
+mace {
+  global:
+    *LoadModelData*;
+    *CreateNet*;
+    *ModelName*;
+    *ModelChecksum*;
+    *ModelBuildTime*;
+    *ModelBuildOptions*;
+
+  local:
+    *;
+};
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -603,9 +603,11 @@ MaceEngine::Impl::~Impl() {
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  if (device_type_ == HEXAGON || device_type_ == HTA) {
    if (VLOG_IS_ON(2)) {
-      hexagon_controller_->GetPerfInfo();
      hexagon_controller_->PrintLog();
    }
+    if (VLOG_IS_ON(1)) {
+      hexagon_controller_->GetPerfInfo();
+    }
    MACE_CHECK(hexagon_controller_->TeardownGraph(), "hexagon teardown error");
    MACE_CHECK(hexagon_controller_->Finalize(), "hexagon finalize error");
  }

--- a/mace/proto/CMakeLists.txt
+++ b/mace/proto/CMakeLists.txt
 set(MACE_PROTO_PROTOS mace.proto)
 set(MACE_PROTO_SRCS)
 set(MACE_PROTO_HDRS)
+set(MACE_PROTO_PYTHON_DIR ${PROJECT_SOURCE_DIR}/tools/python/py_proto)

 foreach(proto_file ${MACE_PROTO_PROTOS})
  get_filename_component(proto_file_abs ${proto_file} ABSOLUTE)
@@ -17,10 +18,20 @@ foreach(proto_file ${MACE_PROTO_PROTOS})
      DEPENDS protoc_bin
      VERBATIM
  )
+
+  set(PROTO_GENERATED_PY_FILES ${MACE_PROTO_PYTHON_DIR}/${basename}_pb2.py)
+  add_custom_command(
+      OUTPUT ${PROTO_GENERATED_PY_FILES}
+      COMMAND ${PROTOC_BIN} --python_out ${MACE_PROTO_PYTHON_DIR} -I ${CMAKE_CURRENT_SOURCE_DIR} ${proto_file_abs}
+      COMMENT "Generating ${PROTO_GENERATED_PY_FILES} from ${proto_file}"
+      DEPENDS protoc_bin
+      VERBATIM
+  )
 endforeach()

-add_custom_target(mace_proto_src DEPENDS ${MACE_PROTO_SRCS}
-  COMMENT "Checking if re-generation is required" )
+add_custom_target(mace_proto_src DEPENDS ${PROTO_GENERATED_FILES}
+        COMMENT "Checking if re-generation is required")
+add_custom_target(mace_proto_py ALL DEPENDS ${PROTO_GENERATED_PY_FILES})

 add_library(proto ${MACE_PROTO_SRCS})
 target_link_libraries(proto libprotobuf_lite)

--- a/mace/tools/CMakeLists.txt
+++ b/mace/tools/CMakeLists.txt
@@ -4,6 +4,7 @@ file(GLOB MACE_RUN_SRCS
 add_executable(mace_run ${MACE_RUN_SRCS})
 target_link_libraries(mace_run PUBLIC
  mace_static
+  model
  gflags
 )


--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -548,10 +548,16 @@ int Main(int argc, char **argv) {
  LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
  LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
  LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
-  LOG(INFO) << "limit_opencl_kernel_time: "
-            << getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
-  LOG(INFO) << "opencl_queue_window_size: "
-            << getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");
+  auto limit_opencl_kernel_time = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
+  if (limit_opencl_kernel_time) {
+    LOG(INFO) << "limit_opencl_kernel_time: "
+              << limit_opencl_kernel_time;
+  }
+  auto opencl_queue_window_size = getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");
+  if (opencl_queue_window_size) {
+    LOG(INFO) << "opencl_queue_window_size: "
+              << getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");
+  }

  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
@@ -584,14 +590,12 @@ int Main(int argc, char **argv) {
  for (size_t i = 0; i < output_count; ++i) {
    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
  }
-
  float cpu_float32_performance = 0.0f;
  if (FLAGS_input_dir.empty()) {
    // get cpu capability
    Capability cpu_capability = GetCapability(DeviceType::CPU);
    cpu_float32_performance = cpu_capability.float32_performance.exec_time;
  }
-
  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;

--- a/third_party/caffe/caffe.cmake
+++ b/third_party/caffe/caffe.cmake
+set(CAFFE_PROTO_PROTOS ${PROJECT_SOURCE_DIR}/third_party/caffe/caffe.proto)
+set(MACE_PROTO_PYTHON_DIR ${PROJECT_SOURCE_DIR}/tools/python/py_proto)
+
+foreach(proto_file ${CAFFE_PROTO_PROTOS})
+    get_filename_component(proto_file_abs ${proto_file} ABSOLUTE)
+    get_filename_component(basename ${proto_file} NAME_WE)
+    set(PROTO_GENERATED_PY_FILES ${MACE_PROTO_PYTHON_DIR}/${basename}_pb2.py)
+
+    add_custom_command(
+            OUTPUT ${PROTO_GENERATED_PY_FILES}
+            COMMAND ${PROTOC_BIN} --python_out ${MACE_PROTO_PYTHON_DIR} -I ${PROJECT_SOURCE_DIR}/third_party/caffe ${proto_file_abs}
+            COMMENT "Generating ${PROTO_GENERATED_PY_FILES} from ${proto_file}"
+            DEPENDS protoc_bin
+            VERBATIM
+    )
+endforeach()
+
+add_custom_target(caffe_proto_src ALL DEPENDS ${PROTO_GENERATED_PY_FILES})
--- a/third_party/third_party.cmake
+++ b/third_party/third_party.cmake
@@ -50,6 +50,7 @@ include(${PROJECT_SOURCE_DIR}/third_party/opencl-clhpp/opencl-clhpp.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/opencl-headers/opencl-headers.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/protobuf/protobuf.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/tflite/tflite.cmake)
+include(${PROJECT_SOURCE_DIR}/third_party/caffe/caffe.cmake)

 if(MACE_ENABLE_HEXAGON_DSP)
  include(${PROJECT_SOURCE_DIR}/third_party/nnlib/nnlib.cmake)

--- a/tools/clear_workspace.sh
+++ b/tools/clear_workspace.sh
+#!/usr/bin/env bash
+
+rm -rf mace/codegen/models
+rm -rf mace/codegen/engine
+rm -rf mace/codegen/opencl
+
+for d in build/*; do
+    if [[ "$d" != "build/cmake-build*" ]]; then
+        rm -rf "$d"
+    fi
+done
--- a/tools/cmake/cmake-build-aarch64-linux-gnu.sh
+++ b/tools/cmake/cmake-build-aarch64-linux-gnu.sh
+#!/usr/bin/env sh
+
+set -e
+
+# build for arm linux aarch64
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/aarch64-linux-gnu
+fi
+
+MACE_ENABLE_OPENCL=OFF
+if [[ "$RUNTIME" == "GPU" ]]; then
+    MACE_ENABLE_OPENCL=ON
+fi
+
+MACE_ENABLE_CODE_MODE=OFF
+if [[ "$RUNMODE" == "code" ]]; then
+    MACE_ENABLE_CODE_MODE=ON
+fi
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \
+      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/aarch64-linux-gnu.cmake \
+      -DCMAKE_BUILD_TYPE=Release          \
+      -DMACE_ENABLE_NEON=ON               \
+      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
+      -DMACE_ENABLE_OPT_SIZE=ON           \
+      -DMACE_ENABLE_OBFUSCATE=ON          \
+      -DMACE_ENABLE_TESTS=ON              \
+      -DMACE_ENABLE_BENCHMARKS=ON         \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}       \
+      -DCMAKE_INSTALL_PREFIX=install      \
+      ../../..
+make -j6 VERBOSE=1 && make install
+cd ../../..
--- a/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
+++ b/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
+#!/usr/bin/env sh
+
+set -e
+
+# build for arm linux gnueabihf
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/arm-linux-gnueabihf
+fi
+
+MACE_ENABLE_CODE_MODE=OFF
+if [[ $RUNMODE == "code" ]]; then
+    MACE_ENABLE_CODE_MODE=ON
+fi
+
+MACE_ENABLE_OPENCL=OFF
+if [[ "$RUNTIME" == "GPU" ]]; then
+    MACE_ENABLE_OPENCL=ON
+fi
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \
+      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/arm-linux-gnueabihf.cmake \
+      -DCMAKE_BUILD_TYPE=Release          \
+      -DMACE_ENABLE_NEON=ON               \
+      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}              \
+      -DMACE_ENABLE_OPT_SIZE=ON           \
+      -DMACE_ENABLE_OBFUSCATE=ON          \
+      -DMACE_ENABLE_TESTS=ON              \
+      -DMACE_ENABLE_BENCHMARKS=ON         \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}        \
+      -DCMAKE_INSTALL_PREFIX=install      \
+      ../../..
+make -j6 VERBOSE=1 && make install
+cd ../../..
--- a/tools/cmake/cmake-build-arm64-v8a.sh
+++ b/tools/cmake/cmake-build-arm64-v8a.sh
+#!/usr/bin/env sh
+
+set -e
+
+# build for android arm64-v8a
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/arm64-v8a
+fi
+
+MACE_ENABLE_OPENCL=OFF
+MACE_ENABLE_HEXAGON_DSP=OFF
+MACE_ENABLE_HEXAGON_HTA=OFF
+MACE_ENABLE_MTK_APU=OFF
+if [[ "$RUNTIME" == "GPU" ]]; then
+    MACE_ENABLE_OPENCL=ON
+elif [[ "$RUNTIME" == "HEXAGON" ]]; then
+    MACE_ENABLE_HEXAGON_DSP=ON
+elif [[ "$RUNTIME" == "HTA" ]]; then
+    MACE_ENABLE_HEXAGON_HTA=ON
+elif [[ "$RUNTIME" == "APU" ]]; then
+    MACE_ENABLE_MTK_APU=ON
+fi
+
+MACE_ENABLE_CODE_MODE=OFF
+if [[ "$RUNMODE" == "code" ]]; then
+    MACE_ENABLE_CODE_MODE=ON
+fi
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake -DANDROID_ABI="arm64-v8a" \
+      -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
+      -DANDROID_NATIVE_API_LEVEL=28       \
+      -DCMAKE_BUILD_TYPE=Release          \
+      -DCMAKE_ANDROID_STL_TYPE=c++_shared \
+      -DMACE_ENABLE_NEON=ON               \
+      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
+      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
+      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
+      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
+      -DMACE_ENABLE_OPT_SIZE=ON           \
+      -DMACE_ENABLE_OBFUSCATE=ON          \
+      -DMACE_ENABLE_TESTS=ON              \
+      -DMACE_ENABLE_BENCHMARKS=ON         \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}        \
+      -DCMAKE_INSTALL_PREFIX=install      \
+      ../../..
+make -j6 VERBOSE=1 && make install
+cd ../../..
--- a/tools/cmake/cmake-build-armeabi-v7a.sh
+++ b/tools/cmake/cmake-build-armeabi-v7a.sh
+#!/usr/bin/env bash
+
+set -e
+
+# build for android armeabi-v7a
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/armeabi-v7a
+fi
+
+MACE_ENABLE_OPENCL=OFF
+MACE_ENABLE_HEXAGON_DSP=OFF
+MACE_ENABLE_HEXAGON_HTA=OFF
+MACE_ENABLE_MTK_APU=OFF
+
+if [[ "$RUNTIME" == "GPU" ]]; then
+    MACE_ENABLE_OPENCL=ON
+elif [[ "$RUNTIME" == "HEXAGON" ]]; then
+    MACE_ENABLE_HEXAGON_DSP=ON
+elif [[ "$RUNTIME" == "HTA" ]]; then
+    MACE_ENABLE_HEXAGON_HTA=ON
+elif [[ "$RUNTIME" == "APU" ]]; then
+    MACE_ENABLE_MTK_APU=ON
+fi
+
+MACE_ENABLE_CODE_MODE=OFF
+if [[ "$RUNMODE" == "code" ]]; then
+    MACE_ENABLE_CODE_MODE=ON
+fi
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake -DANDROID_ABI="armeabi-v7a" \
+      -DANDROID_ARM_NEON=ON \
+      -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
+      -DANDROID_NATIVE_API_LEVEL=28                          \
+      -DCMAKE_BUILD_TYPE=Release                             \
+      -DCMAKE_ANDROID_STL_TYPE=c++_shared                    \
+      -DMACE_ENABLE_NEON=ON                                  \
+      -DMACE_ENABLE_QUANTIZE=ON                              \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
+      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
+      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
+      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
+      -DMACE_ENABLE_OPT_SIZE=ON                              \
+      -DMACE_ENABLE_OBFUSCATE=ON                             \
+      -DMACE_ENABLE_TESTS=ON                                 \
+      -DMACE_ENABLE_BENCHMARKS=ON                            \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}       \
+      -DCMAKE_INSTALL_PREFIX=install                         \
+      ../../..
+make -j6 VERBOSE=1 && make install
+cd ../../..
--- a/tools/cmake/cmake-build-host.sh
+++ b/tools/cmake/cmake-build-host.sh
+#!/usr/bin/env sh
+
+set -e
+
+# build for host
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/host
+fi
+
+
+MACE_ENABLE_CODE_MODE=OFF
+if [[ "$RUNMODE" == "code" ]]; then
+    MACE_ENABLE_CODE_MODE=ON
+fi
+
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake -DMACE_ENABLE_NEON=OFF         \
+      -DMACE_ENABLE_QUANTIZE=OFF     \
+      -DMACE_ENABLE_OPENCL=OFF       \
+      -DMACE_ENABLE_TESTS=ON         \
+      -DMACE_ENABLE_BENCHMARKS=ON    \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}    \
+      -DCMAKE_INSTALL_PREFIX=install \
+      ../../..
+make -j6 VERBOSE=1 && make install
+cd ../../..
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -224,7 +224,7 @@ def get_opencl_mode(configs):

 def get_quantize_mode(configs):
    for model_name in configs[YAMLKeyword.models]:
-        quantize =\
+        quantize = \
            configs[YAMLKeyword.models][model_name].get(
                YAMLKeyword.quantize, 0)
        if quantize == 1:
@@ -297,8 +297,8 @@ def get_model_files(model_config, model_output_dir):

    if sha256_checksum(model_file) != model_sha256_checksum:
        error_info = model_file_path + \
-            " model file sha256checksum not match " + \
-            model_sha256_checksum
+                     " model file sha256checksum not match " + \
+                     model_sha256_checksum
        MaceLogger.error(ModuleName.MODEL_CONVERTER, error_info)

    if weight_file_path.startswith("http://") or \
@@ -316,8 +316,8 @@ def get_model_files(model_config, model_output_dir):
    if weight_file:
        if sha256_checksum(weight_file) != weight_sha256_checksum:
            error_info = weight_file_path + \
-                " weight file sha256checksum not match " + \
-                weight_sha256_checksum
+                         " weight file sha256checksum not match " + \
+                         weight_sha256_checksum
            MaceLogger.error(ModuleName.MODEL_CONVERTER, error_info)

    if quantize_range_file_path.startswith("http://") or \
@@ -547,7 +547,7 @@ def format_model_config(flags):
                                              [])
            if input_data_formats:
                if not isinstance(input_data_formats, list):
-                    subgraph[YAMLKeyword.input_data_formats] =\
+                    subgraph[YAMLKeyword.input_data_formats] = \
                        [input_data_formats] * input_size
                else:
                    mace_check(len(input_data_formats)
@@ -555,7 +555,7 @@ def format_model_config(flags):
                               ModuleName.YAML_CONFIG,
                               "input_data_formats should match"
                               " the size of input.")
-                for input_data_format in\
+                for input_data_format in \
                        subgraph[YAMLKeyword.input_data_formats]:
                    mace_check(input_data_format in DataFormatStrs,
                               ModuleName.YAML_CONFIG,
@@ -578,14 +578,14 @@ def format_model_config(flags):
                               ModuleName.YAML_CONFIG,
                               "output_data_formats should match"
                               " the size of output")
-                for output_data_format in\
+                for output_data_format in \
                        subgraph[YAMLKeyword.output_data_formats]:
                    mace_check(output_data_format in DataFormatStrs,
                               ModuleName.YAML_CONFIG,
                               "'output_data_formats' must be in "
                               + str(DataFormatStrs))
            else:
-                subgraph[YAMLKeyword.output_data_formats] =\
+                subgraph[YAMLKeyword.output_data_formats] = \
                    [DataFormat.NHWC] * output_size

            validation_threshold = subgraph.get(
@@ -767,6 +767,7 @@ def print_library_summary(configs):

 def convert_func(flags):
    configs = config_parser.parse(flags.config)
+    print(configs)
    library_name = configs[YAMLKeyword.library_name]
    if not os.path.exists(BUILD_OUTPUT_DIR):
        os.makedirs(BUILD_OUTPUT_DIR)
@@ -817,26 +818,27 @@ def convert_func(flags):
    for model_name, model_config in configs[YAMLKeyword.models].items():
        model_codegen_dir = "%s/%s" % (MODEL_CODEGEN_DIR, model_name)
        encrypt.encrypt(model_name,
-                        "%s/%s.pb" % (model_codegen_dir, model_name),
-                        "%s/%s.data" % (model_codegen_dir, model_name),
-                        model_config[YAMLKeyword.runtime],
+                        "%s/model/%s.pb" % (model_codegen_dir, model_name),
+                        "%s/model/%s.data" % (model_codegen_dir, model_name),
+                        config_parser.parse_device_type(
+                            model_config[YAMLKeyword.runtime]),
                        model_codegen_dir,
-                        bool(model_config.get(YAMLKeyword.obfuscate, 1)))
+                        bool(model_config.get(YAMLKeyword.obfuscate, 1)),
+                        model_graph_format == "code",
+                        model_data_format == "code")

        if model_graph_format == ModelFormat.file:
            sh.mv("-f",
-                  '%s/file/%s.pb' % (model_codegen_dir, model_name),
+                  '%s/model/%s.pb' % (model_codegen_dir, model_name),
                  model_output_dir)
            sh.mv("-f",
-                  '%s/file/%s.data' % (model_codegen_dir, model_name),
+                  '%s/model/%s.data' % (model_codegen_dir, model_name),
                  model_output_dir)
-            sh.rm("-rf", '%s/code' % model_codegen_dir)
        else:
            if not embed_model_data:
                sh.mv("-f",
-                      '%s/file/%s.data' % (model_codegen_dir, model_name),
+                      '%s/model/%s.data' % (model_codegen_dir, model_name),
                      model_output_dir)
-                sh.rm('%s/code/tensor_data.cc' % model_codegen_dir)

            sh.cp("-f", glob.glob("mace/codegen/models/*/code/*.h"),
                  model_header_dir)

--- a/tools/python/CMakeLists.txt
+++ b/tools/python/CMakeLists.txt
--- a/tools/python/README.md
+++ b/tools/python/README.md
+# MACE Build and Test Tools
+
+## Clear Workspace
+Before you do anything, clear the workspace used by build and test process.
+```bash
+tools/clear_workspace.sh
+```
+
+## Build Engine
+Please make sure you have CMake installed.
+```bash
+RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+```
+which generate libraries in `build/cmake-build/armeabi-v7a`, you can use either static libraries or the `libmace.so` shared library.
+
+You can also build for other target abis. 
+The default build command builds engine that runs on CPU, you can modify the cmake file to support other hardware, or you can just set environment variable before building.
+```bash
+RUNTIME: GPU/HEXAGON/HTA/APU
+```
+
+## Model Conversion
+When you have prepared your model, the first thing to do is write a model config.
+
+```yaml
+models:
+  mobilenet_v1:
+    platform: tensorflow
+    model_file_path: https://cnbj1.fds.api.xiaomi.com/mace/miai-models/mobilenet-v1/mobilenet-v1-1.0.pb
+    model_sha256_checksum: 71b10f540ece33c49a7b51f5d4095fc9bd78ce46ebf0300487b2ee23d71294e6
+    subgraphs:
+      - input_tensors:
+          - input
+        input_shapes:
+          - 1,224,224,3
+        output_tensors:
+          - MobilenetV1/Predictions/Reshape_1
+        output_shapes:
+          - 1,1001
+    runtime: gpu
+
+```
+
+The following steps generate output to `build` directory which is the default build and test workspace.
+Suppose you have the model config in `../mace-models/mobilenet-v1/mobilenet-v1.yml`. Then run
+
+```bash
+python tools/python/convert.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml
+```
+
+which generate 4 files in `build/mobilenet_v1/model/`
+```
+├── mobilenet_v1.pb                (model file)
+├── mobilenet_v1.data              (param file)
+├── mobilenet_v1_index.html        (visualization page, you can open it in browser)
+└── mobilenet_v1.pb_txt            (model text file, which can be for debug use)
+```
+
+## Model Test and Benchmark
+After model is converted, simply run
+```bash
+python tools/python/run_model.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml --validate
+```
+
+Or benchmark the model
+```bash
+python tools/python/run_model.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml --benchmark
+```
+
+
+It will test your model on the device configured in the model config (`runtime`). 
+You can also test on other device by specify `--runtime=cpu (dsp/hta/apu)` if you previously build engine for the device.
+
+The log will be shown if `--vlog_level=2` is specified.
+
+
+## Encrypt Model (optional)
+Model can be encrypted by obfuscation.
+```bash
+python tools/python/encrypt.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml
+```
+It will override `mobilenet_v1.pb` and `mobilenet_v1.data`. 
+If you want to compiled the model into a library, you should use options `--gencode_model --gencode_param` to generate model code, i.e.,
+
+```bash
+python tools/python/encrypt.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml --gencode_model --gencode_param
+```
+It will generate model code into `mace/codegen/models` and also generate a helper function `CreateMaceEngineFromCode` in `mace/codegen/engine/mace_engine_factory.h` by which you can create an engine with models built in it.
+
+After that you can rebuild the engine. 
+```bash
+RUNTIME=GPU RUNMODE=code bash tools/cmake/cmake-build-armeabi-v7a.sh
+```
+`RUNMODE=code` means you compile and link model library with MACE engine.
+
+When you test the model in code format, you should specify it in the script as follows.
+```bash
+python tools/python/run_model.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml --gencode_model --gencode_param
+```
+Of course you can generate model code only, and use parameter file.
+
+## Precompile OpenCL (optional)
+After you test model on GPU, it will generate compiled OpenCL binary file automatically in `build/mobilenet_v1/opencl` directory.
+```bash
+└── mobilenet_v1_compiled_opencl_kernel.MIX2S.sdm845.bin
+```
+It specifies your test platform model and SoC. You can use it in production to accelerate the initialization.
+
+
+## Auto Tune OpenCL kernels (optional)
+MACE can auto tune OpenCL kernels used by models. You can specify `--tune` option.
+```bash
+python tools/python/run_model.py --config ../mace-models/mobilenet-v1/mobilenet-v1.yml --tune
+```
+It will generate OpenCL tuned parameter binary file in `build/mobilenet_v1/opencl` directory.
+```bash
+└── mobilenet_v1_tuned_opencl_parameter.MIX2S.sdm845.bin
+```
+It specifies your test platform model and SoC. You can use it in production to reduce latency on GPU.
+
+
+## Multi Model Support (optional)
+If multiple models are configured in config file. After you test it, it will generate more than one tuned parameter files.
+Then you need to merge them together.
+```bash
+python tools/python/gen_opencl.py
+```
+After that, it will generate one set of files into `build/opencl` directory.
+
+```bash
+├── compiled_opencl_kernel.bin
+└── tuned_opencl_parameter.bin
+```
+
+You can also generate code into the engine by specify `--gencode`, after which you should rebuild the engine.
+
+
--- a/tools/python/convert.py
+++ b/tools/python/convert.py
@@ -12,63 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# python tools/python/convert.py \
+# --config ../mace-models/mobilenet-v2/mobilenet-v2.yml
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import argparse
-import os
 import sys
 import numpy as np
+import shutil
+import tempfile
 from utils import config_parser
+from utils.config_parser import DataFormat
+from utils.config_parser import DeviceType
+from utils.config_parser import Platform
 from utils import util
 from utils.util import mace_check
+from utils.config_parser import normalize_model_config
+from utils.config_parser import ModelKeys
 from py_proto import mace_pb2
 from transform import base_converter as cvt
 from transform import transformer
 from visualize import visualize_model

-device_type_map = {'cpu': cvt.DeviceType.CPU.value,
-                   'gpu': cvt.DeviceType.GPU.value,
-                   'dsp': cvt.DeviceType.HEXAGON.value,
-                   'hta': cvt.DeviceType.HTA.value,
-                   'apu': cvt.DeviceType.APU.value,
-                   'cpu+gpu': cvt.DeviceType.CPU.value}
-
-data_format_map = {
-    'NONE': cvt.DataFormat.NONE,
-    'NHWC': cvt.DataFormat.NHWC,
-    'NCHW': cvt.DataFormat.NCHW,
-    'OIHW': cvt.DataFormat.OIHW,
-}
-
-data_type_map = {
-    'float32': mace_pb2.DT_FLOAT,
-    'int32': mace_pb2.DT_INT32,
-}
-
-
-def parse_data_type(data_type, quantize):
-    if quantize or data_type == 'fp32_fp32':
-        return mace_pb2.DT_FLOAT
-    else:
-        return mace_pb2.DT_HALF
-
-
-def split_shape(shape):
-    if shape.strip() == "":
-        return []
-    else:
-        return shape.split(',')
-
-
-def parse_int_array_from_str(ints_str):
-    return [int(i) for i in split_shape(ints_str)]
-
-
-def parse_float_array_from_str(floats_str):
-    return [float(i) for i in floats_str.split(',')]
-

 def transpose_shape(shape, dst_order):
    t_shape = [0] * len(shape)
@@ -77,52 +45,32 @@ def transpose_shape(shape, dst_order):
    return t_shape


-def to_list(x):
-    if isinstance(x, list):
-        return x
-    else:
-        return [x]
-
-
-def separate_params(mace_model):
-    tensors = mace_model.tensors
-    params = mace_pb2.NetDef()
-    params.tensors.extend(tensors)
-
-    model = mace_model
-    del model.tensors[:]
-    return model, params
-
-
 def convert(conf, output):
-    if not os.path.exists(output):
-        os.mkdir(output)
-
    for model_name, model_conf in conf["models"].items():
-        model_output = output + "/" + model_name
-        if not os.path.exists(model_output):
-            os.mkdir(model_output)
-
-        subgraph = model_conf["subgraphs"][0]
-        del model_conf["subgraphs"]
-        model_conf.update(subgraph)
-
-        model_file = util.download_or_get_file(model_conf["model_file_path"],
-                                               model_conf[
-                                                   "model_sha256_checksum"],
-                                               model_output)
-        model_conf["model_file_path"] = model_file
-        if "weight_file_path" in model_conf:
-            weight_file = util.download_or_get_file(
-                model_conf["weight_file_path"],
-                model_conf["weight_sha256_checksum"], model_output)
-            model_conf["weight_file_path"] = weight_file
+        model_output = output + "/" + model_name + "/model"
+        org_model_dir = output + "/" + model_name + "/org_model"
+        util.mkdir_p(model_output)
+        util.mkdir_p(org_model_dir)
+
+        model_conf = normalize_model_config(model_conf)
+
+        model_file = util.download_or_get_model(
+            model_conf[ModelKeys.model_file_path],  # noqa
+            model_conf[ModelKeys.model_sha256_checksum],  # noqa
+            output + "/" + model_name + "/org_model")
+        model_conf[ModelKeys.model_file_path] = model_file
+        if ModelKeys.weight_file_path in model_conf:
+            weight_file = util.download_or_get_model(
+                model_conf[ModelKeys.weight_file_path],
+                model_conf[ModelKeys.weight_sha256_checksum], "/tmp/")
+            model_conf[ModelKeys.weight_file_path] = weight_file
+
        # TODO: remove the following after quantize tool is made
-        if "quantize_range_file" in model_conf:
-            range_file = util.download_or_get_file(
-                model_conf["quantize_range_file"],
+        if ModelKeys.quantize_range_file in model_conf:
+            range_file = util.download_or_get_model(
+                model_conf[ModelKeys.quantize_range_file],
                "", model_output)
-            model_conf["quantize_range_file"] = range_file
+            model_conf[ModelKeys.quantize_range_file] = range_file

        mace_model = convert_model(model_conf)

@@ -132,7 +80,7 @@ def convert(conf, output):
                                                         model_output)
            visualizer.save_html()
        except:  # noqa
-            print("Failed to visualize model:", sys.exc_info()[0])
+            print("Failed to visualize model:", sys.exc_info())

        model, params = merge_params(mace_model)

@@ -147,115 +95,64 @@ def convert(conf, output):


 def convert_model(conf):
-    print(conf)
-    platform = conf["platform"]
-    mace_check(platform in ['tensorflow', 'caffe', 'onnx'],
-               "platform not supported")
-    runtime = conf["runtime"]
-    mace_check(
-        runtime in ['cpu', 'gpu', 'dsp', 'hta', 'apu', 'cpu+gpu'],
-        "runtime not supported")
-
    option = cvt.ConverterOption()
-    if "graph_optimize_options" in conf:
-        option.transformer_option = conf["graph_optimize_options"]
-    option.winograd = conf.get("winograd", 0)
-    option.quantize = bool(conf.get("quantize", 0))
-    option.quantize_large_weights = bool(conf.get("quantize_large_weights", 0))
-    option.quantize_range_file = conf.get("quantize_range_file", "")
-    option.change_concat_ranges = bool(conf.get("change_concat_ranges", 0))
-    option.cl_mem_type = conf.get("cl_mem_type", "image")
-    option.device = device_type_map[conf.get("runtime", "cpu")]
-    option.data_type = parse_data_type(conf.get("data_type", "fp16_fp32"),
-                                       option.quantize)
-    input_tensors = to_list(conf["input_tensors"])
-    input_shapes = [parse_int_array_from_str(shape) for shape in
-                    to_list(conf["input_shapes"])]
-    mace_check(len(input_tensors) == len(input_shapes),
-               "input node count and shape count do not match")
-    input_count = len(input_tensors)
-    input_data_types = [data_type_map[dt] for dt in
-                        to_list(conf.get("input_data_types",
-                                         ["float32"]))]
-    if len(input_data_types) == 1 and input_count > 1:
-        input_data_types = [input_data_types[0]] * input_count
-    mace_check(len(input_data_types) == input_count,
-               "the number of input_data_types should be "
-               "the same as input tensors")
-    input_data_formats = [data_format_map[df] for df in
-                          to_list(conf.get("input_data_formats",
-                                           ["NHWC"]))]
-    if len(input_data_formats) == 1 and input_count > 1:
-        input_data_formats = [input_data_formats[0]] * input_count
-    mace_check(len(input_data_formats) == input_count,
-               "the number of input_data_formats should be "
-               "the same as input tensors")
-    input_ranges = [parse_float_array_from_str(r) for r in
-                    to_list(conf.get("input_ranges",
-                                     ["-1.0,1.0"]))]
-    if len(input_ranges) == 1 and input_count > 1:
-        input_ranges = [input_ranges[0]] * input_count
-    mace_check(len(input_ranges) == input_count,
-               "the number of input_ranges should be "
-               "the same as input tensors")
-    for i in range(len(input_tensors)):
+
+    if ModelKeys.graph_optimize_options in conf:
+        option.transformer_option = conf[ModelKeys.graph_optimize_options]
+    if ModelKeys.winograd in conf:
+        option.winograd = conf[ModelKeys.winograd]
+    if ModelKeys.quantize in conf:
+        option.quantize = conf[ModelKeys.quantize]
+    if ModelKeys.quantize_large_weights in conf:
+        option.quantize_large_weights = conf[ModelKeys.quantize_large_weights]
+    if ModelKeys.quantize_range_file in conf:
+        option.quantize_range_file = conf[ModelKeys.quantize_range_file]
+    if ModelKeys.change_concat_ranges in conf:
+        option.change_concat_ranges = conf[ModelKeys.change_concat_ranges]
+    if ModelKeys.cl_mem_type in conf:
+        option.cl_mem_type = conf[ModelKeys.cl_mem_type]
+    if ModelKeys.runtime in conf:
+        option.device = conf[ModelKeys.runtime]
+        if option.device == DeviceType.CPU_GPU:
+            # when convert, cpu and gpu share the same model
+            option.device = DeviceType.CPU
+        # we don't need `value`, but to be consistent with legacy code
+        # used by `base_converter`
+        option.device = option.device.value
+
+    option.data_type = conf[ModelKeys.data_types]
+
+    for i in range(len(conf[ModelKeys.input_tensors])):
        input_node = cvt.NodeInfo()
-        input_node.name = input_tensors[i]
-        input_node.shape = input_shapes[i]
-        input_node.data_type = input_data_types[i]
-        input_node.data_format = input_data_formats[i]
-        if (input_node.data_format == cvt.DataFormat.NCHW and len(
-              input_node.shape) == 4):
+        input_node.name = conf[ModelKeys.input_tensors][i]
+        input_node.shape = conf[ModelKeys.input_shapes][i]
+        input_node.data_type = conf[ModelKeys.input_data_types][i]
+        input_node.data_format = conf[ModelKeys.input_data_formats][i]
+        if (input_node.data_format == DataFormat.NCHW and len(
+                input_node.shape) == 4):
            input_node.shape = transpose_shape(input_node.shape, [0, 2, 3, 1])
-            input_node.data_format = cvt.DataFormat.NHWC
-        input_node.range = input_ranges[i]
+            input_node.data_format = DataFormat.NHWC
+        input_node.range = conf[ModelKeys.input_ranges][i]
        option.add_input_node(input_node)

-    output_tensors = to_list(conf["output_tensors"])
-    output_shapes = [parse_int_array_from_str(shape) for shape in
-                     to_list(conf["output_shapes"])]
-    mace_check(len(output_tensors) == len(output_shapes),
-               "output node count and shape count do not match")
-    output_count = len(output_tensors)
-    output_data_types = [data_type_map[dt] for dt in
-                         to_list(conf.get("output_data_types",
-                                          ["float32"]))]
-    if len(output_data_types) == 1 and output_count > 1:
-        output_data_types = [output_data_types[0]] * output_count
-    mace_check(len(output_data_types) == output_count,
-               "the number of output_data_types should be "
-               "the same as output tensors")
-    output_data_formats = [data_format_map[df] for df in
-                           to_list(conf.get("output_data_formats",
-                                            ["NHWC"]))]
-    if len(output_data_formats) == 1 and output_count > 1:
-        output_data_formats = [output_data_formats[0]] * output_count
-    mace_check(len(output_data_formats) == output_count,
-               "the number of output_data_formats should be "
-               "the same as output tensors")
-    for i in range(len(output_tensors)):
+    for i in range(len(conf[ModelKeys.output_tensors])):
        output_node = cvt.NodeInfo()
-        output_node.name = output_tensors[i]
-        output_node.shape = output_shapes[i]
-        output_node.data_type = output_data_types[i]
-        output_node.data_format = output_data_formats[i]
-        if output_node.data_format == cvt.DataFormat.NCHW and len(
+        output_node.name = conf[ModelKeys.output_tensors][i]
+        output_node.shape = conf[ModelKeys.output_shapes][i]
+        output_node.data_type = conf[ModelKeys.output_data_types][i]
+        output_node.data_format = conf[ModelKeys.output_data_formats][i]
+        if output_node.data_format == DataFormat.NCHW and len(
                output_node.shape) == 4:
            output_node.shape = transpose_shape(output_node.shape,
                                                [0, 2, 3, 1])
-            output_node.data_format = cvt.DataFormat.NHWC
+            output_node.data_format = DataFormat.NHWC
        option.add_output_node(output_node)

-    if "check_tensors" in conf:
-        check_tensors = to_list(conf["check_tensors"])
-        check_tensors_shapes = [parse_int_array_from_str(shape) for shape in
-                                to_list(conf["check_shapes"])]
-        mace_check(len(check_tensors) == len(check_tensors_shapes),
-                   "check tensors count and shape count do not match.")
-        for i in range(len(check_tensors)):
+    if ModelKeys.check_tensors in conf:
+        for i in range(len(conf[ModelKeys.check_tensors])):
            check_node = cvt.NodeInfo()
-            check_node.name = check_tensors[i]
-            check_node.shape = check_tensors_shapes[i]
+            check_node.name = conf[ModelKeys.check_tensors][i]
+            check_node.shape = conf[ModelKeys.check_shapes][i]
            option.add_check_node(check_node)
    else:
        option.check_nodes = option.output_nodes
@@ -263,17 +160,17 @@ def convert_model(conf):
    option.build()

    print("Transform model to one that can better run on device")
-
-    if platform == 'tensorflow':
+    platform = conf[ModelKeys.platform]
+    if platform == Platform.TENSORFLOW:
        from transform import tensorflow_converter
        converter = tensorflow_converter.TensorflowConverter(
            option, conf["model_file_path"])
-    elif platform == 'caffe':
+    elif platform == Platform.CAFFE:
        from transform import caffe_converter
        converter = caffe_converter.CaffeConverter(option,
                                                   conf["model_file_path"],
                                                   conf["weight_file_path"])
-    elif platform == 'onnx':
+    elif platform == Platform.ONNX:
        from transform import onnx_converter
        converter = onnx_converter.OnnxConverter(option,
                                                 conf["model_file_path"])
@@ -285,14 +182,15 @@ def convert_model(conf):
        option, output_graph_def)
    output_graph_def, quantize_activation_info = mace_transformer.run()

-    if option.device in [cvt.DeviceType.HEXAGON.value,
-                         cvt.DeviceType.HTA.value]:
+    runtime = conf[ModelKeys.runtime]
+    if runtime in [DeviceType.HEXAGON,
+                   DeviceType.HTA]:
        from transform import hexagon_converter
        converter = hexagon_converter.HexagonConverter(
            option, output_graph_def, quantize_activation_info)
        output_graph_def = converter.run()
-    elif runtime == 'apu':
-        mace_check(platform == "tensorflow",
+    elif runtime == DeviceType.APU:
+        mace_check(platform == Platform.TENSORFLOW,
                   "apu only support model from tensorflow")
        from transform import apu_converter
        converter = apu_converter.ApuConverter(
@@ -366,7 +264,7 @@ def parse_args():
    parser.add_argument(
        '--output',
        type=str,
-        default=".",
+        default="build",
        help="output dir")
    flgs, _ = parser.parse_known_args()
    return flgs

--- a/tools/python/encrypt.py
+++ b/tools/python/encrypt.py
@@ -22,10 +22,13 @@ import os
 import hashlib
 from jinja2 import Environment, FileSystemLoader
 from py_proto import mace_pb2
+from utils import device
 from utils import util
-from transform import base_converter as cvt
 from utils.util import mace_check
+from utils.util import MaceLogger
+from utils import config_parser
 from utils.config_parser import CPP_KEYWORDS
+from utils.config_parser import ModelKeys

 GENERATED_NAME = set()

@@ -99,9 +102,8 @@ def obfuscate_name(model):


 def save_model_to_code(namespace, model, params, model_checksum,
-                       params_checksum, device, output):
-    if not os.path.exists(output):
-        os.mkdir(output)
+                       params_checksum, device, output, gencode_params):
+    util.mkdir_p(output)
    cwd = os.path.dirname(__file__)
    j2_env = Environment(
        loader=FileSystemLoader(cwd + "/template"), trim_blocks=True)
@@ -120,24 +122,18 @@ def save_model_to_code(namespace, model, params, model_checksum,
            f.write(source)
        counter += 1

-    template_name = "tensor_data.jinja2"
-    source = j2_env.get_template(template_name).render(
-        tag=namespace,
-        model_data_size=len(params),
-        model_data=params)
-    with open(output + "/tensor_data.cc", "w") as f:
-        f.write(source)
+    if gencode_params:
+        template_name = "tensor_data.jinja2"
+        source = j2_env.get_template(template_name).render(
+            tag=namespace,
+            model_data_size=len(params),
+            model_data=params)
+        with open(output + "/tensor_data.cc", "w") as f:
+            f.write(source)

    template_name = "operator.jinja2"
    counter = 0
    op_size = len(model.op)
-    try:
-        device = cvt.DeviceType[device.upper()]
-    except:  # noqa
-        if device.upper() == "DSP":
-            device = cvt.DeviceType.HEXAGON
-        else:
-            device = cvt.DeviceType.CPU

    for start in range(0, op_size, 10):
        source = j2_env.get_template(template_name).render(
@@ -170,8 +166,7 @@ def save_model_to_code(namespace, model, params, model_checksum,


 def save_model_to_file(model_name, model, params, output):
-    if not os.path.exists(output):
-        os.mkdir(output)
+    util.mkdir_p(output)
    with open(output + "/" + model_name + ".pb", "wb") as f:
        f.write(model.SerializeToString())
    with open(output + "/" + model_name + ".data", "wb") as f:
@@ -179,7 +174,7 @@ def save_model_to_file(model_name, model, params, output):


 def encrypt(model_name, model_file, params_file, device, output,
-            is_obfuscate=False):
+            is_obfuscate=False, gencode_model=False, gencode_params=False):
    model_checksum = util.file_checksum(model_file)
    params_checksum = util.file_checksum(params_file)

@@ -191,9 +186,11 @@ def encrypt(model_name, model_file, params_file, device, output,

            if is_obfuscate:
                obfuscate_name(model)
-            save_model_to_file(model_name, model, params, output + "/file/")
-            save_model_to_code(model_name, model, params, model_checksum,
-                               params_checksum, device, output + "/code/")
+            save_model_to_file(model_name, model, params, output)
+            if gencode_model:
+                save_model_to_code(model_name, model, params, model_checksum,
+                                   params_checksum, device, output + "/code/",
+                                   gencode_params)


 def parse_args():
@@ -216,22 +213,89 @@ def parse_args():
        default='cpu',
        help="cpu/gpu/hexagon/hta/apu")
    parser.add_argument(
-        '--output',
+        '--config',
        type=str,
-        default=".",
-        help="output dir")
+        help="model config")
    parser.add_argument(
-        "--obfuscate",
+        "--no_obfuscate",
        action="store_true",
        help="obfuscate model names")
+    parser.add_argument(
+        "--gencode_model",
+        action="store_true",
+        help="generate model code")
+    parser.add_argument(
+        "--gencode_param",
+        action="store_true",
+        help="generate params code")
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="build",
+        help="output dir")

    flgs, _ = parser.parse_known_args()
-    mace_check(flags.model_name not in CPP_KEYWORDS, "model name cannot be cpp"
-                                                     "keywords")
+    mace_check(flgs.model_name not in CPP_KEYWORDS, "model name cannot be cpp"
+                                                    "keywords")
    return flgs


+def gen_mace_engine_factory(model_name, embed_model_data, output):
+    util.mkdir_p(output)
+    cwd = os.path.dirname(__file__)
+    j2_env = Environment(
+        loader=FileSystemLoader(cwd + "/template"), trim_blocks=True)
+    # generate mace_run BUILD file
+    template_name = 'mace_engine_factory.h.jinja2'
+    model_name = list(model_name)
+    source = j2_env.get_template(template_name).render(
+        model_tags=model_name,
+        embed_model_data=embed_model_data,
+    )
+    with open(output + '/mace_engine_factory.h', "w") as f:
+        f.write(source)
+
+
 if __name__ == '__main__':
    flags = parse_args()
-    encrypt(flags.model_name, flags.model_file, flags.params_file,
-            flags.device, flags.output, flags.obfuscate)
+    codegen_dir = "mace/codegen/models"
+    device.execute("rm -rf %s/*" % codegen_dir)
+
+    models = []
+    if flags.config:
+        conf = config_parser.parse(flags.config)
+
+        for name, model_conf in conf["models"].items():
+            model_conf = config_parser.normalize_model_config(model_conf)
+            if not flags.model_name or name == flags.model_name:
+                MaceLogger.info("Encrypt model %s" % name)
+                encrypt(name,
+                        "build/%s/model/%s.pb" % (name, name),
+                        "build/%s/model/%s.data" % (name, name),
+                        model_conf[ModelKeys.runtime],
+                        codegen_dir + "/" + name,
+                        not flags.no_obfuscate,
+                        flags.gencode_model,
+                        flags.gencode_param)
+                models.append(name)
+                os.rename("%s/%s/%s.pb" % (codegen_dir, name, name),
+                          "build/%s/model/%s.pb" % (name, name))
+                os.rename("%s/%s/%s.data" % (codegen_dir, name, name),
+                          "build/%s/model/%s.data" % (name, name))
+    else:
+        device_type = config_parser.parse_device_type(flags.device)
+        encrypt(flags.model_name, flags.model_file, flags.params_file,
+                device_type, codegen_dir, not flags.no_obfuscate,
+                flags.gencode_model, flags.gencode_param)
+        models.append(flags.model_name)
+        os.rename(
+            "%s/%s/%s.pb" % (codegen_dir, flags.model_name, flags.model_name),
+            "build/%s/model/%s.pb" % (flags.model_name, flags.model_name))
+        os.rename(
+            "%s/%s/%s.data" % (codegen_dir, flags.model_name,
+                               flags.model_name),
+            "build/%s/model/%s.data" % (flags.model_name, flags.model_name))
+
+    if flags.gencode_model:
+        gen_mace_engine_factory(models, flags.gencode_param,
+                                "mace/codegen/engine")
--- a/tools/python/gen_opencl.py
+++ b/tools/python/gen_opencl.py
+# Copyright 2019 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import jinja2
+import os
+import struct
+import numpy as np
+
+from utils import util
+from utils.util import MaceLogger
+from utils.util import mace_check
+
+
+def generate_opencl_code(binary_file_name, load_func_name, size_func_name,
+                         output_path):
+    binary_array = []
+    if os.path.exists(binary_file_name):
+        with open(binary_file_name, 'rb') as f:
+            binary_array = np.fromfile(f, dtype=np.uint8)
+
+    cwd = os.path.dirname(__file__)
+    env = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(cwd + "/template"))
+    content = env.get_template('file_binary.cc.jinja2').render(
+        data=binary_array,
+        data_size=len(binary_array),
+        load_func_name=load_func_name,
+        size_func_name=size_func_name)
+
+    if os.path.exists(output_path):
+        os.remove(output_path)
+    with open(output_path, "w") as w_file:
+        w_file.write(content)
+
+
+def merge_opencl_binaries(opencl_binaries,
+                          output_file):
+    platform_info_key = 'mace_opencl_precompiled_platform_info_key'
+
+    kvs = {}
+    for binary in opencl_binaries:
+        if not os.path.exists(binary):
+            MaceLogger.warning("OpenCL bin %s not found" % binary)
+            continue
+
+        with open(binary, "rb") as f:
+            binary_array = np.fromfile(f, dtype=np.uint8)
+
+        idx = 0
+        size, = struct.unpack("Q", binary_array[idx:idx + 8])
+        idx += 8
+        for _ in range(size):
+            key_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            key, = struct.unpack(
+                str(key_size) + "s", binary_array[idx:idx + key_size])
+            idx += key_size
+            value_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            if key == platform_info_key and key in kvs:
+                mace_check(
+                    (kvs[key] == binary_array[idx:idx + value_size]).all(),
+                    "There exists more than one OpenCL version for models:"
+                    " %s vs %s " %
+                    (kvs[key], binary_array[idx:idx + value_size]))
+            else:
+                kvs[key] = binary_array[idx:idx + value_size]
+            idx += value_size
+
+    output_byte_array = bytearray()
+    data_size = len(kvs)
+    output_byte_array.extend(struct.pack("Q", data_size))
+    for key, value in kvs.items():
+        key_size = len(key)
+        output_byte_array.extend(struct.pack("i", key_size))
+        output_byte_array.extend(struct.pack(str(key_size) + "s", key))
+        value_size = len(value)
+        output_byte_array.extend(struct.pack("i", value_size))
+        output_byte_array.extend(value)
+
+    np.array(output_byte_array).tofile(output_file)
+
+
+def merge_opencl_parameters(params_files,
+                            output_file):
+    kvs = {}
+    for params in params_files:
+        if not os.path.exists(params):
+            MaceLogger.warning("Tune param %s not found" % params)
+            continue
+
+        with open(params, "rb") as f:
+            binary_array = np.fromfile(f, dtype=np.uint8)
+
+        idx = 0
+        size, = struct.unpack("Q", binary_array[idx:idx + 8])
+        idx += 8
+        for _ in range(size):
+            key_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            key, = struct.unpack(
+                str(key_size) + "s", binary_array[idx:idx + key_size])
+            idx += key_size
+            value_size, = struct.unpack("i", binary_array[idx:idx + 4])
+            idx += 4
+            kvs[key] = binary_array[idx:idx + value_size]
+            idx += value_size
+
+    output_byte_array = bytearray()
+    data_size = len(kvs)
+    output_byte_array.extend(struct.pack("Q", data_size))
+    for key, value in kvs.items():
+        key_size = len(key)
+        output_byte_array.extend(struct.pack("i", key_size))
+        output_byte_array.extend(struct.pack(str(key_size) + "s", key))
+        value_size = len(value)
+        output_byte_array.extend(struct.pack("i", value_size))
+        output_byte_array.extend(value)
+
+    np.array(output_byte_array).tofile(output_file)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--binary_files',
+        type=str,
+        default="",
+        help="opencl binary files")
+    parser.add_argument(
+        '--tuning_files',
+        type=str,
+        default="",
+        help="tuning params files")
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="build",
+        help="output dir")
+    parser.add_argument(
+        "--gencode",
+        action="store_true",
+        help="generate code")
+    flgs, _ = parser.parse_known_args()
+    return flgs
+
+
+if __name__ == '__main__':
+    flags = parse_args()
+    util.mkdir_p(flags.output)
+    opencl_binary_files = []
+    if flags.binary_files:
+        opencl_binary_files = flags.binary_files.split(",")
+    opencl_tuning_files = []
+    if flags.tuning_files:
+        opencl_tuning_files = flags.tuning_files.split(",")
+
+    compiled_opencl_kernel_prefix = "compiled_opencl_kernel"
+    tuned_opencl_parameter_prefix = "tuned_opencl_parameter"
+
+    if not opencl_binary_files and not opencl_tuning_files:
+        for root, dirs, files in os.walk("build", topdown=False):
+            for name in files:
+                if compiled_opencl_kernel_prefix in name:
+                    opencl_binary_files.append(os.path.join(root, name))
+                elif tuned_opencl_parameter_prefix in name:
+                    opencl_tuning_files.append(os.path.join(root, name))
+
+    opencl_dir = flags.output + "/opencl"
+    util.mkdir_p(opencl_dir)
+    merged_opencl_bin_file = "%s/%s.bin" % (opencl_dir,
+                                            compiled_opencl_kernel_prefix)
+    merged_opencl_tuning_file = "%s/%s.bin" % (opencl_dir,
+                                               tuned_opencl_parameter_prefix)
+
+    merge_opencl_binaries(opencl_binary_files,
+                          merged_opencl_bin_file)
+    if flags.gencode:
+        util.mkdir_p('mace/codegen/opencl')
+        generate_opencl_code(merged_opencl_bin_file,
+                             "LoadOpenCLBinary",
+                             "OpenCLBinarySize",
+                             "mace/codegen/opencl/opencl_binary.cc")
+
+    merge_opencl_binaries(opencl_tuning_files,
+                          merged_opencl_tuning_file)
+    if flags.gencode:
+        generate_opencl_code(merged_opencl_tuning_file,
+                             "LoadOpenCLParameter",
+                             "LoadOpenCLParameter",
+                             "mace/codegen/opencl/opencl_parameter.cc")
--- a/tools/python/py_proto/__init__.py
+++ b/tools/python/py_proto/__init__.py
@@ -18,12 +18,17 @@ from __future__ import print_function

 import os
 from utils import device
+from utils.util import MaceLogger

 cwd = os.path.dirname(__file__)

 # TODO: Remove bazel deps
-device.execute("bazel build //mace/proto:mace_py")
-device.execute("cp -f bazel-genfiles/mace/proto/mace_pb2.py %s" % cwd)
+try:
+    device.execute("bazel build //mace/proto:mace_py")
+    device.execute("cp -f bazel-genfiles/mace/proto/mace_pb2.py %s" % cwd)

-device.execute("bazel build //third_party/caffe:caffe_py")
-device.execute("cp -f bazel-genfiles/third_party/caffe/caffe_pb2.py %s" % cwd)
+    device.execute("bazel build //third_party/caffe:caffe_py")
+    device.execute(
+        "cp -f bazel-genfiles/third_party/caffe/caffe_pb2.py %s" % cwd)
+except:  # noqa
+    MaceLogger.warning("No bazel, use cmake.")
--- a/tools/python/run_model.py
+++ b/tools/python/run_model.py
+# Copyright 2019 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import tempfile
+import shutil
+import numpy as np
+
+from py_proto import mace_pb2
+from utils import util
+from utils import device
+from utils import config_parser
+from utils.config_parser import DeviceType
+from utils.target import Target
+from utils.config_parser import ModelKeys
+from utils.util import MaceLogger
+from utils.util import mace_check
+import run_target
+import validate
+
+"""
+Tool for mace_run:
+
+python tools/python/run_model.py \
+--config ../mace-models/mobilenet-v1/mobilenet-v1.yml --build --validate
+python tools/python/run_model.py \
+--config ../mace-models/mobilenet-v1/mobilenet-v1.yml --benchmark
+python tools/python/run_model.py \
+--config ../mace-models/mobilenet-v1/mobilenet-v1.yml --runtime=cpu
+
+"""
+
+
+def join_2d_array(xs):
+    return ":".join([",".join([str(y) for y in x]) for x in xs])
+
+
+def build_engine(flags):
+    cmake_shell = os.path.abspath(
+        os.path.dirname(
+            __file__)) + "/../cmake/cmake-build-%s.sh" % flags.target_abi
+    os.environ["BUILD_DIR"] = flags.build_dir + "/" + flags.target_abi
+    if flags.runtime:
+        os.environ["RUNTIME"] = config_parser.parse_device_type(
+            flags.runtime).name
+    if flags.gencode_model:
+        os.environ["RUNMODE"] = "code"
+    device.execute("bash " + cmake_shell)
+
+
+def run_models(flags, args):
+    if flags.device_conf:
+        device_conf = config_parser.parse_device_info(flags.device_conf)
+        device.ArmLinuxDevice.set_devices(device_conf)
+
+    run_devices = device.choose_devices(flags.target_abi, flags.target_socs)
+    MaceLogger.info("Run on devices: %s" % run_devices)
+
+    for device_id in run_devices:
+        dev = device.crete_device(flags.target_abi, device_id)
+        run_models_for_device(flags, args, dev)
+
+
+def run_models_for_device(flags, args, dev):
+    conf = config_parser.parse(flags.config)
+    for name, model_conf in conf["models"].items():
+        if not flags.model_name or name == flags.model_name:
+            MaceLogger.info("Run model %s" % name)
+            model_conf = config_parser.normalize_model_config(model_conf)
+            run_model_for_device(flags, args, dev, name, model_conf)
+
+
+def run_model_for_device(flags, args, dev, model_name, model_conf):
+    runtime = flags.runtime
+    target_abi = flags.target_abi
+    install_dir = run_target.default_install_dir(target_abi) + "/" + model_name
+    sysdir = install_dir + "/interior"
+    dev.mkdir(sysdir)
+
+    if not runtime:
+        runtime = model_conf[ModelKeys.runtime]
+        if runtime == DeviceType.CPU_GPU:
+            runtime = DeviceType.GPU
+    else:
+        runtime = config_parser.parse_device_type(runtime)
+
+    # install models to devices
+    workdir = flags.output + "/" + model_name
+    model_file = model_name + ".pb"
+    model_data_file = model_name + ".data"
+    model_path = workdir + "/model/" + model_file
+    model_data_path = workdir + "/model/" + model_data_file
+    if os.path.exists(model_path) and os.path.exists(model_data_path):
+        dev.install(Target(model_path), install_dir)
+        dev.install(Target(model_data_path), install_dir)
+    else:
+        MaceLogger.warning("No models exist in %s, use --model_file and"
+                           " --model_data_file specified in args" % model_path)
+
+    if ModelKeys.check_tensors in model_conf:
+        model_conf[ModelKeys.output_tensors] = model_conf[
+            ModelKeys.check_tensors]
+        model_conf[ModelKeys.output_shapes] = model_conf[
+            ModelKeys.check_shapes]
+
+    model_file_path = ""
+    if not flags.gencode_model:
+        model_file_path = install_dir + "/" + model_file
+    model_data_file_path = ""
+    if not flags.gencode_param:
+        model_data_file_path = install_dir + "/" + model_data_file
+    model_args = {"model_name": model_name,
+                  "model_file": model_file_path,
+                  "model_data_file": model_data_file_path,
+                  "input_node": ",".join(
+                      model_conf[ModelKeys.input_tensors]),
+                  "input_shape": join_2d_array(
+                      model_conf[ModelKeys.input_shapes]),
+                  "output_node": ",".join(
+                      model_conf[ModelKeys.output_tensors]),
+                  "output_shape": join_2d_array(
+                      model_conf[ModelKeys.output_shapes]),
+                  "input_data_format": ",".join(
+                      [df.name for df in
+                       model_conf[ModelKeys.input_data_formats]]),
+                  "output_data_format": ",".join(
+                      [df.name for df in
+                       model_conf[ModelKeys.output_data_formats]]),
+                  "device": runtime.name
+                  }
+
+    opts = ["--%s=%s" % (arg_key, arg_val) for arg_key, arg_val in
+            model_args.items()] + args
+    should_generate_data = (flags.validate
+                            or flags.tune or "--benchmark" in opts)
+
+    if should_generate_data:
+        tmpdirname = tempfile.mkdtemp()
+        input_file_prefix = tmpdirname + "/" + model_name
+
+        if ModelKeys.validation_inputs_data in model_conf:
+            input_tensor = model_conf[ModelKeys.input_tensors]
+            input_data = model_conf[ModelKeys.validation_inputs_data]
+            mace_check(len(input_tensor) == len(input_data),
+                       "len(input_tensor) != len(validate_data")
+
+            for i in range(len(input_tensor)):
+                util.download_or_get_file(
+                    model_conf[ModelKeys.validation_inputs_data][i], "",
+                    util.formatted_file_name(input_file_prefix,
+                                             input_tensor[i]))
+        else:
+            generate_input_data(input_file_prefix,
+                                model_conf[ModelKeys.input_tensors],
+                                model_conf[ModelKeys.input_shapes],
+                                model_conf[ModelKeys.input_ranges],
+                                model_conf[ModelKeys.input_data_types])
+
+        dev.install(Target(tmpdirname), install_dir + "/validate_in")
+        target_input_file = "%s/validate_in/%s" % (
+            install_dir, model_name)
+        target_output_dir = "%s/validate_out" % install_dir
+        dev.mkdir(target_output_dir)
+        target_output_file = target_output_dir + "/" + model_name
+        opts += ["--input_file=%s" % target_input_file,
+                 "--output_file=%s" % target_output_file]
+
+    # run
+    envs = flags.envs.split(" ") + ["MACE_INTERNAL_STORAGE_PATH=%s" % sysdir]
+    if flags.tune:
+        envs += ["MACE_TUNING=1",
+                 "MACE_RUN_PARAMETER_PATH=%s/interior/tune_params"
+                 % install_dir]
+        opts += ["--round=0"]
+    if flags.vlog_level > 0:
+        envs += ["MACE_CPP_MIN_VLOG_LEVEL=%s" % flags.vlog_level]
+
+    build_dir = flags.build_dir + "/" + target_abi
+    libs = []
+    if model_conf[ModelKeys.runtime] == DeviceType.HEXAGON:
+        libs += ["third_party/nnlib/%s/libhexagon_controller.so" % target_abi]
+    elif model_conf[ModelKeys.runtime] == DeviceType.APU:
+        libs += ["third_party/apu/libapu-frontend.so"]
+
+    target = Target(build_dir + "/install/bin/mace_run", libs,
+                    opts=opts, envs=envs)
+    run_target.run_target(target_abi, install_dir, target,
+                          device_ids=flags.target_socs)
+
+    if runtime == DeviceType.GPU:
+        opencl_dir = workdir + "/opencl"
+        util.mkdir_p(opencl_dir)
+        dev.pull(
+            Target(install_dir + "/interior/mace_cl_compiled_program.bin"),
+            "%s/%s_compiled_opencl_kernel.%s.%s.bin" % (
+                opencl_dir, model_name,
+                dev.info()["ro.product.model"].replace(' ', ''),
+                dev.info()["ro.board.platform"]))
+        if flags.tune:
+            dev.pull(Target(install_dir + "/interior/tune_params"),
+                     "%s/%s_tuned_opencl_parameter.%s.%s.bin" % (
+                         opencl_dir, model_name,
+                         dev.info()["ro.product.model"].replace(' ', ''),
+                         dev.info()["ro.board.platform"]))
+
+    if flags.validate:
+        validate_model_file = util.download_or_get_model(
+            model_conf[ModelKeys.model_file_path],
+            model_conf[ModelKeys.model_sha256_checksum],
+            tmpdirname)
+
+        validate_weight_file = ""
+        if ModelKeys.weight_file_path in model_conf:
+            validate_weight_file = util.download_or_get_model(
+                model_conf[ModelKeys.weight_file_path],
+                model_conf[ModelKeys.weight_sha256_checksum],
+                tmpdirname)
+
+        dev.pull(Target(target_output_dir), tmpdirname + "/validate_out")
+        output_file_prefix = tmpdirname + "/validate_out/" + model_name
+        validate.validate(model_conf[ModelKeys.platform],
+                          validate_model_file,
+                          validate_weight_file,
+                          input_file_prefix,
+                          output_file_prefix,
+                          model_conf[ModelKeys.input_shapes],
+                          model_conf[ModelKeys.output_shapes],
+                          model_conf[ModelKeys.input_data_formats],
+                          model_conf[ModelKeys.output_data_formats],
+                          model_conf[ModelKeys.input_tensors],
+                          model_conf[ModelKeys.output_tensors],
+                          flags.validate_threshold,
+                          model_conf[ModelKeys.input_data_types],
+                          flags.backend,
+                          "",
+                          "")
+    if should_generate_data:
+        shutil.rmtree(tmpdirname)
+
+
+def generate_input_data(input_file, input_node, input_shape, input_ranges,
+                        input_data_type):
+    np.random.seed()
+    for i in range(len(input_node)):
+        data = np.random.random(input_shape[i]) * (
+            input_ranges[i][1] - input_ranges[i][0]) + input_ranges[i][0]
+        input_file_name = util.formatted_file_name(input_file, input_node[i])
+        MaceLogger.info('Generate input file: %s' % input_file_name)
+        if input_data_type[i] == mace_pb2.DT_FLOAT:
+            np_data_type = np.float32
+        elif input_data_type[i] == mace_pb2.DT_INT32:
+            np_data_type = np.int32
+
+        data.astype(np_data_type).tofile(input_file_name)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="",
+        help="yaml conf path"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="",
+        help="model name in yaml conf"
+    )
+    parser.add_argument(
+        "--target_abi",
+        type=str,
+        default="armeabi-v7a",
+        help="Target ABI: host, armeabi-v7a, arm64-v8a,"
+             " arm-linux-gnueabihf, aarch64-linux-gnu"
+    )
+    parser.add_argument(
+        "--target_socs",
+        type=str,
+        default="all",
+        help="serialno for adb connection,"
+             " username@ip for arm linux,"
+             " host for host"
+             " | all | random"
+    )
+    parser.add_argument(
+        "--device_conf",
+        type=str,
+        default="",
+        help="device yaml config path"
+    )
+    parser.add_argument(
+        "--runtime",
+        type=str,
+        default="",
+        help="cpu/gpu/dsp/hta/apu"
+    )
+    parser.add_argument("--envs", type=str, default="",
+                        help="Environment vars: "
+                             " MACE_OUT_OF_RANGE_CHECK=1, "
+                             " MACE_OPENCL_PROFILING=1,"
+                             " MACE_INTERNAL_STORAGE_PATH=/path/to,"
+                             " LD_PRELOAD=/path/to")
+    parser.add_argument(
+        "--validate",
+        action="store_true",
+        help="enable validate"
+    )
+    parser.add_argument(
+        "--validate_threshold",
+        type=float,
+        default="0.99",
+        help="validate threshold"
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="tensorflow",
+        help="onnx backend framework")
+    parser.add_argument(
+        "--tune",
+        action="store_true",
+        help="enable tuning"
+    )
+    parser.add_argument(
+        "--build_dir",
+        type=str,
+        default="build/cmake-build",
+        help="cmake build dir"
+    )
+    parser.add_argument(
+        "--build",
+        action="store_true",
+        help="if build before run"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="build",
+        help="output dir")
+    parser.add_argument(
+        '--vlog_level',
+        type=int,
+        default="0",
+        help="vlog level")
+    parser.add_argument(
+        "--gencode_model",
+        action="store_true",
+        help="use compiled model")
+    parser.add_argument(
+        "--gencode_param",
+        action="store_true",
+        help="use compiled param")
+
+    return parser.parse_known_args()
+
+
+if __name__ == "__main__":
+    flags, args = parse_args()
+    if flags.build:
+        build_engine(flags)
+    run_models(flags, args)
--- a/tools/python/run.py
+++ b/tools/python/run.py
@@ -12,52 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
+"""
+Internal tool for mace_cc_benchmark, mace_cc_test:
+
+python tools/python/run_target.py \
+    --target_abi=armeabi-v7a --target_socs=all --target_name=mace_cc_test \
+    --gtest_filter=EnvTest.*  --envs="MACE_CPP_MIN_VLOG_LEVEL=5"
+"""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import filelock
-import random
 import argparse
 import os

 from utils import device
 from utils import target
 from utils import config_parser
-
-
-def device_lock(device_id, timeout=7200):
-    return filelock.FileLock("/tmp/device-lock-%s" % device_id,
-                             timeout=timeout)
-
-
-def is_device_locked(device_id):
-    try:
-        with device_lock(device_id, timeout=0.000001):
-            return False
-    except filelock.Timeout:
-        return True
+from utils import util


 def run_target(target_abi, install_dir, target_obj, device_ids="all"):
    if not install_dir:
        install_dir = default_install_dir(target_abi)

-    device_class = device.device_class(target_abi)
-    devices = device_class.list_devices()
-
-    if device_ids == "all":
-        run_devices = devices
-    elif device_ids == "random":
-        unlocked_devices = [dev for dev in devices if
-                            not is_device_locked(dev)]
-        if unlocked_devices:
-            run_devices = [random.choice(unlocked_devices)]
-        else:
-            run_devices = [random.choice(devices)]
-    else:
-        device_id_list = [dev.strip() for dev in device_ids.split(",")]
-        run_devices = [dev for dev in device_id_list if dev in devices]
+    run_devices = device.choose_devices(target_abi, device_ids)

    print("Run on devices: %s" % run_devices)

@@ -72,7 +53,7 @@ def run_target(target_abi, install_dir, target_obj, device_ids="all"):

        # run on device
        print("Runing ...")
-        with device_lock(device_id):
+        with util.device_lock(device_id):
            dev.run(device_target)


@@ -84,15 +65,6 @@ def default_install_dir(target_abi):
    return install_dir


-"""
-Internal tool for mace_cc_benchmark, mace_cc_test, mace_run:
-
-python tools/experimental/run.py \
-    --target_abi=armeabi-v7a --target_socs=all --target_name=mace_cc_test \
-    --args="--gtest_filter=EnvTest.*"  --envs="MACE_CPP_MIN_VLOG_LEVEL=5"
-"""
-
-
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
@@ -105,7 +77,7 @@ def parse_args():
    parser.add_argument(
        "--target_socs",
        type=str,
-        default="",
+        default="all",
        help="serialno for adb connection,"
             " username@ip for arm linux,"
             " host for host"
@@ -126,7 +98,7 @@ def parse_args():
    parser.add_argument(
        "--build_dir",
        type=str,
-        default="cmake-build-debug-tools",
+        default="build/cmake-build",
        help="cmake build dir"
    )
    parser.add_argument(
@@ -135,8 +107,6 @@ def parse_args():
        help="if build before run"
    )

-    parser.add_argument("--args", type=str, default="",
-                        help="Command args: --gtest_filter=*, --filter=*")
    parser.add_argument("--envs", type=str, default="",
                        help="Environment vars: "
                             " MACE_CPP_MIN_VLOG_LEVEL=2,"
@@ -145,19 +115,18 @@ def parse_args():
                             " MACE_INTERNAL_STORAGE_PATH=/path/to,"
                             " LD_PRELOAD=/path/to")

-    flgs, _ = parser.parse_known_args()
-    return flgs
+    flgs, args = parser.parse_known_args()
+    return flgs, args


 if __name__ == "__main__":
-    flags = parse_args()
+    flags, args = parse_args()
    if flags.device_conf:
        device_conf = config_parser.parse_device_info(flags.device_conf)
        device.ArmLinuxDevice.set_devices(device_conf)

    target_abi = flags.target_abi.strip()
    target_name = flags.target_name.strip()
-    opts = flags.args.split(" ")
    envs = flags.envs.split(" ")

    # build
@@ -165,11 +134,11 @@ if __name__ == "__main__":
    if flags.build:
        cmake_shell = os.path.abspath(
            os.path.dirname(
-                __file__)) + "/config/build/cmake-build-%s.sh" % target_abi
+                __file__)) + "/../cmake/cmake-build-%s.sh" % target_abi
        os.environ["BUILD_DIR"] = build_dir
-        device.execute(cmake_shell)
+        device.execute("bash " + cmake_shell)

    # run
    target = target.Target(build_dir + "/install/bin/" + target_name,
-                           opts=opts, envs=envs)
+                           opts=args, envs=envs)
    run_target(target_abi, None, target, device_ids=flags.target_socs)
--- a/tools/python/template/file_binary.cc.jinja2
+++ b/tools/python/template/file_binary.cc.jinja2
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a generated file. DO NOT EDIT!
+
+#include <cstring>
+
+namespace mace {
+
+const unsigned char *{{ load_func_name }}() {
+{% if data_size == 0 %}
+  return nullptr;
+{% else %}
+  static const unsigned char kData[{{ data_size }}] = {
+  {% for d in data %}{{"0x%02X, " % d }}{%endfor%}
+  };
+
+  return kData;
+{% endif %}
+}
+
+size_t {{ size_func_name }}() {
+  return {{ data_size }};
+}
+
+}  // namespace mace
+
--- a/tools/python/template/mace_engine_factory.h.jinja2
+++ b/tools/python/template/mace_engine_factory.h.jinja2
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a generated file. DO NOT EDIT!
+
+#ifndef MACE_CODEGEN_ENGINE_MACE_ENGINE_FACTORY_H_
+#define MACE_CODEGEN_ENGINE_MACE_ENGINE_FACTORY_H_
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/public/mace.h"
+
+namespace mace {
+
+{% for tag in model_tags %}
+namespace {{tag}} {
+
+extern const unsigned char *LoadModelData();
+
+extern const std::shared_ptr<NetDef> CreateNet();
+
+extern const std::string ModelName();
+extern const std::string ModelChecksum();
+extern const std::string ModelBuildTime();
+extern const std::string ModelBuildOptions();
+
+}  // namespace {{tag}}
+{% endfor %}
+
+namespace {
+std::map<std::string, int> model_name_map {
+{% for i in range(model_tags |length) %}
+  std::make_pair({{ model_tags[i]|tojson }}, {{ i }}),
+{% endfor %}
+};
+}  // namespace
+
+/// \brief Create MaceEngine from code
+///
+/// Create MaceEngine object based on model graph code and model data file or
+/// model data code.
+///
+/// \param model_name[in]: the name of model you want to use.
+/// \param model_data_file[in]: the path of model data file,
+///        if model_data_format is code, just pass empty string("")
+/// \param input_nodes[in]: the array of input nodes' name
+/// \param output_nodes[in]: the array of output nodes' name
+/// \param config[in]: configurations for MaceEngine.
+/// \param engine[out]: output MaceEngine object
+/// \return MaceStatus::MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
+///         MACE_OUT_OF_RESOURCES for resources is out of range.
+__attribute__((deprecated)) MaceStatus CreateMaceEngineFromCode(
+    const std::string &model_name,
+    const std::string &model_data_file,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const MaceEngineConfig &config,
+    std::shared_ptr<MaceEngine> *engine) {
+  // load model
+  if (engine == nullptr) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+  std::shared_ptr<NetDef> net_def;
+{% if embed_model_data %}
+  (void)model_data_file;
+  const unsigned char * model_data;
+{% endif %}
+  MaceStatus status = MaceStatus::MACE_SUCCESS;
+  switch (model_name_map[model_name]) {
+{% for i in range(model_tags |length) %}
+    case {{ i }}:
+      net_def = mace::{{model_tags[i]}}::CreateNet();
+      engine->reset(new mace::MaceEngine(config));
+{% if embed_model_data %}
+      model_data = mace::{{model_tags[i]}}::LoadModelData();
+      status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
+                               model_data);
+{% else %}
+      status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
+                               model_data_file);
+{% endif %}
+      break;
+{% endfor %}
+   default:
+     status = MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  return status;
+}
+
+MACE_API MaceStatus CreateMaceEngineFromCode(
+    const std::string &model_name,
+    const unsigned char *model_weights_data,
+    const size_t model_weights_data_size,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const MaceEngineConfig &config,
+    std::shared_ptr<MaceEngine> *engine) {
+  // load model
+  if (engine == nullptr) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+  std::shared_ptr<NetDef> net_def;
+{% if embed_model_data %}
+  const unsigned char * model_data;
+  (void)model_weights_data;
+{% endif %}
+  // TODO(yejianwu) Add buffer range checking
+  (void)model_weights_data_size;
+
+  MaceStatus status = MaceStatus::MACE_SUCCESS;
+  switch (model_name_map[model_name]) {
+{% for i in range(model_tags |length) %}
+    case {{ i }}:
+      net_def = mace::{{model_tags[i]}}::CreateNet();
+      engine->reset(new mace::MaceEngine(config));
+{% if embed_model_data %}
+      model_data = mace::{{model_tags[i]}}::LoadModelData();
+      status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
+                               model_data);
+{% else %}
+      status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
+                               model_weights_data);
+{% endif %}
+      break;
+{% endfor %}
+   default:
+     status = MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  return status;
+}
+
+}  // namespace mace
+#endif  // MACE_CODEGEN_ENGINE_MACE_ENGINE_FACTORY_H_
--- a/tools/python/template/model.jinja2
+++ b/tools/python/template/model.jinja2
@@ -133,7 +133,7 @@ void CreateTensors(NetDef *net_def) {

 namespace {{tag}} {

-const std::shared_ptr<NetDef> CreateNet() {
+MACE_API const std::shared_ptr<NetDef> CreateNet() {
  MACE_LATENCY_LOGGER(1, "Create net {{ net.name }}");

  std::shared_ptr<NetDef> net_def(new NetDef());
@@ -154,15 +154,15 @@ const std::shared_ptr<NetDef> CreateNet() {
  return net_def;
 }

-const std::string ModelName() {
+MACE_API const std::string ModelName() {
  return {{ tag|tojson }};
 }

-const std::string ModelChecksum() {
+MACE_API const std::string ModelChecksum() {
  return {{ checksum|tojson }};
 }

-const std::string ModelBuildTime() {
+MACE_API const std::string ModelBuildTime() {
  return {{ build_time|tojson }};
 }


--- a/tools/python/template/model_header.jinja2
+++ b/tools/python/template/model_header.jinja2
@@ -25,14 +25,14 @@ namespace mace {
 namespace {{tag}} {


-extern const unsigned char *LoadModelData();
+MACE_API extern const unsigned char *LoadModelData();

-extern const std::shared_ptr<NetDef> CreateNet();
+MACE_API extern const std::shared_ptr<NetDef> CreateNet();

-extern const std::string ModelName();
-extern const std::string ModelChecksum();
-extern const std::string ModelBuildTime();
-extern const std::string ModelBuildOptions();
+MACE_API extern const std::string ModelName();
+MACE_API extern const std::string ModelChecksum();
+MACE_API extern const std::string ModelBuildTime();
+MACE_API extern const std::string ModelBuildOptions();

 }  // namespace {{ tag }}
 }  // namespace mace

--- a/tools/python/template/tensor_data.jinja2
+++ b/tools/python/template/tensor_data.jinja2
@@ -14,6 +14,7 @@

 // This is a generated file. DO NOT EDIT!

+#include "mace/public/mace.h"

 namespace mace {
 namespace {{tag}} {
@@ -22,7 +23,7 @@ alignas(4) const unsigned char model_data[{{ model_data_size }}] = {
 {% for d in model_data %}{{"0x%02X, " % d }}{%endfor%}
 };

-const unsigned char *LoadModelData() {
+MACE_API const unsigned char *LoadModelData() {
  return model_data;
 }


--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -17,24 +17,8 @@ from enum import Enum

 from py_proto import mace_pb2

-
-class DeviceType(Enum):
-    CPU = 0
-    GPU = 2
-    HEXAGON = 3
-    HTA = 4
-    APU = 5
-
-
-class DataFormat(Enum):
-    NONE = 0
-    NHWC = 1
-    NCHW = 2
-    HWIO = 100
-    OIHW = 101
-    HWOI = 102
-    OHWI = 103
-    AUTO = 1000
+from utils.config_parser import DataFormat
+from utils.config_parser import DeviceType


 # SAME_LOWER: if the amount of paddings to be added is odd,
@@ -402,7 +386,7 @@ class ConverterOption(object):
        self._quantize_range_file = ""
        self._change_concat_ranges = False
        self._transformer_option = None
-        self._cl_mem_type = ""
+        self._cl_mem_type = "image"

    @property
    def input_nodes(self):

--- a/tools/python/utils/config_parser.py
+++ b/tools/python/utils/config_parser.py
@@ -18,8 +18,13 @@ from __future__ import print_function

 import re
 import os
+import copy
 import yaml
+from enum import Enum

+from utils.util import mace_check
+from utils.util import MaceLogger
+from py_proto import mace_pb2

 CPP_KEYWORDS = [
    'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel',
@@ -63,3 +68,227 @@ def parse(path):
 def parse_device_info(path):
    conf = parse(path)
    return conf["devices"]
+
+
+class ModelKeys(object):
+    platform = "platform"
+    runtime = "runtime"
+    graph_optimize_options = "graph_optimize_options"
+    input_tensors = "input_tensors"
+    input_shapes = "input_shapes"
+    input_data_types = "input_data_types"
+    input_data_formats = "input_data_formats"
+    input_ranges = "input_ranges"
+    output_tensors = "output_tensors"
+    output_shapes = "output_shapes"
+    output_data_types = "output_data_types"
+    output_data_formats = "output_data_formats"
+    check_tensors = "check_tensors"
+    check_shapes = "check_shapes"
+    model_file_path = "model_file_path"
+    model_sha256_checksum = "model_sha256_checksum"
+    weight_file_path = "weight_file_path"
+    weight_sha256_checksum = "weight_sha256_checksum"
+    quantize_range_file = "quantize_range_file"
+    quantize = "quantize"
+    quantize_large_weights = "quantize_large_weights"
+    change_concat_ranges = "change_concat_ranges"
+    winograd = "winograd"
+    cl_mem_type = "cl_mem_type"
+    data_types = "data_types"
+    subgraphs = "subgraphs"
+    validation_inputs_data = "validation_inputs_data"
+
+
+class DataFormat(Enum):
+    NONE = 0
+    NHWC = 1
+    NCHW = 2
+    HWIO = 100
+    OIHW = 101
+    HWOI = 102
+    OHWI = 103
+    AUTO = 1000
+
+
+def parse_data_format(str):
+    str = str.upper()
+    mace_check(str in [e.name for e in DataFormat],
+               "unknown data format %s" % str)
+    return DataFormat[str]
+
+
+class DeviceType(Enum):
+    CPU = 0
+    GPU = 2
+    HEXAGON = 3
+    HTA = 4
+    APU = 5
+    CPU_GPU = 100
+
+
+DEVICE_MAP = {
+    "cpu": DeviceType.CPU,
+    "gpu": DeviceType.GPU,
+    "hexagon": DeviceType.HEXAGON,
+    "dsp": DeviceType.HEXAGON,
+    "hta": DeviceType.HTA,
+    "apu": DeviceType.APU,
+    "cpu+gpu": DeviceType.CPU_GPU
+}
+
+
+def parse_device_type(str):
+    mace_check(str in DEVICE_MAP, "unknown device %s" % str)
+    return DEVICE_MAP[str]
+
+
+class Platform(Enum):
+    TENSORFLOW = 0
+    CAFFE = 1
+    ONNX = 2
+
+
+def parse_platform(str):
+    str = str.upper()
+    mace_check(str in [e.name for e in Platform],
+               "unknown platform %s" % str)
+    return Platform[str]
+
+
+DATA_TYPE_MAP = {
+    'float32': mace_pb2.DT_FLOAT,
+    'int32': mace_pb2.DT_INT32,
+}
+
+
+def parse_data_type(str):
+    if str == "float32":
+        return mace_pb2.DT_FLOAT
+    elif str == "int32":
+        return mace_pb2.DT_INT32
+    else:
+        mace_check(False, "data type %s not supported" % str)
+
+
+def parse_internal_data_type(str):
+    if str == 'fp32_fp32':
+        return mace_pb2.DT_FLOAT
+    else:
+        return mace_pb2.DT_HALF
+
+
+def to_list(x):
+    if isinstance(x, list):
+        return x
+    else:
+        return [x]
+
+
+def parse_int_array(xs):
+    return [int(x) for x in xs.split(",")]
+
+
+def parse_float_array(xs):
+    return [float(x) for x in xs.split(",")]
+
+
+def normalize_model_config(conf):
+    conf = copy.deepcopy(conf)
+    if ModelKeys.subgraphs in conf:
+        subgraph = conf[ModelKeys.subgraphs][0]
+        del conf[ModelKeys.subgraphs]
+        conf.update(subgraph)
+
+    print(conf)
+    conf[ModelKeys.platform] = parse_platform(conf[ModelKeys.platform])
+    conf[ModelKeys.runtime] = parse_device_type(conf[ModelKeys.runtime])
+
+    if ModelKeys.quantize in conf:
+        conf[ModelKeys.data_types] = mace_pb2.DT_FLOAT
+    else:
+        if ModelKeys.data_types in conf:
+            conf[ModelKeys.data_types] = parse_internal_data_type(
+                conf[ModelKeys.data_types])
+        else:
+            conf[ModelKeys.data_types] = mace_pb2.DT_HALF
+
+    # parse input
+    conf[ModelKeys.input_tensors] = to_list(conf[ModelKeys.input_tensors])
+    input_count = len(conf[ModelKeys.input_tensors])
+    conf[ModelKeys.input_shapes] = [parse_int_array(shape) for shape in
+                                    to_list(conf[ModelKeys.input_shapes])]
+    mace_check(
+        len(conf[ModelKeys.input_shapes]) == input_count,
+        "input node count and shape count do not match")
+
+    input_data_types = [parse_data_type(dt) for dt in
+                        to_list(conf.get(ModelKeys.input_data_types,
+                                         ["float32"]))]
+
+    if len(input_data_types) == 1 and input_count > 1:
+        input_data_types = [input_data_types[0]] * input_count
+    mace_check(len(input_data_types) == input_count,
+               "the number of input_data_types should be "
+               "the same as input tensors")
+    conf[ModelKeys.input_data_types] = input_data_types
+
+    input_data_formats = [parse_data_format(df) for df in
+                          to_list(conf.get(ModelKeys.input_data_formats,
+                                           ["NHWC"]))]
+    if len(input_data_formats) == 1 and input_count > 1:
+        input_data_formats = [input_data_formats[0]] * input_count
+    mace_check(len(input_data_formats) == input_count,
+               "the number of input_data_formats should be "
+               "the same as input tensors")
+    conf[ModelKeys.input_data_formats] = input_data_formats
+
+    input_ranges = [parse_float_array(r) for r in
+                    to_list(conf.get(ModelKeys.input_ranges,
+                                     ["-1.0,1.0"]))]
+    if len(input_ranges) == 1 and input_count > 1:
+        input_ranges = [input_ranges[0]] * input_count
+    mace_check(len(input_ranges) == input_count,
+               "the number of input_ranges should be "
+               "the same as input tensors")
+    conf[ModelKeys.input_ranges] = input_ranges
+
+    # parse output
+    conf[ModelKeys.output_tensors] = to_list(conf[ModelKeys.output_tensors])
+    output_count = len(conf[ModelKeys.output_tensors])
+    conf[ModelKeys.output_shapes] = [parse_int_array(shape) for shape in
+                                     to_list(conf[ModelKeys.output_shapes])]
+    mace_check(len(conf[ModelKeys.output_tensors]) == output_count,
+               "output node count and shape count do not match")
+
+    output_data_types = [parse_data_type(dt) for dt in
+                         to_list(conf.get(ModelKeys.output_data_types,
+                                          ["float32"]))]
+    if len(output_data_types) == 1 and output_count > 1:
+        output_data_types = [output_data_types[0]] * output_count
+    mace_check(len(output_data_types) == output_count,
+               "the number of output_data_types should be "
+               "the same as output tensors")
+    conf[ModelKeys.output_data_types] = output_data_types
+
+    output_data_formats = [parse_data_format(df) for df in
+                           to_list(conf.get(ModelKeys.output_data_formats,
+                                            ["NHWC"]))]
+    if len(output_data_formats) == 1 and output_count > 1:
+        output_data_formats = [output_data_formats[0]] * output_count
+    mace_check(len(output_data_formats) == output_count,
+               "the number of output_data_formats should be "
+               "the same as output tensors")
+    conf[ModelKeys.output_data_formats] = output_data_formats
+
+    if ModelKeys.check_tensors in conf:
+        conf[ModelKeys.check_tensors] = to_list(conf[ModelKeys.check_tensors])
+        conf[ModelKeys.check_shapes] = [parse_int_array(shape) for shape in
+                                        to_list(conf[ModelKeys.check_shapes])]
+        mace_check(len(conf[ModelKeys.check_tensors]) == len(
+            conf[ModelKeys.check_shapes]),
+                   "check tensors count and shape count do not match.")
+
+    MaceLogger.summary(conf)
+
+    return conf
--- a/tools/python/utils/device.py
+++ b/tools/python/utils/device.py
@@ -17,13 +17,15 @@ from __future__ import division
 from __future__ import print_function

 import os
+import re
 import subprocess
+import random
+import tempfile

+from utils import util

-MACE_TOOL_QUIET_ENV = "MACE_TOOL_QUIET"

-
-def execute(cmd):
+def execute(cmd, verbose=True):
    print("CMD> %s" % cmd)
    p = subprocess.Popen([cmd],
                         shell=True,
@@ -31,20 +33,28 @@ def execute(cmd):
                         stderr=subprocess.STDOUT,
                         stdin=subprocess.PIPE,
                         universal_newlines=True)
-    returncode = p.poll()
+
+    if not verbose:
+        if p.wait() != 0:
+            raise Exception("errorcode: %s" % p.returncode)
+        return p.stdout.read()
+
    buf = []
-    while returncode is None:
-        line = p.stdout.readline()
-        returncode = p.poll()
-        line = line.strip()
-        if MACE_TOOL_QUIET_ENV not in os.environ:
+
+    while p.poll() is None:
+        line = p.stdout.readline().strip()
+        if verbose:
            print(line)
        buf.append(line)

-    p.wait()
+    for l in p.stdout:
+        line = l.strip()
+        if verbose:
+            print(line)
+        buf.append(line)

-    if returncode != 0:
-        raise Exception("errorcode: %s" % returncode)
+    if p.returncode != 0:
+        raise Exception("errorcode: %s" % p.returncode)

    return "\n".join(buf)

@@ -62,6 +72,12 @@ class Device(object):
    def pull(self, target, out_dir):
        pass

+    def mkdir(self, dirname):
+        pass
+
+    def info(self):
+        pass
+

 class HostDevice(Device):
    def __init__(self, device_id):
@@ -98,6 +114,9 @@ class HostDevice(Device):
        if out_dir.strip() and out_dir != os.path.dirname(target.path):
            execute("cp -r %s %s" % (target.path, out_dir))

+    def mkdir(self, dirname):
+        execute("mkdir -p %s" % dirname)
+

 class AndroidDevice(Device):
    def __init__(self, device_id):
@@ -120,9 +139,15 @@ class AndroidDevice(Device):
        sn = self._device_id

        execute("adb -s %s shell mkdir -p %s" % (sn, install_dir))
-        execute("adb -s %s push %s %s" % (sn, target.path, install_dir))
+        if os.path.isdir(target.path):
+            execute("adb -s %s push %s/* %s" % (sn, target.path, install_dir),
+                    False)
+        else:
+            execute("adb -s %s push %s %s" % (sn, target.path, install_dir),
+                    False)
+
        for lib in target.libs:
-            execute("adb -s %s push %s %s" % (sn, lib, install_dir))
+            execute("adb -s %s push %s %s" % (sn, lib, install_dir), False)

        target.path = "%s/%s" % (install_dir, os.path.basename(target.path))
        target.libs = ["%s/%s" % (install_dir, os.path.basename(lib))
@@ -132,7 +157,17 @@ class AndroidDevice(Device):
        return target

    def run(self, target):
-        out = execute("adb -s %s shell %s" % (self._device_id, target))
+        tmpdirname = tempfile.mkdtemp()
+        cmd_file_path = tmpdirname + "/cmd.sh"
+        with open(cmd_file_path, "w") as cmd_file:
+            cmd_file.write(str(target))
+        target_dir = os.path.dirname(target.path)
+        execute("adb -s %s push %s %s" % (self._device_id,
+                                          cmd_file_path,
+                                          target_dir))
+
+        out = execute("adb -s %s shell sh %s" % (self._device_id,
+                                                 target_dir + "/cmd.sh"))
        # May have false positive using the following error word
        for line in out.split("\n")[:-10]:
            if ("Aborted" in line
@@ -141,7 +176,23 @@ class AndroidDevice(Device):

    def pull(self, target, out_dir):
        sn = self._device_id
-        execute("adb -s %s pull %s %s" % (sn, target.path, out_dir))
+        execute("adb -s %s pull %s %s" % (sn, target.path, out_dir), False)
+
+    def mkdir(self, dirname):
+        sn = self._device_id
+        execute("adb -s %s shell mkdir -p %s" % (sn, dirname))
+
+    def info(self):
+        sn = self._device_id
+        output = execute("adb -s %s shell getprop" % sn, False)
+        raw_props = output.split("\n")
+        props = {}
+        p = re.compile(r'\[(.+)\]: \[(.+)\]')
+        for raw_prop in raw_props:
+            m = p.match(raw_prop)
+            if m:
+                props[m.group(1)] = m.group(2)
+        return props


 class ArmLinuxDevice(Device):
@@ -153,10 +204,12 @@ class ArmLinuxDevice(Device):
    @staticmethod
    def list_devices():
        device_ids = []
-        for dev_name, dev_info in ArmLinuxDevice.devices:
+        print("!!!", ArmLinuxDevice.devices)
+        for dev_name, dev_info in ArmLinuxDevice.devices.items():
            address = dev_info["address"]
            username = dev_info["username"]
            device_ids.append("%s@%s" % (username, address))
+        return device_ids

    @staticmethod
    def set_devices(devices):
@@ -166,10 +219,10 @@ class ArmLinuxDevice(Device):
        install_dir = os.path.abspath(install_dir)
        ip = self._device_id

-        execute("ssh %s mkdir -p %s" % install_dir)
-        execute("scp %s %s:%s" % (target.path, ip, install_dir))
+        execute("ssh %s mkdir -p %s" % (ip, install_dir))
+        execute("scp -r %s %s:%s" % (target.path, ip, install_dir))
        for lib in target.libs:
-            execute("scp %s:%s" % (lib, install_dir))
+            execute("scp -r %s:%s" % (lib, install_dir))

        target.path = "%s/%s" % (install_dir, os.path.basename(target.path))
        target.libs = ["%s/%s" % (install_dir, os.path.basename(lib))
@@ -179,11 +232,15 @@ class ArmLinuxDevice(Device):
        return target

    def run(self, target):
-        execute("ssh %s shell %s" % (self._device_id, target))
+        execute("ssh %s %s" % (self._device_id, target))

    def pull(self, target, out_dir):
        sn = self._device_id
-        execute("scp %s:%s %s" % (sn, target.path, out_dir))
+        execute("scp -r %s:%s %s" % (sn, target.path, out_dir))
+
+    def mkdir(self, dirname):
+        sn = self._device_id
+        execute("ssh %s mkdir -p %s" % (sn, dirname))


 def device_class(target_abi):
@@ -204,3 +261,23 @@ def device_class(target_abi):

 def crete_device(target_abi, device_id=None):
    return device_class(target_abi)(device_id)
+
+
+def choose_devices(target_abi, target_ids):
+    device_clazz = device_class(target_abi)
+    devices = device_clazz.list_devices()
+
+    if target_ids == "all":
+        run_devices = devices
+    elif target_ids == "random":
+        unlocked_devices = [dev for dev in devices if
+                            not util.is_device_locked(dev)]
+        if unlocked_devices:
+            run_devices = [random.choice(unlocked_devices)]
+        else:
+            run_devices = [random.choice(devices)]
+    else:
+        device_id_list = [dev.strip() for dev in target_ids.split(",")]
+        run_devices = [dev for dev in device_id_list if dev in devices]
+
+    return run_devices
--- a/tools/python/utils/util.py
+++ b/tools/python/utils/util.py
@@ -18,9 +18,12 @@ from __future__ import print_function

 import inspect
 import hashlib
+import filelock
+import errno
 import os
-import urllib
-from utils import device
+import sys
+import shutil
+import traceback


 ################################
@@ -46,33 +49,92 @@ def get_frame_info(level=2):
 class MaceLogger:
    @staticmethod
    def header(message):
-        print(CMDColors.PURPLE + message + CMDColors.ENDC)
+        print(CMDColors.PURPLE + str(message) + CMDColors.ENDC)

    @staticmethod
    def summary(message):
-        print(CMDColors.GREEN + message + CMDColors.ENDC)
+        print(CMDColors.GREEN + str(message) + CMDColors.ENDC)

    @staticmethod
    def info(message):
-        print(get_frame_info() + message)
+        print(get_frame_info() + str(message))

    @staticmethod
    def warning(message):
-        print(CMDColors.YELLOW + 'WARNING: ' + get_frame_info() + message
+        print(CMDColors.YELLOW + 'WARNING: ' + get_frame_info() + str(message)
              + CMDColors.ENDC)

    @staticmethod
    def error(message):
-        print(CMDColors.RED + 'ERROR: ' + get_frame_info() + message
+        print(CMDColors.RED + 'ERROR: ' + get_frame_info() + str(message)
              + CMDColors.ENDC)
        exit(1)


 def mace_check(condition, message):
    if not condition:
+        for line in traceback.format_stack():
+            print(line.strip())
+
        MaceLogger.error(message)


+################################
+# String Formatter
+################################
+class StringFormatter:
+    @staticmethod
+    def table(header, data, title, align="R"):
+        data_size = len(data)
+        column_size = len(header)
+        column_length = [len(str(ele)) + 1 for ele in header]
+        for row_idx in range(data_size):
+            data_tuple = data[row_idx]
+            ele_size = len(data_tuple)
+            assert (ele_size == column_size)
+            for i in range(ele_size):
+                column_length[i] = max(column_length[i],
+                                       len(str(data_tuple[i])) + 1)
+
+        table_column_length = sum(column_length) + column_size + 1
+        dash_line = '-' * table_column_length + '\n'
+        header_line = '=' * table_column_length + '\n'
+        output = ""
+        output += dash_line
+        output += str(title).center(table_column_length) + '\n'
+        output += dash_line
+        output += '|' + '|'.join([str(header[i]).center(column_length[i])
+                                  for i in range(column_size)]) + '|\n'
+        output += header_line
+
+        for data_tuple in data:
+            ele_size = len(data_tuple)
+            row_list = []
+            for i in range(ele_size):
+                if align == "R":
+                    row_list.append(str(data_tuple[i]).rjust(column_length[i]))
+                elif align == "L":
+                    row_list.append(str(data_tuple[i]).ljust(column_length[i]))
+                elif align == "C":
+                    row_list.append(str(data_tuple[i])
+                                    .center(column_length[i]))
+            output += '|' + '|'.join(row_list) + "|\n" + dash_line
+        return output
+
+    @staticmethod
+    def block(message):
+        line_length = 10 + len(str(message)) + 10
+        star_line = '*' * line_length + '\n'
+        return star_line + str(message).center(line_length) + '\n' + star_line
+
+
+def formatted_file_name(input_file_name, input_name):
+    res = input_file_name + '_'
+    for c in input_name:
+        res += c if c.isalnum() else '_'
+    return res
+
+
 ################################
 # file
 ################################
@@ -86,17 +148,86 @@ def file_checksum(fname):

 def download_or_get_file(file,
                         sha256_checksum,
-                         output_dir):
-    filename = os.path.basename(file)
-    output_file = "%s/%s-%s.pb" % (output_dir, filename, sha256_checksum)
-
+                         output_file):
    if file.startswith("http://") or file.startswith("https://"):
        if not os.path.exists(output_file) or file_checksum(
                output_file) != sha256_checksum:
-            MaceLogger.info("Downloading file %s, please wait ..." % file)
-            urllib.urlretrieve(file, output_file)
+            MaceLogger.info("Downloading file %s to %s, please wait ..."
+                            % (file, output_file))
+            if sys.version_info >= (3, 0):
+                import urllib.request
+                data = urllib.request.urlopen(file)
+                out_handle = open(output_file, "wb")
+                out_handle.write(data.read())
+                out_handle.close()
+            else:
+                import urllib
+                urllib.urlretrieve(file, output_file)
            MaceLogger.info("Model downloaded successfully.")
    else:
-        device.execute("cp %s %s" % (file, output_file))
+        shutil.copyfile(file, output_file)
+
+    if sha256_checksum:
+        mace_check(file_checksum(output_file) == sha256_checksum,
+                   "checksum validate failed")

    return output_file
+
+
+def download_or_get_model(file,
+                          sha256_checksum,
+                          output_dir):
+    filename = os.path.basename(file)
+    output_file = "%s/%s-%s.pb" % (output_dir, filename, sha256_checksum)
+    download_or_get_file(file, sha256_checksum, output_file)
+    return output_file
+
+
+################################
+# bazel commands
+################################
+class ABIType(object):
+    armeabi_v7a = 'armeabi-v7a'
+    arm64_v8a = 'arm64-v8a'
+    arm64 = 'arm64'
+    aarch64 = 'aarch64'
+    armhf = 'armhf'
+    host = 'host'
+
+
+def abi_to_internal(abi):
+    if abi in [ABIType.armeabi_v7a, ABIType.arm64_v8a]:
+        return abi
+    if abi == ABIType.arm64:
+        return ABIType.aarch64
+    if abi == ABIType.armhf:
+        return ABIType.armeabi_v7a
+
+
+################################
+# lock
+################################
+def device_lock(device_id, timeout=7200):
+    return filelock.FileLock("/tmp/device-lock-%s" % device_id,
+                             timeout=timeout)
+
+
+def is_device_locked(device_id):
+    try:
+        with device_lock(device_id, timeout=0.000001):
+            return False
+    except filelock.Timeout:
+        return True
+
+
+################################
+# os
+################################
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
--- a/tools/python/validate.py
+++ b/tools/python/validate.py
+# Copyright 2018 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path
+import numpy as np
+import six
+
+from py_proto import mace_pb2
+from utils import util
+from utils.config_parser import DataFormat
+from utils.config_parser import Platform
+
+VALIDATION_MODULE = 'VALIDATION'
+
+
+def load_data(file, data_type=mace_pb2.DT_FLOAT):
+    if os.path.isfile(file):
+        if data_type == mace_pb2.DT_FLOAT:
+            return np.fromfile(file=file, dtype=np.float32)
+        elif data_type == mace_pb2.DT_INT32:
+            return np.fromfile(file=file, dtype=np.int32)
+    return np.empty([0])
+
+
+def calculate_sqnr(expected, actual):
+    noise = expected - actual
+
+    def power_sum(xs):
+        return sum([x * x for x in xs])
+
+    signal_power_sum = power_sum(expected)
+    noise_power_sum = power_sum(noise)
+    return signal_power_sum / (noise_power_sum + 1e-15)
+
+
+def calculate_similarity(u, v, data_type=np.float64):
+    if u.dtype is not data_type:
+        u = u.astype(data_type)
+    if v.dtype is not data_type:
+        v = v.astype(data_type)
+    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+
+def calculate_pixel_accuracy(out_value, mace_out_value):
+    if len(out_value.shape) < 2:
+        return 1.0
+    out_value = out_value.reshape((-1, out_value.shape[-1]))
+    batches = out_value.shape[0]
+    classes = out_value.shape[1]
+    mace_out_value = mace_out_value.reshape((batches, classes))
+    correct_count = 0
+    for i in range(batches):
+        if np.argmax(out_value[i]) == np.argmax(mace_out_value[i]):
+            correct_count += 1
+    return 1.0 * correct_count / batches
+
+
+def compare_output(output_name, mace_out_value,
+                   out_value, validation_threshold, log_file):
+    if mace_out_value.size != 0:
+        pixel_accuracy = calculate_pixel_accuracy(out_value, mace_out_value)
+        out_value = out_value.reshape(-1)
+        mace_out_value = mace_out_value.reshape(-1)
+        assert len(out_value) == len(mace_out_value)
+        sqnr = calculate_sqnr(out_value, mace_out_value)
+        similarity = calculate_similarity(out_value, mace_out_value)
+        util.MaceLogger.summary(
+            output_name + ' MACE VS training platform'
+            + ' similarity: ' + str(similarity) + ' , sqnr: ' + str(sqnr)
+            + ' , pixel_accuracy: ' + str(pixel_accuracy))
+        if log_file:
+            if not os.path.exists(log_file):
+                with open(log_file, 'w') as f:
+                    f.write('output_name,similarity,sqnr,pixel_accuracy\n')
+            summary = '{output_name},{similarity},{sqnr},{pixel_accuracy}\n' \
+                .format(output_name=output_name,
+                        similarity=similarity,
+                        sqnr=sqnr,
+                        pixel_accuracy=pixel_accuracy)
+            with open(log_file, "a") as f:
+                f.write(summary)
+        elif similarity > validation_threshold:
+            util.MaceLogger.summary(
+                util.StringFormatter.block("Similarity Test Passed"))
+        else:
+            util.MaceLogger.error(
+                "", util.StringFormatter.block("Similarity Test Failed"))
+    else:
+        util.MaceLogger.error(
+            "", util.StringFormatter.block(
+                "Similarity Test failed because of empty output"))
+
+
+def normalize_tf_tensor_name(name):
+    if name.find(':') == -1:
+        return name + ':0'
+    else:
+        return name
+
+
+def validate_with_file(output_names, output_shapes,
+                       mace_out_file, validation_outputs_data,
+                       validation_threshold, log_file):
+    for i in range(len(output_names)):
+        if validation_outputs_data[i].startswith("http://") or \
+                validation_outputs_data[i].startswith("https://"):
+            validation_file_name = util.formatted_file_name(
+                mace_out_file, output_names[i] + '_validation')
+            six.moves.urllib.request.urlretrieve(validation_outputs_data[i],
+                                                 validation_file_name)
+        else:
+            validation_file_name = validation_outputs_data[i]
+        value = load_data(validation_file_name)
+        out_shape = output_shapes[i]
+        if len(out_shape) == 4:
+            out_shape[1], out_shape[2], out_shape[3] = \
+                out_shape[3], out_shape[1], out_shape[2]
+            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
+        output_file_name = util.formatted_file_name(
+            mace_out_file, output_names[i])
+        mace_out_value = load_data(output_file_name)
+        compare_output(output_names[i], mace_out_value,
+                       value, validation_threshold, log_file)
+
+
+def validate_tf_model(model_file,
+                      input_file, mace_out_file,
+                      input_names, input_shapes, input_data_formats,
+                      output_names, output_shapes, output_data_formats,
+                      validation_threshold, input_data_types, log_file):
+    import tensorflow as tf
+    if not os.path.isfile(model_file):
+        util.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")
+
+    tf.reset_default_graph()
+    input_graph_def = tf.GraphDef()
+    with open(model_file, "rb") as f:
+        data = f.read()
+        input_graph_def.ParseFromString(data)
+        tf.import_graph_def(input_graph_def, name="")
+
+        with tf.Session() as session:
+            with session.graph.as_default() as graph:
+                tf.import_graph_def(input_graph_def, name="")
+                input_dict = {}
+                for i in range(len(input_names)):
+                    input_value = load_data(
+                        util.formatted_file_name(input_file, input_names[i]),
+                        input_data_types[i])
+                    input_value = input_value.reshape(input_shapes[i])
+                    if input_data_formats[i] == DataFormat.NCHW and \
+                            len(input_shapes[i]) == 4:
+                        input_value = input_value.transpose((0, 2, 3, 1))
+                    elif input_data_formats[i] == DataFormat.OIHW and \
+                            len(input_shapes[i]) == 4:
+                        # OIHW -> HWIO
+                        input_value = input_value.transpose((2, 3, 1, 0))
+                    input_node = graph.get_tensor_by_name(
+                        normalize_tf_tensor_name(input_names[i]))
+                    input_dict[input_node] = input_value
+
+                output_nodes = []
+                for name in output_names:
+                    output_nodes.extend(
+                        [graph.get_tensor_by_name(
+                            normalize_tf_tensor_name(name))])
+                output_values = session.run(output_nodes, feed_dict=input_dict)
+                for i in range(len(output_names)):
+                    output_file_name = util.formatted_file_name(
+                        mace_out_file, output_names[i])
+                    mace_out_value = load_data(output_file_name)
+                    if output_data_formats[i] == DataFormat.NCHW and \
+                            len(output_shapes[i]) == 4:
+                        mace_out_value = mace_out_value. \
+                            reshape(output_shapes[i]).transpose((0, 2, 3, 1))
+                    compare_output(output_names[i],
+                                   mace_out_value, output_values[i],
+                                   validation_threshold, log_file)
+
+
+def validate_caffe_model(model_file, input_file,
+                         mace_out_file, weight_file,
+                         input_names, input_shapes, input_data_formats,
+                         output_names, output_shapes, output_data_formats,
+                         validation_threshold, log_file):
+    os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
+    import caffe
+    if not os.path.isfile(model_file):
+        util.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")
+    if not os.path.isfile(weight_file):
+        util.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input weight file '" + weight_file + "' does not exist!")
+
+    caffe.set_mode_cpu()
+
+    net = caffe.Net(model_file, caffe.TEST, weights=weight_file)
+
+    for i in range(len(input_names)):
+        input_value = load_data(
+            util.formatted_file_name(input_file, input_names[i]))
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
+        input_blob_name = input_names[i]
+        try:
+            if input_names[i] in net.top_names:
+                input_blob_name = net.top_names[input_names[i]][0]
+        except ValueError:
+            pass
+        new_shape = input_value.shape
+        net.blobs[input_blob_name].reshape(*new_shape)
+        for index in range(input_value.shape[0]):
+            net.blobs[input_blob_name].data[index] = input_value[index]
+
+    net.forward()
+
+    for i in range(len(output_names)):
+        value = net.blobs[output_names[i]].data
+        output_file_name = util.formatted_file_name(
+            mace_out_file, output_names[i])
+        mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i]) \
+                .transpose((0, 3, 1, 2))
+        compare_output(output_names[i], mace_out_value,
+                       value, validation_threshold, log_file)
+
+
+def validate_onnx_model(model_file,
+                        input_file, mace_out_file,
+                        input_names, input_shapes, input_data_formats,
+                        output_names, output_shapes, output_data_formats,
+                        validation_threshold, input_data_types,
+                        backend, log_file):
+    import onnx
+    if backend == "tensorflow":
+        from onnx_tf.backend import prepare
+        print("valivate on onnx tensorflow backend.")
+    elif backend == "caffe2" or backend == "pytorch":
+        from caffe2.python.onnx.backend import prepare
+        print("valivate on onnx caffe2 backend.")
+    else:
+        util.MaceLogger.error(
+            VALIDATION_MODULE,
+            "onnx backend framwork '" + backend + "' is invalid.")
+    if not os.path.isfile(model_file):
+        util.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")
+    model = onnx.load(model_file)
+    input_dict = {}
+    for i in range(len(input_names)):
+        input_value = load_data(util.formatted_file_name(input_file,
+                                                         input_names[i]),
+                                input_data_types[i])
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
+        input_dict[input_names[i]] = input_value
+    onnx_outputs = []
+    for i in range(len(output_names)):
+        out_shape = output_shapes[i][:]
+        if output_data_formats[i] == DataFormat.NHWC and \
+                len(out_shape) == 4:
+            out_shape[1], out_shape[2], out_shape[3] = \
+                out_shape[3], out_shape[1], out_shape[2]
+        onnx_outputs.append(
+            onnx.helper.make_tensor_value_info(output_names[i],
+                                               onnx.TensorProto.FLOAT,
+                                               out_shape))
+    model.graph.output.extend(onnx_outputs)
+    rep = prepare(model)
+
+    output_values = rep.run(input_dict)
+    for i in range(len(output_names)):
+        out_name = output_names[i]
+        value = output_values[out_name].flatten()
+        output_file_name = util.formatted_file_name(mace_out_file,
+                                                    output_names[i])
+        mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i]) \
+                .transpose((0, 3, 1, 2))
+        compare_output(output_names[i],
+                       mace_out_value, value,
+                       validation_threshold, log_file)
+
+
+def validate(platform, model_file, weight_file, input_file, mace_out_file,
+             input_shape, output_shape, input_data_format,
+             output_data_format, input_node, output_node,
+             validation_threshold, input_data_type, backend,
+             validation_outputs_data, log_file):
+    if not isinstance(validation_outputs_data, list):
+        if os.path.isfile(validation_outputs_data):
+            validation_outputs = [validation_outputs_data]
+        else:
+            validation_outputs = []
+    else:
+        validation_outputs = validation_outputs_data
+    if validation_outputs:
+        validate_with_file(output_node, output_shape,
+                           mace_out_file, validation_outputs,
+                           validation_threshold, log_file)
+    elif platform == Platform.TENSORFLOW:
+        validate_tf_model(model_file, input_file, mace_out_file,
+                          input_node, input_shape, input_data_format,
+                          output_node, output_shape, output_data_format,
+                          validation_threshold, input_data_type,
+                          log_file)
+    elif platform == Platform.CAFFE:
+        validate_caffe_model(model_file,
+                             input_file, mace_out_file, weight_file,
+                             input_node, input_shape, input_data_format,
+                             output_node, output_shape, output_data_format,
+                             validation_threshold, log_file)
+    elif platform == Platform.ONNX:
+        validate_onnx_model(model_file,
+                            input_file, mace_out_file,
+                            input_node, input_shape, input_data_format,
+                            output_node, output_shape, output_data_format,
+                            validation_threshold,
+                            input_data_type, backend, log_file)
--- a/tools/python/visualize/visualize_model.py
+++ b/tools/python/visualize/visualize_model.py
@@ -90,5 +90,5 @@ class ModelVisualizer(object):

    def save_html(self):
        html = self.render_html()
-        with open(self._output_file, "wb") as f:
+        with open(self._output_file, "w") as f:
            f.write(html)