Merge branch 'develop' into add_matmul_op

9d70dd4d · jiweibo · 050fc184 · 14af02be · 9d70dd4d · 9d70dd4d
100 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,8 @@ repos:
    sha: v1.0.1
    hooks:
    -   id: remove-crlf
-        files: (?!.*third_party)^.*$ | (?!.*book)^.*$ ^mobile/ ^metal/ ^web/
+        files: (?!.*third_party)^.*$|(?!.*book)^.*$
+        exclude: ^(mobile/|metal/|web/)
 #-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
    #sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
    #hooks:
@@ -16,7 +17,7 @@ repos:
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
-        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+        files: (?!.*third_party)^.*$|(?!.*book)^.*$
    -   id: end-of-file-fixer
 -   repo: local
    hooks:
@@ -25,7 +26,8 @@ repos:
        description: Format files with ClangFormat.
        entry: bash ./tools/codestyle/clang_format.hook -i
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ ^mobile/ ^metal/ ^web/
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+        exclude: ^(mobile/|metal/|web/)
 -   repo: local
    hooks:
    -   id: cpplint-cpp-source
@@ -33,7 +35,8 @@ repos:
        description: Check C++ code style using cpplint.py.
        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ ^mobile/ ^metal/ ^web/
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+        exclude: ^(mobile/|metal/|web/)
 #-   repo: local
    #hooks:
    #-   id: pylint-doc-string
@@ -48,5 +51,6 @@ repos:
        name: copyright_checker
        entry: python ./tools/codestyle/copyright.hook
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ ^mobile/ ^metal/ ^web/
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$|(?!.*book)^.*$
+        exclude: ^(mobile/|metal/|web/)
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,18 +9,17 @@ os:
 addons:
  apt:
    packages:
-      - git
-      - python
-      - python-pip
-      - python2.7-dev
-      - libc6-i386
-      - curl
-
-compiler:
-  - clang
+#      - git
+#      - python
+#      - python-pip
+#      - python2.7-dev
+#      - libc6-i386
+#      - curl
+      - clang-format-3.8
        
 before_install:
-  - sudo pip install -U virtualenv pre-commit pip
+  - sudo pip install cpplint pre-commit
+  - sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format
  # Download and install recent cmake

 script:

--- a/.travis/pre-commit-job.sh
+++ b/.travis/pre-commit-job.sh
@@ -11,6 +11,8 @@ cd `dirname $0`
 cd ..
 export PATH=/usr/bin:$PATH
 pre-commit install
+which clang-format
+clang-format --version

 if ! pre-commit run -a ; then
  ls -lh

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,8 @@ option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
 option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
+# publish options
+option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)

 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
        "A path setting third party libraries download & build directories.")
@@ -93,7 +95,7 @@ endif()

 # check options
 if (LITE_ON_TINY_PUBLISH)
-    if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_JAVA AND NOT WITH_TESTING))
+    if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT WITH_TESTING))#LITE_WITH_JAVA AND 
        message(FATAL_ERROR "LITE_ON_TINY_PUBLISH=ON must be used with WITH_LITE=ON LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON LITE_WITH_JAVA=ON WITH_TESTING=OFF")
        return()
    endif()

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -127,6 +127,7 @@ elseif(ARM_TARGET_OS STREQUAL "ios64")
 else()
  return()
 endif()
+add_definitions(-DTARGET_IOS)

 # if do not specify the ARM_TARGET_ARCH_ABI then use default all supported
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv7"

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -57,6 +57,8 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+
+
  if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    foreach(var ${lite_deps_HVY_DEPS})
      set(deps ${deps} ${var})
@@ -182,9 +184,16 @@ function(lite_cc_test TARGET)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
-        ARGS)
+        ARGS
+        COMPILE_LEVEL # (basic|extra)
+        )
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+    if (args_COMPILE_LEVEL STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
+      MESSAGE(STATUS "Ignore test ${TARGET} due to compile level ${args_COMPILE_LEVEL}")
+      return()
+    endif()
+
    set(deps "")
    lite_deps(deps
              DEPS ${args_DEPS}
@@ -207,6 +216,117 @@ function(lite_cc_test TARGET)
    endif()
 endfunction()

+set(arm_kernels CACHE INTERNAL "arm kernels")
+set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(fpga_kernels CACHE INTERNAL "fpga kernels")
+set(npu_kernels CACHE INTERNAL "npu kernels")
+set(opencl_kernels CACHE INTERNAL "opencl kernels")
+set(host_kernels CACHE INTERNAL "host kernels")
+
+set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
+file(WRITE ${kernels_src_list} "") # clean
+# add a kernel for some specific device
+# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
+# level: one of (basic, extra)
+function(add_kernel TARGET device level)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+        ARGS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
+        return()
+    endif()
+
+    if ("${device}" STREQUAL "Host")
+        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "ARM")
+        if (NOT LITE_WITH_ARM)
+            return()
+        endif()
+        set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "X86")
+        if (NOT LITE_WITH_X86)
+            return()
+        endif()
+        set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "NPU")
+        if (NOT LITE_WITH_NPU)
+            return()
+        endif()
+        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "FPGA")
+        if (NOT LITE_WITH_FPGA)
+            return()
+        endif()
+        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "OPENCL")
+        if (NOT LITE_WITH_OPENCL)
+            return()
+        endif()
+        set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+
+    foreach(src ${args_SRCS})
+        file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+    endforeach()
+
+    lite_cc_library(${TARGET} SRCS ${args_SRCS}
+              DEPS ${args_DEPS}
+              X86_DEPS ${args_X86_DEPS}
+              CUDA_DEPS ${args_CUDA_DEPS}
+              CL_DEPS ${args_CL_DEPS}
+              ARM_DEPS ${args_ARM_DEPS}
+              FPGA_DEPS ${args_FPGA_DEPS}
+              PROFILE_DEPS ${args_PROFILE_DEPS}
+              LIGHT_DEPS ${args_LIGHT_DEPS}
+              HVY_DEPS ${args_HVY_DEPS}
+      )
+endfunction()
+
+set(ops CACHE INTERNAL "ops")
+set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt")
+file(WRITE ${ops_src_list} "") # clean
+# add an operator
+# level: one of (basic, extra)
+function(add_operator TARGET level)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+        ARGS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
+        return()
+    endif()
+
+    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")
+
+    foreach(src ${args_SRCS})
+      file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+    endforeach()
+
+    lite_cc_library(${TARGET} SRCS ${args_SRCS}
+              DEPS ${args_DEPS}
+              X86_DEPS ${args_X86_DEPS}
+              CUDA_DEPS ${args_CUDA_DEPS}
+              CL_DEPS ${args_CL_DEPS}
+              ARM_DEPS ${args_ARM_DEPS}
+              FPGA_DEPS ${args_FPGA_DEPS}
+              PROFILE_DEPS ${args_PROFILE_DEPS}
+              LIGHT_DEPS ${args_LIGHT_DEPS}
+              HVY_DEPS ${args_HVY_DEPS}
+      )
+endfunction()
+

 # Bundle several static libraries into one.
 function(bundle_static_library tgt_name bundled_tgt_name fake_target)

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -32,7 +32,11 @@ ELSE(WIN32)
            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
        ENDIF()
-        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+        IF(ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"
+                OR ARM_TARGET_OS STREQUAL "ios" OR ARM_TARGET_OS STREQUAL "ios64")
+        ELSE()
+            set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+        ENDIF()
    ELSE(APPLE)

        IF(EXISTS "/etc/issue")

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,7 +13,6 @@ set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")

 set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})

-
 add_subdirectory(utils)
 add_subdirectory(operators)
 add_subdirectory(kernels)
@@ -78,14 +77,16 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
                )
-        add_dependencies(publish_inference_cxx_lib model_optimize_tool)
-        add_dependencies(publish_inference_cxx_lib paddle_code_generator)
-        add_dependencies(publish_inference_cxx_lib bundle_full_api)
-        add_dependencies(publish_inference_cxx_lib bundle_light_api)
-        add_dependencies(publish_inference_cxx_lib test_model_bin)
-        add_dependencies(publish_inference publish_inference_cxx_lib)
-        add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
-                                       COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+            if(NOT IOS)
+                add_dependencies(publish_inference_cxx_lib model_optimize_tool)
+                add_dependencies(publish_inference_cxx_lib paddle_code_generator)
+                add_dependencies(publish_inference_cxx_lib bundle_full_api)
+                add_dependencies(publish_inference_cxx_lib bundle_light_api)
+                add_dependencies(publish_inference_cxx_lib test_model_bin)
+                add_dependencies(publish_inference publish_inference_cxx_lib)
+                add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
+                        COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+            endif()
    endif()



--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -17,6 +17,7 @@ if(LITE_WITH_FPGA)
 endif()

 message(STATUS "get ops ${ops}")
+message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -117,7 +118,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
    add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_tar_gz)
    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
    set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-    
+
    lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}
@@ -125,7 +126,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
            --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu SERIAL)
    add_dependencies(test_mobilenetv2 extern_lite_download_mobilenet_v2_relu_tar_gz)
    set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-    
+
    lite_cc_test(test_resnet50 SRCS resnet50_test.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}
@@ -145,8 +146,13 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)    
-#    lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
-#       DEPS ${lite_model_test_DEPS})
+   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
+   #    DEPS ${lite_model_test_DEPS})
+
+   # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
+   #    DEPS ${lite_model_test_DEPS}
+   #    CL_DEPS ${opencl_kernels}
+   #    FPGA_DEPS ${fpga_kernels})
 endif()

 # These tests needs CLI arguments, and is not supported in ARM CI.
@@ -169,7 +175,11 @@ lite_cc_library(paddle_api SRCS paddle_api.cc DEPS op_params tensor)

 #-----------------------------------------------------------------------------------------------------
 # The final inference library for both CxxConfig and MobileConfig.
-lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api)
+if (LITE_ON_TINY_PUBLISH)
+    lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream)
+else()
+    lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api)
+endif()
 if (NOT LITE_ON_TINY_PUBLISH)
    lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api light_api
      ${ops}

--- a/lite/api/paddle_use_kernels.h
+++ b/lite/api/paddle_use_kernels.h
@@ -21,6 +21,8 @@
 #ifndef LITE_WITH_FPGA
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
 #else
 USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
 USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);

--- a/lite/api/paddle_use_ops.h
+++ b/lite/api/paddle_use_ops.h
@@ -73,9 +73,12 @@ USE_LITE_OP(prior_box)
 USE_LITE_OP(density_prior_box)
 USE_LITE_OP(reshape)
 USE_LITE_OP(reshape2)
+USE_LITE_OP(flatten)
+USE_LITE_OP(flatten2)
 USE_LITE_OP(split)
 USE_LITE_OP(fake_quantize_moving_average_abs_max);
 USE_LITE_OP(fake_dequantize_max_abs);
+USE_LITE_OP(fake_quantize_range_abs_max);
 USE_LITE_OP(calib);
 USE_LITE_OP(calib_once);
 USE_LITE_OP(norm);

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -20,7 +20,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
 else()
    add_library(paddle_lite_jni SHARED "")
    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    #add_dependencies(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
 endif()

 if (APPLE)

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -30,6 +30,9 @@ DEFINE_string(input_shape,
              "1,3,224,224",
              "input shapes, separated by colon and comma");
 DEFINE_string(result_filename, "", "save test result");
+DEFINE_bool(run_model_optimize,
+            false,
+            "apply model_optimize_tool to model, use optimized model to test");

 namespace paddle {
 namespace lite_api {
@@ -69,10 +72,10 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 #ifdef LITE_WITH_ARM
  lite::DeviceInfo::Init();
  if (thread_num == 1) {
-    lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
+    lite::DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
    LOG(INFO) << "LITE_POWER_HIGH";
  } else {
-    lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_NO_BIND, thread_num);
+    lite::DeviceInfo::Global().SetRunMode(LITE_POWER_NO_BIND, thread_num);
    LOG(INFO) << "LITE_POWER_NO_BIND";
  }
 #endif
@@ -172,13 +175,17 @@ int main(int argc, char** argv) {
  }

  // Output optimized model
-  paddle::lite_api::OutputOptModel(
-      FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  if (FLAGS_run_model_optimize) {
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  }

 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
  // Run inference using optimized model
+  std::string run_model_dir =
+      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
  paddle::lite_api::Run(input_shapes,
-                        save_optimized_model_dir,
+                        run_model_dir,
                        FLAGS_repeats,
                        FLAGS_threads,
                        FLAGS_warmup,

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -71,6 +71,13 @@ const lite::Tensor *Predictor::GetOutput(size_t offset) const {
  return &fetch_list.at(offset);
 }

+const std::vector<lite::Tensor> *Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  return &fetch_list;
+}
+
 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
 }

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -69,6 +69,7 @@ class LITE_API Predictor {

  // Get offset-th col of fetch results.
  const lite::Tensor* GetOutput(size_t offset) const;
+  const std::vector<lite::Tensor>* GetOutputs() const;

  const cpp::ProgramDesc& program_desc() const;
  const lite::Tensor* GetTensor(const std::string& name) const;

--- a/lite/api/efficientnet_b0_test.cc
+++ b/lite/api/efficientnet_b0_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 void TestModel(const std::vector<Place> &valid_places,
               const Place &preferred_place) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/inceptionv4_test.cc
+++ b/lite/api/inceptionv4_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(InceptionV4, test) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -40,6 +40,10 @@ class LightPredictorImpl : public PaddlePredictor {

 void LightPredictorImpl::Init(const MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
+#ifdef LITE_WITH_ARM
+  lite::DeviceInfo::Init();
+  lite::DeviceInfo::Global().SetRunMode(config.power_mode(), config.threads());
+#endif
  raw_predictor_.reset(new lite::LightPredictor(config.model_dir(),
                                                LiteModelType::kNaiveBuffer));
 }

--- a/lite/api/mobilenetv1_int8_test.cc
+++ b/lite/api/mobilenetv1_int8_test.cc
@@ -29,7 +29,7 @@ void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place,
               bool use_npu = false) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/mobilenetv1_ssd_test.cc
+++ b/lite/api/mobilenetv1_ssd_test.cc
@@ -29,7 +29,7 @@ namespace lite {
 void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -33,7 +33,7 @@ void TestModel(const std::vector<Place>& valid_places,
               bool gen_npu = false,
               bool save_model = false) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(model_dir, preferred_place, valid_places);

--- a/lite/api/mobilenetv1_yolov3_test.cc
+++ b/lite/api/mobilenetv1_yolov3_test.cc
@@ -29,7 +29,7 @@ namespace lite {
 void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -34,7 +34,7 @@ void TestModel(const std::vector<Place>& valid_places,
               bool gen_npu = false,
               bool save_model = false) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(model_dir, preferred_place, valid_places);

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -33,7 +33,7 @@ DEFINE_string(valid_targets,
              "arm",
              "The targets this model optimized for, should be one of (arm, "
              "opencl, x86), splitted by space");
-DEFINE_bool(int8_mode, false, "Support Int8 quantitative mode");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");

 namespace paddle {
 namespace lite_api {
@@ -62,7 +62,7 @@ void Main() {
  CHECK(!valid_places.empty())
      << "At least one target should be set, should set the "
         "command argument 'valid_targets'";
-  if (FLAGS_int8_mode) {
+  if (FLAGS_prefer_int8_kernel) {
    LOG(WARNING) << "Int8 mode is only support by ARM target";
    valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)});
    config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)});

--- a/lite/api/model_run_test_image.cc
+++ b/lite/api/model_run_test_image.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(model, test) {
+#ifdef LITE_WITH_ARM
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kInt8)}});
+
+  auto precision = PRECISION(kFloat);
+  if (FLAGS_int8) {
+    precision = PRECISION(kInt8);
+  }
+  predictor.Build(
+      FLAGS_model_dir, Place{TARGET(kARM), precision}, valid_places);
+  int im_width = FLAGS_im_width;
+  int im_height = FLAGS_im_height;
+  auto* input_tensor = predictor.GetInput(0);
+  auto in_dims = input_tensor->dims();
+  input_tensor->Resize(
+      DDim(std::vector<DDim::value_type>({1, 3, im_width, im_height})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  auto* output_tensors = predictor.GetOutputs();
+
+  LOG(INFO) << "======output:========";
+  for (auto t : *output_tensors) {
+    LOG(INFO) << t;
+  }
+  LOG(INFO)
+      << "=====RUN_finished!!============= Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -64,7 +64,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         const int warmup_times = 0) {
 #ifdef LITE_WITH_ARM
  lite::DeviceInfo::Init();
-  lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
+  lite::DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, thread_num);
 #endif
  lite_api::MobileConfig config;
  config.set_model_dir(model_dir);

--- a/lite/api/ocr_attention_test.cc
+++ b/lite/api/ocr_attention_test.cc
@@ -29,7 +29,7 @@ void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place,
               bool use_npu = false) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -110,7 +110,18 @@ class LITE_API CxxConfig : public ConfigBase {

 /// MobileConfig is the config for the light weight predictor, it will skip
 /// IR optimization or other unnecessary stages.
-class LITE_API MobileConfig : public ConfigBase {};
+class LITE_API MobileConfig : public ConfigBase {
+  PowerMode mode_{LITE_POWER_HIGH};
+  int threads_{1};
+public:
+  MobileConfig(Place preferred_place=Place(TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)),
+               PowerMode mode=LITE_POWER_HIGH, int threads=1) : mode_(mode), threads_(threads) {}
+  void set_power_mode(PowerMode mode) { mode_ = mode; }
+  void set_threads(int threads) { threads_ = threads; }
+
+  PowerMode power_mode() const { return mode_; }
+  int threads() const { return threads_; }
+};

 template <typename ConfigT>
 std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -70,6 +70,14 @@ enum class DataLayoutType : int {
  kAny = 2,  // any data layout
  NUM = 4,   // number of fields.
 };
+typedef enum {
+    LITE_POWER_HIGH = 0,
+    LITE_POWER_LOW = 1,
+    LITE_POWER_FULL = 2,
+    LITE_POWER_NO_BIND = 3,
+    LITE_POWER_RAND_HIGH = 4,
+    LITE_POWER_RAND_LOW = 5
+} PowerMode;

 enum class ActivationType : int {
  kIndentity = 0,

--- a/lite/api/resnet18_test.cc
+++ b/lite/api/resnet18_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(ResNet18, test) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});

--- a/lite/api/resnet50_test.cc
+++ b/lite/api/resnet50_test.cc
@@ -29,7 +29,7 @@ namespace lite {
 void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/shufflenetv2_test.cc
+++ b/lite/api/shufflenetv2_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 void TestModel(const std::vector<Place>& valid_places,
               const Place& preferred_place) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;

  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);

--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -23,6 +23,9 @@ DEFINE_string(model_dir, "", "model dir");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(im_width, 224, "image width");
+DEFINE_int32(im_height, 224, "image height");
+DEFINE_bool(int8, false, "is run int8");

 namespace paddle {
 namespace lite {

--- a/lite/api/unet_test.cc
+++ b/lite/api/unet_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(unet, test) {
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});

--- a/lite/arm/CMakeLists.txt
+++ b/lite/arm/CMakeLists.txt
-
 add_subdirectory(math)
--- a/lite/arm/math/CMakeLists.txt
+++ b/lite/arm/math/CMakeLists.txt
@@ -65,7 +65,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      conv_direct_3x3s1.cc
      conv_direct_3x3s2.cc
      conv_direct.cc
-      conv_depthwise_3x3_int7.cc
      conv_depthwise_3x3_int8.cc
      conv_depthwise_5x5s1_int8.cc
      conv_depthwise_3x3p0.cc

--- a/lite/arm/math/conv_depthwise_3x3_int7.cc
+++ b/lite/arm/math/conv_depthwise_3x3_int7.cc
--- a/lite/arm/math/prior_box.cc
+++ b/lite/arm/math/prior_box.cc
@@ -51,7 +51,7 @@ void density_prior_box(const lite::Tensor* input,
                       const std::vector<float>& min_size_,
                       const std::vector<float>& fixed_size_,
                       const std::vector<float>& fixed_ratio_,
-                       const std::vector<float>& density_size_,
+                       const std::vector<int>& density_size_,
                       const std::vector<float>& max_size_,
                       const std::vector<float>& aspect_ratio_,
                       const std::vector<float>& variance_,
@@ -82,14 +82,12 @@ void density_prior_box(const lite::Tensor* input,
    img_width = image->dims()[3];
    img_height = image->dims()[2];
  }
-
  float step_w = step_w_;
  float step_h = step_h_;
  if (step_w == 0 || step_h == 0) {
    step_w = static_cast<float>(img_width) / width;
    step_h = static_cast<float>(img_height) / height;
  }
-
  float offset = offset_;
  int step_average = static_cast<int>((step_w + step_h) * 0.5);  // add
  int channel_size = height * width * prior_num_ * 4;
@@ -343,7 +341,7 @@ void prior_box(const lite::Tensor* input,
                    min_size,
                    std::vector<float>(),
                    std::vector<float>(),
-                    std::vector<float>(),
+                    std::vector<int>(),
                    max_size,
                    aspect_ratio,
                    variance,

--- a/lite/arm/math/prior_box.h
+++ b/lite/arm/math/prior_box.h
@@ -30,7 +30,7 @@ void density_prior_box(const lite::Tensor* input,
                       const std::vector<float>& min_size_,
                       const std::vector<float>& fixed_size_,
                       const std::vector<float>& fixed_ratio_,
-                       const std::vector<float>& density_size_,
+                       const std::vector<int>& density_size_,
                       const std::vector<float>& max_size_,
                       const std::vector<float>& aspect_ratio_,
                       const std::vector<float>& variance_,

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -37,9 +37,36 @@ lite_cc_library(context SRCS context.cc DEPS tensor any cpu_info CL_DEPS cl_cont
 else()
 lite_cc_library(context SRCS context.cc DEPS tensor any cpu_info eigen3 CL_DEPS cl_context gflags)
 endif()
-lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor)
+
+#----------------------------------------------- NOT CHANGE -----------------------------------------------
+# A trick to generate the paddle_use_kernels.h
+add_custom_command(
+  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
+  ${kernels_src_list}
+  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
+  OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
+  )
+# A trick to generate the paddle_use_ops.h
+add_custom_command(
+  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
+  ${ops_src_list}
+  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
+  OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
+  )
+add_custom_target(op_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h)
+add_custom_target(kernel_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h)
+
+#----------------------------------------------- NOT CHANGE -----------------------------------------------
+lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor
+  )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
-  cpp_op_desc tensor)
+  cpp_op_desc tensor
+  )
+
+add_dependencies(kernel kernel_list_h)
+add_dependencies(op op_list_h)
+
+
 lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)

 lite_cc_library(program SRCS program.cc
@@ -73,3 +100,17 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils)
 lite_cc_test(test_types SRCS types_test.cc DEPS types)
 lite_cc_test(test_memory SRCS memory_test.cc DEPS memory)
 lite_cc_test(test_context SRCS context_test.cc DEPS context)
+
+
+# # A trick to generate the paddle_use_kernels.h
+# execute_process(
+#   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
+#   ${kernels_src_list}
+#   ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
+#   )
+# # A trick to generate the paddle_use_ops.h
+# execute_process(
+#   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
+#   ${ops_src_list}
+#   ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
+#   )
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -101,7 +101,7 @@ class Context<TargetType::kARM> {

  void CopySharedTo(ARMContext* ctx) {}

-  void SetRunMode(PowerMode mode, int threads) {
+  void SetRunMode(lite_api::PowerMode mode, int threads) {
    return DeviceInfo::Global().SetRunMode(mode, threads);
  }
  void SetCache(int l1size, int l2size, int l3size) {
@@ -109,7 +109,7 @@ class Context<TargetType::kARM> {
  }
  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }

-  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); }
  int threads() const { return DeviceInfo::Global().threads(); }
  ARMArch arch() const { return DeviceInfo::Global().arch(); }
  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }

--- a/lite/core/cpu_info.cc
+++ b/lite/core/cpu_info.cc
@@ -119,7 +119,8 @@ size_t get_mem_size() {
  return memsize;
 #elif defined(TARGET_IOS)
  // to be implemented
-  printf("not implemented\n");
+  printf("not implemented, set to default 4GB\n");
+  return 4096 * 1024;
 #endif
  return 0;
 }
@@ -209,7 +210,7 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
  }
 #elif defined(TARGET_IOS)
  for (int i = 0; i < cpu_num; ++i) {
-    archs->at(i) = APPLE;
+    archs->at(i) = kAPPLE;
  }
 #endif
 }
@@ -818,7 +819,7 @@ void DeviceInfo::RequestPowerFullMode(int thread_num) {
      active_ids_.push_back(little_core_ids_[i - big_core_size]);
    }
  }
-  mode_ = LITE_POWER_FULL;
+  mode_ = lite_api::PowerMode::LITE_POWER_FULL;
 }

 void DeviceInfo::RequestPowerHighMode(int thread_num) {
@@ -826,7 +827,7 @@ void DeviceInfo::RequestPowerHighMode(int thread_num) {
  int little_core_size = little_core_ids_.size();
  active_ids_.clear();
  if (big_core_size > 0) {
-    mode_ = LITE_POWER_HIGH;
+    mode_ =lite_api::PowerMode::LITE_POWER_HIGH;
    if (thread_num > big_core_size) {
      LOG(ERROR) << "Request thread num: " << thread_num
                 << ", exceed the big cores size: " << big_core_size
@@ -838,7 +839,7 @@ void DeviceInfo::RequestPowerHighMode(int thread_num) {
      }
    }
  } else {
-    mode_ = LITE_POWER_LOW;
+    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
    if (thread_num > little_core_size) {
      active_ids_ = little_core_ids_;
@@ -855,7 +856,7 @@ void DeviceInfo::RequestPowerLowMode(int thread_num) {
  int little_core_size = little_core_ids_.size();
  active_ids_.clear();
  if (little_core_size > 0) {
-    mode_ = LITE_POWER_LOW;
+    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
    if (thread_num > little_core_size) {
      LOG(WARNING) << "Request thread num: " << thread_num
                   << ", exceed the little cores size: " << little_core_size
@@ -867,7 +868,7 @@ void DeviceInfo::RequestPowerLowMode(int thread_num) {
      }
    }
  } else {
-    mode_ = LITE_POWER_HIGH;
+    mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
    if (thread_num > big_core_size) {
      active_ids_ = big_core_ids_;
@@ -893,7 +894,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
      }
    }
  }
-  mode_ = LITE_POWER_NO_BIND;
+  mode_ = lite_api::PowerMode::LITE_POWER_NO_BIND;
 }

 void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
@@ -901,7 +902,7 @@ void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
  int little_core_size = little_core_ids_.size();
  active_ids_.clear();
  if (big_core_size > 0) {
-    mode_ = LITE_POWER_RAND_HIGH;
+    mode_ = lite_api::PowerMode::LITE_POWER_RAND_HIGH;
    if (thread_num > big_core_size) {
      LOG(WARNING) << "Request thread num: " << thread_num
                   << ", exceed the big cores size: " << big_core_size
@@ -913,7 +914,7 @@ void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
      }
    }
  } else {
-    mode_ = LITE_POWER_LOW;
+    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
    if (thread_num > little_core_size) {
      active_ids_ = little_core_ids_;
@@ -930,7 +931,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
  int little_core_size = little_core_ids_.size();
  active_ids_.clear();
  if (little_core_size > 0) {
-    mode_ = LITE_POWER_RAND_LOW;
+    mode_ = lite_api::PowerMode::LITE_POWER_RAND_LOW;
    if (thread_num > little_core_size) {
      LOG(WARNING) << "Request thread num: " << thread_num
                   << ", exceed the little cores size: " << little_core_size
@@ -943,7 +944,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
      }
    }
  } else {
-    mode_ = LITE_POWER_HIGH;
+    mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
    if (thread_num > big_core_size) {
      active_ids_ = big_core_ids_;
@@ -957,6 +958,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {

 int DeviceInfo::Setup() {
  core_num_ = get_cpu_num();
+  printf("core number: %d\n", core_num_);
  mem_size_ = get_mem_size();
  get_cpu_arch(&archs_, core_num_);
  // set defalut CPU info
@@ -966,10 +968,10 @@ int DeviceInfo::Setup() {
  SetFP32Info(1, 1);
  SetFP16Info(1, 0);
  SetDotInfo(1, 0);
-#ifdef LITE_WITH_LINUX
-  // get max&min freq
  max_freqs_.resize(core_num_);
  min_freqs_.resize(core_num_);
+#ifdef LITE_WITH_LINUX
+  // get max&min freq
  for (int i = 0; i < core_num_; ++i) {
    int max_freq, min_freq;
    get_cpu_max_min_freq(i, &max_freq, &min_freq);
@@ -981,6 +983,30 @@ int DeviceInfo::Setup() {
  if (!SetCPUInfoByName()) {
    SetCPUInfoByProb();
  }
+  core_ids_.resize(core_num_);
+  cluster_ids_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    max_freqs_[i] = 1000000;
+    min_freqs_[i] = 1000000;
+    cluster_ids_[i] = 0;
+  }
+#else
+#ifdef TARGET_IOS
+  dev_name_ = "Apple";
+#else
+  dev_name_ = "Unknown";
+#endif
+  core_ids_.resize(core_num_);
+  cluster_ids_.resize(core_num_);
+  big_core_ids_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    max_freqs_[i] = 1000000;
+    min_freqs_[i] = 1000000;
+    cluster_ids_[i] = 0;
+    core_ids_[i] = i;
+    big_core_ids_[i] = i;
+  }
+#endif
  // output info
  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
@@ -1004,13 +1030,12 @@ int DeviceInfo::Setup() {
    LOG(INFO) << L3_cache_[i] / 1024 << " KB";
  }
  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
-#endif
  // set default run mode
-  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
+  SetRunMode(lite_api::PowerMode::LITE_POWER_NO_BIND, 1);  // use single thread by default
  return 0;
 }

-void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
+void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
 #else
@@ -1024,22 +1049,22 @@ void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
  count_++;
  int shift_num = (count_ / 10) % big_core_size;
  switch (mode) {
-    case LITE_POWER_FULL:
+    case lite_api::LITE_POWER_FULL:
      RequestPowerFullMode(thread_num);
      break;
-    case LITE_POWER_HIGH:
+    case lite_api::LITE_POWER_HIGH:
      RequestPowerHighMode(thread_num);
      break;
-    case LITE_POWER_LOW:
+    case lite_api::LITE_POWER_LOW:
      RequestPowerLowMode(thread_num);
      break;
-    case LITE_POWER_NO_BIND:
+    case lite_api::LITE_POWER_NO_BIND:
      RequestPowerNoBindMode(thread_num);
      break;
-    case LITE_POWER_RAND_HIGH:
+    case lite_api::LITE_POWER_RAND_HIGH:
      RequestPowerRandHighMode(shift_num, thread_num);
      break;
-    case LITE_POWER_RAND_LOW:
+    case lite_api::LITE_POWER_RAND_LOW:
      RequestPowerRandLowMode(shift_num, thread_num);
      break;
    default:
@@ -1052,12 +1077,12 @@ void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  omp_set_num_threads(active_ids_.size());
 #endif
-  if (mode_ != LITE_POWER_NO_BIND) {
+  if (mode_ != lite_api::LITE_POWER_NO_BIND) {
    if (check_cpu_online(active_ids_)) {
      bind_threads(active_ids_);
    } else {
      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
-      mode_ = LITE_POWER_NO_BIND;
+      mode_ = lite_api::LITE_POWER_NO_BIND;
    }
  }
 #else  // LITE_WITH_LINUX
@@ -1080,7 +1105,7 @@ void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
  workspace_.Resize({2 * (l1size + l2size)});
 }

-bool DeviceInfo::ExtendWorkspace(size_t size) {
+bool DeviceInfo::ExtendWorkspace(int size) {
  workspace_.Resize({size + llc_size()});
  workspace_.mutable_data<int8_t>();
  return true;

--- a/lite/core/cpu_info.h
+++ b/lite/core/cpu_info.h
@@ -25,15 +25,6 @@ namespace lite {

 #ifdef LITE_WITH_ARM

-typedef enum {
-  LITE_POWER_HIGH = 0,
-  LITE_POWER_LOW = 1,
-  LITE_POWER_FULL = 2,
-  LITE_POWER_NO_BIND = 3,
-  LITE_POWER_RAND_HIGH = 4,
-  LITE_POWER_RAND_LOW = 5
-} PowerMode;
-
 typedef enum {
  kAPPLE = 0,
  kA53 = 53,
@@ -60,11 +51,11 @@ class DeviceInfo {

  int Setup();

-  void SetRunMode(PowerMode mode, int thread_num);
+  void SetRunMode(lite_api::PowerMode mode, int thread_num);
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }

-  PowerMode mode() const { return mode_; }
+  lite_api::PowerMode mode() const { return mode_; }
  int threads() const { return active_ids_.size(); }
  ARMArch arch() const { return arch_; }
  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
@@ -82,7 +73,7 @@ class DeviceInfo {
  T* workspace_data() {
    return reinterpret_cast<T*>(workspace_.mutable_data<int8_t>());
  }
-  bool ExtendWorkspace(size_t size);
+  bool ExtendWorkspace(int size);

 private:
  int core_num_;
@@ -107,7 +98,7 @@ class DeviceInfo {
  // LITE_POWER_HIGH stands for using big cores,
  // LITE_POWER_LOW stands for using small core,
  // LITE_POWER_FULL stands for using all cores
-  PowerMode mode_;
+  lite_api::PowerMode mode_;
  std::vector<int> active_ids_;
  TensorLite workspace_;
  int64_t count_{0};

--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
@@ -37,7 +37,7 @@ namespace lite {
 namespace mir {
 namespace subgraph {

-void GenerateNPUProgramPass::NPUSortHelper(
+void GenerateNPUProgramPass::SubgraphSortHelper(
    Node* node,
    const std::unordered_set<Node*>& nodes_all,
    std::unordered_set<const Node*>* visited_nodes,
@@ -46,7 +46,7 @@ void GenerateNPUProgramPass::NPUSortHelper(
    if (var_node->inlinks.empty()) continue;
    auto* op_node = var_node->inlinks.front();
    if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
-      NPUSortHelper(op_node, nodes_all, visited_nodes, ret);
+      SubgraphSortHelper(op_node, nodes_all, visited_nodes, ret);
    }
  }
  ret->push_back(node);
@@ -55,40 +55,68 @@ void GenerateNPUProgramPass::NPUSortHelper(

 void GenerateNPUProgramPass::CvtOpNodes(
    const std::vector<Node*>& nodes2cvt,
-    std::vector<std::string>* in_vars_name,
-    std::vector<std::string>* out_vars_name,
-    lite::npu::bridge::node_map_type* cvted_vars,
-    std::unordered_set<const Node*>* nodes2rm) {
+    lite::npu::bridge::node_map_type* cvted_vars) {
  const auto& bridges = lite::npu::bridge::Factory::Instance();
  const auto& cvtfunc_map = bridges.AllFunctions();
+  // record all converted vars
+  // op node's inputs must be found in cvted_vars
  for (auto& node : nodes2cvt) {
    lite::npu::bridge::node_map_type node_inputs;
    auto& stmt = node->AsStmt();
    for (auto& var_node : node->inlinks) {
      auto& arg = var_node->AsArg();
+      if (arg.is_weight) continue;
      auto var_name = arg.name;
      if (!cvted_vars->count(var_name)) {
-        if (arg.is_weight) continue;
        cvted_vars->insert(std::make_pair(
            var_name,
            lite::npu::bridge::CvtNode(var_node, stmt.op()->scope())));
-        in_vars_name->push_back(var_name);
      }
      node_inputs.insert(*cvted_vars->find(var_name));
    }
    auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
    cvted_vars->insert(node_outputs.begin(), node_outputs.end());
-    nodes2rm->insert(node);
-    for (auto& var_node : node->outlinks) {
-      for (auto& next_op_node : var_node->outlinks) {
-        if (std::find(nodes2cvt.begin(), nodes2cvt.end(), next_op_node) ==
-            nodes2cvt.end()) {
-          out_vars_name->push_back(var_node->AsArg().name);
-          break;
-        }
+  }
+}
+
+void GenerateNPUProgramPass::GetIOVars(
+    const std::vector<Node*>& nodes2cvt,
+    const lite::npu::bridge::node_map_type& cvted_vars,
+    std::unordered_set<const Node*>* nodes2rm,
+    std::vector<Node*>* in_vars,
+    std::vector<Node*>* out_vars,
+    lite::npu::bridge::node_map_type* in_cvted_vars,
+    lite::npu::bridge::node_map_type* out_cvted_vars) {
+  std::unordered_set<Node*> op_nodes_all(nodes2cvt.begin(), nodes2cvt.end());
+  for (auto& op_node : nodes2cvt) {
+    for (auto& in_var : op_node->inlinks) {
+      if (in_var->AsArg().is_weight) continue;
+      auto* pre_op_node = in_var->inlinks.front();
+      if (op_nodes_all.count(pre_op_node)) {
+        nodes2rm->insert(in_var);
+        continue;
+      }
+      in_vars->push_back(in_var);
+      auto arg_name = in_var->AsArg().name;
+      in_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name)));
+    }
+    for (auto& out_var : op_node->outlinks) {
+      if (out_var->outlinks.empty()) {
+        nodes2rm->insert(out_var);
+        continue;
+      }
+      auto* next_op_node = out_var->outlinks.front();
+
+      if (op_nodes_all.count(next_op_node)) {
+        nodes2rm->insert(out_var);
+        continue;
      }
+      out_vars->push_back(out_var);
+      auto arg_name = out_var->AsArg().name;
+      out_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name)));
    }
  }
+  nodes2rm->insert(nodes2cvt.begin(), nodes2cvt.end());
 }

 void GenerateNPUProgramPass::GenNPUGraphOpNode(
@@ -100,23 +128,38 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
  for (auto& node : nodes_all) {
    if (!node->IsStmt()) continue;
    if (visited_nodes.count(node)) continue;
-    NPUSortHelper(node, nodes_all, &visited_nodes, &ret);
+    SubgraphSortHelper(node, nodes_all, &visited_nodes, &ret);
  }

-  std::vector<std::string> in_vars_name;
-  std::vector<std::string> out_vars_name;
  lite::npu::bridge::node_map_type cvted_vars;
+  CvtOpNodes(ret, &cvted_vars);
+
  std::unordered_set<const Node*> nodes2rm;
-  CvtOpNodes(ret, &in_vars_name, &out_vars_name, &cvted_vars, &nodes2rm);
-  // insert new graph op node
+  std::vector<Node*> in_vars;
+  std::vector<Node*> out_vars;
+  lite::npu::bridge::node_map_type in_cvted_vars;
+  lite::npu::bridge::node_map_type out_cvted_vars;
+  GetIOVars(ret,
+            cvted_vars,
+            &nodes2rm,
+            &in_vars,
+            &out_vars,
+            &in_cvted_vars,
+            &out_cvted_vars);
+
+  std::vector<std::string> in_vars_name;
+  std::vector<std::string> out_vars_name;
  std::vector<ge::Operator> inputs;
  std::vector<ge::Operator> outputs;
-  for (auto i : in_vars_name) {
-    inputs.push_back(*cvted_vars.at(i));
+  for (auto i : in_cvted_vars) {
+    in_vars_name.push_back(i.first);
+    inputs.push_back(*i.second);
  }
-  for (auto i : out_vars_name) {
-    outputs.push_back(*cvted_vars.at(i));
+  for (auto i : out_cvted_vars) {
+    out_vars_name.push_back(i.first);
+    outputs.push_back(*i.second);
  }
+
  std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om");
  if (!npu::BuildNPUClient(inputs, outputs, model_name)) {
    LOG(FATAL) << "Build NPU failed subgraph " << sub_id;
@@ -125,27 +168,25 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(

  cpp::OpDesc op_desc;
  op_desc.SetType("graph_op");
+  std::vector<std::string> in_var_names;
+
  op_desc.SetInput("Inputs", in_vars_name);
  op_desc.SetOutput("Outputs", out_vars_name);
  op_desc.SetAttr("model_name", model_name);
  auto graph_op = LiteOpRegistry::Global().Create("graph_op");
-  // TODO(zpy): support multi inputs op
-  auto start_op = ret.front()->AsStmt().op();
-  auto* scope = start_op->scope();
+
+  auto any_op = ret.front()->AsStmt().op();
+  auto* scope = any_op->scope();
  graph_op->Attach(op_desc, scope);

-  auto valid_places = start_op->valid_places();
+  auto valid_places = any_op->valid_places();
  auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);

-  for (auto& var_node : ret.front()->inlinks) {
-    auto& arg = var_node->AsArg();
-    if (arg.is_weight) continue;
-    IR_NODE_LINK_TO(var_node, new_op_node);
+  for (auto& in_var : in_vars) {
+    IR_NODE_LINK_TO(in_var, new_op_node);
  }
-  for (auto& var_node : ret.back()->outlinks) {
-    auto& arg = var_node->AsArg();
-    if (arg.is_weight) continue;
-    IR_NODE_LINK_TO(var_node, new_op_node);
+  for (auto& out_var : out_vars) {
+    IR_OP_VAR_LINK(new_op_node, out_var);
  }

  // assign context
@@ -159,8 +200,10 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
 void GenerateNPUProgramPass::ConvertSubgraph(
    const std::unique_ptr<SSAGraph>& graph, int sub_num) {
  std::unordered_map<int, std::unordered_set<Node*>> nodes_all;
+  int ops_num = 0;
  for (auto& item : graph->StmtTopologicalOrder()) {
    if (!item->IsStmt()) continue;
+    ops_num++;
    auto& stmt = item->AsStmt();
    int sub_id = stmt.subgraph_id();
    if (sub_id < 1) continue;
@@ -178,6 +221,7 @@ void GenerateNPUProgramPass::ConvertSubgraph(

 void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get());
+
  const auto& bridges = lite::npu::bridge::Factory::Instance();
  const auto& op_map = bridges.AllFunctions();
  std::vector<std::string> supported_op_types;
@@ -215,5 +259,3 @@ std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {

 REGISTER_MIR_PASS(generate_npu_program_pass,
                  paddle::lite::mir::subgraph::GenerateNPUProgramPass);
-
-// USE_LITE_OP(graph_op);
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
@@ -38,21 +38,27 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
  std::unique_ptr<RuntimeProgram> GenProgram();

 protected:
-  void NPUSortHelper(Node* node,
-                     const std::unordered_set<Node*>& nodes_all,
-                     std::unordered_set<const Node*>* visited_nodes,
-                     std::vector<Node*>* ret);
+  // sort nodes to operational sequence
+  void SubgraphSortHelper(Node* node,
+                          const std::unordered_set<Node*>& nodes_all,
+                          std::unordered_set<const Node*>* visited_nodes,
+                          std::vector<Node*>* ret);

  // nodes2cvt: op nodes to convert
-  // in_vars_name: graph op's inputs var name
-  // out_vars_name: graph op's outputs var name
-  // vcted_vars:
+  // cvted_vars: converted var nodes
  // nodes2rm: op nodes and var nodes that need to be removed
  void CvtOpNodes(const std::vector<Node*>& nodes2cvt,
-                  std::vector<std::string>* in_vars_name,
-                  std::vector<std::string>* out_vars_name,
-                  lite::npu::bridge::node_map_type* cvted_vars,
-                  std::unordered_set<const Node*>* nodes2rm);
+                  lite::npu::bridge::node_map_type* cvted_vars);
+
+  // achieve input and output vars/cvted_vars;
+  // achieve all nodes to remove
+  void GetIOVars(const std::vector<Node*>& nodes2cvt,
+                 const lite::npu::bridge::node_map_type& cvted_vars,
+                 std::unordered_set<const Node*>* nodes2rm,
+                 std::vector<Node*>* in_vars,
+                 std::vector<Node*>* out_vars,
+                 lite::npu::bridge::node_map_type* in_cvted_vars,
+                 lite::npu::bridge::node_map_type* out_cvted_vars);

  void GenNPUGraphOpNode(const std::unique_ptr<SSAGraph>& graph,
                         int sub_id,

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -26,17 +26,49 @@ namespace paddle {
 namespace lite {
 namespace profile {

+template <typename dtype>
+static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+  if (locate.find('/') != std::string::npos) {
+    return;
+  }
+  FILE* fp = fopen(locate.c_str(), "w");
+  if (fp == nullptr) {
+    LOG(ERROR) << "file open field " << locate;
+  } else {
+    const dtype* data = tensor->data<dtype>();
+    for (int i = 0; i < tensor->numel(); ++i) {
+      fprintf(fp, "[%d] %f \n", i, static_cast<float>(data[i]));
+    }
+  }
+  fclose(fp);
+}
+
 class PrecisionProfiler {
 public:
  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
  ~PrecisionProfiler() {
    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target());
-    auto tensor_mean = [](const Tensor* in, PrecisionType ptype) -> double {
+              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
+              << PrecisionToStr(inst_->kernel()->precision());
+    auto tensor_mean = [](const Tensor* in,
+                          PrecisionType ptype,
+                          std::string name = "inst") -> double {
+      if (!in->data<int8_t>()) {
+        return -99999;
+      }
      double sum = 0.;
      switch (ptype) {
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
+          // write_tensorfile<float>(in, name);
+          for (int i = 0; i < in->numel(); ++i) {
+            sum += ptr[i];
+          }
+          return sum / in->numel();
+        }
+        case PRECISION(kAny): {
+          auto ptr = in->data<float>();
+          // write_tensorfile<float>(in, name);
          for (int i = 0; i < in->numel(); ++i) {
            sum += ptr[i];
          }
@@ -44,6 +76,7 @@ class PrecisionProfiler {
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
+          // write_tensorfile<int8_t>(in, name);
          for (int i = 0; i < in->numel(); ++i) {
            sum += ptr[i];
          }
@@ -51,6 +84,7 @@ class PrecisionProfiler {
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
+          // write_tensorfile<int32_t>(in, name);
          for (int i = 0; i < in->numel(); ++i) {
            sum += ptr[i];
          }
@@ -70,17 +104,18 @@ class PrecisionProfiler {
        std::string out_arg_name;
        op->op_info()->GetOutputArgname(out_name, &out_arg_name);
        auto type = kernel->GetOutputDeclType(out_arg_name);
+
        if (type->IsTensor()) {
          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision());
+          double mean = tensor_mean(tout, type->precision(), out_name);
          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean;
+                    << ", mean value: " << mean << " shape:" << tout->dims();
        } else if (type->IsTensorList()) {
          auto tout =
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision());
+            double mean = tensor_mean(&t, type->precision(), out_name);
            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
                      << ", precision: " << PrecisionToStr(type->precision())
                      << ", mean value: " << mean;

--- a/lite/demo/java/android/PaddlePredictor/gradlew.bat
+++ b/lite/demo/java/android/PaddlePredictor/gradlew.bat
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
 message(STATUS "add lite kernels")

-set(lite_kernel_deps type_system kernel op op_registry context tensor CACHE INTERNAL "" FORCE)
+set(lite_kernel_deps type_system kernel op op_registry context tensor any CACHE INTERNAL "" FORCE)

 add_subdirectory(host)
 add_subdirectory(arm)

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -4,64 +4,66 @@ endif()

 message(STATUS "compile with lite ARM kernels")

-lite_cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(matmul_compute_arm SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(lrn_compute_arm SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(decode_bboxes_compute_arm SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(multiclass_nms_compute_arm SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(pad2d_compute_arm SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(prior_box_compute_arm SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(density_prior_box_compute_arm SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(negative_compute_arm SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(crop_compute_arm SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(power_compute_arm SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(yolo_box_compute_arm SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(shuffle_channel_compute_arm SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(argmax_compute_arm SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(axpy_compute_arm SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(conv_transpose_compute_arm SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(gru_unit_compute_arm SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(gru_compute_arm SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(beam_search_decode_compute_arm SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(lookup_table_compute_arm SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(im2sequence_compute_arm SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(sequence_softmax_compute_arm SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(norm_compute_arm SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(interpolate_compute_arm SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(logical_compute_arm SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(less_than_arm SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(while_compute_arm SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(compare_compute_arm SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(topk_compute_arm SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(increment_compute_arm SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(write_to_array_compute_arm SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(read_from_array_compute_arm SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(beam_search_compute_arm SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(fill_constant_compute_arm SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(lod_reset_compute_arm SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(box_coder_compute_arm SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(sequence_pool_compute_arm SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(sequence_expand_compute_arm SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(reduce_max_compute_arm SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(is_empty_compute_arm SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(shape_compute_arm SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(slice_compute_arm SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(cast_compute_arm SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(squeeze_compute_arm SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-lite_cc_library(expand_compute_arm SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fc_compute_arm ARM basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(activation_compute_arm ARM basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(mul_compute_arm ARM basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(matmul_compute_arm ARM basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(conv_compute_arm ARM basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(multiclass_nms_compute_arm ARM basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+# for OCR specific
+add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)

 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
@@ -77,71 +79,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
 lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
 lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
-lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
+lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
 lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
 lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
-
-
-set(arm_kernels
-    fc_compute_arm
-    activation_compute_arm
-    mul_compute_arm
-    matmul_compute_arm
-    scale_compute_arm
-    softmax_compute_arm
-    conv_compute_arm
-    batch_norm_compute_arm
-    elementwise_compute_arm
-    lrn_compute_arm
-    decode_bboxes_compute_arm
-    multiclass_nms_compute_arm
-    pool_compute_arm
-    split_compute_arm
-    concat_compute_arm
-    pad2d_compute_arm
-    prior_box_compute_arm
-    density_prior_box_compute_arm
-    negative_compute_arm
-    crop_compute_arm
-    dropout_compute_arm
-    transpose_compute_arm
-    calib_compute_arm
-    argmax_compute_arm
-    axpy_compute_arm
-    conv_transpose_compute_arm
-    gru_unit_compute_arm
-    gru_compute_arm
-    beam_search_decode_compute_arm
-    lookup_table_compute_arm
-    im2sequence_compute_arm
-    sequence_softmax_compute_arm
-    norm_compute_arm
-    power_compute_arm
-    shuffle_channel_compute_arm
-    yolo_box_compute_arm
-    interpolate_compute_arm
-    logical_compute_arm
-    less_than_arm
-    while_compute_arm
-    compare_compute_arm
-    topk_compute_arm
-    increment_compute_arm
-    write_to_array_compute_arm
-    read_from_array_compute_arm
-    beam_search_compute_arm
-    fill_constant_compute_arm
-    lod_reset_compute_arm
-    box_coder_compute_arm
-    reduce_max_compute_arm
-    sequence_expand_compute_arm
-    sequence_pool_compute_arm
-    is_empty_compute_arm
-    shape_compute_arm
-    slice_compute_arm
-    cast_compute_arm
-    squeeze_compute_arm
-    expand_compute_arm
-    )
-
-set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
--- a/lite/kernels/arm/density_prior_box_compute.cc
+++ b/lite/kernels/arm/density_prior_box_compute.cc
@@ -48,13 +48,12 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,

 void DensityPriorBoxCompute::Run() {
  auto& param = Param<operators::DensityPriorBoxParam>();
-
  bool is_flip = param.flip;
  bool is_clip = param.clip;
  std::vector<float> min_size = param.min_sizes;
  std::vector<float> fixed_size = param.fixed_sizes;
  std::vector<float> fixed_ratio = param.fixed_ratios;
-  std::vector<float> density_size = param.density_sizes;
+  auto density_size = param.density_sizes;
  std::vector<float> max_size = param.max_sizes;
  std::vector<float> aspect_ratio = param.aspect_ratios;
  std::vector<float> variance = param.variances_;

--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
 if (NOT LITE_WITH_FPGA)
    return()
 endif()
-message("fpga : ${lite_kernel_deps}")

 set(fpga_deps fpga_target_wrapper kernel_fpga)

-lite_cc_library(activation_compute_fpga SRCS activation_compute.cc DEPS ${fpga_deps})
+add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})

-lite_cc_library(conv_compute_fpga SRCS conv_compute.cc DEPS ${fpga_deps})
+add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})

-lite_cc_library(elementwise_compute_fpga SRCS elementwise_compute.cc DEPS ${fpga_deps})
+add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})


-lite_cc_library(pooling_compute_fpga SRCS pooling_compute.cc DEPS ${fpga_deps})
+add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})

-lite_cc_library(scale_compute_fpga SRCS scale_compute.cc DEPS ${fpga_deps})
+add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})

-lite_cc_library(softmax_compute_fpga SRCS softmax_compute.cc DEPS ${fpga_deps})
+add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})

-lite_cc_library(fc_compute_fpga SRCS fc_compute.cc DEPS ${fpga_deps})
+add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})

-lite_cc_library(io_copy_compute_fpga SRCS io_copy_compute.cc DEPS ${fpga_deps})
-lite_cc_library(calib_compute_fpga SRCS calib_compute.cc DEPS ${fpga_deps})
-lite_cc_library(layout_compute_fpga SRCS layout_compute.cc DEPS ${fpga_deps})
-lite_cc_library(feed_compute_fpga SRCS feed_compute.cc DEPS ${fpga_deps})
-lite_cc_library(fetch_compute_fpga SRCS fetch_compute.cc DEPS ${fpga_deps})
-
-set (fpga_kernels 
-     activation_compute_fpga
-     conv_compute_fpga
-     elementwise_compute_fpga
-     pooling_compute_fpga
-     scale_compute_fpga
-     softmax_compute_fpga
-     fc_compute_fpga
-     io_copy_compute_fpga 
-     calib_compute_fpga
-     layout_compute_fpga
-     feed_compute_fpga
-    fetch_compute_fpga
-)
-
-set(fpga_kernels "${fpga_kernels}" CACHE INTERNAL "fpga kernels")
+add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
+add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
+add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
+add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
+add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
 message(STATUS "compile with lite host kernels")

-lite_cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
-lite_cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-lite_cc_library(reshape_compute_host SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
+add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)

-lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host)
-
-set(host_kernels
-    feed_compute_host
-    fetch_compute_host
-    reshape_compute_host
-    )
-
-set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
- 
- 
+lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
--- a/lite/kernels/host/reshape_compute.cc
+++ b/lite/kernels/host/reshape_compute.cc
@@ -93,3 +93,40 @@ REGISTER_LITE_KERNEL(reshape2,
                {LiteType::GetTensorTy(
                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten2,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .Finalize();
--- a/lite/kernels/npu/CMakeLists.txt
+++ b/lite/kernels/npu/CMakeLists.txt
@@ -2,12 +2,8 @@
 if(NOT LITE_WITH_NPU)
  return ()
 endif()
- 
+
 message(STATUS "compile with lite NPU kernels")

-lite_cc_library(graph_compute_npu SRCS graph_compute.cc DEPS ${lite_kernel_deps} ${npu_ddk_libs})
+add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} ${npu_ddk_libs})
 # lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
-
-set(npu_kernels graph_compute_npu)
-set(npu_kernels "${npu_kernels}" CACHE INTERNAL "npu kernels")
- 
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -4,17 +4,17 @@ endif()

 set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper)

-lite_cc_library(fc_opencl SRCS fc_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(mul_opencl SRCS mul_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(elementwise_add_opencl SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(fusion_elementwise_add_activation_opencl
-        SRCS fusion_elementwise_add_activation_compute.cc
+add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fusion_elementwise_add_activation_opencl
+        OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
        DEPS elementwise_add_opencl ${cl_kernel_deps})
-lite_cc_library(pool_opencl SRCS pool_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(io_copy_compute_opencl SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
-lite_cc_library(relu_opencl SRCS relu_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(depthwise_conv2d_opencl SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_library(conv_opencl SRCS conv_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})

 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
        DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
@@ -47,15 +47,3 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
        DEPS conv_opencl op_registry program context
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/opencl)
-
-set(opencl_kernels
-        io_copy_compute_opencl
-        elementwise_add_opencl
-        fusion_elementwise_add_activation_opencl
-        pool_opencl
-        relu_opencl
-        mul_opencl
-        fc_opencl
-        depthwise_conv2d_opencl
-        conv_opencl
-        CACHE INTERNAL "opencl_kernels")
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()
 # lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-lite_cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
 # lite_cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
 # lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
@@ -31,23 +31,3 @@ lite_cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps}
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
-
-
-set(x86_kernels
-#     activation_compute_x86
-#     elementwise_compute_x86
-#     mean_compute_x86
-#     fill_constant_compute_x86
-#     mul_compute_x86
-#     relu_compute_x86
-#     fc_compute_x86
-    scale_compute_x86
-#     softmax_compute_x86
-#     dropout_compute_x86
-#     concat_compute_x86
-#     conv_compute_x86
-#     pool_compute_x86
-#     batch_norm_compute_x86
-#     uniform_random_compute_x86
-#     sgd_compute_x86
-    CACHE INTERNAL "x86 kernels")
--- a/lite/npu/bridge/batch_norm_op.cc
+++ b/lite/npu/bridge/batch_norm_op.cc
@@ -30,12 +30,14 @@ namespace bridge {
 node_map_type BatchNormConverter(
    const std::shared_ptr<lite::OpLite> batch_norm_op,
    const node_map_type& inputs_map) {
-  LOG(INFO) << "converting batchnorm...";
-  lite::Scope* scope = batch_norm_op->scope();
-  const lite::OpInfo* op_info = batch_norm_op->op_info();
+  auto scope = batch_norm_op->scope();
+  auto op_info = batch_norm_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";

-  std::shared_ptr<ge::op::BatchNorm> output_node =
-      std::make_shared<ge::op::BatchNorm>(UniqueName("batch_norm"));
+  std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
+      std::make_shared<ge::op::BatchNorm>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();

  auto scale_var_name = op_info->Input("Scale").front();
@@ -68,21 +70,21 @@ node_map_type BatchNormConverter(
  int npu_mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
  bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");

-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  output_node->set_input_scale(*npu_scale);
-  output_node->set_input_b(*npu_bias);
-  output_node->set_input_mean(*npu_mean);
-  output_node->set_input_variance(*npu_variance);
-  output_node->set_attr_momentum(npu_momentum);
-  output_node->set_attr_epsilon(npu_epsilon);
-  output_node->set_attr_mode(npu_mode);
-  output_node->set_attr_use_global_stats(npu_use_global_stats);
+  batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
+  batch_norm_node->set_input_scale(*npu_scale);
+  batch_norm_node->set_input_b(*npu_bias);
+  batch_norm_node->set_input_mean(*npu_mean);
+  batch_norm_node->set_input_variance(*npu_variance);
+  batch_norm_node->set_attr_momentum(npu_momentum);
+  batch_norm_node->set_attr_epsilon(npu_epsilon);
+  batch_norm_node->set_attr_mode(npu_mode);
+  batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);

  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(output_node);
+  OpList::Global().add(batch_norm_node);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Y").front()] = output_node;
+  outputs_map[op_info->Output("Y").front()] = batch_norm_node;
  return outputs_map;
 }


--- a/lite/npu/bridge/elementwise_ops.cc
+++ b/lite/npu/bridge/elementwise_ops.cc
@@ -30,11 +30,14 @@ namespace bridge {
 node_map_type ElementwiseConverter(
    const std::shared_ptr<lite::OpLite> elementwise_op,
    const node_map_type& inputs_map) {
+  auto scope = elementwise_op->scope();
+  auto op_info = elementwise_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
  LOG(INFO) << "converting elementwise...";
-  lite::Scope* scope = elementwise_op->scope();
-  const lite::OpInfo* op_info = elementwise_op->op_info();
-  std::shared_ptr<ge::op::Eltwise> output_node =
-      std::make_shared<ge::op::Eltwise>(UniqueName("elementwise"));
+
+  std::shared_ptr<ge::op::Eltwise> elementwise_node =
+      std::make_shared<ge::op::Eltwise>(unique_op_type);

  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
@@ -43,27 +46,27 @@ node_map_type ElementwiseConverter(
      << "npu elementwise only support inputs with same size";

  CHECK(inputs_map.find(x_var_name) != inputs_map.end());
-  output_node->set_input_x1(*inputs_map.at(x_var_name));
+  elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
  OpList::Global().add(inputs_map.at(x_var_name));

  if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    output_node->set_input_x2(*inputs_map.at(y_var_name));
+    elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
    OpList::Global().add(inputs_map.at(y_var_name));
  } else {
    auto consty = std::make_shared<ge::op::Const>(y_var_name);
    auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
    consty->set_attr_value(CvtFromLiteTensor(y));
-    output_node->set_input_x2(*consty);
+    elementwise_node->set_input_x2(*consty);
    OpList::Global().add(consty);
  }

-  OpList::Global().add(output_node);
+  OpList::Global().add(elementwise_node);

  // paddlelite has sum only
-  output_node->set_attr_mode(1);
+  elementwise_node->set_attr_mode(1);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = elementwise_node;
  return outputs_map;
 }


--- a/lite/npu/bridge/pool_op.cc
+++ b/lite/npu/bridge/pool_op.cc
@@ -29,12 +29,14 @@ namespace bridge {

 node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
                            const node_map_type& inputs_map) {
-  LOG(INFO) << "converting pool...";
-  lite::Scope* scope = pool_op->scope();
-  const lite::OpInfo* op_info = pool_op->op_info();
+  auto scope = pool_op->scope();
+  auto op_info = pool_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";

-  std::shared_ptr<ge::op::Pooling> output_node =
-      std::make_shared<ge::op::Pooling>(UniqueName("pool"));
+  std::shared_ptr<ge::op::Pooling> pool_node =
+      std::make_shared<ge::op::Pooling>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
  int npu_mode = 0;
@@ -61,21 +63,21 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
  }

-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  output_node->set_attr_mode(npu_mode);
-  output_node->set_attr_pad_mode(0);
-  output_node->set_attr_global_pooling(npu_global_pooling);
-  output_node->set_attr_window(npu_window);
-  output_node->set_attr_pad(npu_pad);
-  output_node->set_attr_stride(npu_stride);
-  output_node->set_attr_ceil_mode(npu_ceil_mode);
+  pool_node->set_input_x(*inputs_map.at(x_var_name));
+  pool_node->set_attr_mode(npu_mode);
+  pool_node->set_attr_pad_mode(0);
+  pool_node->set_attr_global_pooling(npu_global_pooling);
+  pool_node->set_attr_window(npu_window);
+  pool_node->set_attr_pad(npu_pad);
+  pool_node->set_attr_stride(npu_stride);
+  pool_node->set_attr_ceil_mode(npu_ceil_mode);
  // output_node->set_attr_data_mode(npu_data_mode);

  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(output_node);
+  OpList::Global().add(pool_node);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = pool_node;
  return outputs_map;
 }


--- a/lite/npu/bridge/shuffle_channel_op.cc
+++ b/lite/npu/bridge/shuffle_channel_op.cc
@@ -30,22 +30,24 @@ namespace bridge {
 node_map_type ShuffleChannelConverter(
    const std::shared_ptr<lite::OpLite> shuffle_channel_op,
    const node_map_type& inputs_map) {
-  LOG(INFO) << "converting shuffle_channel...";
-  lite::Scope* scope = shuffle_channel_op->scope();
-  const lite::OpInfo* op_info = shuffle_channel_op->op_info();
-
-  std::shared_ptr<ge::op::ShuffleChannel> output_node =
-      std::make_shared<ge::op::ShuffleChannel>(UniqueName("shuffle_channel"));
+  auto scope = shuffle_channel_op->scope();
+  auto op_info = shuffle_channel_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";
+
+  std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
+      std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();

-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  output_node->set_attr_group(op_info->GetAttr<int>("group"));
+  shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
+  shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));

  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(output_node);
+  OpList::Global().add(shuffle_channel_node);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
  return outputs_map;
 }


--- a/lite/npu/bridge/softmax_op.cc
+++ b/lite/npu/bridge/softmax_op.cc
@@ -29,12 +29,14 @@ namespace bridge {

 node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
                               const node_map_type& inputs_map) {
-  LOG(INFO) << "converting softmax...";
-  lite::Scope* scope = softmax_op->scope();
-  const lite::OpInfo* op_info = softmax_op->op_info();
+  auto scope = softmax_op->scope();
+  auto op_info = softmax_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";

-  std::shared_ptr<ge::op::Softmax> output_node =
-      std::make_shared<ge::op::Softmax>(UniqueName("softmax"));
+  std::shared_ptr<ge::op::Softmax> softmax_node =
+      std::make_shared<ge::op::Softmax>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();

  auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
@@ -46,14 +48,14 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
  }

  CHECK(inputs_map.count(x_var_name));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  output_node->set_attr_axis(axis);
+  softmax_node->set_input_x(*inputs_map.at(x_var_name));
+  softmax_node->set_attr_axis(axis);

  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(output_node);
+  OpList::Global().add(softmax_node);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = softmax_node;
  return outputs_map;
 }


--- a/lite/npu/bridge/transpose_op.cc
+++ b/lite/npu/bridge/transpose_op.cc
@@ -30,19 +30,21 @@ namespace bridge {
 node_map_type TransposeConverter(
    const std::shared_ptr<lite::OpLite> transpose_op,
    const node_map_type& inputs_map) {
-  LOG(INFO) << "converting transpose...";
-  lite::Scope* scope = transpose_op->scope();
-  const lite::OpInfo* op_info = transpose_op->op_info();
+  auto scope = transpose_op->scope();
+  auto op_info = transpose_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = UniqueName(op_type);
+  LOG(INFO) << "Converting " + op_type + "...";

-  std::shared_ptr<ge::op::Permute> output_node =
-      std::make_shared<ge::op::Permute>(UniqueName("transpose"));
+  std::shared_ptr<ge::op::Permute> transpose_node =
+      std::make_shared<ge::op::Permute>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();

  // paddlelite doesn't have this input
  // w must be set, but it does nothing
-  auto w_var_name = "transpose_w";
+  auto w_var_name = unique_op_type + "/w";
  auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
-  w->Resize(scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims());
+  w->Resize({1});
  auto* w_data = w->mutable_data<float>();
  for (int i = 0; i < w->numel(); i++) {
    w_data[i] = 1.f;
@@ -55,15 +57,15 @@ node_map_type TransposeConverter(
  auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());

  CHECK(inputs_map.count(x_var_name));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  output_node->set_input_w(*npu_w);
-  output_node->set_attr_order(npu_axis);
+  transpose_node->set_input_x(*inputs_map.at(x_var_name));
+  transpose_node->set_input_w(*npu_w);
+  transpose_node->set_attr_order(npu_axis);

  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(output_node);
+  OpList::Global().add(transpose_node);

  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = transpose_node;
  return outputs_map;
 }


--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
-set(op_DEPS tensor op op_params)
-
-lite_cc_library(conv_op SRCS conv_op.cc DEPS ${op_DEPS})
-lite_cc_library(pool_op SRCS pool_op.cc DEPS ${op_DEPS})
-lite_cc_library(fc_op SRCS fc_op.cc DEPS ${op_DEPS})
-lite_cc_library(relu_op SRCS relu_op.cc DEPS ${op_DEPS})
-lite_cc_library(mul_op SRCS mul_op.cc DEPS ${op_DEPS})
-lite_cc_library(matmul_op SRCS matmul_op.cc DEPS ${op_DEPS})
-lite_cc_library(scale_op SRCS scale_op.cc DEPS ${op_DEPS})
-lite_cc_library(softmax_op SRCS softmax_op.cc DEPS ${op_DEPS})
-lite_cc_library(reshape_op SRCS reshape_op.cc DEPS ${op_DEPS} )
-lite_cc_library(batch_norm_op SRCS batch_norm_op.cc DEPS ${op_DEPS})
-lite_cc_library(feed_op SRCS feed_op.cc DEPS ${op_DEPS})
-lite_cc_library(fetch_op SRCS fetch_op.cc DEPS ${op_DEPS})
-lite_cc_library(io_copy_op SRCS io_copy_op.cc DEPS ${op_DEPS})
-lite_cc_library(io_copy_once_op SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
-lite_cc_library(activation_ops SRCS activation_ops.cc DEPS ${op_DEPS})
-lite_cc_library(elementwise_ops SRCS elementwise_ops.cc DEPS ${op_DEPS})
-lite_cc_library(lrn_op_lite SRCS lrn_op.cc DEPS ${op_DEPS})
-lite_cc_library(decode_bboxes_op_lite SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
-lite_cc_library(box_coder_op_lite SRCS box_coder_op.cc DEPS ${op_DEPS})
-lite_cc_library(multiclass_nms_op_lite SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-lite_cc_library(fusion_elementwise_activation_ops SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
-lite_cc_library(mean_op SRCS mean_op.cc DEPS ${op_DEPS})
-lite_cc_library(fill_constant_op SRCS fill_constant_op.cc DEPS ${op_DEPS})
-lite_cc_library(sgd_op SRCS sgd_op.cc DEPS ${op_DEPS})
-lite_cc_library(uniform_random_op SRCS uniform_random_op.cc DEPS ${op_DEPS})
-lite_cc_library(power_op SRCS power_op.cc DEPS ${op_DEPS})
-lite_cc_library(shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
-lite_cc_library(yolo_box_op SRCS yolo_box_op.cc DEPS ${op_DEPS})
-lite_cc_library(interpolate_op SRCS interpolate_op.cc DEPS ${op_DEPS})
-lite_cc_library(argmax_op SRCS argmax_op.cc DEPS ${op_DEPS})
-lite_cc_library(axpy_op SRCS axpy_op.cc DEPS ${op_DEPS})
-lite_cc_library(gru_unit_op SRCS gru_unit_op.cc DEPS ${op_DEPS})
-lite_cc_library(gru_op SRCS gru_op.cc DEPS ${op_DEPS})
-lite_cc_library(layout_op SRCS layout_op.cc DEPS ${op_DEPS})
-lite_cc_library(layout_once_op SRCS layout_once_op.cc DEPS ${op_DEPS})
-lite_cc_library(while_op SRCS while_op.cc DEPS ${op_DEPS})
-lite_cc_library(lookup_table_op SRCS lookup_table_op.cc DEPS ${op_DEPS})
-lite_cc_library(beam_search_decode_op SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
-lite_cc_library(prior_box_op SRCS prior_box_op.cc DEPS ${op_DEPS})
-lite_cc_library(density_prior_box_op SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+set(op_DEPS tensor op op_params scope memory)

 lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
-lite_cc_library(dropout_op SRCS dropout_op.cc DEPS ${op_DEPS})
-lite_cc_library(concat_op SRCS concat_op.cc DEPS ${op_DEPS})
-lite_cc_library(pad2d_op SRCS pad2d_op.cc DEPS ${op_DEPS})
-lite_cc_library(negative_op SRCS negative_op.cc DEPS ${op_DEPS})
-lite_cc_library(crop_op SRCS crop_op.cc DEPS ${op_DEPS})
-lite_cc_library(calib_op SRCS calib_op.cc DEPS ${op_DEPS})
-lite_cc_library(calib_once_op SRCS calib_once_op.cc DEPS ${op_DEPS})
-lite_cc_library(split_op SRCS split_op.cc DEPS ${op_DEPS})
-lite_cc_library(transpose_op SRCS transpose_op.cc DEPS ${op_DEPS})
-lite_cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
-lite_cc_library(fake_dequant SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
-lite_cc_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS ${op_DEPS})
-lite_cc_library(im2sequence_op SRCS im2sequence_op.cc DEPS ${op_DEPS})
-lite_cc_library(sequence_softmax_op SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
-lite_cc_library(norm_op SRCS norm_op.cc DEPS ${op_DEPS})
-lite_cc_library(graph_op SRCS graph_op.cc DEPS ${op_DEPS})
-lite_cc_library(topk_op SRCS topk_op.cc DEPS ${op_DEPS})
-lite_cc_library(increment_op SRCS increment_op.cc DEPS ${op_DEPS})
-lite_cc_library(write_to_array_op SRCS write_to_array_op.cc DEPS ${op_DEPS})
-lite_cc_library(graph_op_lite SRCS graph_op.cc DEPS ${op_DEPS})
-lite_cc_library(logical_xor  SRCS logical_op.cc DEPS ${op_DEPS})
-lite_cc_library(logical_and  SRCS logical_op.cc DEPS ${op_DEPS})
-lite_cc_library(logical_or  SRCS logical_op.cc DEPS ${op_DEPS})
-lite_cc_library(logical_not  SRCS logical_op.cc DEPS ${op_DEPS})
-lite_cc_library(less_than  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(equal  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(not_equal  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(less_equal  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(greater_than  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(greater_equal  SRCS compare_op.cc DEPS ${op_DEPS})
-lite_cc_library(read_from_array_op SRCS read_from_array_op.cc DEPS ${op_DEPS})
-lite_cc_library(beam_search_op SRCS beam_search_op.cc DEPS ${op_DEPS})
-lite_cc_library(sequence_pool_op_lite SRCS sequence_pool_op.cc DEPS ${op_DEPS})
-lite_cc_library(sequence_expand_op_lite SRCS sequence_expand_op.cc DEPS ${op_DEPS})
-lite_cc_library(reduce_max_op_lite SRCS reduce_max_op.cc DEPS ${op_DEPS})
-lite_cc_library(lod_reset_op SRCS lod_reset_op.cc DEPS ${op_DEPS})
-lite_cc_library(is_empty SRCS is_empty_op.cc DEPS ${op_DEPS})
-lite_cc_library(shape_op_lite SRCS shape_op.cc DEPS ${op_DEPS})
-lite_cc_library(cast_op_lite SRCS cast_op.cc DEPS ${op_DEPS})
-lite_cc_library(slice_op_lite SRCS slice_op.cc DEPS ${op_DEPS})
-lite_cc_library(squeeze_op_lite SRCS squeeze_op.cc DEPS ${op_DEPS})
-lite_cc_library(expand_op_lite SRCS expand_op.cc DEPS ${op_DEPS})

+add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
+add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
+add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
+add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
+add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
+add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
+add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
+add_operator(softmax_op basic SRCS softmax_op.cc DEPS ${op_DEPS})
+add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
+add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
+add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
+add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
+add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
+add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
+add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
+add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
+add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
+add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
+add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
+add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
+add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
+add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
+add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
+#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
+add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
+add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
+add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
+add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
+add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
+add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
+add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
+add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
+add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
+add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
+add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
+add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
+add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
+add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
+add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
+add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
+add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
+add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
+add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
+add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
+add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
+add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
+add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
+add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
+add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
+add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
+add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
+add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
+add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
+
+# for OCR specific
+add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
+add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
+add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
+add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
+add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
+add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
+add_operator(logical_and  extra SRCS logical_op.cc DEPS ${op_DEPS})
+add_operator(logical_or  extra SRCS logical_op.cc DEPS ${op_DEPS})
+add_operator(logical_not  extra SRCS logical_op.cc DEPS ${op_DEPS})
+add_operator(less_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(not_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(less_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(greater_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(greater_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
+add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
+add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pool_op_lite extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
+add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
+add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
+add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS})
+add_operator(slice_op_lite extra SRCS slice_op.cc DEPS ${op_DEPS})
+add_operator(write_to_array_op extra SRCS write_to_array_op.cc DEPS ${op_DEPS})
+add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
+add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
+add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})

-set(ops
-        conv_op
-        pool_op
-        fc_op
-        relu_op
-        mul_op
-        matmul_op
-        scale_op
-        softmax_op
-        reshape_op
-        batch_norm_op
-        feed_op
-        fetch_op
-        gru_unit_op
-        gru_op
-        beam_search_decode_op
-        lookup_table_op
-        io_copy_op
-        io_copy_once_op
-        elementwise_ops
-        fusion_elementwise_activation_ops
-        lrn_op_lite
-	decode_bboxes_op_lite
-	multiclass_nms_op_lite
-        decode_bboxes_op_lite
-        box_coder_op_lite
-        multiclass_nms_op_lite
-        mean_op
-        fill_constant_op
-        activation_ops
-        dropout_op
-        concat_op
-        pad2d_op
-        crop_op
-        prior_box_op
-        density_prior_box_op
-        negative_op
-        calib_op
-        calib_once_op
-        split_op
-        transpose_op
-        fake_quant
-        fake_dequant
-        sgd_op
-        uniform_random_op
-        power_op
-        yolo_box_op
-        shuffle_channel_op
-        argmax_op
-        axpy_op
-        conv_transpose_op
-        im2sequence_op
-        sequence_softmax_op
-        norm_op
-        layout_op
-        layout_once_op
-        interpolate_op
-        logical_xor
-        logical_and        
-        logical_or        
-        logical_not        
-        equal        
-        not_equal        
-        less_than        
-        while_op
-        less_equal        
-        greater_than        
-        greater_equal        
-        graph_op
-        topk_op
-        increment_op
-        write_to_array_op
-        read_from_array_op
-        beam_search_op
-        sequence_pool_op_lite
-        sequence_expand_op_lite
-        reduce_max_op_lite
-        lod_reset_op
-        is_empty 
-        shape_op_lite
-        cast_op_lite
-        slice_op_lite
-        squeeze_op_lite
-        expand_op_lite
-        CACHE INTERNAL "ops lite")

 if (NOT LITE_WITH_X86)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc
@@ -184,7 +100,7 @@ if (NOT LITE_WITH_X86)
    lite_cc_test(test_softmax_op SRCS softmax_op_test.cc DEPS softmax_op memory)
    #lite_cc_test(test_reshape_op SRCS reshape_op_test.cc DEPS reshape_op memory)
    lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
-    lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory)
+    lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
    lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
    lite_cc_test(test_fusion_elementwise_activation_ops
                SRCS fusion_elementwise_activation_ops_test.cc

--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -85,7 +85,9 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
      }
    }
  }
-  param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  if (op_desc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  }
  return true;
 }


--- a/lite/operators/density_prior_box_op.cc
+++ b/lite/operators/density_prior_box_op.cc
@@ -41,15 +41,29 @@ bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
  param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
  param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();

-  param_.flip = opdesc.GetAttr<bool>("flip");
  param_.clip = opdesc.GetAttr<bool>("clip");
-  param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
  param_.fixed_sizes = opdesc.GetAttr<std::vector<float>>("fixed_sizes");
  param_.fixed_ratios = opdesc.GetAttr<std::vector<float>>("fixed_ratios");
-  param_.density_sizes = opdesc.GetAttr<std::vector<float>>("density_sizes");
-  param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
-  param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
  param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
+
+  if (opdesc.HasAttr("aspect_ratios")) {
+    param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
+  }
+  if (opdesc.HasAttr("max_sizes")) {
+    param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
+  }
+  if (opdesc.HasAttr("density_sizes")) {
+    param_.density_sizes = opdesc.GetAttr<std::vector<int>>("density_sizes");
+  }
+  if (opdesc.HasAttr("densities")) {
+    param_.density_sizes = opdesc.GetAttr<std::vector<int>>("densities");
+  }
+  if (opdesc.HasAttr("min_sizes")) {
+    param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
+  }
+  if (opdesc.HasAttr("flip")) {
+    param_.flip = opdesc.GetAttr<bool>("flip");
+  }
  if (opdesc.HasAttr("img_w")) {
    param_.img_w = opdesc.GetAttr<int>("img_w");
  }

--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ b/lite/operators/fake_quantize_range_abs_max.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fake_quantize_range_abs_max.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fake_quantize_range_abs_max,
+                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
+ public:
+  FakeQuantizeRangeMaxAbsOpLite() {}
+
+  explicit FakeQuantizeRangeMaxAbsOpLite(const std::string &type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override { return true; }
+
+  bool InferShape() const override { return true; }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto in_scale = op_desc.Input("InScale").front();
+
+    auto out = op_desc.Output("Out").front();
+    auto out_scale = op_desc.Output("OutScale").front();
+
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+
+    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
+    param_.bit_length = op_desc.GetAttr<int>("bit_length");
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fake_quantize_range_max_abs";
+  }
+
+ private:
+  mutable FakeQuantizeMovingAvgMaxAbsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/flatten_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool FlattenOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool FlattenOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+
+  auto out_lod = param_.output->mutable_lod();
+  *out_lod = param_.x->lod();
+
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (i < axis_) {
+      outer *= x_dims[i];
+    } else {
+      inner *= x_dims[i];
+    }
+  }
+  std::vector<int64_t> out_shape(2);
+  out_shape[0] = outer;
+  out_shape[1] = inner;
+
+  param_.output->Resize(out_shape);
+
+  return true;
+}
+
+bool FlattenOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  auto x_var = scope->FindVar(opdesc.Input("X").front());
+  auto output_var = scope->FindVar(opdesc.Output("Out").front());
+  CHECK(x_var);
+  CHECK(output_var);
+  param_.x = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
+  param_.output = output_var->GetMutable<lite::Tensor>();
+  axis_ = opdesc.GetAttr<int>("axis");
+
+  param_.inplace = false;
+
+  CHECK(param_.x) << "Input(X) of FlattenOp should not be null.";
+  CHECK(param_.output) << "Output(Out) of FlattenOp should not be null.";
+  CHECK_GE(axis_, 0) << "Flatten op axis should >=0.";
+  return true;
+}
+
+bool Flatten2Op::CheckShape() const {
+  FlattenOp::CheckShape();
+  CHECK_OR_FALSE(param_.xshape);
+  return true;
+}
+
+bool Flatten2Op::InferShape() const {
+  FlattenOp::InferShape();
+  auto x_dims = param_.x->dims();
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  param_.xshape->Resize(DDim(xshape_dims));
+  return true;
+}
+
+bool Flatten2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  FlattenOp::AttachImpl(opdesc, scope);
+  auto xshape_var = scope->FindVar(opdesc.Output("XShape").front());
+  CHECK(xshape_var);
+  param_.xshape = xshape_var->GetMutable<lite::Tensor>();
+  CHECK(param_.xshape) << "Output(XShape) of FlattenOp should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(flatten, paddle::lite::operators::FlattenOp);
+REGISTER_LITE_OP(flatten2, paddle::lite::operators::Flatten2Op);
--- a/lite/operators/flatten_op.h
+++ b/lite/operators/flatten_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FlattenOp : public OpLite {
+ public:
+  FlattenOp() {}
+  explicit FlattenOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "flatten"; }
+
+ protected:
+  mutable ReshapeParam param_;
+  int axis_;
+};
+
+class Flatten2Op : public FlattenOp {
+ public:
+  Flatten2Op() : FlattenOp() {}
+  explicit Flatten2Op(const std::string &op_type) : FlattenOp(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "flatten2"; }
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -521,7 +521,7 @@ struct PriorBoxParam {
 struct DensityPriorBoxParam : public PriorBoxParam {
  std::vector<float> fixed_sizes;
  std::vector<float> fixed_ratios;
-  std::vector<float> density_sizes;
+  std::vector<int> density_sizes;
 };
 /// ----------------------- GRU operators ----------------------f
 struct GRUParam {

--- a/lite/operators/prior_box_op.cc
+++ b/lite/operators/prior_box_op.cc
@@ -40,12 +40,14 @@ bool PriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
  param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
  param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();

-  param_.flip = opdesc.GetAttr<bool>("flip");
  param_.clip = opdesc.GetAttr<bool>("clip");
  param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
  param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
  param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
  param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
+  if (opdesc.HasAttr("flip")) {
+    param_.flip = opdesc.GetAttr<bool>("flip");
+  }
  if (opdesc.HasAttr("img_w")) {
    param_.img_w = opdesc.GetAttr<int>("img_w");
  }

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -21,7 +21,13 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+
+if(LITE_BUILD_EXTRA)
    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+endif()
+
    lite_cc_test(test_sgemm SRCS test_sgemm.cc DEPS ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -31,9 +37,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -171,9 +171,9 @@ void test_fc(Place place) {
          DDim bdim{{bflag ? n : 0}};
          std::unique_ptr<arena::TestCase> tester(
              new FcOPTest(place, "def", dim_in, wdim, bdim, 1));
-#ifdef WITH_ARM_LITE
+#ifdef LITE_WITH_ARM
          auto& ctx = tester->context()->As<ARMContext>();
-          ctx.SetRunMode(LITE_POWER_HIGH, 1);
+          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
 #endif
          arena::Arena arena(std::move(tester), place, 6e-5);
          if (!arena.TestPrecision()) {

--- a/lite/tests/kernels/gru_unit_test.cc
+++ b/lite/tests/kernels/gru_unit_test.cc
@@ -344,7 +344,7 @@ void test_gru_unit(Place place) {
      place, "def", 1 /* sigomoid */, 2 /* tanh */, false, dims));
 #ifdef LITE_WITH_ARM
  auto& ctx = tester->context()->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 1);
+  ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
 #endif
  arena::Arena arena(std::move(tester), place, 2e-5);
  arena.TestPrecision();

--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -75,7 +75,7 @@ void prior_box_compute_ref(const lite::Tensor* input,
                           const std::vector<float>& min_size_,
                           const std::vector<float>& fixed_size_,
                           const std::vector<float>& fixed_ratio_,
-                           const std::vector<float>& density_size_,
+                           const std::vector<int>& density_size_,
                           const std::vector<float>& max_size_,
                           const std::vector<float>& aspect_ratio_,
                           const std::vector<float>& variance_,
@@ -352,7 +352,7 @@ class DensityPriorBoxComputeTester : public arena::TestCase {
  std::vector<float> min_size_;
  std::vector<float> fixed_size_;
  std::vector<float> fixed_ratio_;
-  std::vector<float> density_size_;
+  std::vector<int> density_size_;
  std::vector<float> max_size_;
  std::vector<float> aspect_ratio_;
  std::vector<float> variance_;
@@ -375,7 +375,7 @@ class DensityPriorBoxComputeTester : public arena::TestCase {
                               const std::vector<float>& min_size,
                               const std::vector<float>& fixed_size,
                               const std::vector<float>& fixed_ratio,
-                               const std::vector<float>& density_size,
+                               const std::vector<int>& density_size,
                               const std::vector<float>& max_size,
                               const std::vector<float>& aspect_ratio,
                               const std::vector<float>& variance,
@@ -561,7 +561,7 @@ class PriorBoxComputeTester : public arena::TestCase {
                          min_size_,
                          std::vector<float>(),
                          std::vector<float>(),
-                          std::vector<float>(),
+                          std::vector<int>(),
                          max_size_,
                          aspect_ratio_,
                          variance_,
@@ -621,7 +621,7 @@ void test_density_prior_box(Place place) {
  std::vector<float> variance{0.1f, 0.1f, 0.2f, 0.2f};
  std::vector<float> fixed_size{60, 30};
  std::vector<float> fixed_ratio{1., 2.};
-  std::vector<float> density_size{1., 3.};
+  std::vector<int> density_size{1, 3};
  bool flip = true;
  bool clip = false;
  float step_h = 0;

--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -5,18 +5,22 @@ if [ $# -lt  2 ];
 then
    echo "Input error"
    echo "USAGE:"
-    echo "  sh benchmark.sh benchmark_bin_path test_models_dir"
-    echo "  sh benchmark.sh benchmark_bin_path test_models_dir arm_bi"
+    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path"
+    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path is_run_model_optimize"
    exit
 fi

-BENCHMARK_BIN=$1
-MODELS_DIR=$2
-ARM_BI=$3
 ANDROID_DIR=/data/local/tmp
 RESULT_FILENAME="result.txt"
 WARMUP=10
 REPEATS=30
+BENCHMARK_BIN=$1
+MODELS_DIR=$2
+IS_RUN_MODEL_OPTIMIZE=false
+if [ $# -gt  2 ];
+then
+	IS_RUN_MODEL_OPTIMIZE=$3
+fi

 adb push $BENCHMARK_BIN $ANDROID_DIR/benchmark_bin
 adb shell chmod 777 $ANDROID_DIR/benchmark_bin
@@ -25,11 +29,11 @@ adb push $MODELS_DIR $ANDROID_DIR
 adb shell "echo  PaddleLite Benchmark > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in 1 2 4
 do
-adb shell "echo ABI=$ARM_BI Threads=$threads Warmup=$WARMUP Repeats=$REPEATS  >>  $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS  >>  $ANDROID_DIR/$RESULT_FILENAME"
 for model_name in `ls $MODELS_DIR`
 do
  echo $model_name
-  adb shell "$ANDROID_DIR/benchmark_bin --model_dir=$ANDROID_DIR/${MODELS_DIR##*/}/$model_name --warmup=$WARMUP --repeats=$REPEATS --threads=$threads --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+  adb shell "$ANDROID_DIR/benchmark_bin --model_dir=$ANDROID_DIR/${MODELS_DIR##*/}/$model_name --warmup=$WARMUP --repeats=$REPEATS --threads=$threads --result_filename=$ANDROID_DIR/$RESULT_FILENAME --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE"
 done
 adb shell "echo  >>  $ANDROID_DIR/$RESULT_FILENAME"
 done

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
 #!/bin/bash
+set -ex

 readonly CMAKE_COMMON_OPTIONS="-DWITH_GPU=OFF \
                               -DWITH_MKL=OFF \
@@ -31,6 +32,10 @@ function make_tiny_publish_so {

  cur_dir=$(pwd)
  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+  if [ -d $build_dir ]
+  then
+    rm -rf $build_dir
+  fi
  mkdir -p $build_dir
  cd $build_dir

@@ -55,6 +60,10 @@ function make_full_publish_so {

  cur_dir=$(pwd)
  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+  if [ -d $build_dir ]
+  then
+    rm -rf $build_dir
+  fi
  mkdir -p $build_dir
  cd $build_dir

@@ -78,6 +87,10 @@ function make_all_tests {

  cur_dir=$(pwd)
  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+  if [ -d $build_dir ]
+  then
+    rm -rf $build_dir
+  fi
  mkdir -p $build_dir
  cd $build_dir


--- a/lite/tools/build_armlinux.sh
+++ b/lite/tools/build_armlinux.sh
+#!/bin/bash
+
+os=armlinux
+abi=armv8
+lang=gcc
+
+if [ x$1 != x ]; then
+  abi=$1
+fi
+
+if [ x$2 != x ]; then
+  lang=$2
+fi
+
+cur_dir=$(pwd)
+build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+mkdir -p $build_dir
+cd $build_dir
+
+GEN_CODE_PATH_PREFIX=lite/gen_code
+mkdir -p ./${GEN_CODE_PATH_PREFIX}
+touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DWITH_ARM_DOTPROD=ON   \
+        -DLITE_WITH_OPENMP=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=ON \
+        -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
+
+make -j4 publish_inference
+
+cd -
--- a/lite/tools/build_ios_armv7_arm64.sh
+++ b/lite/tools/build_ios_armv7_arm64.sh
 #!/bin/bash
+set -e

 build_dir=build.ios.armv7.arm64
 mkdir -p ${build_dir}
@@ -15,11 +16,15 @@ cmake .. \
        -DLITE_WITH_CUDA=OFF \
        -DLITE_WITH_X86=OFF \
        -DLITE_WITH_ARM=ON \
-        -DLITE_WITH_OPENMP=ON \
+        -DWITH_TESTING=OFF \
+        -DLITE_WITH_JAVA=OFF \
+        -DLITE_SHUTDOWN_LOG=ON \
+        -DLITE_ON_TINY_PUBLISH=ON \
+        -DLITE_WITH_OPENMP=OFF \
+        -DWITH_ARM_DOTPROD=OFF \
        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
        -DARM_TARGET_OS=ios

-make -j2
+make -j4

 cd -
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ b/lite/tools/cmake_tools/parse_kernel_registry.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import logging
+
+ops_list_path = sys.argv[1]
+dest_path = sys.argv[2]
+
+out_lines = [
+    '#pragma once',
+    '#include "paddle_lite_factory_helper.h"',
+    '',
+]
+
+with open(ops_list_path) as f:
+    for line in f:
+        path = line.strip()
+
+        status = ''
+        with open(path) as g:
+            lines = [v for v in g]
+            for i in range(len(lines)):
+                line = lines[i].strip()
+
+                if not status:
+                    key = 'REGISTER_LITE_KERNEL'
+                    if line.startswith(key):
+                        forward = i + min(7, len(lines) - i)
+                        remaining = line[len(key) + 1:] + ' '.join(
+                            [v.strip() for v in lines[i + 1:forward]])
+
+                        x = remaining.find('.')
+                        if x > 0:
+                            remaining = remaining[:x]
+
+                        fs = [v.strip() for v in remaining.split(',')]
+                        assert (len(fs) >= 4)
+                        op, target, precision, layout, __, alias = fs[:6]
+                        alias = alias.replace(')', '')
+
+                        key = "USE_LITE_KERNEL(%s, %s, %s, %s, %s);" % (
+                            op, target, precision, layout, alias)
+                        out_lines.append(key)
+
+with open(dest_path, 'w') as f:
+    logging.info("write kernel list to %s" % dest_path)
+    f.write('\n'.join(out_lines))
--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+''' Collect op registry information. '''
+
+import sys
+import logging
+
+ops_list_path = sys.argv[1]
+dest_path = sys.argv[2]
+
+out_lines = [
+    '#pragma once',
+    '#include "paddle_lite_factory_helper.h"',
+    '',
+]
+
+with open(ops_list_path) as f:
+    for line in f:
+        path = line.strip()
+
+        with open(path) as g:
+            for line in g:
+                key = 'REGISTER_LITE_OP'
+                if line.startswith(key):
+                    end = line.find(',')
+                    op = line[len(key) + 1:end]
+                    if not op: continue
+                    if "_grad" in op: continue
+                    out = "USE_LITE_OP(%s);" % op
+                    out_lines.append(out)
+
+with open(dest_path, 'w') as f:
+    logging.info("write op list to %s" % dest_path)
+    f.write('\n'.join(out_lines))
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -115,7 +115,7 @@ void FillTensorData(lite::Tensor* tensor, const DebugConfig& conf, int col) {
      data[i] = input_data[i];
    }
  } else {
-    LOG(INFO) << "------------> Use all-ones input";
+    LOG(INFO) << "-------------> Use all-ones input";
    for (int i = 0; i < dim_size; i++) {
      data[i] = 1;
    }

--- a/lite/tools/debug/model_debug_tool.cc
+++ b/lite/tools/debug/model_debug_tool.cc
@@ -33,7 +33,7 @@ void Run(DebugConfig* conf) {
  CHECK(conf);
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, conf->arm_thread_num);
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, conf->arm_thread_num);
 #endif
  lite::Predictor predictor;
  std::vector<Place> valid_places({

--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -35,7 +35,7 @@ static bool IsFileExists(const std::string& path) {
 // ARM mobile not support mkdir in C++
 static void MkDirRecur(const std::string& path) {
 #ifndef LITE_WITH_ARM
-  if(system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
+  if (system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
    LOG(ERROR) << "Cann't mkdir " << path;
  }
 #else  // On ARM

--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -24,6 +24,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <string>
+#include <assert.h>
 #include "lite/utils/replace_stl/stream.h"

 // NOLINTFILE()

--- a/mobile/.clang-format
+++ b/mobile/.clang-format
+---
+Language:        Cpp
+BasedOnStyle: Google 
+Standard:  Cpp11 
+...
--- a/mobile/.clang-tidy
+++ b/mobile/.clang-tidy
+Checks: >
+  *
+  -android-*
+  -bugprone-bool-pointer-implicit-conversion
+  -cert-env33-c
+  -cert-dcl50-cpp
+  -cert-dcl59-cpp
+  -cppcoreguidelines-*
+  -fuchsia-*
+  -google-*
+  google-default-arguments
+  google-explicit-constructor
+  google-runtime-member-string-references
+  google-runtime-operator
+  -hicpp-braces-around-statements
+  -hicpp-named-parameter
+  -hicpp-no-array-decay
+  -hicpp-no-assembler
+  -hicpp-no-malloc
+  -hicpp-function-size
+  -hicpp-special-member-functions
+  -hicpp-vararg
+  -llvm-*
+  -objc-*
+  -readability-else-after-return
+  -readability-implicit-bool-conversion
+  -readability-named-parameter
+  -readability-simplify-boolean-expr
+  -readability-braces-around-statements
+  -readability-identifier-naming
+  -readability-function-size
+  -readability-redundant-member-init
+  -misc-bool-pointer-implicit-conversion
+  -misc-definitions-in-headers
+  -misc-unused-alias-decls
+  -misc-unused-parameters
+  -misc-unused-using-decls
+  -modernize-use-using
+  -modernize-use-default-member-init
+  -clang-diagnostic-*
+  -clang-analyzer-*
+WarningsAsErrors: '*'
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            allonli
+CheckOptions:    
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
--- a/mobile/.gitignore
+++ b/mobile/.gitignore
+opencl_kernels.cpp
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.lib
+*.a
+
+# Executables
+*.exe
+*.out
+*.app
+
+.DS_Store
+
+build/
+
+.idea/
+
+CMakeCache.txt
+
+CMakeFiles/
+
+Makefile
+
+cmake_install.cmake
+
+
+*.cbp
+
+paddle-mobile.cbp
+
+.idea
+
+compile_commands.json
+
+cmake-build-debug/
+cmake-build-release/
+
+test/models/
+
+test/images/
+
+# Emacs intermediate files
+*~
+
+# CMake building directory
+build
+
+# clion building directories
+cmake-build-debug
+cmake-build-release
+
+# ios
+tools/libomp.a
+
+# ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
+/tools/quantification/quantify
+
+# metal
+Podfile.lock
+metal/Pods/
+SwiftProtobuf.framework
+paddle-mobile.xcworkspace
+metal/models/
+metal/images/
+*.a
+metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
+*.xcuserdatad/
+*/xcuserdata/
+/venv/
+
+metal/paddle-mobile-demo/paddle-mobile-demo/images
+metal/paddle-mobile-demo/paddle-mobile-demo/models
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
+metal/MobileNetDemo/MobileNetDemo/Resources
--- a/mobile/.pre-commit-config.yaml
+++ b/mobile/.pre-commit-config.yaml
+repos:
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    sha: v1.0.1
+    hooks:
+    -   id: remove-crlf
+        files: ^(mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
+        exclude: ^(lite/)
+    -   id: remove-tabs
+        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
+        exclude: ^(lite/)
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
+    hooks:
+    -   id: check-added-large-files
+        exclude: ^(lite/)
+    -   id: check-merge-conflict
+        exclude: ^(lite/)
+    -   id: check-symlinks
+        exclude: ^(lite/)
+    -   id: detect-private-key
+        files: (?!.*tar.gz)^.*$ 
+        exclude: ^(lite/)
+    -   id: end-of-file-fixer
+        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
+        exclude: ^(lite/)
+    -   id: trailing-whitespace
+        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
+        exclude: ^(lite/)
+
+-   repo: local
+    hooks:
+    -   id: copyright
+        name: copyright
+        entry: python ./mobile/tools/pre-commit.hooks/copyright.hook
+        language: system
+        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ | ^(lite/)
+
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: bash ./mobile/tools/pre-commit.hooks/clang-format.hook -i
+        language: system
+        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
+        exclude: ^(lite/)
+
+-   repo: local
+    hooks:
+    -   id: cpplint
+        name: cpplint
+        description: Check C++ code style using cpplint.
+        entry: bash ./mobile/tools/pre-commit.hooks/cpplint.hook
+        language: system
+        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$i | *\.pb\.cpp | ^(lite/)
+
+
+#
+#-   repo: local
+#    hooks:
+#    -   id: clang-tidy
+#        name: clang-tidy
+#        description: Check C++ code style using clang-tidy.
+#        entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i
+#        language: system
+#        files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
--- a/mobile/.travis.yml
+++ b/mobile/.travis.yml
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+
+os:
+  - linux
+
+addons:
+  apt:
+    packages:
+      - git
+      - python
+      - python-pip
+      - python2.7-dev
+      - libc6-i386
+      - curl
+
+compiler:
+  - clang
+        
+before_install:
+  - sudo pip install -U virtualenv pre-commit pip
+  # Download and install recent cmake
+
+script:
+  - | 
+    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
+  - |
+    timeout 600 .travis/pre-commit-job.sh # 10min timeout
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
+
+notifications:
+  email:
+    on_success: change
+    on_failure: always
--- a/mobile/.travis/pre-commit-job.sh
+++ b/mobile/.travis/pre-commit-job.sh
+#!/bin/bash
+function abort(){
+    echo "Your change doesn't follow Paddle-Moible's code style" 1>&2
+    echo "Please use pre-commit to auto-format your code." 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+cd `dirname $0`
+cd ..
+export PATH=/usr/bin:$PATH
+pre-commit install
+
+if ! pre-commit run -a ; then
+  ls -lh
+  git diff  --exit-code
+  exit 1
+fi
+
+trap : 0
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
@@ -96,6 +96,21 @@ class CLEngine {
    return std::move(program_ptr);
  }

+  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWithSource(
+      cl_context context, const char *source) {
+    size_t sourceSize[] = {strlen(source)};
+    cl_program p =
+        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
+
+    DLOG << " cl kernel from source";
+    DLOG << " source size: " << sourceSize[0];
+    CL_CHECK_ERRORS(status_);
+
+    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
+
+    return std::move(program_ptr);
+  }
+
  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
    cl_event event = clCreateUserEvent(context, &status_);
    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);

--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
@@ -14,9 +14,11 @@ limitations under the License. */

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>

 #include "CL/cl.h"
 #include "framework/cl/cl_deleter.h"
@@ -24,6 +26,10 @@ limitations under the License. */
 #include "framework/cl/cl_tool.h"

 namespace paddle_mobile {
+
+extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels;
+extern const std::vector<std::string> need_conv_header_kernels;
+
 namespace framework {

 class CLScope {
@@ -62,15 +68,35 @@ class CLScope {
      return it->second.get();
    }

-    auto program = CLEngine::Instance()->CreateProgramWith(
-        context_,
-        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
-
-    DLOG << " --- begin build program -> " << program_key << " --- ";
-    CLEngine::Instance()->BuildProgram(program.get(), options);
-    DLOG << " --- end build program -> " << program_key << " --- ";
-
-    programs_[program_key] = std::move(program);
+    if (opencl_kernels.find(file_name) != opencl_kernels.end()) {
+      auto it = opencl_kernels.find(file_name);
+      std::string source(it->second.begin(), it->second.end());
+      if (std::find(need_conv_header_kernels.begin(),
+                    need_conv_header_kernels.end(),
+                    file_name) != need_conv_header_kernels.end()) {
+        auto it = opencl_kernels.find("conv_kernel.inc.cl");
+        std::string header(it->second.begin(), it->second.end());
+        source = header + source;
+      }
+      auto program = CLEngine::Instance()->CreateProgramWithSource(
+          context_, source.c_str());
+
+      DLOG << " --- begin build program -> " << program_key << " --- ";
+      CLEngine::Instance()->BuildProgram(program.get(), options);
+      DLOG << " --- end build program -> " << program_key << " --- ";
+
+      programs_[program_key] = std::move(program);
+    } else {
+      auto program = CLEngine::Instance()->CreateProgramWith(
+          context_,
+          CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
+
+      DLOG << " --- begin build program -> " << program_key << " --- ";
+      CLEngine::Instance()->BuildProgram(program.get(), options);
+      DLOG << " --- end build program -> " << program_key << " --- ";
+
+      programs_[program_key] = std::move(program);
+    }

    return programs_[program_key].get();
  }

--- a/mobile/src/io/paddle_mobile_wrap.h
+++ b/mobile/src/io/paddle_mobile_wrap.h
@@ -16,9 +16,9 @@ limitations under the License. */

 #include <cstdint>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
-#include <memory>
 #include <utility>
 #include <vector>


--- a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
@@ -14,10 +14,10 @@ limitations under the License. */

 #ifdef CONDITIONAL_BLOCK_OP

-#include <algorithm>
 #include "operators/kernel/conditional_block_kernel.h"
 #include <framework/program/block_desc.h>
 #include <framework/program/op_desc.h>
+#include <algorithm>
 #include "framework/data_type.h"

 namespace paddle_mobile {

--- a/mobile/src/operators/kernel/cl/gen_code.py
+++ b/mobile/src/operators/kernel/cl/gen_code.py
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import re
+import os
+import sys
+
+source = """
+#pragma
+#ifdef PADDLE_MOBILE_CL
+#include <map>
+#include <string>
+#include <vector>
+namespace paddle_mobile {
+    extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
+%s
+    };
+    extern const std::vector<std::string> need_conv_header_kernels = {
+        %s
+    };
+}
+#endif
+"""
+
+def string_to_hex(str):
+    hex_list = []
+    for i in range(len(code_str)):
+        hex_ = hex(ord(code_str[i]))
+        hex_list.append(hex_)
+    return hex_list
+
+infile = open("cl_kernel/cl_common.h", "r")
+common_content = infile.read()
+infile.close()
+common_content = re.sub(r"/\*[^*]*\*/", "", common_content, flags=re.DOTALL)
+lines = common_content.split("\n")
+new_lines = []
+for i in range(len(lines)):
+    line = lines[i]
+    line = line.strip()
+    if line == "":
+        continue
+    if line.startswith("//"):
+        continue
+    line = re.sub(r"//.*$", "", line)
+    new_lines.append(line)
+common_content = "\n".join(new_lines)
+
+need_conv_header_kernels = []
+
+cores = ""
+filenames = os.listdir("cl_kernel")
+file_count = len(filenames)
+for i in range(file_count):
+    filename = filenames[i]
+    infile = open("cl_kernel/" + filename, "r")
+    new_lines = []
+    content = infile.read()
+    content = re.sub(r"/\*[^*]*\*/", "", content, flags=re.DOTALL)
+    infile.close()
+    lines = content.split("\n")
+    for i in range(len(lines)):
+        line = lines[i]
+        line = line.strip()
+        if line == "":
+            continue
+        if line.startswith("//"):
+            continue
+        line = re.sub(r"//.*$", "", line)
+        if "cl_common.h" in line:
+            line = common_content
+        elif "conv_kernel.inc.cl" in line:
+            need_conv_header_kernels.append("\"%s\"" % filename)
+            continue
+        new_lines.append(line)
+    content = "\n".join(new_lines)
+    if content == "":
+        content = " "
+    hexes = []
+    for char in content:
+        hexes.append(hex(ord(char)))
+    core = "        {\"%s\", {" % filename
+    for item in hexes:
+        core += str(item) + ", "
+    core = core[: -2]
+    core += "}}"
+    if i != file_count - 1:
+        core += ",\n"
+    cores += core
+
+source = source % (cores, ",".join(need_conv_header_kernels))
+print(source)
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/tools/build.sh
+++ b/mobile/tools/build.sh
@@ -2,6 +2,15 @@
 NETS=""
 declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super" "op")

+# merge cl to so
+merge_cl_to_so=1
+rm ../src/operators/kernel/cl/opencl_kernels.cpp
+if [ $merge_cl_to_so == 1 ]; then
+    cd ../src/operators/kernel/cl
+    python gen_code.py > opencl_kernels.cpp
+    cd -
+fi
+
 build_for_mac() {
    if [ ! `which brew` ]; then
        echo "building failed! homebrew not found, please install homebrew."

--- a/mobile/tools/pre-commit.hooks/clang-format.hook
+++ b/mobile/tools/pre-commit.hooks/clang-format.hook
 #!/bin/bash
-set -e
+# set -e

 readonly VERSION="5.0"


--- a/mobile/tools/python/fluidtools/run.py
+++ b/mobile/tools/python/fluidtools/run.py
@@ -535,6 +535,7 @@ def main():
    push(checked_model_path)
    push(feed_path + "/" + last_feed_file_name, "input.txt")
    push(mobile_src_root + "/build/release/arm-v7a/build/libpaddle-mobile.so")
+    push(mobile_src_root + "/build/release/arm-v7a/build/cl_kernel")
    push(mobile_src_root + "/test/build/test-net")
    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
    args = str(len(last_feed_var_shape))