Merge remote-tracking branch 'upstream/develop' into develop

f4e27ad1 · jackzhang235 · b6c35b17 · 6a0a1f08 · f4e27ad1 · f4e27ad1
143 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+build*
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -57,22 +57,20 @@ function(check_linker_flag)
    endforeach()
    set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
 endfunction()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (LITE_ON_TINY_PUBLISH)
    if((NOT LITE_WITH_PYTHON))
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
    endif()
+    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+    endif()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()
-if(LITE_WITH_OPENCL)
-    if(ARM_TARGET_LANG STREQUAL "clang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-endif()
 if(LITE_WITH_OPENMP)
    find_package(OpenMP REQUIRED)
    if(OPENMP_FOUND OR OpenMP_CXX_FOUND)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+# file to record faked kernels for opt python lib
+set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
+file(WRITE ${fake_kernels_src_list} "") # clean
 if(LITE_BUILD_TAILOR)
  set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
@@ -313,56 +318,65 @@ function(add_kernel TARGET device level)
        return()
    endif()
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
-    endif()
-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
    if ("${device}" STREQUAL "Host")
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
        if (NOT LITE_WITH_ARM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "X86")
        if (NOT LITE_WITH_X86)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "NPU")
        if (NOT LITE_WITH_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "FPGA")
        if (NOT LITE_WITH_FPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "BM")
        if (NOT LITE_WITH_BM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
@@ -375,6 +389,9 @@ function(add_kernel TARGET device level)
    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
@@ -382,6 +399,9 @@ function(add_kernel TARGET device level)
    if ("${device}" STREQUAL "CUDA")
        if (NOT LITE_WITH_CUDA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")

--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
 > 不同手机，不同版本，测试模型的性能数据不同。
 ```shell
-run benchmark armv7
+run benchmark armv8
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 159.8427 ms
+mnasnet                       min = 19.83500    max = 19.38500    average = 19.65503
-- mobilenet_v1          avg = 235.0072 ms
+mobilenetv1                   min = 32.00600    max = 31.56900    average = 31.81983
-- mobilenet_v2          avg = 173.0387 ms
+mobilenetv2                   min = 22.37900    max = 22.08700    average = 22.28623
-- shufflenet_v2         avg = 76.0040 ms
+shufflenetv2                  min = 10.80400    max = 10.62900    average = 10.68890
-- squeezenet_v11        avg = 164.2957 ms
+squeezenet                    min = 17.67400    max = 17.47900    average = 17.57677
 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 83.1287 ms
+mnasnet                       min = 11.85600    max = 11.72000    average = 11.77127
-- mobilenet_v1          avg = 121.6029 ms
+mobilenetv1                   min = 18.75000    max = 18.64300    average = 18.70593
-- mobilenet_v2          avg = 86.6175 ms
+mobilenetv2                   min = 14.05100    max = 13.59900    average = 13.71450
-- shufflenet_v2         avg = 41.5761 ms
+shufflenetv2                  min = 6.67200     max = 6.58300     average = 6.63400
-- squeezenet_v11        avg = 87.8678 ms
+squeezenet                    min = 12.07100    max = 11.33400    average = 11.41253
 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 73.3880 ms
+mnasnet                       min = 7.19300     max = 7.02600     average = 7.08480
-- mobilenet_v1          avg = 119.0739 ms
+mobilenetv1                   min = 10.42000    max = 10.29100    average = 10.34267
-- mobilenet_v2          avg = 85.3050 ms
+mobilenetv2                   min = 8.61900     max = 8.46900     average = 8.54707
-- shufflenet_v2         avg = 38.0762 ms
+shufflenetv2                  min = 4.55200     max = 4.41900     average = 4.46477
-- squeezenet_v11        avg = 64.2201 ms
+squeezenet                    min = 8.60000     max = 7.85200     average = 7.98407
 --------------------------------------
-run benchmark armv8
+run benchmark armv7
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 165.3073 ms
+mnasnet                       min = 20.98300    max = 20.81400    average = 20.92527
-- mobilenet_v1          avg = 306.0188 ms
+mobilenetv1                   min = 33.19000    max = 32.81700    average = 33.08490
-- mobilenet_v2          avg = 195.1884 ms
+mobilenetv2                   min = 25.91400    max = 25.61700    average = 25.73097
-- shufflenet_v2         avg = 99.3692 ms
+shufflenetv2                  min = 11.14300    max = 10.97600    average = 11.06757
-- squeezenet_v11        avg = 156.6971 ms
+squeezenet                    min = 19.31800    max = 19.20000    average = 19.26530
 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 90.2290 ms
+mnasnet                       min = 12.59900    max = 12.46600    average = 12.52207
-- mobilenet_v1          avg = 157.0007 ms
+mobilenetv1                   min = 19.05800    max = 18.94700    average = 18.97897
-- mobilenet_v2          avg = 118.1607 ms
+mobilenetv2                   min = 15.28400    max = 15.11300    average = 15.19843
-- shufflenet_v2         avg = 68.6804 ms
+shufflenetv2                  min = 6.97000     max = 6.81400     average = 6.90863
-- squeezenet_v11        avg = 91.3090 ms
+squeezenet                    min = 12.87900    max = 12.12900    average = 12.22530
 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 179.9730 ms
+mnasnet                       min = 7.31400     max = 7.12900     average = 7.20357
-- mobilenet_v1          avg = 204.0684 ms
+mobilenetv1                   min = 11.44000    max = 10.86900    average = 10.94383
-- mobilenet_v2          avg = 181.6486 ms
+mobilenetv2                   min = 9.14900     max = 9.03800     average = 9.09907
-- shufflenet_v2         avg = 123.2728 ms
+shufflenetv2                  min = 4.60600     max = 4.49400     average = 4.53360
-- squeezenet_v11        avg = 412.9046 ms
+squeezenet                    min = 8.27000     max = 8.10600     average = 8.19000
 --------------------------------------
 ```
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
    --valid_targets=npu,arm \
-    --prefer_int8_kernel=(true|false) \
    --record_tailoring_info =(true|false)
 ```
 - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。

--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h
  --arm_os=android \
  --arm_abi=armv8 \
  --arm_lang=gcc \
-  build_test_arm_opencl
+  build_opencl
 ```
+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
 ### 编译产物说明
 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：

--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 例如：
 ```bash
-./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
 ```
 **注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 #include <stdio.h>
 #include <vector>
 #include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
 using namespace paddle::lite_api;  // NOLINT
@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
 2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
-4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
+4. 需要使用Paddle-Lite  `release/v2.1.0`之后版本代码编译出的模型优化工具。
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式：
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
    --valid_targets=(arm|opencl|x86|npu|xpu) \
-    --prefer_int8_kernel=(true|false) \
    --record_tailoring_info =(true|false)
 ```
@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式：
 | --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
 | --optimize_out      | 优化模型的输出路径。                                         |
 | --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
-| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
 | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
 * 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
 * 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
 * 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。
 ### 功能二：统计模型算子信息、判断是否支持

--- a/docs/user_guides/model_quantization.md
+++ b/docs/user_guides/model_quantization.md
@@ -245,7 +245,6 @@ python compress.py \
 --optimize_out_type=naive_buffer              \
 --optimize_out=mobilenet_v1_quant_opt         \
 --valid_targets=arm                           \
--prefer_int8_kernel=true
 ```
 如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。

--- a/docs/user_guides/post_quant_no_data.md
+++ b/docs/user_guides/post_quant_no_data.md
@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
-因为该模型会将量化的权重反量化，然后实际加载并执行FP32预测模型，所以opt命令的输入参数--prefer_int8_kernel不需要设置为true，同时其他参数按照实际情况参考文档设置。
 比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \

--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'):
 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
-参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true，其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
+参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具，参数按照实际情况设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \
      --optimize_out_type=naive_buffer \
      --optimize_out=mobilenet_v1_quant_opt \
-      --valid_targets=arm \
+      --valid_targets=arm
-      --prefer_int8_kernel=true
 ```
 ### 3.2 量化模型预测

--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -24,8 +24,7 @@ $ ./opt \
    --param_file=<param_path> \
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86) \
+    --valid_targets=(arm|opencl|x86)
-    --prefer_int8_kernel=(ture|false)
 ```
 其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 if (LITE_WITH_PYTHON)
    add_custom_target(publish_inference_python_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    add_custom_target(publish_inference_python_installer ${TARGET}
+        COMMAND python setup.py bdist_wheel
+        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+        DEPENDS publish_inference_python_lib)
    add_custom_target(publish_inference_python_light_demo ${TARGET}
    	COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
    	COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON)
    endif()
    add_dependencies(publish_inference_python_lib lite_pybind)
    add_dependencies(publish_inference publish_inference_python_lib)
+    add_dependencies(publish_inference publish_inference_python_installer)
    add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                endif()
            endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH)
    return()
 endif()
+# add library for opt_base
+lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
    message(STATUS "Compiling opt")
    lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  const std::vector<std::string> quant_dequant_op = {
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max",
+      "fake_dequantize_max_abs",
+      "fake_channel_wise_dequantize_max_abs"};
+  bool is_quantized_model = false;
+  for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
+       ++i) {
+    auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
+      auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
+      std::string op_type = op_desc->Type();
+      if (std::find(quant_dequant_op.begin(),
+                    quant_dequant_op.end(),
+                    op_type) != quant_dequant_op.end()) {
+        is_quantized_model = true;
+      }
+    }
+  }
+  if (is_quantized_model) {
+    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+  }
  Program program(desc, scope_, inner_places);
  core::KernelPickFactor factor;

--- a/lite/api/cxx_api_bin.cc
+++ b/lite/api/cxx_api_bin.cc
@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
 int main(int argc, char** argv) {
  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  paddle::lite::Run(argv[1], atoi(argv[2]));
  return 0;
 }

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
 std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
  LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
+  return nullptr;
 }
 std::string LightPredictorImpl::GetVersion() const { return lite::version(); }

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,
  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places,
  // Get detailed result
  size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;
  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
-    float sum = 0.f;
+    auto out_std_dev = compute_standard_deviation<float>(
-    for (int i = 0; i < out_dims.production(); ++i) {
+        out_data, out_dims.production(), true, out_mean);
-      sum += output_tensor->data<float>()[i];
-    }
+    VLOG(1) << "output tensor dims:" << out_dims;
-    VLOG(1) << "out_dims.production():" << out_dims.production();
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    VLOG(1) << "output tensor mean value:" << out_mean;
    // print result
    for (int i = 0; i < out_dims.production(); ++i) {

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places,
  // Get detailed result
  size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;
  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
-    float sum = 0.f;
+    auto out_std_dev = compute_standard_deviation<float>(
-    for (int i = 0; i < out_dims.production(); ++i) {
+        out_data, out_dims.production(), true, out_mean);
-      sum += output_tensor->data<float>()[i];
-    }
+    VLOG(1) << "output tensor dims:" << out_dims;
-    VLOG(1) << "out_dims.production():" << out_dims.production();
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    VLOG(1) << "output tensor mean value:" << out_mean;
    // print result
    for (int i = 0; i < out_dims.production(); ++i) {

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
              "arm",
              "The targets this model optimized for, should be one of (arm, "
              "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
 DEFINE_bool(print_supported_ops,
            false,
            "Print supported operators on the inputed target");
@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() {
      << "At least one target should be set, should set the "
         "command argument 'valid_targets'";
-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.insert(valid_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
  return valid_places;
 }
@@ -257,7 +251,6 @@ void PrintHelpInfo() {
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
-      "        `--prefer_int8_kernel=(true|false)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/api/opt_base.h"
+#include "all_kernel_faked.cc"  // NOLINT
+namespace paddle {
+namespace lite_api {
+void OptBase::SetModelDir(const std::string& model_path) {
+  opt_config_.set_model_dir(model_path);
+}
+void OptBase::SetModelFile(const std::string& model_path) {
+  opt_config_.set_model_file(model_path);
+}
+void OptBase::SetParamFile(const std::string& param_path) {
+  opt_config_.set_param_file(param_path);
+}
+void OptBase::SetModelType(std::string optimize_out_type) {
+  if (optimize_out_type == "protobuf") {
+    model_type_ = LiteModelType::kProtobuf;
+  } else if (optimize_out_type == "naive_buffer") {
+    model_type_ = LiteModelType::kNaiveBuffer;
+  } else {
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
+  }
+}
+void OptBase::SetValidPlaces(const std::string& valid_places) {
+  valid_places_.clear();
+  auto target_reprs = lite::Split(valid_places, ",");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places_.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
+    } else if (target_repr == "x86") {
+      valid_places_.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places_.emplace_back(TARGET(kXPU));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+  CHECK(!valid_places_.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+}
+void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+  optimize_out_path_ = optimized_out_path;
+}
+void OptBase::RunOptimize(bool record_strip_info) {
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+    auto resulted_model_name =
+        record_strip_info ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << optimize_out_path_ << "successfully";
+  }
+}
+// collect ops info of modelset
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+void OptBase::SetModelSetDir(const std::string& model_set_path) {
+  model_set_dir_ = model_set_path;
+}
+void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
+  // 1. mkdir of outputed optimized model set.
+  lite::MkDirRecur(optimize_out_path_);
+  auto model_dirs = lite::ListDir(model_set_dir_, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
+  }
+  // 2. optimize each model in inputed model set dir.
+  std::string model_file = opt_config_.model_file();
+  std::string param_file = opt_config_.param_file();
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({model_set_dir_, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({optimize_out_path_, name}, "/");
+    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
+      auto model_file_path =
+          lite::Join<std::string>({input_model_dir, model_file}, "/");
+      auto param_file_path =
+          lite::Join<std::string>({input_model_dir, param_file}, "/");
+    }
+    std::cout << "Start optimize model: " << input_model_dir;
+    opt_config_.set_model_dir(input_model_dir);
+    opt_config_.set_model_file(model_file);
+    opt_config_.set_param_file(param_file);
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+    std::cout << "Optimize done. ";
+  }
+  // 3. if record_strip_info = true, we will record striping info
+  if (record_strip_info) {
+    // Collect all models information
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+    CollectModelMetaInfo(optimize_out_path_,
+                         model_dirs,
+                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+    std::cout << "Record the information of stripped models into :"
+              << optimize_out_path_ << "successfully";
+  }
+}
+void OptBase::PrintHelpInfo() {
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of help information:\n"
+      "        `help()`   Print help infomation\n"
+      "  Arguments of model optimization:\n"
+      "        `set_model_dir(model_dir)`\n"
+      "        `set_model_file(model_file_path)`\n"
+      "        `set_param_file(param_file_path)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `run_optimize(false|true)`\n"
+      "        `  ----fasle&true refer to whether to record ops info for "
+      "tailoring lib, false by default`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `print_all_ops()`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `print_supported_ops`   Display supported operators of valid "
+      "places\n"
+      "        `check_if_model_supported()`   Check if the input model is "
+      "supported\n";
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+}
+// 2. Print supported info of inputed ops
+void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
+  std::vector<std::string> lite_supported_targets = {"kHost",
+                                                     "kX86",
+                                                     "kCUDA",
+                                                     "kARM",
+                                                     "kOpenCL",
+                                                     "kFPGA",
+                                                     "kNPU",
+                                                     "kXPU",
+                                                     "kAny",
+                                                     "kUnk"};
+  // Get the lengh of the first column: maximum length of the op_type
+  size_t maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  // Print the first row: OP_nam taget1 target2 ...
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+    std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  // Print the name of supported ops and mark if it's supported by each target
+  // print the support info of inputed ops: valid_ops
+  for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+    std::cout << std::setw(maximum_optype_length) << *op;
+    // Check: If this kernel doesn't match any operator, we will skip it.
+    if (supported_ops.find(*op) == supported_ops.end()) {
+      continue;
+    }
+    // Print OP info.
+    auto ops_valid_places = supported_ops.at(*op);
+    for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+      if (std::find(ops_valid_places.begin(),
+                    ops_valid_places.end(),
+                    lite_supported_targets[i]) != ops_valid_places.end()) {
+        std::cout << std::setw(10) << "Y";
+      } else {
+        std::cout << std::setw(10) << " ";
+      }
+    }
+    std::cout << std::endl;
+  }
+}
+void OptBase::DisplayKernelsInfo() {  // Display kernel information
+  std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
+}
+void OptBase::PrintAllOps() {
+  // 1. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < supported_ops_target.size(); i++) {
+    auto ops = supported_ops_target[i];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 2. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+void OptBase::PrintSupportedOps() {
+  // 1. Get the valid hardware targets
+  std::vector<TargetType> target_types = {};
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    target_types.push_back(valid_places_[i].target);
+  }
+  std::string targets_str = TargetToStr(target_types[0]);
+  for (size_t i = 1; i < target_types.size(); i++) {
+    targets_str = targets_str + TargetToStr(target_types[i]);
+  }
+  std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+  target_types.push_back(TARGET(kHost));
+  target_types.push_back(TARGET(kUnk));
+  // 2. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < target_types.size(); i++) {
+    auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 3. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+// test whether this model is supported
+void OptBase::CheckIfModelSupported(bool print_ops_info) {
+  // 1. parse valid places and valid targets
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    auto target = valid_places_[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+  // 2.Load model into program to get ops in model
+  std::string prog_path = opt_config_.model_dir() + "/__model__";
+  if (!(opt_config_.model_file()).empty() &&
+      !(opt_config_.param_file()).empty()) {
+    prog_path = opt_config_.model_file();
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (print_ops_info) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (size_t i = 0; i < valid_places_.size(); i++) {
+      targets.push_back(valid_places_[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (size_t i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (print_ops_info) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
+}  // namespace lite_api
+}  // namespace paddle
--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * This file defines Opt and basic functions about model transformation.
+ */
+#ifndef PADDLE_LITE_OPT_H_  // NOLINT
+#define PADDLE_LITE_OPT_H_
+#include <algorithm>
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+// stores the map that records the source_file path of each kernel.
+#include "kernel_src_map.h"  // NOLINT
+#include "lite/api/cxx_api.h"
+// version of Paddle-lite
+#include "lite/core/version.h"
+// model parser functions to pre-load model to verify if this model is supported
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
+#include "lite/utils/string.h"
+// recorded all the ops supported by paddle-lite
+#include "supported_kernel_op_info.h"  // NOLINT
+namespace paddle {
+namespace lite_api {
+/// The PaddlePredictor defines the basic interfaces for different kinds of
+/// predictors.
+class LITE_API OptBase {
+ public:
+  OptBase() = default;
+  void SetModelSetDir(const std::string &model_set_path);
+  void SetModelDir(const std::string &model_path);
+  void SetModelFile(const std::string &model_path);
+  void SetParamFile(const std::string &param_path);
+  void SetValidPlaces(const std::string &valid_places);
+  void SetOptimizeOut(const std::string &optimized_out_path);
+  // set optimized_model type
+  void SetModelType(std::string model_type);
+  // transform and save the optimized model
+  void RunOptimize(bool record_strip_info = false);
+  // fuctions of printing info
+  // 1. help info
+  void PrintHelpInfo();
+  // 2. PrintOpsInfo
+  void PrintOpsInfo(const std::set<std::string> &valid_ops =
+                        {});  // print supported ops on target_types
+  void PrintAllOps();         // print all ops
+  void PrintSupportedOps();   // print ops supported on valid_places_
+  void DisplayKernelsInfo();  // Display kernel information
+  // 3. Check if this model is supported
+  void CheckIfModelSupported(bool print_ops_info = true);
+ private:
+  CxxConfig opt_config_;
+  // valid places for the optimized_model
+  std::vector<Place> valid_places_;
+  // filename of the optimized_model
+  std::string optimize_out_path_;
+  // type of the optimized_model, kNaiveBuffer default.
+  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
+  // Dir path of a set of models, this should be combined with model
+  std::string model_set_dir_;
+  void RunOptimizeFromModelSet(bool record_strip_info = false);
+};
+}  // namespace lite_api
+}  // namespace paddle
+#endif  // NOLINT
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
    return()
 endif()
+# to create setup.py for packeting whl for Paddle-Lite and opt
+execute_process(
+  COMMAND git describe --tags --exact-match
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_TAG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 add_subdirectory(pybind)
 #add_subdirectory(interface)
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
 set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
 if (NOT LITE_ON_TINY_PUBLISH)
-   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
+   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -26,11 +26,12 @@
 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
+#include "lite/api/opt_base.h"
 #endif
 #include "lite/api/light_api.h"
 #include "lite/api/paddle_api.h"
+#include "lite/core/tensor.h"
 namespace py = pybind11;
@@ -48,10 +49,27 @@ using lite_api::DataLayoutType;
 using lite_api::Place;
 using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
+using lite_api::OptBase;
 #ifndef LITE_ON_TINY_PUBLISH
 using lite::CxxPaddleApiImpl;
 static void BindLiteCxxPredictor(py::module *m);
+void BindLiteOpt(py::module *m) {
+  py::class_<OptBase> opt_base(*m, "Opt");
+  opt_base.def(py::init<>())
+      .def("set_model_dir", &OptBase::SetModelDir)
+      .def("set_modelset_dir", &OptBase::SetModelSetDir)
+      .def("set_model_file", &OptBase::SetModelFile)
+      .def("set_param_file", &OptBase::SetParamFile)
+      .def("set_valid_places", &OptBase::SetValidPlaces)
+      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_model_type", &OptBase::SetModelType)
+      .def("run_optimize", &OptBase::RunOptimize)
+      .def("help", &OptBase::PrintHelpInfo)
+      .def("print_supported_ops", &OptBase::PrintSupportedOps)
+      .def("display_kernels_info", &OptBase::DisplayKernelsInfo)
+      .def("print_all_ops", &OptBase::PrintAllOps);
+}
 #endif
 static void BindLiteLightPredictor(py::module *m);
 static void BindLiteCxxConfig(py::module *m);

--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
@@ -22,11 +22,15 @@ namespace lite {
 namespace pybind {
 void BindLiteApi(pybind11::module *m);
+void BindLiteOpt(pybind11::module *m);
-PYBIND11_MODULE(lite_core, m) {
+PYBIND11_MODULE(lite, m) {
  m.doc() = "C++ core of Paddle-Lite";
  BindLiteApi(&m);
+#ifndef LITE_ON_TINY_PUBLISH
+  BindLiteOpt(&m);
+#endif
 }
 }  // namespace pybind

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+import shutil
+import os
+from setuptools import setup, Distribution
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+# link lite.so to paddlelite.libs
+COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -17,6 +17,7 @@
 #include <gflags/gflags.h>
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 // for eval
 DEFINE_string(model_dir, "", "model dir");
@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
  return 1e+6 * time.tv_sec + time.tv_usec;
 }
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_add_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 int num) {
+  int cnt = num >> 4;
+  int remain = num & 0x0f;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const float* out_data = dout_grad + 16 * i;
+    float* x_data = x_grad + 16 * i;
+    float32x4_t din0 = vld1q_f32(out_data);
+    float32x4_t din1 = vld1q_f32(out_data + 4);
+    float32x4_t din2 = vld1q_f32(out_data + 8);
+    float32x4_t din3 = vld1q_f32(out_data + 12);
+    vst1q_f32(x_data, din0);
+    vst1q_f32(x_data + 4, din1);
+    vst1q_f32(x_data + 8, din2);
+    vst1q_f32(x_data + 12, din3);
+  }
+  if (remain > 0) {
+    const float* out_data = dout_grad + 16 * cnt;
+    float* x_data = x_grad + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      x_data[i] = out_data[i];
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_add_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post >> 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum += out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 template <>
 void elementwise_sub<float>(const float* dinx,
                            const float* diny,
@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
    }
  }
 }
+// we assume the formula is x-y
+template <>
+void elementwise_sub_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 float* y_grad,
+                                 int num) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, num);
+  }
+  if (y_grad) {
+    int cnt = num >> 4;
+    int remain = num & 0x0f;
+    float32x4_t minus = vdupq_n_f32(-1);
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      const float* out_data = dout_grad + 16 * i;
+      float* y_data = y_grad + 16 * i;
+      float32x4_t din0 = vld1q_f32(out_data);
+      float32x4_t din1 = vld1q_f32(out_data + 4);
+      float32x4_t din2 = vld1q_f32(out_data + 8);
+      float32x4_t din3 = vld1q_f32(out_data + 12);
+      din0 = vmulq_f32(din0, minus);
+      din1 = vmulq_f32(din1, minus);
+      din2 = vmulq_f32(din2, minus);
+      din3 = vmulq_f32(din3, minus);
+      vst1q_f32(y_data, din0);
+      vst1q_f32(y_data + 4, din1);
+      vst1q_f32(y_data + 8, din2);
+      vst1q_f32(y_data + 12, din3);
+    }
+    if (remain > 0) {
+      const float* out_data = dout_grad + 16 * cnt;
+      float* y_data = y_grad + 16 * cnt;
+      for (int i = 0; i < remain; ++i) {
+        y_data[i] = -out_data[i];
+      }
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post << 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum -= out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 template <>
 void elementwise_mul<float>(const float* dinx,

--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -183,6 +183,13 @@ template <typename T>
 void elementwise_add_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+template <typename T>
+void elementwise_add_grad(const T* dout, T* dinx, int num);
+template <typename T>
+void elementwise_add_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
 template <typename T>
 void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
@@ -197,6 +204,13 @@ template <typename T>
 void elementwise_sub_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+template <typename T>
+void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
+template <typename T>
+void elementwise_sub_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
 template <typename T>
 void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);

--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
  "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
  "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
  "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
  "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
+  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
  "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
+  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
+  /*"vmla.f32 q0, q4, q6            @ mul add\n" */                            \
+  /*"vmla.f32 q1, q4, q8            @ mul add\n" */                            \
  "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
  "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
  "subs %[cnt], #1                @ sub loop count \n"                         \

--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
 inline void save_float(float* data, const std::string& name, int len) {
  static int counter = 0;
-  std::string old_string = std::to_string(counter);
+  std::string old_string = paddle::lite::to_string(counter);
  std::string new_string =
      std::string(3 - old_string.length(), '0') + old_string;

--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -351,10 +351,10 @@ class Tensor {
  void printScale(std::string type) { printScale(); }
  std::string dimsFileName() {
-    return std::to_string(shape_->num()) + "_" +
+    return paddle::lite::to_string(shape_->num()) + "_" +
-           std::to_string(shape_->channel()) + "_" +
+           paddle::lite::to_string(shape_->channel()) + "_" +
-           std::to_string(shape_->height()) + "_" +
+           paddle::lite::to_string(shape_->height()) + "_" +
-           std::to_string(shape_->width()) + ".txt";
+           paddle::lite::to_string(shape_->width()) + ".txt";
  }
  void saveToFile() { std::string path = dimsFileName(); }
@@ -374,7 +374,7 @@ class Tensor {
    invalidate();
    std::ofstream ofs;
    static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
+    std::string npath = paddle::lite::to_string(counter) + "_" + path;
    counter++;
    save_file_with_name(npath);
  }

--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace lite {
 namespace npu {
-std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
+std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
-    std::string& model_name,                 // NOLINT
+    const std::string model_name,            // NOLINT
    std::vector<ge::Operator>& input_nodes,  // NOLINT
    std::vector<ge::Operator>& output_nodes  // NOLINT
    ) {
@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
    ir_build.ReleaseModelBuff(om_model_buf);
    return nullptr;
  }
  // Create a HiAI model manager client to load the HiAI om model
-  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+  std::shared_ptr<hiai::AiModelMngerClient> model_client(
      new hiai::AiModelMngerClient());
  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
    ir_build.ReleaseModelBuff(om_model_buf);
    return nullptr;
  }
-  model_name = "model_" + std::to_string(model_count_++) + ".om";
  auto model_desc = std::make_shared<hiai::AiModelDescription>(
      model_name, freq_level(), framework_type(), model_type(), device_type());
  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);

--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -40,8 +40,8 @@ class Device {
  // Build the HiAI IR graph to om model, return HiAI model manager client to
  // load om model and run inference.
-  std::unique_ptr<hiai::AiModelMngerClient> Build(
+  std::shared_ptr<hiai::AiModelMngerClient> Build(
-      std::string& model_name,                 // NOLINT
+      const std::string model_name,            // NOLINT
      std::vector<ge::Operator>& input_nodes,  // NOLINT
      std::vector<ge::Operator>& output_nodes  // NOLINT
      );                                       // NOLINT
@@ -51,7 +51,6 @@ class Device {
  int framework_type_{0};
  int model_type_{0};
  int device_type_{0};
-  int model_count_{0};
 };
 }  // namespace npu

--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)
 add_dependencies(cl_wrapper opencl_clhpp)
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
 #include <cl_common.h>
-__kernel void conv2d_1x1(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
                         __private const int global_size_dim1,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -26,6 +26,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int stride,
                             __private const int pad,
                             __private const int dilation,
+                             __private const int batch,
                             __private const int in_ch,
                             __private const int in_w,
                             __private const int in_h,
@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #ifdef BIASE_CH
  CL_DTYPE4 output[5];
-  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
  output[1] = output[0];
  output[2] = output[0];
  output[3] = output[0];
@@ -69,22 +71,32 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #elif defined(BIASE_ELE)
  CL_DTYPE4 output[5];
-  output[0] =
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
-      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
  if (out_w_id1 < out_w) {
-    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
                              (int2)(out_w_base_id + out_w_id1, item_h_id));
  }
  if (out_w_id2 < out_w) {
-    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
                              (int2)(out_w_base_id + out_w_id2, item_h_id));
  }
  if (out_w_id3 < out_w) {
-    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
                              (int2)(out_w_base_id + out_w_id3, item_h_id));
  }
  if (out_w_id4 < out_w) {
-    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
                              (int2)(out_w_base_id + out_w_id4, item_h_id));
  }
 #else
@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
    int filter_w_val = ch * 3;
    for (int h = 0; h < 3; h++) {
-      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h,
+                            -1,
                            (out_batch_id * in_h + in_h_id + h < 0 ||
                             out_batch_id * in_h + in_h_id + h >= in_h));
      for (int w = 0; w < 3; w++) {
-        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-        filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
+        filter[0] = READ_IMG_TYPE(
-            filter_image, sampler,
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
-        filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
+        filter[1] = READ_IMG_TYPE(
-            filter_image, sampler,
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
-        filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
+        filter[2] = READ_IMG_TYPE(
-            filter_image, sampler,
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
-        filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
+        filter[3] = READ_IMG_TYPE(
-            filter_image, sampler,
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
-        filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x,
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
                                      filter[3].x);  // in_ch:0,out_ch:0-3
-        filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y,
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
                                      filter[3].y);  // in_ch:1,out_ch:0-3
-        filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z,
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
                                      filter[3].z);  // in_ch:2,out_ch:0-3
-        filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w,
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
                                      filter[3].w);  // in_ch:3,out_ch:0-3
-        input[0] =
+        input[0] = READ_IMG_TYPE(
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
-        input[1] =
+        input[1] = READ_IMG_TYPE(
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
-        input[2] =
+        input[2] = READ_IMG_TYPE(
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
-        input[3] =
+        input[3] = READ_IMG_TYPE(
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
-        input[4] =
+        input[4] = READ_IMG_TYPE(
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
        output[0] = mad(input[0].x, filter_trans[0], output[0]);
        output[1] = mad(input[1].x, filter_trans[0], output[1]);
@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
  output[3] = activation_type4(output[3]);
  output[4] = activation_type4(output[4]);
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
                 output[0]);
  if (out_w_id1 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
                   output[1]);
  }
  if (out_w_id2 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
                   output[2]);
  }
  if (out_w_id3 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
                   output[3]);
  }
  if (out_w_id4 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
                   output[4]);
  }
 }
+// support batch > 1
+__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+#ifdef BIASE_CH
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+    const int in_w_base_id = mul24(ch, in_w);
+    int filter_w_val = ch * 3;
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "glog/logging.h"
 #include "lite/backends/x86/jit/gen/jitcode.h"
+#include "lite/utils/string.h"
 namespace paddle {
 namespace lite {
@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
      base += "_Vec";
    }
    base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
+    base += "_D" + paddle::lite::to_string(num_);
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
    } else if (type_ == SeqPoolType::kSqrt) {
      base += "_Sqrt";
    }
-    base += ("_W" + std::to_string(tbl_w_));
+    base += ("_W" + paddle::lite::to_string(tbl_w_));
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {
  std::string name() const override {
    std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
+    base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
-           std::to_string(k_);
+           paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
    } else if (type_ == SeqPoolType::kSqrt) {
      base += "_Sqrt";
    }
-    base += ("_W" + std::to_string(w_));
+    base += ("_W" + paddle::lite::to_string(w_));
    return base;
  }
  void genCode() override;

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -94,9 +94,13 @@ add_custom_command(
  OUTPUT ops.h # not a real path to the output to force it execute every time.
  )
 # generate fake kernels for memory_optimize_tool
+#-------------------------------opt----------------------------------------------------------------
+# tricks to create headfiles for opt
 add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
  ${kernels_src_list}
+  ${fake_kernels_src_list}
  ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
  ${CMAKE_BINARY_DIR}/kernel_src_map.h
  OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
@@ -104,12 +108,12 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
-#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc)
 # create headfile to restore ops info sorted by suppported platforms
 add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
  ${kernels_src_list}
+  ${fake_kernels_src_list}
  ${ops_src_list}
  ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
  OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -490,7 +490,7 @@ class ContextScheduler {
      } break;
 #endif
      default:
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
+#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
        LOG(FATAL) << "unsupported target " << TargetToStr(target);
 #endif
        break;

--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) {
    auto attr_type = op_info->GetAttrType(attr_name);
    switch (attr_type) {
      case AttrType::INT:
-        os << ":int:" << std::to_string(op_info->GetAttr<int>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
        break;
      case AttrType::FLOAT:
-        os << ":float:" << std::to_string(op_info->GetAttr<float>(attr_name));
+        os << ":float:"
+           << paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
        break;
      case AttrType::BOOLEAN:
-        os << ":int:" << std::to_string(op_info->GetAttr<bool>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
        break;
      case AttrType::STRING:
        os << ":string: \""

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
  // non-tensor(like tensor_array) variables will not be reused
  for (auto& node : graph->nodes()) {
-    if (node.IsArg() && !node.arg()->type->IsTensor()) {
+    if (node.IsArg() && (node.arg()->type != nullptr) &&
+        !node.arg()->type->IsTensor()) {
      invalid_var_names.insert(node.arg()->name);
    }
  }
@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
      if (reuse_table.count(name) && reuse_table.at(name) != name) {
        auto replace_name = reuse_table.at(name);
        input_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
        node_append_idx++;
      }
    }
@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
      if (reuse_table.count(name) && reuse_table.at(name) != name) {
        auto replace_name = reuse_table.at(name);
        out_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
        node_append_idx++;
      }
    }

--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -85,7 +85,7 @@ class Node {
  struct Arg {
    std::string name;
    int id{0};
-    const Type* type{};
+    const Type* type{nullptr};
    // Weight is a special kind of argument, it is marked as weight explicitly
    // so that some weight related optimization can take place.
    bool is_weight{false};

--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply(
    }
    if (found) {
      inst.mutable_op_info()->SetAttr("output_scale", output_scale);
+    } else if (op_info->HasAttr("output_scale")) {
+      int bit_length = op_info->GetAttr<int>("bit_length");
+      int range = (1 << (bit_length - 1)) - 1;
+      output_scale = op_info->GetAttr<float>("output_scale");
+      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
    }
    if (op_info->HasAttr("output_scale")) {
      inst.mutable_op_info()->SetAttr("enable_int8", true);

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
    }
    VLOG(4) << "[score(final)]:" << final_score;
-    VLOG(4) << "-------- pick summary --------";
+    VLOG(2) << "-------- pick summary for " << instruct.op_type()
-    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+            << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
            << " " << DataLayoutToStr(winner_place.layout) << " "
            << TargetToStr(winner_place.target);
-    VLOG(4) << " ===> kernel.place():"
+    VLOG(2) << " ===> kernel.place():"
            << PrecisionToStr(kernel.place().precision) << " "
            << DataLayoutToStr(kernel.place().layout) << " "
            << TargetToStr(kernel.place().target);

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
    } else {
      exists_ops[op_type]++;
    }
-    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
    std::string op_color = "white";
    if (subgraph_indices.count(node)) {
      auto subgraph_idx = subgraph_indices[node];
-      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
      op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
    }
    dot.AddNode(op_name,
@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
  std::vector<std::string> lines = ReadLines(config_file_path);
  for (std::string line : lines) {
+    if (line.empty()) continue;
    std::vector<std::string> node_info = Split(line, ":");
    std::string op_type = node_info.at(0);
    std::vector<std::string> in_vars_name;
@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  cpp::OpDesc subgraph_op_desc;
  subgraph_op_desc.SetType("subgraph");
-  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // Create a new sub block desc for storing all of Ops and Vars of the target
  // subgraph and sub_block_idx is set as a attribute of subgraph op,
  // sub_block_idx < 0 means it's a new subgraph op
  int sub_block_idx = -(subgraph_idx + 1);

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
  CHECK_EQ(input_var_names.size(), 1);
  CHECK_EQ(wshape.size(), 2);
  static int id = 0;
-  std::string prefix = "fc_" + std::to_string(id);
+  std::string prefix = "fc_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
    const std::vector<std::string>& input_Y_names) {
  // CHECK_EQ(input_var_names.size(), 2);
  static int id = 0;
-  std::string prefix = "elementwise_add_" + std::to_string(id);
+  std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();
@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
    const std::vector<std::string>& input_X_names) {
  // CHECK_EQ(input_var_names.size(), 1);
  static int id = 0;
-  std::string prefix = "feed_" + std::to_string(id);
+  std::string prefix = "feed_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();
@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
    const std::vector<std::string>& input_X_names) {
  // CHECK_EQ(input_var_names.size(), 1);
  static int id = 0;
-  std::string prefix = "fetch_" + std::to_string(id);
+  std::string prefix = "fetch_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,6 +17,7 @@
 #include "lite/api/paddle_api.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
 DEFINE_string(model_file, "", "model file path of combined protobuf model");
 DEFINE_string(params_file, "", "params file path of combined protobuf model");
@@ -31,43 +32,17 @@ namespace lite {
 // The helper functions for loading and running model from command line and
 // verifying output data
 std::vector<std::string> TypeParsing(std::string text) {
-  std::vector<std::string> types;
+  return Split(text, ":");
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string type = text.substr(0, index);
-    VLOG(3) << type;
-    types.push_back(type);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
-    }
-  }
-  return types;
 }
 std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
  std::vector<std::vector<int64_t>> shapes;
-  while (!text.empty()) {
+  std::vector<std::string> shape_strings = Split(text, ":");
-    size_t index = text.find_first_of(":");
+  shapes.resize(shape_strings.size());
-    std::string slice = text.substr(0, index);
+  for (int i = 0; i < shape_strings.size(); i++) {
-    std::vector<int64_t> shape;
+    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
-    while (!slice.empty()) {
+    for (auto shape_num : shape_nums) {
-      size_t index = slice.find_first_of(",");
+      shapes[i].push_back(atoi(shape_num.c_str()));
-      int d = atoi(slice.substr(0, index).c_str());
-      VLOG(3) << d;
-      shape.push_back(d);
-      if (index == std::string::npos) {
-        break;
-      } else {
-        slice = slice.substr(index + 1);
-      }
-    }
-    shapes.push_back(shape);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
    }
  }
  return shapes;

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
    if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
    auto inlinks = node->inlinks;
-    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
+    VLOG(4) << "============== node->AsStmt().op_type():"
-            << " inlinks.size():" << inlinks.size();
+            << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
+            << " ================";
    for (auto* in : inlinks) {
      ComplementInputs(graph.get(), node, in);
    }
@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
  auto decl_arg_type =
      inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
  CHECK(in->AsArg().type);
-  VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
+  VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
          << "\n in->AsArg().name:" << in->AsArg().name
          << "\n *in->AsArg().type:" << *in->AsArg().type
          << "\n *decl_arg_type:" << *decl_arg_type
          << "\n inst.op()->DebugString():" << inst.op()->DebugString();
+  // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
+  // layout).
+  // not a good judge, but don't find the source of this issue from
+  // static_pick_kernel_pass
+  // to this pass.
+  auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
+  if (in_arg_type->target() == TARGET(kARM) &&
+      in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
+    return;
+  }
  if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
    VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
            << " for kernel " << inst.op()->DebugString() << " "

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
  CHECK(in->IsArg());
  // auto node_id = [&] { return graph->nodes().size(); };
  auto cast_op_output_name = in->AsArg().name + "/precision_trans";
-  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // in->AsArg().name + "/precision_trans/" +
+  // paddle::lite::to_string(node_id());
  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
  cast_op_output_arg->AsArg().type =
      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -65,6 +65,7 @@ class OpLite : public Registry {
  virtual bool CheckShape() const { return true; }
  // Inference the outputs' shape.
  virtual bool InferShape() const { return true; }
+  virtual bool SmartInferShape() { return this->InferShape(); }
  // Run this operator.
  virtual bool Run();
  // Indicate whether the Op runs only once or not
@@ -150,6 +151,10 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
+  std::vector<DDimLite> last_input_shapes;
+  std::vector<DDimLite> last_output_shapes;
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
+  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
 };
 /*

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,18 +22,25 @@
 #include <vector>
 #include "lite/core/program.h"
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/kernels/opencl/image_helper.h"
+#endif
 namespace paddle {
 namespace lite {
 namespace profile {
 template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
  if (locate.find('/') != std::string::npos) {
-    return;
+    return false;
  }
  FILE* fp = fopen(locate.c_str(), "w");
  if (fp == nullptr) {
    LOG(ERROR) << "file open field " << locate;
+    return false;
  } else {
    const dtype* data = tensor->data<dtype>();
    for (int i = 0; i < tensor->numel(); ++i) {
@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
    }
  }
  fclose(fp);
+  return true;
 }
 class PrecisionProfiler {
 public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
+  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
-  ~PrecisionProfiler() {
+  // keep this method only for arm/math/conditional
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
+  explicit PrecisionProfiler(const Instruction* inst) {
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
+    std::string inst_precison_str = GetInstPrecision(inst);
-              << PrecisionToStr(inst_->kernel()->precision());
+  }
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
+  PrecisionProfiler() {}
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
+  std::string GetSummaryHeader() {
-        return -99999;
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    ss << "========================================= "
+       << "Detailed Precision Profiler Summary "
+       << "=========================================" << std::endl;
+    ss << setw(45) << left << "operator:(kernel_info)"
+       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
+       << " " << setw(15) << left << "dims"
+       << " " << setw(15) << left << "mean"
+       << " " << setw(15) << left << "std_deviation"
+       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+    return ss.str();
  }
+  template <typename T>
+  double compute_mean(const T* in, const size_t length) {
    double sum = 0.;
-      switch (ptype) {
+    for (size_t i = 0; i < length; ++i) {
+      sum += in[i];
+    }
+    return sum / length;
+  }
+  template <typename T>
+  double compute_standard_deviation(const T* in,
+                                    const size_t length,
+                                    bool has_mean = false,
+                                    double mean = 10000) {
+    if (!has_mean) {
+      mean = compute_mean<T>(in, length);
+    }
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      variance += pow((in[i] - mean), 2);
+    }
+    variance /= length;
+    return sqrt(variance);
+  }
+  template <typename T>
+  double compute_average_grow_rate(const T* in, const size_t length) {
+    const double eps = 1e-5;
+    double ave_grow_rate = 0.0f;
+    for (size_t i = 1; i < length; ++i) {
+      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+    }
+    ave_grow_rate /= length;
+    return ave_grow_rate;
+  }
+  // check if output tensor unused
+  bool is_unused(const Tensor* in) {
+    if (!in->data<int8_t>()) {
+      return true;
+    }
+    return false;
+  }
+  void compute_tensor_precision_info(const Tensor* in,
+                                     TargetType target_type,
+                                     PrecisionType precision_type,
+                                     DataLayoutType layout_type,
+                                     double* mean,
+                                     double* std_dev,
+                                     double* ave_grow_rate,
+                                     std::string name = "inst",
+                                     bool write_result_to_file = false) {
+    std::string unsupported_error_log =
+        "Unsupported precision profile for kernel registered on" +
+        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
+        DataLayoutToStr(layout_type);
+    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
+        target_type == TARGET(kX86)) {
+      switch (precision_type) {
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
+          *mean = compute_mean<float>(ptr, in->numel());
-          for (int i = 0; i < in->numel(); ++i) {
+          *std_dev =
-            sum += ptr[i];
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
-          }
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          return sum / in->numel();
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
        }
        case PRECISION(kAny): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
+          *mean = compute_mean<float>(ptr, in->numel());
-          for (int i = 0; i < in->numel(); ++i) {
+          *std_dev =
-            sum += ptr[i];
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
-          }
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          return sum / in->numel();
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
+          *mean = compute_mean<int8_t>(ptr, in->numel());
-          for (int i = 0; i < in->numel(); ++i) {
+          *std_dev =
-            sum += ptr[i];
+              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
-          }
+          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
-          return sum / in->numel();
+          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          return;
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
+          *mean = compute_mean<int32_t>(ptr, in->numel());
-          for (int i = 0; i < in->numel(); ++i) {
+          *std_dev = compute_standard_deviation<int32_t>(
-            sum += ptr[i];
+              ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          return;
        }
-          return sum / in->numel();
+        default:
+          *mean = -333333333333;
+          *std_dev = -33333333333;
+          *ave_grow_rate = -33333333333;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#ifdef LITE_WITH_OPENCL
+    } else if (target_type == TARGET(kOpenCL)) {
+      switch (layout_type) {
+        case DATALAYOUT(kImageDefault): {
+          paddle::lite::CLImageConverterDefault default_convertor;
+          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          size_t im_w = image_shape[0];
+          size_t im_h = image_shape[1];
+          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
+                  << im_h;
+          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
+          std::vector<float> real_out_v(in->numel());
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          TargetWrapperCL::ImgcpySync(in_data_v.data(),
+                                      in->data<uint16_t, cl::Image2D>(),
+                                      im_w,
+                                      im_h,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          default_convertor.ImageToNCHW(
+              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
+          CHECK(real_out_v.size() == in->numel());
+          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
+          *std_dev = compute_standard_deviation<float>(
+              real_out_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
+                                                            real_out_v.size());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        case DATALAYOUT(kNCHW): {
+          std::vector<float> in_data_v(in->numel(), 0);
+          TargetWrapperCL::MemcpySync(in_data_v.data(),
+                                      in->data<float>(),
+                                      in->numel() * sizeof(float),
+                                      IoDirection::DtoH);
+          VLOG(1) << name << ":" << in->numel();
+          *mean = compute_mean<float>(in_data_v.data(), in->numel());
+          *std_dev = compute_standard_deviation<float>(
+              in_data_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate =
+              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
        }
        default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
+          *mean = -222222222222;
-          return 0.;
+          *std_dev = -22222222222;
+          *ave_grow_rate = -22222222222;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#endif
+    } else {
+      *mean = -111111111111;
+      *std_dev = -11111111111;
+      *ave_grow_rate = -11111111111;
+      LOG(ERROR) << unsupported_error_log;
+      return;
+    }
  }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
+  std::string GetInstPrecision(const Instruction* inst = nullptr) {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
+    using std::setw;
-      auto kernel = inst_->kernel();
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    bool write_result_to_file = false;
+    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
+            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
+            << PrecisionToStr(inst->kernel()->precision()) << "/"
+            << DataLayoutToStr(inst->kernel()->layout());
+    std::string kernel_repr = inst->op()->op_info()->Repr();
+    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
+                               PrecisionToStr(inst->kernel()->precision()) +
+                               "/" + DataLayoutToStr(inst->kernel()->layout());
+    std::string op_name = inst->op()->op_info()->Type();
+    if (inst->op()->op_info()->Type() != "fetch") {
+      auto op = const_cast<lite::OpLite*>(inst->op());
+      auto kernel = inst->kernel();
      auto op_scope = op->scope();
      auto out_names = op->op_info()->output_names();
      for (auto& out_name : out_names) {
@@ -106,32 +277,90 @@ class PrecisionProfiler {
        auto type = kernel->GetOutputDeclType(out_arg_name);
        if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
+          const Tensor* tout =
-          double mean = tensor_mean(tout, type->precision(), out_name);
+              op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
+          double mean = -999999;
-                    << ", precision: " << PrecisionToStr(type->precision())
+          double std_dev = -100000;
-                    << ", mean value: " << mean << " shape:" << tout->dims();
+          double ave_grow_rate = 99999;
+          std::string mean_str{"unused"};
+          std::string std_dev_str{"unused"};
+          std::string ave_grow_rate_str{"unused"};
+          if (!is_unused(tout)) {
+            compute_tensor_precision_info(tout,
+                                          type->target(),
+                                          type->precision(),
+                                          type->layout(),
+                                          &mean,
+                                          &std_dev,
+                                          &ave_grow_rate,
+                                          out_name,
+                                          write_result_to_file);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+            ave_grow_rate_str = std::to_string(ave_grow_rate);
+          }
+          std::string kernel_info = op_name + ":" + kernel_place;
+          std::string output_arg_info = out_name + ":" +
+                                        TargetToStr(type->target()) + "/" +
+                                        PrecisionToStr(type->precision()) +
+                                        "/" + DataLayoutToStr(type->layout());
+          ss << setw(45) << left << kernel_info << " " << setw(70) << left
+             << output_arg_info << " " << setw(15) << left << tout->dims()
+             << " " << setw(15) << left << mean_str << " " << setw(15) << left
+             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+             << std::endl;
        } else if (type->IsTensorList()) {
-          auto tout =
+          auto touts =
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
+          for (auto t : *touts) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
+            const Tensor* tout = &t;
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
+            double mean = -999999;
-                      << ", precision: " << PrecisionToStr(type->precision())
+            double std_dev = -100000;
-                      << ", mean value: " << mean;
+            double ave_grow_rate = 99999;
+            std::string mean_str{"unused"};
+            std::string std_dev_str{"unused"};
+            std::string ave_grow_rate_str{"unused"};
+            if (!is_unused(tout)) {
+              compute_tensor_precision_info(tout,
+                                            type->target(),
+                                            type->precision(),
+                                            type->layout(),
+                                            &mean,
+                                            &std_dev,
+                                            &ave_grow_rate,
+                                            out_name,
+                                            write_result_to_file);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+              ave_grow_rate_str = std::to_string(ave_grow_rate);
            }
+            std::string kernel_info = op_name + ":" + kernel_place;
+            std::string output_arg_info = out_name + ":" +
+                                          TargetToStr(type->target()) + "/" +
+                                          PrecisionToStr(type->precision()) +
+                                          "/" + DataLayoutToStr(type->layout());
+            ss << setw(45) << left << kernel_info << " " << setw(70) << left
+               << output_arg_info << " " << setw(15) << left << tout->dims()
+               << " " << setw(15) << left << mean_str << " " << setw(15) << left
+               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+               << std::endl;
          }
        }
      }
    }
+    return ss.str();
- private:
+  }
-  const Instruction* inst_{nullptr};
 };
 }  // namespace profile
 }  // namespace lite
 }  // namespace paddle
+// TODO(ysh329): need to remove.
+// keep this method only for arm/math/conditional_block_compute
 #define LITE_PRECISION_PROFILE(inst) \
  { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }
 void RuntimeProgram::Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
+  std::string precision_profiler_summary =
+      inst_precision_profiler.GetSummaryHeader();
+#endif
+#endif
  for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-    LITE_PRECISION_PROFILE(inst)
+    precision_profiler_summary +=
+        inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }
 #ifdef LITE_WITH_PROFILE
  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#ifdef LITE_WITH_PRECISION_PROFILE
+  LOG(INFO) << "\n" << precision_profiler_summary;
+#endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
 }
@@ -274,7 +286,8 @@ void Instruction::Run() {
    return;
  }
-  op_->InferShape();
+  // op_->InferShape();
+  op_->SmartInferShape();
  kernel_->Launch();
  has_run_ = true;
 }

--- a/lite/core/program_fake_utils.h
+++ b/lite/core/program_fake_utils.h
@@ -30,9 +30,9 @@ Program FakeProgram() {
  auto add_fc = [&](int id, std::string x) {
    // create variables
-    std::string w1 = "w" + std::to_string(id);
+    std::string w1 = "w" + paddle::lite::to_string(id);
-    std::string b1 = "b" + std::to_string(id);
+    std::string b1 = "b" + paddle::lite::to_string(id);
-    std::string out1 = "out" + std::to_string(id);
+    std::string out1 = "out" + paddle::lite::to_string(id);
    auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
    auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
    auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();

--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -53,9 +53,9 @@ static std::string version() {
 static int64_t int_version(const std::string& version) {
  const std::vector<std::string> vec = Split(version, ".");
  if (vec.size() == 3) {
-    return std::stoi(vec[0]) * MAJOR_COEFF +
+    return atoi(vec[0].c_str()) * MAJOR_COEFF +
-           std::stoi(vec[1]) * MINOR_COEFF +
+           atoi(vec[1].c_str()) * MINOR_COEFF +
-           std::stoi(vec[2]) * PATCH_COEFF;
+           atoi(vec[2].c_str()) * PATCH_COEFF;
  }
  return -1;
 }

--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file,
    cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);
    // uncomment two lines below, save roi img to disk
-    // std::string roi_name = "roi_" + std::to_string(i) + ".jpg";
+    // std::string roi_name = "roi_" + paddle::lite::to_string(i)
+    // + ".jpg";
    // imwrite(roi_name, roi);
    // Do PreProcess

--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -14,6 +14,7 @@
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) {
  return shape_str;
 }
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
 inline double GetCurrentUS() {
  struct timeval time;
  gettimeofday(&time, NULL);
@@ -101,24 +128,24 @@ void RunModel(std::string model_dir,
  // 5. Get output
  std::cout << "\n====== output summary ====== " << std::endl;
  size_t output_tensor_num = predictor->GetOutputNames().size();
-  std::cout << "output tesnor num:" << output_tensor_num << std::endl;
+  std::cout << "output tensor num:" << output_tensor_num << std::endl;
  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
        predictor->GetOutput(tidx);
    std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
    auto out_shape = output_tensor->shape();
-    std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl;
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, ShapeProduction(out_shape), true, out_mean);
-    float sum = 0.f;
+    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
-    for (int i = 0; i < ShapeProduction(out_shape); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
    std::cout << "output tensor " << tidx
              << " elem num:" << ShapeProduction(out_shape) << std::endl;
-    std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl;
    std::cout << "output tensor " << tidx
-              << " mean value:" << sum / ShapeProduction(out_shape)
+              << " standard deviation:" << out_std_dev << std::endl;
+    std::cout << "output tensor " << tidx << " mean value:" << out_mean
              << std::endl;
    // print output

--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,
    switch (type) {
      case AttrType::INT:
-        return std::to_string(desc.GetAttr<int>(name));
+        return paddle::lite::to_string(desc.GetAttr<int>(name));
      case AttrType::FLOAT:
-        return std::to_string(desc.GetAttr<float>(name));
+        return paddle::lite::to_string(desc.GetAttr<float>(name));
      case AttrType::BOOLEAN:
-        return std::to_string(desc.GetAttr<bool>(name));
+        return paddle::lite::to_string(desc.GetAttr<bool>(name));
      case AttrType::STRING:
        return "\"" + desc.GetAttr<std::string>(name) + "\"";
      case AttrType::FLOATS: {

--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -153,16 +153,16 @@ class Module {
 private:
  std::string WeightUniqueName() const {
-    return "w_" + std::to_string(weight_counter_++);
+    return "w_" + paddle::lite::to_string(weight_counter_++);
  }
  std::string TmpVarUniqueName() const {
-    return "tmp_" + std::to_string(tmp_var_counter_++);
+    return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
  }
  std::string OpUniqueName() const {
-    return "op_" + std::to_string(op_counter_++);
+    return "op_" + paddle::lite::to_string(op_counter_++);
  }
  std::string KernelUniqueName() const {
-    return "kernel_" + std::to_string(kernel_counter_++);
+    return "kernel_" + paddle::lite::to_string(kernel_counter_++);
  }
  std::string DataRepr(const std::string &raw_data, PrecisionType dtype);

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
 # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
 # to the model_optimize_tool.
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
    return()
 endif()
@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de
 if(LITE_WITH_TRAIN)
  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
  add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
 endif()

--- a/lite/kernels/arm/elementwise_grad_compute.cc
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+void ElementwiseAddGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_grad_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (!param.XGrad) {
+    CHECK(param.YGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+  if (!param.YGrad) {
+    CHECK(param.XGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    return;
+  }
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, y_grad_data, x_grad_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+  }
+}
+void ElementwiseSubGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (!param.XGrad || !param.YGrad) {
+    CHECK(param.XGrad || param.YGrad);
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
+  }
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_grad_broadcast(
+        out_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, x_dims.production());
+  }
+}
+template <typename T, PrecisionType PType>
+void ElementwiseMulGradCompute<T, PType>::Run() {
+  LOG(FATAL) << "elementwise mul_grad not implement yet";
+}
+void ElementwiseMaxGradCompute::Run() {
+  LOG(FATAL) << "elementwise max_grad not implement yet";
+}
+void ElementwiseDivGradCompute::Run() {
+  LOG(FATAL) << "elementwise div_grad not implement yet";
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+using elementwise_mul_grad_float =
+    paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
+                                                          PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(elementwise_add_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_sub_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseSubGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_div_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseDivGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_max_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/elementwise_grad_compute.h
+++ b/lite/kernels/arm/elementwise_grad_compute.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class ElementwiseAddGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseAddGradCompute() = default;
+};
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+template <typename T, PrecisionType PType>
+class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseMulGradCompute() = default;
+};
+class ElementwiseMaxGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseMaxGradCompute() = default;
+};
+class ElementwiseDivGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseDivGradCompute() = default;
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/mul_grad_compute.cc
+++ b/lite/kernels/arm/mul_grad_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void MulGradCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+void MulGradCompute::Run() {
+  // step1 flatten_2d
+  auto& param = Param<param_t>();
+  const auto x_dims = param.x->dims();
+  const auto y_dims = param.y->dims();
+  const auto out_dims = param.output_grad->dims();
+  m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
+  k_ = static_cast<int>(
+      x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
+  n_ = static_cast<int>(
+      y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
+  const auto* out_grad_data = param.output_grad->data<float>();
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.x_grad) {
+    x_grad_data = param.x_grad->mutable_data<float>();
+  }
+  if (param.y_grad) {
+    y_grad_data = param.y_grad->mutable_data<float>();
+  }
+  paddle::lite::operators::ActivationParam act_param;
+  act_param.has_active = false;
+  // out_grad  * y^T = x_grad
+  // (m, n), (n, k) -> (m, k)
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (param.x_grad) {
+    if (m_ == 1) {
+      lite::arm::math::sgemv(y_data,
+                             out_grad_data,
+                             x_grad_data,
+                             false,
+                             k_,  // M
+                             n_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(false,
+                                     true,           // is_transB,
+                                     m_,             // M
+                                     k_,             // N
+                                     n_,             // K
+                                     1.0f,           // alpha
+                                     out_grad_data,  // A
+                                     n_,             // lda
+                                     y_data,         // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     x_grad_data,    // C
+                                     k_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+  // x^T * out_grad = y_grad
+  // (k, m) (m, n) -> (k, n)
+  if (param.y_grad) {
+    if (n_ == 1) {
+      lite::arm::math::sgemv(x_data,
+                             out_grad_data,
+                             y_grad_data,
+                             true,
+                             k_,  // M
+                             m_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(true,           // is_transA
+                                     false,          // is_transB,
+                                     k_,             // M
+                                     n_,             // N
+                                     m_,             // K
+                                     1.0f,           // alpha
+                                     x_data,         // A
+                                     k_,             // lda
+                                     out_grad_data,  // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     y_grad_data,    // C
+                                     n_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(mul_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MulGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/mul_grad_compute.h
+++ b/lite/kernels/arm/mul_grad_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulGradParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~MulGradCompute() = default;
+ private:
+  int m_, n_, k_;
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
    counter = ++(it->second);
  }
-  return prefix + "_" + std::to_string(counter);
+  return prefix + "_" + paddle::lite::to_string(counter);
 }
 bool HasInputArg(const OpInfo* op_info,

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
    return()
 endif()

--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
    return()
 endif()

--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -87,7 +87,8 @@ class Graph {
    auto idx = Add(name, node);
    CHECK_GE(idx, 1);
    // Generate a unique name for the created HiAI IR
-    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
    return node;
  }

--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  split_op->create_dynamic_output_y(out_names.size());
  int idx = 1;
  for (auto& out_name : out_names) {
-    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto zero_node =
+        graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
    auto add_node = graph->Add<ge::op::Add>(out_name);
    auto add_op = add_node->data<ge::op::Add>();
-    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x1(*split_node->data(),
+                         "y" + paddle::lite::to_string(idx));
    add_op->set_input_x2(*zero_node->data());
    idx++;
  }

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
      << "[NPU] No input nodes found for building NPU model";
  CHECK(!device_onames_.empty())
      << "[NPU] No output nodes found for building NPU model";
  // Build the HiAI IR graph to HiAI om model as the device program
-  device_program_ = lite::npu::Device::Global().Build(
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client = lite::npu::Device::Global().Build(
      model_name_, device_inodes, device_onodes);
-  if (device_program_ == nullptr) {
+  if (device_client == nullptr) {
    LOG(WARNING) << "[NPU] Build model failed!";
    return subgraph::FAILED;
  }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;
  // Query and check the dimensions of valid input and output tensors
  std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program_->GetModelIOTensorDim(
+  if (device_program->client->GetModelIOTensorDim(
          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
    LOG(WARNING)
        << "[NPU] Get the dimensions of input and output tensors failed!";
    return subgraph::FAILED;
  }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
  CHECK_EQ(device_idims.size(), device_inames_.size());
  CHECK_EQ(device_odims.size(), device_onames_.size());
  origin_idims_.resize(device_inames_.size());
@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  origin_odims_.resize(device_onames_.size());
  origin_otensors_.resize(device_onames_.size());
  device_otensors_.resize(device_onames_.size());
  for (int i = 0; i < device_inames_.size(); i++) {
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
    device_itensors_[i].reset(new hiai::AiTensor);
    device_itensors_[i]->Init(&(device_idims[i]));
  }
+  device_program->origin_idims = origin_idims_;
  for (int i = 0; i < device_onames_.size(); i++) {
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
                   << PrecisionToStr(precision);
        break;
    }
+    device_program->origin_odims = origin_odims_;
    CHECK_EQ(origin_odims_[i].production(),
             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {
 int SubgraphEngine::LaunchDeviceProgram() {
  // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  // init device_itensors_, device_otensors_, origin_otensors_
+  auto device_program = device_program_map_[inputs_shape_];
  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    device_itensors_[i]->Init(&(device_program->device_idims[i]));
    std::memcpy(device_itensors_[i]->GetBuffer(),
                origin_itensors_[i]->raw_data(),
                origin_itensors_[i]->memory_size());
  }
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    device_otensors_[i]->Init(&(device_program->device_odims[i]));
+  }
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+  }
  // Run the HiAI model by name
  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
+  hiai::AiContext model_context;
+  model_context.AddPara(key, model_name_);
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
  };
  int istamp;
  auto start_time = GetCurrentUS();
-  CHECK_EQ(
+  CHECK_EQ(device_program->client->Process(
-      device_program_->Process(
+               model_context, device_itensors_, device_otensors_, 1000, istamp),
-          model_context_, device_itensors_, device_otensors_, 1000, istamp),
           hiai::AI_SUCCESS);
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
  // Copy the data of output HiAI tensor to the buffer of origin output tensors
  for (size_t i = 0; i < device_otensors_.size(); i++) {
    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
  return 0;
 }
+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),

--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
      : subgraph::Engine(
            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
+        : client(_client) {}
+    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<hiai::TensorDimension> device_idims{};
+    std::vector<hiai::TensorDimension> device_odims{};
+  };
 protected:
  int BuildDeviceProgram() override;
  int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;
-  std::string model_name_;
+  std::string model_name_{"model.om"};
-  hiai::AiContext model_context_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
-  std::vector<std::string> device_inames_;
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
-  std::vector<std::string> device_onames_;
+      device_program_map_{};
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
+  std::vector<std::string> device_inames_{};
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
+  std::vector<std::string> device_onames_{};
-  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
 };
 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
    return ()
 endif()
@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
 #             DEPS conv_opencl op_registry program context)
+#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
+#             DEPS tensor cl_context cl_wrapper cl_target_wrapper)
 #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
 #             DEPS depthwise_conv2d_opencl op_registry program context)

--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -101,6 +101,7 @@ class ActivationComputeImageDefault
    status = kernel.setArg(++arg_idx, scale_);
    CL_CHECK_FATAL(status);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(param.X->target());
    VLOG(4) << TargetToStr(param.Out->target());
    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
@@ -112,6 +113,7 @@ class ActivationComputeImageDefault
    VLOG(4) << "threshold:" << threshold_;
    VLOG(4) << "scale:" << scale_;
    VLOG(4) << "kernel func name:" << kernel_func_name_;
+#endif
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL(
 // exp
 REGISTER_LITE_KERNEL(
-    exp_act,
+    exp,
    kOpenCL,
    kFP16,
    kImageDefault,
@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL(
 // tanh
 REGISTER_LITE_KERNEL(
-    tanh_act,
+    tanh,
    kOpenCL,
    kFP16,
    kImageDefault,

--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) {
                    func_name = "sigmoid";
                    break;
                  case 6:  // tanh
-                    func_name = "tanh_act";
+                    func_name = "tanh";
                    break;
                  case 7:  // tanh
                    func_name = "swish";
                    break;
                  case 8:  // tanh
-                    func_name = "exp_act";
+                    func_name = "exp";
                    break;
                }
                LOG(INFO) << "func_name: " << func_name;
@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
 // exp
-USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);
 // swish
 USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
 USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
 // tanh act
-USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);
 // relu image2d fp16
 USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);

--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -77,17 +77,21 @@ class BilinearInterpImageCompute
    int out_h = out_dims[2];
    int out_w = out_dims[3];
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
+#endif
    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
-    // VLOG(4) << "x_image: " << x_img;
    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "x_image: " << x_img;
    // VLOG(4) << "out_image: " << out_img;
    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
            << out_image_shape["height"];
@@ -96,6 +100,7 @@ class BilinearInterpImageCompute
            << ", align_delta: " << align_delta;
    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
    VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
@@ -107,8 +112,10 @@ class BilinearInterpImageCompute
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[2];
+#endif
    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(arg_idx++, *out_img);
@@ -142,9 +149,10 @@ class BilinearInterpImageCompute
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }
 protected:

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
    int arg_idx = 0;
    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
-    VLOG(4) << "concat 输入尺寸:  ";
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "concat input shape:  ";
    for (size_t i = 0; i < inputs.size(); i++) {
      VLOG(4) << "inputs [" << i << "]"
              << "[" << inputs[i]->dims().size() << "D]:"
@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
              << inputs[i]->dims()[3];
    }
-    VLOG(4) << "concat 输出尺寸:  ";
+    VLOG(4) << "concat output shape:  ";
    VLOG(4) << " out  dims:  "
            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
            << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "axis_: " << axis_;
    VLOG(4) << "flag_: " << flag_;
+#endif
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
                                               x_dims[x_dims.size() - 1]),
                    static_cast<cl::size_type>(image_shape["height"])};
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(param.output->target());
    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
            << image_shape["height"];
@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
            << (image_shape["height"]);
+#endif
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int out_w = x_dims[x_dims.size() - 1];
@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
        image_shape = InitImageDimInfoWith(in_dims);
        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
        int in_w = in_dims[in_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
                << image_shape["height"];
+#endif
        global_work_size =
            cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
                        static_cast<cl::size_type>(image_shape["width"] /

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  void Run() override;
 private:
-  void Conv2d1x1();
+  void Conv2d1x1opt();
  void Conv2d3x3();
  void Conv2d3x3opt();
  void Conv2d5x5();
+  void Conv2d5x5opt();
  void Conv2d7x7();
+  void Conv2d7x7opt();
  void DepthwiseConv2d3x3s1();
  void DepthwiseConv2d3x3();
  void DepthwiseConv2d();

--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
  const int dilation = 1;
  const int stride = 2;
  const int group = 1;
-  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
    for (int oc = 1; oc < 10; oc += 1) {   // oc
      for (int ih = 5; ih < 9; ih += 1) {  // ih
        int iw = ih;
@@ -532,7 +532,7 @@ const int stride = 2;
 #else  // big scale with group
  const int stride = 1;
  const int group = 32 / 1;
-  const int batch_size = 1;
+  const int batch_size = 2;
  const int ic = 32 / 1;
  const int ih = 112 / 1;
  const int iw = 112 / 1;
@@ -558,7 +558,8 @@ const int stride = 2;
                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(kernels.empty());
-              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+              //              CHECK(batch_size == 1) << "conv3x3 only supprt
+              //              batch_size == 1";
              auto kernel = std::move(kernels.front());
              SHADOW_LOG << "created conv2d kernel";
@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) {
 //  int loop_cnt = 0;
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {   // oc
+    for (int oc = 1; oc < 5; oc += 1) {    // oc
-      for (int ih = 5; ih < 9; ih += 1) {  // ih
+      for (int ih = 5; ih < 8; ih += 1) {  // ih
        int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
+        for (int ic = 2; ic < 6; ic += 1) {  // ic
          for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {/*true,*/ "relu"}) {
+            for (std::string relu_flag : {""
+                                          "relu"}) {
 #else
  const int batch_size = 2;
  const int oc = 1;
@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {
              SHADOW_LOG << "gen input and filter ...";
              for (auto& i : input_v) {
-                i = 0.01 * gen(engine);
+                i = 0.5 * gen(engine);
              }
              for (auto& f : filter_v) {
-                f = 0.01 * gen(engine);
+                f = 0.5 * gen(engine);
              }
              SHADOW_LOG << "after gen input and filter ...";
@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) {
 #undef LOOP_TEST
 #undef PRINT_RESULT
 #endif
 #ifdef TEST_CONV_IMAGE_7x7
-#undef FP16_ABS_DIFF
+// #undef FP16_ABS_DIFF
-#define FP16_ABS_DIFF (1e0)
+// #define FP16_ABS_DIFF (1e-1)
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_7x7) {
  // conv infos
@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) {
 //  int loop_cnt = 0;
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {    // oc
+    for (int oc = 1; oc < 6; oc += 1) {    // oc
-      for (int ih = 7; ih < 15; ih += 1) {  // ih
+      for (int ih = 7; ih < 8; ih += 1) {  // ih
        int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
+        for (int ic = 2; ic < 4; ic += 1) {  // ic
-          for (bool bias_flag : {true, false}) {
+          for (bool bias_flag : {false, true}) {
-            for (std::string relu_flag : {"relu"}) {
+            for (std::string relu_flag : {"", "relu"}) {
 #else
  const int batch_size = 2;
  const int oc = 1;
@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) {
              SHADOW_LOG << "gen input and filter ...";
              for (auto& i : input_v) {
-                i = gen(engine);
+                i = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
                i = 1;
 #endif
              }
+              int fiii = 1;
              for (auto& f : filter_v) {
-                f = gen(engine);
+                f = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
+                // f = fiii++;
                f = 1;
 #endif
              }
@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) {
              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                             filter_dim);
-              //              auto* filter_image2d = filter.mutable_data<float,
+              //              auto* filter_image2d =
+              // filter.mutable_data < float,
              //              cl::Image2D>(
              //                  filter_image_width,
              //                  filter_image_height,

--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_name_ << build_options_;
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << TargetToStr(ele_param_->X->target());
  VLOG(4) << TargetToStr(ele_param_->Y->target());
  VLOG(4) << TargetToStr(ele_param_->Out->target());
+#endif
  int arg_idx = 0;
  cl_int status = kernel.setArg(arg_idx, *x_buf);
  CL_CHECK_FATAL(status);
@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() {
  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
    num_ *= x_dims[i];
  }
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "axis: " << axis;
  VLOG(4) << "batch: " << batch_;
  VLOG(4) << "channels: " << channels_;
  VLOG(4) << "num: " << num_;
+#endif
 }
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
  auto* out = ele_param_->Out;
  auto axis = ele_param_->axis;
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x->target():" << TargetToStr(x->target());
  VLOG(4) << "y->target():" << TargetToStr(y->target());
  VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
  VLOG(4) << "y->dims():" << y->dims();
  VLOG(4) << "out->dims():" << out->dims();
  VLOG(4) << "axis:" << axis;
+#endif
  paddle::lite::CLImageConverterDefault default_convertor;
  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() {
  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                         out_img_shape[1]);
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
          << out_img_shape[1];
+#endif
  STL::stringstream kernel_key;
  kernel_key << kernel_func_name_ << build_options_;
@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
  } else if (y_dims.size() == 1) {
    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
      int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "tensor_w:" << tensor_w;
+#endif
      cl_int status = kernel.setArg(arg_idx, *x_img);
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, *y_img);
@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {
  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                      static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,

--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
    auto out_dims = out->dims();
    auto in_dims = x->dims();
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
+#endif
    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
    // VLOG(4) << "out_image" << out_img;
    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
            << out_image_shape["height"];
+#endif
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[2];
+#endif
    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(arg_idx++, *grid_img);
@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }
 protected:

--- a/lite/backends/opencl/cl_im2col_test.cc
+++ b/lite/backends/opencl/cl_im2col_test.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/elementwise_grad_ops.cc
+++ b/lite/operators/elementwise_grad_ops.cc
--- a/lite/operators/elementwise_grad_ops.h
+++ b/lite/operators/elementwise_grad_ops.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
--- a/lite/operators/mul_grad_op.cc
+++ b/lite/operators/mul_grad_op.cc
--- a/lite/operators/mul_grad_op.h
+++ b/lite/operators/mul_grad_op.h
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
--- a/lite/tests/kernels/mul_grad_compute_test.cc
+++ b/lite/tests/kernels/mul_grad_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc
--- a/lite/utils/cv/bgr_rotate.h
+++ b/lite/utils/cv/bgr_rotate.h
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/string.h
+++ b/lite/utils/string.h