Merge remote-tracking branch 'upstream/develop' into develop

f4e27ad1 · jackzhang235 · b6c35b17 · 6a0a1f08 · f4e27ad1 · f4e27ad1
143 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+
+build*
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -57,22 +57,20 @@ function(check_linker_flag)
    endforeach()
    set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
 endfunction()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (LITE_ON_TINY_PUBLISH)
    if((NOT LITE_WITH_PYTHON))
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
    endif()
+    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+    endif()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

-if(LITE_WITH_OPENCL)
-    if(ARM_TARGET_LANG STREQUAL "clang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-endif()
-
 if(LITE_WITH_OPENMP)
    find_package(OpenMP REQUIRED)
    if(OPENMP_FOUND OR OpenMP_CXX_FOUND)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels")

 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+
+# file to record faked kernels for opt python lib
+set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
+file(WRITE ${fake_kernels_src_list} "") # clean
+
 if(LITE_BUILD_TAILOR)
  set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
@@ -313,56 +318,65 @@ function(add_kernel TARGET device level)
        return()
    endif()

-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
-    endif()
-
-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-

    if ("${device}" STREQUAL "Host")
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
        if (NOT LITE_WITH_ARM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "X86")
        if (NOT LITE_WITH_X86)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "NPU")
        if (NOT LITE_WITH_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "XPU")
        if (NOT LITE_WITH_XPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "FPGA")
        if (NOT LITE_WITH_FPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "BM")
        if (NOT LITE_WITH_BM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
@@ -375,6 +389,9 @@ function(add_kernel TARGET device level)
    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
@@ -382,6 +399,9 @@ function(add_kernel TARGET device level)

    if ("${device}" STREQUAL "CUDA")
        if (NOT LITE_WITH_CUDA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
            return()
        endif()
        set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")

--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
 > 不同手机，不同版本，测试模型的性能数据不同。

 ```shell
-run benchmark armv7
+run benchmark armv8
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 159.8427 ms
-- mobilenet_v1          avg = 235.0072 ms
-- mobilenet_v2          avg = 173.0387 ms
-- shufflenet_v2         avg = 76.0040 ms
-- squeezenet_v11        avg = 164.2957 ms
+mnasnet                       min = 19.83500    max = 19.38500    average = 19.65503
+mobilenetv1                   min = 32.00600    max = 31.56900    average = 31.81983
+mobilenetv2                   min = 22.37900    max = 22.08700    average = 22.28623
+shufflenetv2                  min = 10.80400    max = 10.62900    average = 10.68890
+squeezenet                    min = 17.67400    max = 17.47900    average = 17.57677

 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 83.1287 ms
-- mobilenet_v1          avg = 121.6029 ms
-- mobilenet_v2          avg = 86.6175 ms
-- shufflenet_v2         avg = 41.5761 ms
-- squeezenet_v11        avg = 87.8678 ms
+mnasnet                       min = 11.85600    max = 11.72000    average = 11.77127
+mobilenetv1                   min = 18.75000    max = 18.64300    average = 18.70593
+mobilenetv2                   min = 14.05100    max = 13.59900    average = 13.71450
+shufflenetv2                  min = 6.67200     max = 6.58300     average = 6.63400
+squeezenet                    min = 12.07100    max = 11.33400    average = 11.41253

 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 73.3880 ms
-- mobilenet_v1          avg = 119.0739 ms
-- mobilenet_v2          avg = 85.3050 ms
-- shufflenet_v2         avg = 38.0762 ms
-- squeezenet_v11        avg = 64.2201 ms
+mnasnet                       min = 7.19300     max = 7.02600     average = 7.08480
+mobilenetv1                   min = 10.42000    max = 10.29100    average = 10.34267
+mobilenetv2                   min = 8.61900     max = 8.46900     average = 8.54707
+shufflenetv2                  min = 4.55200     max = 4.41900     average = 4.46477
+squeezenet                    min = 8.60000     max = 7.85200     average = 7.98407
 --------------------------------------

-run benchmark armv8
+run benchmark armv7
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
-- mnasnet               avg = 165.3073 ms
-- mobilenet_v1          avg = 306.0188 ms
-- mobilenet_v2          avg = 195.1884 ms
-- shufflenet_v2         avg = 99.3692 ms
-- squeezenet_v11        avg = 156.6971 ms
+mnasnet                       min = 20.98300    max = 20.81400    average = 20.92527
+mobilenetv1                   min = 33.19000    max = 32.81700    average = 33.08490
+mobilenetv2                   min = 25.91400    max = 25.61700    average = 25.73097
+shufflenetv2                  min = 11.14300    max = 10.97600    average = 11.06757
+squeezenet                    min = 19.31800    max = 19.20000    average = 19.26530

 Threads=2 Warmup=10 Repeats=30
-- mnasnet               avg = 90.2290 ms
-- mobilenet_v1          avg = 157.0007 ms
-- mobilenet_v2          avg = 118.1607 ms
-- shufflenet_v2         avg = 68.6804 ms
-- squeezenet_v11        avg = 91.3090 ms
+mnasnet                       min = 12.59900    max = 12.46600    average = 12.52207
+mobilenetv1                   min = 19.05800    max = 18.94700    average = 18.97897
+mobilenetv2                   min = 15.28400    max = 15.11300    average = 15.19843
+shufflenetv2                  min = 6.97000     max = 6.81400     average = 6.90863
+squeezenet                    min = 12.87900    max = 12.12900    average = 12.22530

 Threads=4 Warmup=10 Repeats=30
-- mnasnet               avg = 179.9730 ms
-- mobilenet_v1          avg = 204.0684 ms
-- mobilenet_v2          avg = 181.6486 ms
-- shufflenet_v2         avg = 123.2728 ms
-- squeezenet_v11        avg = 412.9046 ms
+mnasnet                       min = 7.31400     max = 7.12900     average = 7.20357
+mobilenetv1                   min = 11.44000    max = 10.86900    average = 10.94383
+mobilenetv2                   min = 9.14900     max = 9.03800     average = 9.09907
+shufflenetv2                  min = 4.60600     max = 4.49400     average = 4.53360
+squeezenet                    min = 8.27000     max = 8.10600     average = 8.19000
 --------------------------------------
 ```
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
    --valid_targets=npu,arm \
-    --prefer_int8_kernel=(true|false) \
    --record_tailoring_info =(true|false)
 ```
 - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。

--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h
  --arm_os=android \
  --arm_abi=armv8 \
  --arm_lang=gcc \
-  build_test_arm_opencl
+  build_opencl
 ```

+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
+
 ### 编译产物说明

 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：

--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 例如：

 ```bash
-./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
 ```
 **注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径

@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 #include <stdio.h>
 #include <vector>
 #include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT

 using namespace paddle::lite_api;  // NOLINT

@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
 2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
-4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
+4. 需要使用Paddle-Lite  `release/v2.1.0`之后版本代码编译出的模型优化工具。
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式：
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
    --valid_targets=(arm|opencl|x86|npu|xpu) \
-    --prefer_int8_kernel=(true|false) \
    --record_tailoring_info =(true|false)
 ```

@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式：
 | --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
 | --optimize_out      | 优化模型的输出路径。                                         |
 | --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
-| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
 | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |

 * 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
 * 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
 * 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。

 ### 功能二：统计模型算子信息、判断是否支持


--- a/docs/user_guides/model_quantization.md
+++ b/docs/user_guides/model_quantization.md
@@ -245,7 +245,6 @@ python compress.py \
 --optimize_out_type=naive_buffer              \
 --optimize_out=mobilenet_v1_quant_opt         \
 --valid_targets=arm                           \
--prefer_int8_kernel=true
 ```

 如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。

--- a/docs/user_guides/post_quant_no_data.md
+++ b/docs/user_guides/post_quant_no_data.md
@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。

 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
-因为该模型会将量化的权重反量化，然后实际加载并执行FP32预测模型，所以opt命令的输入参数--prefer_int8_kernel不需要设置为true，同时其他参数按照实际情况参考文档设置。
 比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \

--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'):

 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。

-参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true，其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
+参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具，参数按照实际情况设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
 ```bash
 ./opt --model_dir=./mobilenet_v1_quant \
      --optimize_out_type=naive_buffer \
      --optimize_out=mobilenet_v1_quant_opt \
-      --valid_targets=arm \
-      --prefer_int8_kernel=true
+      --valid_targets=arm
 ```

 ### 3.2 量化模型预测

--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -24,8 +24,7 @@ $ ./opt \
    --param_file=<param_path> \
    --optimize_out_type=(protobuf|naive_buffer) \
    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86) \
-    --prefer_int8_kernel=(ture|false)
+    --valid_targets=(arm|opencl|x86)
 ```

 其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 if (LITE_WITH_PYTHON)
    add_custom_target(publish_inference_python_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    add_custom_target(publish_inference_python_installer ${TARGET}
+        COMMAND python setup.py bdist_wheel
+        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+        DEPENDS publish_inference_python_lib)
    add_custom_target(publish_inference_python_light_demo ${TARGET}
    	COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
    	COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON)
    endif()
    add_dependencies(publish_inference_python_lib lite_pybind)
    add_dependencies(publish_inference publish_inference_python_lib)
+    add_dependencies(publish_inference publish_inference_python_installer)
    add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()

@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                endif()
            endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH)
    return()
 endif()

+
+# add library for opt_base
+lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
+
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
    message(STATUS "Compiling opt")
    lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
+  const std::vector<std::string> quant_dequant_op = {
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max",
+      "fake_dequantize_max_abs",
+      "fake_channel_wise_dequantize_max_abs"};
+  bool is_quantized_model = false;
+  for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
+       ++i) {
+    auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
+      auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
+      std::string op_type = op_desc->Type();
+      if (std::find(quant_dequant_op.begin(),
+                    quant_dequant_op.end(),
+                    op_type) != quant_dequant_op.end()) {
+        is_quantized_model = true;
+      }
+    }
+  }
+  if (is_quantized_model) {
+    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+
  Program program(desc, scope_, inner_places);

  core::KernelPickFactor factor;

--- a/lite/api/cxx_api_bin.cc
+++ b/lite/api/cxx_api_bin.cc
@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {

 int main(int argc, char** argv) {
  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  paddle::lite::Run(argv[1], atoi(argv[2]));

  return 0;
 }

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {

 std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
  LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
+  return nullptr;
 }

 std::string LightPredictorImpl::GetVersion() const { return lite::version(); }

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,

  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places,

  // Get detailed result
  size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
-
-    float sum = 0.f;
-    for (int i = 0; i < out_dims.production(); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
-    VLOG(1) << "out_dims.production():" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;

    // print result
    for (int i = 0; i < out_dims.production(); ++i) {

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,

  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];
@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places,

  // Get detailed result
  size_t output_tensor_num = predictor.GetOutputNames().size();
-  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  VLOG(1) << "output tensor num:" << output_tensor_num;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
-    VLOG(1) << "out_dims:" << out_dims;
-
-    float sum = 0.f;
-    for (int i = 0; i < out_dims.production(); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
-    VLOG(1) << "out_dims.production():" << out_dims.production();
-    VLOG(1) << "output tensor sum value:" << sum;
-    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;

    // print result
    for (int i = 0; i < out_dims.production(); ++i) {

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    std::ofstream out(FLAGS_arg_name + ".txt");
    for (size_t i = 0; i < arg_num; ++i) {
      sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
    }
    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
              << ", mean value is " << sum * 1. / arg_num;

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
              "arm",
              "The targets this model optimized for, should be one of (arm, "
              "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
 DEFINE_bool(print_supported_ops,
            false,
            "Print supported operators on the inputed target");
@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() {
      << "At least one target should be set, should set the "
         "command argument 'valid_targets'";

-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.insert(valid_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
  return valid_places;
 }

@@ -257,7 +251,6 @@ void PrintHelpInfo() {
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
-      "        `--prefer_int8_kernel=(true|false)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/api/opt_base.h"
+#include "all_kernel_faked.cc"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+void OptBase::SetModelDir(const std::string& model_path) {
+  opt_config_.set_model_dir(model_path);
+}
+
+void OptBase::SetModelFile(const std::string& model_path) {
+  opt_config_.set_model_file(model_path);
+}
+
+void OptBase::SetParamFile(const std::string& param_path) {
+  opt_config_.set_param_file(param_path);
+}
+
+void OptBase::SetModelType(std::string optimize_out_type) {
+  if (optimize_out_type == "protobuf") {
+    model_type_ = LiteModelType::kProtobuf;
+  } else if (optimize_out_type == "naive_buffer") {
+    model_type_ = LiteModelType::kNaiveBuffer;
+  } else {
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
+  }
+}
+
+void OptBase::SetValidPlaces(const std::string& valid_places) {
+  valid_places_.clear();
+  auto target_reprs = lite::Split(valid_places, ",");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places_.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
+    } else if (target_repr == "x86") {
+      valid_places_.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places_.emplace_back(TARGET(kXPU));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+  CHECK(!valid_places_.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+}
+
+void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+  optimize_out_path_ = optimized_out_path;
+}
+
+void OptBase::RunOptimize(bool record_strip_info) {
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+    auto resulted_model_name =
+        record_strip_info ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << optimize_out_path_ << "successfully";
+  }
+}
+
+// collect ops info of modelset
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void OptBase::SetModelSetDir(const std::string& model_set_path) {
+  model_set_dir_ = model_set_path;
+}
+void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
+  // 1. mkdir of outputed optimized model set.
+  lite::MkDirRecur(optimize_out_path_);
+  auto model_dirs = lite::ListDir(model_set_dir_, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
+  }
+
+  // 2. optimize each model in inputed model set dir.
+  std::string model_file = opt_config_.model_file();
+  std::string param_file = opt_config_.param_file();
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({model_set_dir_, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({optimize_out_path_, name}, "/");
+
+    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
+      auto model_file_path =
+          lite::Join<std::string>({input_model_dir, model_file}, "/");
+      auto param_file_path =
+          lite::Join<std::string>({input_model_dir, param_file}, "/");
+    }
+
+    std::cout << "Start optimize model: " << input_model_dir;
+
+    opt_config_.set_model_dir(input_model_dir);
+    opt_config_.set_model_file(model_file);
+    opt_config_.set_param_file(param_file);
+
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+
+    std::cout << "Optimize done. ";
+  }
+
+  // 3. if record_strip_info = true, we will record striping info
+  if (record_strip_info) {
+    // Collect all models information
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+    CollectModelMetaInfo(optimize_out_path_,
+                         model_dirs,
+                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+    std::cout << "Record the information of stripped models into :"
+              << optimize_out_path_ << "successfully";
+  }
+}
+
+void OptBase::PrintHelpInfo() {
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of help information:\n"
+      "        `help()`   Print help infomation\n"
+      "  Arguments of model optimization:\n"
+      "        `set_model_dir(model_dir)`\n"
+      "        `set_model_file(model_file_path)`\n"
+      "        `set_param_file(param_file_path)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `run_optimize(false|true)`\n"
+      "        `  ----fasle&true refer to whether to record ops info for "
+      "tailoring lib, false by default`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `print_all_ops()`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `print_supported_ops`   Display supported operators of valid "
+      "places\n"
+      "        `check_if_model_supported()`   Check if the input model is "
+      "supported\n";
+
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+}
+// 2. Print supported info of inputed ops
+void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
+  std::vector<std::string> lite_supported_targets = {"kHost",
+                                                     "kX86",
+                                                     "kCUDA",
+                                                     "kARM",
+                                                     "kOpenCL",
+                                                     "kFPGA",
+                                                     "kNPU",
+                                                     "kXPU",
+                                                     "kAny",
+                                                     "kUnk"};
+  // Get the lengh of the first column: maximum length of the op_type
+  size_t maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  // Print the first row: OP_nam taget1 target2 ...
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+    std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  // Print the name of supported ops and mark if it's supported by each target
+  // print the support info of inputed ops: valid_ops
+  for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+    std::cout << std::setw(maximum_optype_length) << *op;
+    // Check: If this kernel doesn't match any operator, we will skip it.
+    if (supported_ops.find(*op) == supported_ops.end()) {
+      continue;
+    }
+    // Print OP info.
+    auto ops_valid_places = supported_ops.at(*op);
+    for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+      if (std::find(ops_valid_places.begin(),
+                    ops_valid_places.end(),
+                    lite_supported_targets[i]) != ops_valid_places.end()) {
+        std::cout << std::setw(10) << "Y";
+      } else {
+        std::cout << std::setw(10) << " ";
+      }
+    }
+    std::cout << std::endl;
+  }
+}
+
+void OptBase::DisplayKernelsInfo() {  // Display kernel information
+  std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
+}
+void OptBase::PrintAllOps() {
+  // 1. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < supported_ops_target.size(); i++) {
+    auto ops = supported_ops_target[i];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 2. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+void OptBase::PrintSupportedOps() {
+  // 1. Get the valid hardware targets
+  std::vector<TargetType> target_types = {};
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    target_types.push_back(valid_places_[i].target);
+  }
+  std::string targets_str = TargetToStr(target_types[0]);
+  for (size_t i = 1; i < target_types.size(); i++) {
+    targets_str = targets_str + TargetToStr(target_types[i]);
+  }
+  std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+  target_types.push_back(TARGET(kHost));
+  target_types.push_back(TARGET(kUnk));
+
+  // 2. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < target_types.size(); i++) {
+    auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 3. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+// test whether this model is supported
+void OptBase::CheckIfModelSupported(bool print_ops_info) {
+  // 1. parse valid places and valid targets
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    auto target = valid_places_[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+
+  // 2.Load model into program to get ops in model
+  std::string prog_path = opt_config_.model_dir() + "/__model__";
+  if (!(opt_config_.model_file()).empty() &&
+      !(opt_config_.param_file()).empty()) {
+    prog_path = opt_config_.model_file();
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (print_ops_info) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (size_t i = 0; i < valid_places_.size(); i++) {
+      targets.push_back(valid_places_[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (size_t i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (print_ops_info) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
+}  // namespace lite_api
+}  // namespace paddle
--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines Opt and basic functions about model transformation.
+ */
+
+#ifndef PADDLE_LITE_OPT_H_  // NOLINT
+#define PADDLE_LITE_OPT_H_
+#include <algorithm>
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+// stores the map that records the source_file path of each kernel.
+#include "kernel_src_map.h"  // NOLINT
+#include "lite/api/cxx_api.h"
+// version of Paddle-lite
+#include "lite/core/version.h"
+// model parser functions to pre-load model to verify if this model is supported
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
+#include "lite/utils/string.h"
+// recorded all the ops supported by paddle-lite
+#include "supported_kernel_op_info.h"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+/// The PaddlePredictor defines the basic interfaces for different kinds of
+/// predictors.
+class LITE_API OptBase {
+ public:
+  OptBase() = default;
+  void SetModelSetDir(const std::string &model_set_path);
+  void SetModelDir(const std::string &model_path);
+  void SetModelFile(const std::string &model_path);
+  void SetParamFile(const std::string &param_path);
+  void SetValidPlaces(const std::string &valid_places);
+  void SetOptimizeOut(const std::string &optimized_out_path);
+  // set optimized_model type
+  void SetModelType(std::string model_type);
+  // transform and save the optimized model
+  void RunOptimize(bool record_strip_info = false);
+
+  // fuctions of printing info
+  // 1. help info
+  void PrintHelpInfo();
+  // 2. PrintOpsInfo
+  void PrintOpsInfo(const std::set<std::string> &valid_ops =
+                        {});  // print supported ops on target_types
+  void PrintAllOps();         // print all ops
+  void PrintSupportedOps();   // print ops supported on valid_places_
+  void DisplayKernelsInfo();  // Display kernel information
+  // 3. Check if this model is supported
+  void CheckIfModelSupported(bool print_ops_info = true);
+
+ private:
+  CxxConfig opt_config_;
+  // valid places for the optimized_model
+  std::vector<Place> valid_places_;
+  // filename of the optimized_model
+  std::string optimize_out_path_;
+  // type of the optimized_model, kNaiveBuffer default.
+  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
+  // Dir path of a set of models, this should be combined with model
+  std::string model_set_dir_;
+
+  void RunOptimizeFromModelSet(bool record_strip_info = false);
+};
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#endif  // NOLINT
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
    return()
 endif()

+# to create setup.py for packeting whl for Paddle-Lite and opt
+
+execute_process(
+  COMMAND git describe --tags --exact-match
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_TAG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)

 add_subdirectory(pybind)
 #add_subdirectory(interface)
--- a/lite/api/python/__init__.py
+++ b/lite/api/python/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
 set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
 if (NOT LITE_ON_TINY_PUBLISH)
-   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
+   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()

 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -26,11 +26,12 @@

 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
+#include "lite/api/opt_base.h"
 #endif

 #include "lite/api/light_api.h"
 #include "lite/api/paddle_api.h"
+#include "lite/core/tensor.h"

 namespace py = pybind11;

@@ -48,10 +49,27 @@ using lite_api::DataLayoutType;
 using lite_api::Place;
 using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
+using lite_api::OptBase;

 #ifndef LITE_ON_TINY_PUBLISH
 using lite::CxxPaddleApiImpl;
 static void BindLiteCxxPredictor(py::module *m);
+void BindLiteOpt(py::module *m) {
+  py::class_<OptBase> opt_base(*m, "Opt");
+  opt_base.def(py::init<>())
+      .def("set_model_dir", &OptBase::SetModelDir)
+      .def("set_modelset_dir", &OptBase::SetModelSetDir)
+      .def("set_model_file", &OptBase::SetModelFile)
+      .def("set_param_file", &OptBase::SetParamFile)
+      .def("set_valid_places", &OptBase::SetValidPlaces)
+      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_model_type", &OptBase::SetModelType)
+      .def("run_optimize", &OptBase::RunOptimize)
+      .def("help", &OptBase::PrintHelpInfo)
+      .def("print_supported_ops", &OptBase::PrintSupportedOps)
+      .def("display_kernels_info", &OptBase::DisplayKernelsInfo)
+      .def("print_all_ops", &OptBase::PrintAllOps);
+}
 #endif
 static void BindLiteLightPredictor(py::module *m);
 static void BindLiteCxxConfig(py::module *m);

--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
@@ -22,11 +22,15 @@ namespace lite {
 namespace pybind {

 void BindLiteApi(pybind11::module *m);
+void BindLiteOpt(pybind11::module *m);

-PYBIND11_MODULE(lite_core, m) {
+PYBIND11_MODULE(lite, m) {
  m.doc() = "C++ core of Paddle-Lite";

  BindLiteApi(&m);
+#ifndef LITE_ON_TINY_PUBLISH
+  BindLiteOpt(&m);
+#endif
 }

 }  // namespace pybind

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+
+import shutil
+import os
+from setuptools import setup, Distribution
+
+
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+
+
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+
+# link lite.so to paddlelite.libs
+COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -17,6 +17,7 @@
 #include <gflags/gflags.h>
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>

 // for eval
 DEFINE_string(model_dir, "", "model dir");
@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
  return 1e+6 * time.tv_sec + time.tv_usec;
 }

+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_add_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 int num) {
+  int cnt = num >> 4;
+  int remain = num & 0x0f;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const float* out_data = dout_grad + 16 * i;
+    float* x_data = x_grad + 16 * i;
+    float32x4_t din0 = vld1q_f32(out_data);
+    float32x4_t din1 = vld1q_f32(out_data + 4);
+    float32x4_t din2 = vld1q_f32(out_data + 8);
+    float32x4_t din3 = vld1q_f32(out_data + 12);
+    vst1q_f32(x_data, din0);
+    vst1q_f32(x_data + 4, din1);
+    vst1q_f32(x_data + 8, din2);
+    vst1q_f32(x_data + 12, din3);
+  }
+  if (remain > 0) {
+    const float* out_data = dout_grad + 16 * cnt;
+    float* x_data = x_grad + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      x_data[i] = out_data[i];
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_add_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post >> 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum += out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 template <>
 void elementwise_sub<float>(const float* dinx,
                            const float* diny,
@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
    }
  }
 }
+// we assume the formula is x-y
+template <>
+void elementwise_sub_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 float* y_grad,
+                                 int num) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, num);
+  }
+  if (y_grad) {
+    int cnt = num >> 4;
+    int remain = num & 0x0f;
+    float32x4_t minus = vdupq_n_f32(-1);
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      const float* out_data = dout_grad + 16 * i;
+      float* y_data = y_grad + 16 * i;
+      float32x4_t din0 = vld1q_f32(out_data);
+      float32x4_t din1 = vld1q_f32(out_data + 4);
+      float32x4_t din2 = vld1q_f32(out_data + 8);
+      float32x4_t din3 = vld1q_f32(out_data + 12);
+      din0 = vmulq_f32(din0, minus);
+      din1 = vmulq_f32(din1, minus);
+      din2 = vmulq_f32(din2, minus);
+      din3 = vmulq_f32(din3, minus);
+      vst1q_f32(y_data, din0);
+      vst1q_f32(y_data + 4, din1);
+      vst1q_f32(y_data + 8, din2);
+      vst1q_f32(y_data + 12, din3);
+    }
+    if (remain > 0) {
+      const float* out_data = dout_grad + 16 * cnt;
+      float* y_data = y_grad + 16 * cnt;
+      for (int i = 0; i < remain; ++i) {
+        y_data[i] = -out_data[i];
+      }
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post << 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum -= out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}

 template <>
 void elementwise_mul<float>(const float* dinx,

--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -183,6 +183,13 @@ template <typename T>
 void elementwise_add_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);

+template <typename T>
+void elementwise_add_grad(const T* dout, T* dinx, int num);
+
+template <typename T>
+void elementwise_add_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);

@@ -197,6 +204,13 @@ template <typename T>
 void elementwise_sub_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);

+template <typename T>
+void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
+
+template <typename T>
+void elementwise_sub_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);


--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
  "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
  "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
  "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
  "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
+  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
  "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
+  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
+  /*"vmla.f32 q0, q4, q6            @ mul add\n" */                            \
+  /*"vmla.f32 q1, q4, q8            @ mul add\n" */                            \
  "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
  "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
  "subs %[cnt], #1                @ sub loop count \n"                         \

--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {

 inline void save_float(float* data, const std::string& name, int len) {
  static int counter = 0;
-  std::string old_string = std::to_string(counter);
+  std::string old_string = paddle::lite::to_string(counter);
  std::string new_string =
      std::string(3 - old_string.length(), '0') + old_string;


--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -351,10 +351,10 @@ class Tensor {
  void printScale(std::string type) { printScale(); }

  std::string dimsFileName() {
-    return std::to_string(shape_->num()) + "_" +
-           std::to_string(shape_->channel()) + "_" +
-           std::to_string(shape_->height()) + "_" +
-           std::to_string(shape_->width()) + ".txt";
+    return paddle::lite::to_string(shape_->num()) + "_" +
+           paddle::lite::to_string(shape_->channel()) + "_" +
+           paddle::lite::to_string(shape_->height()) + "_" +
+           paddle::lite::to_string(shape_->width()) + ".txt";
  }

  void saveToFile() { std::string path = dimsFileName(); }
@@ -374,7 +374,7 @@ class Tensor {
    invalidate();
    std::ofstream ofs;
    static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
+    std::string npath = paddle::lite::to_string(counter) + "_" + path;
    counter++;
    save_file_with_name(npath);
  }

--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace lite {
 namespace npu {

-std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
-    std::string& model_name,                 // NOLINT
+std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
+    const std::string model_name,            // NOLINT
    std::vector<ge::Operator>& input_nodes,  // NOLINT
    std::vector<ge::Operator>& output_nodes  // NOLINT
    ) {
@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
    ir_build.ReleaseModelBuff(om_model_buf);
    return nullptr;
  }
+
  // Create a HiAI model manager client to load the HiAI om model
-  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+  std::shared_ptr<hiai::AiModelMngerClient> model_client(
      new hiai::AiModelMngerClient());
  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
    ir_build.ReleaseModelBuff(om_model_buf);
    return nullptr;
  }
-  model_name = "model_" + std::to_string(model_count_++) + ".om";
  auto model_desc = std::make_shared<hiai::AiModelDescription>(
      model_name, freq_level(), framework_type(), model_type(), device_type());
  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);

--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -40,8 +40,8 @@ class Device {

  // Build the HiAI IR graph to om model, return HiAI model manager client to
  // load om model and run inference.
-  std::unique_ptr<hiai::AiModelMngerClient> Build(
-      std::string& model_name,                 // NOLINT
+  std::shared_ptr<hiai::AiModelMngerClient> Build(
+      const std::string model_name,            // NOLINT
      std::vector<ge::Operator>& input_nodes,  // NOLINT
      std::vector<ge::Operator>& output_nodes  // NOLINT
      );                                       // NOLINT
@@ -51,7 +51,6 @@ class Device {
  int framework_type_{0};
  int model_type_{0};
  int device_type_{0};
-  int model_count_{0};
 };

 }  // namespace npu

--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)

 add_dependencies(cl_wrapper opencl_clhpp)
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
 #include <cl_common.h>

-__kernel void conv2d_1x1(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
                         __private const int global_size_dim1,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -14,21 +14,22 @@ limitations under the License. */

 #include <cl_common.h>

-__kernel void conv2d_3x3_opt(__private const int item_ch, 
+__kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_w,
-                             __private const int item_h, 
+                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
 #endif
-                             __write_only image2d_t output_image, 
+                             __write_only image2d_t output_image,
                             __private const int stride,
-                             __private const int pad, 
+                             __private const int pad,
                             __private const int dilation,
-                             __private const int in_ch, 
+                             __private const int batch,
+                             __private const int in_ch,
                             __private const int in_w,
-                             __private const int in_h, 
+                             __private const int in_h,
                             __private const int out_w,
                             __private const int out_h) {

@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #ifdef BIASE_CH

  CL_DTYPE4 output[5];
-  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
  output[1] = output[0];
  output[2] = output[0];
  output[3] = output[0];
@@ -69,23 +71,33 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
 #elif defined(BIASE_ELE)

  CL_DTYPE4 output[5];
-  output[0] =
-      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
  if (out_w_id1 < out_w) {
-    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
  }
  if (out_w_id2 < out_w) {
-    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
  }
  if (out_w_id3 < out_w) {
-    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
  }
  if (out_w_id4 < out_w) {
-    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
-                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
  }
 #else
  CL_DTYPE4 output[5] = {0.0f};
@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
    int filter_w_val = ch * 3;

    for (int h = 0; h < 3; h++) {
-      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h,
+                            -1,
                            (out_batch_id * in_h + in_h_id + h < 0 ||
                             out_batch_id * in_h + in_h_id + h >= in_h));

      for (int w = 0; w < 3; w++) {
-        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));

-        filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
-        filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
-        filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
-        filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, 
-            filter_image, sampler,
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3

-        filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x,
-                                  filter[3].x);  // in_ch:0,out_ch:0-3
-        filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y,
-                                  filter[3].y);  // in_ch:1,out_ch:0-3
-        filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z,
-                                  filter[3].z);  // in_ch:2,out_ch:0-3
-        filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w,
-                                  filter[3].w);  // in_ch:3,out_ch:0-3
-
-        input[0] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
-        input[1] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
-        input[2] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
-        input[3] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
-        input[4] =
-            READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));

        output[0] = mad(input[0].x, filter_trans[0], output[0]);
        output[1] = mad(input[1].x, filter_trans[0], output[1]);
@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
  output[3] = activation_type4(output[3]);
  output[4] = activation_type4(output[4]);

-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
-               output[0]);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
  if (out_w_id1 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
-                 output[1]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
  }
  if (out_w_id2 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
-                 output[2]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
  }
  if (out_w_id3 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
-                 output[3]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
  }
  if (out_w_id4 < out_w) {
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
-                 output[4]);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
  }
 }

+// support batch > 1
+__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of conv5x5
+__kernel void conv2d_5x5_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_5x5_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of con7x7
+__kernel void conv2d_7x7_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_7x7_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "glog/logging.h"
 #include "lite/backends/x86/jit/gen/jitcode.h"
+#include "lite/utils/string.h"

 namespace paddle {
 namespace lite {
@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
      base += "_Vec";
    }
    base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
+    base += "_D" + paddle::lite::to_string(num_);
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
    } else if (type_ == SeqPoolType::kSqrt) {
      base += "_Sqrt";
    }
-    base += ("_W" + std::to_string(tbl_w_));
+    base += ("_W" + paddle::lite::to_string(tbl_w_));
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {

  std::string name() const override {
    std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
-           std::to_string(k_);
+    base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
+           paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
    return base;
  }
  void genCode() override;

--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
    } else if (type_ == SeqPoolType::kSqrt) {
      base += "_Sqrt";
    }
-    base += ("_W" + std::to_string(w_));
+    base += ("_W" + paddle::lite::to_string(w_));
    return base;
  }
  void genCode() override;

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -94,9 +94,13 @@ add_custom_command(
  OUTPUT ops.h # not a real path to the output to force it execute every time.
  )
 # generate fake kernels for memory_optimize_tool
+
+#-------------------------------opt----------------------------------------------------------------
+# tricks to create headfiles for opt
 add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
  ${kernels_src_list}
+  ${fake_kernels_src_list}
  ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
  ${CMAKE_BINARY_DIR}/kernel_src_map.h
  OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
@@ -104,12 +108,12 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
-#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc)

 # create headfile to restore ops info sorted by suppported platforms
 add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
  ${kernels_src_list}
+  ${fake_kernels_src_list}
  ${ops_src_list}
  ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
  OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -490,7 +490,7 @@ class ContextScheduler {
      } break;
 #endif
      default:
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
+#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
        LOG(FATAL) << "unsupported target " << TargetToStr(target);
 #endif
        break;

--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) {
    auto attr_type = op_info->GetAttrType(attr_name);
    switch (attr_type) {
      case AttrType::INT:
-        os << ":int:" << std::to_string(op_info->GetAttr<int>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
        break;
      case AttrType::FLOAT:
-        os << ":float:" << std::to_string(op_info->GetAttr<float>(attr_name));
+        os << ":float:"
+           << paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
        break;
      case AttrType::BOOLEAN:
-        os << ":int:" << std::to_string(op_info->GetAttr<bool>(attr_name));
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
        break;
      case AttrType::STRING:
        os << ":string: \""

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(

  // non-tensor(like tensor_array) variables will not be reused
  for (auto& node : graph->nodes()) {
-    if (node.IsArg() && !node.arg()->type->IsTensor()) {
+    if (node.IsArg() && (node.arg()->type != nullptr) &&
+        !node.arg()->type->IsTensor()) {
      invalid_var_names.insert(node.arg()->name);
    }
  }
@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
      if (reuse_table.count(name) && reuse_table.at(name) != name) {
        auto replace_name = reuse_table.at(name);
        input_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
        node_append_idx++;
      }
    }
@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
      if (reuse_table.count(name) && reuse_table.at(name) != name) {
        auto replace_name = reuse_table.at(name);
        out_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
        node_append_idx++;
      }
    }

--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -85,7 +85,7 @@ class Node {
  struct Arg {
    std::string name;
    int id{0};
-    const Type* type{};
+    const Type* type{nullptr};
    // Weight is a special kind of argument, it is marked as weight explicitly
    // so that some weight related optimization can take place.
    bool is_weight{false};

--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply(
    }
    if (found) {
      inst.mutable_op_info()->SetAttr("output_scale", output_scale);
+    } else if (op_info->HasAttr("output_scale")) {
+      int bit_length = op_info->GetAttr<int>("bit_length");
+      int range = (1 << (bit_length - 1)) - 1;
+      output_scale = op_info->GetAttr<float>("output_scale");
+      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
    }
    if (op_info->HasAttr("output_scale")) {
      inst.mutable_op_info()->SetAttr("enable_int8", true);

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
    }

    VLOG(4) << "[score(final)]:" << final_score;
-    VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+    VLOG(2) << "-------- pick summary for " << instruct.op_type()
+            << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
            << " " << DataLayoutToStr(winner_place.layout) << " "
            << TargetToStr(winner_place.target);
-    VLOG(4) << " ===> kernel.place():"
+    VLOG(2) << " ===> kernel.place():"
            << PrecisionToStr(kernel.place().precision) << " "
            << DataLayoutToStr(kernel.place().layout) << " "
            << TargetToStr(kernel.place().target);

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
    } else {
      exists_ops[op_type]++;
    }
-    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
    std::string op_color = "white";
    if (subgraph_indices.count(node)) {
      auto subgraph_idx = subgraph_indices[node];
-      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
      op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
    }
    dot.AddNode(op_name,
@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
  std::vector<std::string> lines = ReadLines(config_file_path);

  for (std::string line : lines) {
+    if (line.empty()) continue;
    std::vector<std::string> node_info = Split(line, ":");
    std::string op_type = node_info.at(0);
    std::vector<std::string> in_vars_name;
@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  cpp::OpDesc subgraph_op_desc;
  subgraph_op_desc.SetType("subgraph");

-  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // Create a new sub block desc for storing all of Ops and Vars of the target
  // subgraph and sub_block_idx is set as a attribute of subgraph op,
  // sub_block_idx < 0 means it's a new subgraph op
  int sub_block_idx = -(subgraph_idx + 1);

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
  CHECK_EQ(input_var_names.size(), 1);
  CHECK_EQ(wshape.size(), 2);
  static int id = 0;
-  std::string prefix = "fc_" + std::to_string(id);
+  std::string prefix = "fc_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();

  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
    const std::vector<std::string>& input_Y_names) {
  // CHECK_EQ(input_var_names.size(), 2);
  static int id = 0;
-  std::string prefix = "elementwise_add_" + std::to_string(id);
+  std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();

@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
    const std::vector<std::string>& input_X_names) {
  // CHECK_EQ(input_var_names.size(), 1);
  static int id = 0;
-  std::string prefix = "feed_" + std::to_string(id);
+  std::string prefix = "feed_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();

@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
    const std::vector<std::string>& input_X_names) {
  // CHECK_EQ(input_var_names.size(), 1);
  static int id = 0;
-  std::string prefix = "fetch_" + std::to_string(id);
+  std::string prefix = "fetch_" + paddle::lite::to_string(id);
  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
  auto* out = block_desc->AddVar<cpp::VarDesc>();


--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,6 +17,7 @@
 #include "lite/api/paddle_api.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"

 DEFINE_string(model_file, "", "model file path of combined protobuf model");
 DEFINE_string(params_file, "", "params file path of combined protobuf model");
@@ -31,43 +32,17 @@ namespace lite {
 // The helper functions for loading and running model from command line and
 // verifying output data
 std::vector<std::string> TypeParsing(std::string text) {
-  std::vector<std::string> types;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string type = text.substr(0, index);
-    VLOG(3) << type;
-    types.push_back(type);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
-    }
-  }
-  return types;
+  return Split(text, ":");
 }

 std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
  std::vector<std::vector<int64_t>> shapes;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string slice = text.substr(0, index);
-    std::vector<int64_t> shape;
-    while (!slice.empty()) {
-      size_t index = slice.find_first_of(",");
-      int d = atoi(slice.substr(0, index).c_str());
-      VLOG(3) << d;
-      shape.push_back(d);
-      if (index == std::string::npos) {
-        break;
-      } else {
-        slice = slice.substr(index + 1);
-      }
-    }
-    shapes.push_back(shape);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
+  std::vector<std::string> shape_strings = Split(text, ":");
+  shapes.resize(shape_strings.size());
+  for (int i = 0; i < shape_strings.size(); i++) {
+    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
+    for (auto shape_num : shape_nums) {
+      shapes[i].push_back(atoi(shape_num.c_str()));
    }
  }
  return shapes;

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
    if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
    auto inlinks = node->inlinks;
-    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
-            << " inlinks.size():" << inlinks.size();
+    VLOG(4) << "============== node->AsStmt().op_type():"
+            << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
+            << " ================";
    for (auto* in : inlinks) {
      ComplementInputs(graph.get(), node, in);
    }
@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
  auto decl_arg_type =
      inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
+
  CHECK(in->AsArg().type);
-  VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
+  VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
          << "\n in->AsArg().name:" << in->AsArg().name
          << "\n *in->AsArg().type:" << *in->AsArg().type
          << "\n *decl_arg_type:" << *decl_arg_type
          << "\n inst.op()->DebugString():" << inst.op()->DebugString();

+  // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
+  // layout).
+  // not a good judge, but don't find the source of this issue from
+  // static_pick_kernel_pass
+  // to this pass.
+  auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
+  if (in_arg_type->target() == TARGET(kARM) &&
+      in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
+    return;
+  }
+
  if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
    VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
            << " for kernel " << inst.op()->DebugString() << " "

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
  CHECK(in->IsArg());
  // auto node_id = [&] { return graph->nodes().size(); };
  auto cast_op_output_name = in->AsArg().name + "/precision_trans";
-  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // in->AsArg().name + "/precision_trans/" +
+  // paddle::lite::to_string(node_id());
  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
  cast_op_output_arg->AsArg().type =
      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -65,6 +65,7 @@ class OpLite : public Registry {
  virtual bool CheckShape() const { return true; }
  // Inference the outputs' shape.
  virtual bool InferShape() const { return true; }
+  virtual bool SmartInferShape() { return this->InferShape(); }
  // Run this operator.
  virtual bool Run();
  // Indicate whether the Op runs only once or not
@@ -150,6 +151,10 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
+  std::vector<DDimLite> last_input_shapes;
+  std::vector<DDimLite> last_output_shapes;
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
+  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
 };

 /*

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,18 +22,25 @@
 #include <vector>
 #include "lite/core/program.h"

+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/kernels/opencl/image_helper.h"
+#endif
+
 namespace paddle {
 namespace lite {
 namespace profile {

 template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
  if (locate.find('/') != std::string::npos) {
-    return;
+    return false;
  }
  FILE* fp = fopen(locate.c_str(), "w");
  if (fp == nullptr) {
    LOG(ERROR) << "file open field " << locate;
+    return false;
  } else {
    const dtype* data = tensor->data<dtype>();
    for (int i = 0; i < tensor->numel(); ++i) {
@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
    }
  }
  fclose(fp);
+  return true;
 }

 class PrecisionProfiler {
 public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
-  ~PrecisionProfiler() {
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
-              << PrecisionToStr(inst_->kernel()->precision());
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
-        return -99999;
-      }
-      double sum = 0.;
-      switch (ptype) {
+  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
+  // keep this method only for arm/math/conditional
+  explicit PrecisionProfiler(const Instruction* inst) {
+    std::string inst_precison_str = GetInstPrecision(inst);
+  }
+
+  PrecisionProfiler() {}
+
+  std::string GetSummaryHeader() {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    ss << "========================================= "
+       << "Detailed Precision Profiler Summary "
+       << "=========================================" << std::endl;
+    ss << setw(45) << left << "operator:(kernel_info)"
+       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
+       << " " << setw(15) << left << "dims"
+       << " " << setw(15) << left << "mean"
+       << " " << setw(15) << left << "std_deviation"
+       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+
+    return ss.str();
+  }
+
+  template <typename T>
+  double compute_mean(const T* in, const size_t length) {
+    double sum = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      sum += in[i];
+    }
+    return sum / length;
+  }
+
+  template <typename T>
+  double compute_standard_deviation(const T* in,
+                                    const size_t length,
+                                    bool has_mean = false,
+                                    double mean = 10000) {
+    if (!has_mean) {
+      mean = compute_mean<T>(in, length);
+    }
+
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      variance += pow((in[i] - mean), 2);
+    }
+    variance /= length;
+    return sqrt(variance);
+  }
+
+  template <typename T>
+  double compute_average_grow_rate(const T* in, const size_t length) {
+    const double eps = 1e-5;
+    double ave_grow_rate = 0.0f;
+    for (size_t i = 1; i < length; ++i) {
+      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+    }
+    ave_grow_rate /= length;
+    return ave_grow_rate;
+  }
+
+  // check if output tensor unused
+  bool is_unused(const Tensor* in) {
+    if (!in->data<int8_t>()) {
+      return true;
+    }
+    return false;
+  }
+
+  void compute_tensor_precision_info(const Tensor* in,
+                                     TargetType target_type,
+                                     PrecisionType precision_type,
+                                     DataLayoutType layout_type,
+                                     double* mean,
+                                     double* std_dev,
+                                     double* ave_grow_rate,
+                                     std::string name = "inst",
+                                     bool write_result_to_file = false) {
+    std::string unsupported_error_log =
+        "Unsupported precision profile for kernel registered on" +
+        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
+        DataLayoutToStr(layout_type);
+
+    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
+        target_type == TARGET(kX86)) {
+      switch (precision_type) {
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
        }
        case PRECISION(kAny): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int8_t>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          return;
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int32_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int32_t>(
+              ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          return;
        }
        default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
-          return 0.;
+          *mean = -333333333333;
+          *std_dev = -33333333333;
+          *ave_grow_rate = -33333333333;
+          LOG(ERROR) << unsupported_error_log;
+          return;
      }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
-      auto kernel = inst_->kernel();
+#ifdef LITE_WITH_OPENCL
+    } else if (target_type == TARGET(kOpenCL)) {
+      switch (layout_type) {
+        case DATALAYOUT(kImageDefault): {
+          paddle::lite::CLImageConverterDefault default_convertor;
+          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          size_t im_w = image_shape[0];
+          size_t im_h = image_shape[1];
+          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
+                  << im_h;
+          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
+          std::vector<float> real_out_v(in->numel());
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          TargetWrapperCL::ImgcpySync(in_data_v.data(),
+                                      in->data<uint16_t, cl::Image2D>(),
+                                      im_w,
+                                      im_h,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          default_convertor.ImageToNCHW(
+              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
+          CHECK(real_out_v.size() == in->numel());
+          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
+          *std_dev = compute_standard_deviation<float>(
+              real_out_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
+                                                            real_out_v.size());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        case DATALAYOUT(kNCHW): {
+          std::vector<float> in_data_v(in->numel(), 0);
+          TargetWrapperCL::MemcpySync(in_data_v.data(),
+                                      in->data<float>(),
+                                      in->numel() * sizeof(float),
+                                      IoDirection::DtoH);
+          VLOG(1) << name << ":" << in->numel();
+          *mean = compute_mean<float>(in_data_v.data(), in->numel());
+          *std_dev = compute_standard_deviation<float>(
+              in_data_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate =
+              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        default:
+          *mean = -222222222222;
+          *std_dev = -22222222222;
+          *ave_grow_rate = -22222222222;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#endif
+    } else {
+      *mean = -111111111111;
+      *std_dev = -11111111111;
+      *ave_grow_rate = -11111111111;
+      LOG(ERROR) << unsupported_error_log;
+      return;
+    }
+  }
+
+  std::string GetInstPrecision(const Instruction* inst = nullptr) {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    bool write_result_to_file = false;
+
+    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
+            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
+            << PrecisionToStr(inst->kernel()->precision()) << "/"
+            << DataLayoutToStr(inst->kernel()->layout());
+
+    std::string kernel_repr = inst->op()->op_info()->Repr();
+    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
+                               PrecisionToStr(inst->kernel()->precision()) +
+                               "/" + DataLayoutToStr(inst->kernel()->layout());
+    std::string op_name = inst->op()->op_info()->Type();
+
+    if (inst->op()->op_info()->Type() != "fetch") {
+      auto op = const_cast<lite::OpLite*>(inst->op());
+      auto kernel = inst->kernel();
      auto op_scope = op->scope();
      auto out_names = op->op_info()->output_names();
      for (auto& out_name : out_names) {
@@ -106,32 +277,90 @@ class PrecisionProfiler {
        auto type = kernel->GetOutputDeclType(out_arg_name);

        if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision(), out_name);
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
-                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean << " shape:" << tout->dims();
+          const Tensor* tout =
+              op_scope->FindVar(out_name)->GetMutable<Tensor>();
+          double mean = -999999;
+          double std_dev = -100000;
+          double ave_grow_rate = 99999;
+          std::string mean_str{"unused"};
+          std::string std_dev_str{"unused"};
+          std::string ave_grow_rate_str{"unused"};
+
+          if (!is_unused(tout)) {
+            compute_tensor_precision_info(tout,
+                                          type->target(),
+                                          type->precision(),
+                                          type->layout(),
+                                          &mean,
+                                          &std_dev,
+                                          &ave_grow_rate,
+                                          out_name,
+                                          write_result_to_file);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+            ave_grow_rate_str = std::to_string(ave_grow_rate);
+          }
+          std::string kernel_info = op_name + ":" + kernel_place;
+          std::string output_arg_info = out_name + ":" +
+                                        TargetToStr(type->target()) + "/" +
+                                        PrecisionToStr(type->precision()) +
+                                        "/" + DataLayoutToStr(type->layout());
+
+          ss << setw(45) << left << kernel_info << " " << setw(70) << left
+             << output_arg_info << " " << setw(15) << left << tout->dims()
+             << " " << setw(15) << left << mean_str << " " << setw(15) << left
+             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+             << std::endl;
        } else if (type->IsTensorList()) {
-          auto tout =
+          auto touts =
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
-                      << ", precision: " << PrecisionToStr(type->precision())
-                      << ", mean value: " << mean;
+          for (auto t : *touts) {
+            const Tensor* tout = &t;
+            double mean = -999999;
+            double std_dev = -100000;
+            double ave_grow_rate = 99999;
+            std::string mean_str{"unused"};
+            std::string std_dev_str{"unused"};
+            std::string ave_grow_rate_str{"unused"};
+
+            if (!is_unused(tout)) {
+              compute_tensor_precision_info(tout,
+                                            type->target(),
+                                            type->precision(),
+                                            type->layout(),
+                                            &mean,
+                                            &std_dev,
+                                            &ave_grow_rate,
+                                            out_name,
+                                            write_result_to_file);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+              ave_grow_rate_str = std::to_string(ave_grow_rate);
+            }
+            std::string kernel_info = op_name + ":" + kernel_place;
+            std::string output_arg_info = out_name + ":" +
+                                          TargetToStr(type->target()) + "/" +
+                                          PrecisionToStr(type->precision()) +
+                                          "/" + DataLayoutToStr(type->layout());
+
+            ss << setw(45) << left << kernel_info << " " << setw(70) << left
+               << output_arg_info << " " << setw(15) << left << tout->dims()
+               << " " << setw(15) << left << mean_str << " " << setw(15) << left
+               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+               << std::endl;
          }
        }
      }
    }
+    return ss.str();
  }
-
- private:
-  const Instruction* inst_{nullptr};
 };

 }  // namespace profile
 }  // namespace lite
 }  // namespace paddle

+// TODO(ysh329): need to remove.
+// keep this method only for arm/math/conditional_block_compute
 #define LITE_PRECISION_PROFILE(inst) \
  { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }

 void RuntimeProgram::Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
+  std::string precision_profiler_summary =
+      inst_precision_profiler.GetSummaryHeader();
+#endif
+#endif
+
  for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-    LITE_PRECISION_PROFILE(inst)
+    precision_profiler_summary +=
+        inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }
 #ifdef LITE_WITH_PROFILE
  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#ifdef LITE_WITH_PRECISION_PROFILE
+  LOG(INFO) << "\n" << precision_profiler_summary;
+#endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
 }

@@ -274,7 +286,8 @@ void Instruction::Run() {
    return;
  }

-  op_->InferShape();
+  // op_->InferShape();
+  op_->SmartInferShape();
  kernel_->Launch();
  has_run_ = true;
 }

--- a/lite/core/program_fake_utils.h
+++ b/lite/core/program_fake_utils.h
@@ -30,9 +30,9 @@ Program FakeProgram() {

  auto add_fc = [&](int id, std::string x) {
    // create variables
-    std::string w1 = "w" + std::to_string(id);
-    std::string b1 = "b" + std::to_string(id);
-    std::string out1 = "out" + std::to_string(id);
+    std::string w1 = "w" + paddle::lite::to_string(id);
+    std::string b1 = "b" + paddle::lite::to_string(id);
+    std::string out1 = "out" + paddle::lite::to_string(id);
    auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
    auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
    auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();

--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -53,9 +53,9 @@ static std::string version() {
 static int64_t int_version(const std::string& version) {
  const std::vector<std::string> vec = Split(version, ".");
  if (vec.size() == 3) {
-    return std::stoi(vec[0]) * MAJOR_COEFF +
-           std::stoi(vec[1]) * MINOR_COEFF +
-           std::stoi(vec[2]) * PATCH_COEFF;
+    return atoi(vec[0].c_str()) * MAJOR_COEFF +
+           atoi(vec[1].c_str()) * MINOR_COEFF +
+           atoi(vec[2].c_str()) * PATCH_COEFF;
  }
  return -1;
 }

--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file,
    cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);

    // uncomment two lines below, save roi img to disk
-    // std::string roi_name = "roi_" + std::to_string(i) + ".jpg";
+    // std::string roi_name = "roi_" + paddle::lite::to_string(i)
+    // + ".jpg";
    // imwrite(roi_name, roi);

    // Do PreProcess

--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -14,6 +14,7 @@

 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) {
  return shape_str;
 }

+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
 inline double GetCurrentUS() {
  struct timeval time;
  gettimeofday(&time, NULL);
@@ -101,24 +128,24 @@ void RunModel(std::string model_dir,
  // 5. Get output
  std::cout << "\n====== output summary ====== " << std::endl;
  size_t output_tensor_num = predictor->GetOutputNames().size();
-  std::cout << "output tesnor num:" << output_tensor_num << std::endl;
+  std::cout << "output tensor num:" << output_tensor_num << std::endl;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
        predictor->GetOutput(tidx);
    std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
    auto out_shape = output_tensor->shape();
-    std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl;
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, ShapeProduction(out_shape), true, out_mean);

-    float sum = 0.f;
-    for (int i = 0; i < ShapeProduction(out_shape); ++i) {
-      sum += output_tensor->data<float>()[i];
-    }
+    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
    std::cout << "output tensor " << tidx
              << " elem num:" << ShapeProduction(out_shape) << std::endl;
-    std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl;
    std::cout << "output tensor " << tidx
-              << " mean value:" << sum / ShapeProduction(out_shape)
+              << " standard deviation:" << out_std_dev << std::endl;
+    std::cout << "output tensor " << tidx << " mean value:" << out_mean
              << std::endl;

    // print output

--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,

    switch (type) {
      case AttrType::INT:
-        return std::to_string(desc.GetAttr<int>(name));
+        return paddle::lite::to_string(desc.GetAttr<int>(name));
      case AttrType::FLOAT:
-        return std::to_string(desc.GetAttr<float>(name));
+        return paddle::lite::to_string(desc.GetAttr<float>(name));
      case AttrType::BOOLEAN:
-        return std::to_string(desc.GetAttr<bool>(name));
+        return paddle::lite::to_string(desc.GetAttr<bool>(name));
      case AttrType::STRING:
        return "\"" + desc.GetAttr<std::string>(name) + "\"";
      case AttrType::FLOATS: {

--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -153,16 +153,16 @@ class Module {

 private:
  std::string WeightUniqueName() const {
-    return "w_" + std::to_string(weight_counter_++);
+    return "w_" + paddle::lite::to_string(weight_counter_++);
  }
  std::string TmpVarUniqueName() const {
-    return "tmp_" + std::to_string(tmp_var_counter_++);
+    return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
  }
  std::string OpUniqueName() const {
-    return "op_" + std::to_string(op_counter_++);
+    return "op_" + paddle::lite::to_string(op_counter_++);
  }
  std::string KernelUniqueName() const {
-    return "kernel_" + std::to_string(kernel_counter_++);
+    return "kernel_" + paddle::lite::to_string(kernel_counter_++);
  }

  std::string DataRepr(const std::string &raw_data, PrecisionType dtype);

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
 # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
 # to the model_optimize_tool.
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
    return()
 endif()

@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de
 if(LITE_WITH_TRAIN)
  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
  add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
 endif()


--- a/lite/kernels/arm/elementwise_grad_compute.cc
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+void ElementwiseAddGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_grad_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (!param.XGrad) {
+    CHECK(param.YGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+
+  if (!param.YGrad) {
+    CHECK(param.XGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    return;
+  }
+
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, y_grad_data, x_grad_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+  }
+}
+
+void ElementwiseSubGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_data = param.OutGrad->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+
+  if (!param.XGrad || !param.YGrad) {
+    CHECK(param.XGrad || param.YGrad);
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
+  }
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_grad_broadcast(
+        out_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, x_dims.production());
+  }
+}
+
+template <typename T, PrecisionType PType>
+void ElementwiseMulGradCompute<T, PType>::Run() {
+  LOG(FATAL) << "elementwise mul_grad not implement yet";
+}
+
+void ElementwiseMaxGradCompute::Run() {
+  LOG(FATAL) << "elementwise max_grad not implement yet";
+}
+
+void ElementwiseDivGradCompute::Run() {
+  LOG(FATAL) << "elementwise div_grad not implement yet";
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using elementwise_mul_grad_float =
+    paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
+                                                          PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(elementwise_add_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseSubGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseDivGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/elementwise_grad_compute.h
+++ b/lite/kernels/arm/elementwise_grad_compute.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ElementwiseAddGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddGradCompute() = default;
+};
+
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+
+template <typename T, PrecisionType PType>
+class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMulGradCompute() = default;
+};
+
+class ElementwiseMaxGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMaxGradCompute() = default;
+};
+
+class ElementwiseDivGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/mul_grad_compute.cc
+++ b/lite/kernels/arm/mul_grad_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void MulGradCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+
+void MulGradCompute::Run() {
+  // step1 flatten_2d
+  auto& param = Param<param_t>();
+  const auto x_dims = param.x->dims();
+  const auto y_dims = param.y->dims();
+  const auto out_dims = param.output_grad->dims();
+
+  m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
+
+  k_ = static_cast<int>(
+      x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
+  n_ = static_cast<int>(
+      y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
+
+  const auto* out_grad_data = param.output_grad->data<float>();
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.x_grad) {
+    x_grad_data = param.x_grad->mutable_data<float>();
+  }
+
+  if (param.y_grad) {
+    y_grad_data = param.y_grad->mutable_data<float>();
+  }
+
+  paddle::lite::operators::ActivationParam act_param;
+  act_param.has_active = false;
+  // out_grad  * y^T = x_grad
+  // (m, n), (n, k) -> (m, k)
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (param.x_grad) {
+    if (m_ == 1) {
+      lite::arm::math::sgemv(y_data,
+                             out_grad_data,
+                             x_grad_data,
+                             false,
+                             k_,  // M
+                             n_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(false,
+                                     true,           // is_transB,
+                                     m_,             // M
+                                     k_,             // N
+                                     n_,             // K
+                                     1.0f,           // alpha
+                                     out_grad_data,  // A
+                                     n_,             // lda
+                                     y_data,         // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     x_grad_data,    // C
+                                     k_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+
+  // x^T * out_grad = y_grad
+  // (k, m) (m, n) -> (k, n)
+  if (param.y_grad) {
+    if (n_ == 1) {
+      lite::arm::math::sgemv(x_data,
+                             out_grad_data,
+                             y_grad_data,
+                             true,
+                             k_,  // M
+                             m_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(true,           // is_transA
+                                     false,          // is_transB,
+                                     k_,             // M
+                                     n_,             // N
+                                     m_,             // K
+                                     1.0f,           // alpha
+                                     x_data,         // A
+                                     k_,             // lda
+                                     out_grad_data,  // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     y_grad_data,    // C
+                                     n_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(mul_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MulGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/mul_grad_compute.h
+++ b/lite/kernels/arm/mul_grad_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulGradParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulGradCompute() = default;
+
+ private:
+  int m_, n_, k_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
    counter = ++(it->second);
  }

-  return prefix + "_" + std::to_string(counter);
+  return prefix + "_" + paddle::lite::to_string(counter);
 }

 bool HasInputArg(const OpInfo* op_info,

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
    return()
 endif()


--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
    return()
 endif()


--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -87,7 +87,8 @@ class Graph {
    auto idx = Add(name, node);
    CHECK_GE(idx, 1);
    // Generate a unique name for the created HiAI IR
-    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
    return node;
  }


--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  split_op->create_dynamic_output_y(out_names.size());
  int idx = 1;
  for (auto& out_name : out_names) {
-    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto zero_node =
+        graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
    auto add_node = graph->Add<ge::op::Add>(out_name);
    auto add_op = add_node->data<ge::op::Add>();
-    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x1(*split_node->data(),
+                         "y" + paddle::lite::to_string(idx));
    add_op->set_input_x2(*zero_node->data());
    idx++;
  }

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
      << "[NPU] No input nodes found for building NPU model";
  CHECK(!device_onames_.empty())
      << "[NPU] No output nodes found for building NPU model";
+
  // Build the HiAI IR graph to HiAI om model as the device program
-  device_program_ = lite::npu::Device::Global().Build(
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client = lite::npu::Device::Global().Build(
      model_name_, device_inodes, device_onodes);
-  if (device_program_ == nullptr) {
+  if (device_client == nullptr) {
    LOG(WARNING) << "[NPU] Build model failed!";
    return subgraph::FAILED;
  }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;

  // Query and check the dimensions of valid input and output tensors
  std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program_->GetModelIOTensorDim(
+  if (device_program->client->GetModelIOTensorDim(
          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
    LOG(WARNING)
        << "[NPU] Get the dimensions of input and output tensors failed!";
    return subgraph::FAILED;
  }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
+
  CHECK_EQ(device_idims.size(), device_inames_.size());
  CHECK_EQ(device_odims.size(), device_onames_.size());
  origin_idims_.resize(device_inames_.size());
@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  origin_odims_.resize(device_onames_.size());
  origin_otensors_.resize(device_onames_.size());
  device_otensors_.resize(device_onames_.size());
+
  for (int i = 0; i < device_inames_.size(); i++) {
    auto node = graph.Get(device_inames_[i]);
    auto precision = node->precision();
@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
    device_itensors_[i].reset(new hiai::AiTensor);
    device_itensors_[i]->Init(&(device_idims[i]));
  }
+  device_program->origin_idims = origin_idims_;
+
  for (int i = 0; i < device_onames_.size(); i++) {
    auto node = graph.Get(device_onames_[i]);
    auto precision = node->precision();
@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
                   << PrecisionToStr(precision);
        break;
    }
+    device_program->origin_odims = origin_odims_;
+
    CHECK_EQ(origin_odims_[i].production(),
             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {

 int SubgraphEngine::LaunchDeviceProgram() {
  // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  // init device_itensors_, device_otensors_, origin_otensors_
+  auto device_program = device_program_map_[inputs_shape_];
  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    device_itensors_[i]->Init(&(device_program->device_idims[i]));
    std::memcpy(device_itensors_[i]->GetBuffer(),
                origin_itensors_[i]->raw_data(),
                origin_itensors_[i]->memory_size());
  }
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    device_otensors_[i]->Init(&(device_program->device_odims[i]));
+  }
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+  }
+
  // Run the HiAI model by name
  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
+  hiai::AiContext model_context;
+  model_context.AddPara(key, model_name_);
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
  };
  int istamp;
  auto start_time = GetCurrentUS();
-  CHECK_EQ(
-      device_program_->Process(
-          model_context_, device_itensors_, device_otensors_, 1000, istamp),
-      hiai::AI_SUCCESS);
+  CHECK_EQ(device_program->client->Process(
+               model_context, device_itensors_, device_otensors_, 1000, istamp),
+           hiai::AI_SUCCESS);
  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+
  // Copy the data of output HiAI tensor to the buffer of origin output tensors
  for (size_t i = 0; i < device_otensors_.size(); i++) {
    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
  return 0;
 }

+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
+
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),

--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
      : subgraph::Engine(
            ctx, block_idx, block_desc, input_names, output_names, scope) {}

+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
+        : client(_client) {}
+    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<hiai::TensorDimension> device_idims{};
+    std::vector<hiai::TensorDimension> device_odims{};
+  };
+
 protected:
  int BuildDeviceProgram() override;
  int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;

-  std::string model_name_;
-  hiai::AiContext model_context_;
-  std::vector<std::string> device_inames_;
-  std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
-  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+  std::string model_name_{"model.om"};
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
+      device_program_map_{};
+  std::vector<std::string> device_inames_{};
+  std::vector<std::string> device_onames_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
 };

 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
    return ()
 endif()

@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
 #             DEPS conv_opencl op_registry program context)

+#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
+#             DEPS tensor cl_context cl_wrapper cl_target_wrapper)
+
 #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
 #             DEPS depthwise_conv2d_opencl op_registry program context)


--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -101,6 +101,7 @@ class ActivationComputeImageDefault
    status = kernel.setArg(++arg_idx, scale_);
    CL_CHECK_FATAL(status);

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(param.X->target());
    VLOG(4) << TargetToStr(param.Out->target());
    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
@@ -112,6 +113,7 @@ class ActivationComputeImageDefault
    VLOG(4) << "threshold:" << threshold_;
    VLOG(4) << "scale:" << scale_;
    VLOG(4) << "kernel func name:" << kernel_func_name_;
+#endif

    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL(

 // exp
 REGISTER_LITE_KERNEL(
-    exp_act,
+    exp,
    kOpenCL,
    kFP16,
    kImageDefault,
@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL(

 // tanh
 REGISTER_LITE_KERNEL(
-    tanh_act,
+    tanh,
    kOpenCL,
    kFP16,
    kImageDefault,

--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) {
                    func_name = "sigmoid";
                    break;
                  case 6:  // tanh
-                    func_name = "tanh_act";
+                    func_name = "tanh";
                    break;
                  case 7:  // tanh
                    func_name = "swish";
                    break;
                  case 8:  // tanh
-                    func_name = "exp_act";
+                    func_name = "exp";
                    break;
                }
                LOG(INFO) << "func_name: " << func_name;
@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);

 // exp
-USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);

 // swish
 USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
 USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);

 // tanh act
-USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);

 // relu image2d fp16
 USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);

--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -77,17 +77,21 @@ class BilinearInterpImageCompute
    int out_h = out_dims[2];
    int out_w = out_dims[3];

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
+#endif

    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
-    // VLOG(4) << "x_image: " << x_img;

    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "x_image: " << x_img;
    // VLOG(4) << "out_image: " << out_img;
    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
            << out_image_shape["height"];
@@ -96,6 +100,7 @@ class BilinearInterpImageCompute
            << ", align_delta: " << align_delta;
    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
    VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
@@ -107,8 +112,10 @@ class BilinearInterpImageCompute
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[2];
+#endif
    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(arg_idx++, *out_img);
@@ -142,9 +149,10 @@ class BilinearInterpImageCompute
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }

 protected:

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
    int arg_idx = 0;
    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];

-    VLOG(4) << "concat 输入尺寸:  ";
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "concat input shape:  ";
    for (size_t i = 0; i < inputs.size(); i++) {
      VLOG(4) << "inputs [" << i << "]"
              << "[" << inputs[i]->dims().size() << "D]:"
@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
              << inputs[i]->dims()[3];
    }

-    VLOG(4) << "concat 输出尺寸:  ";
+    VLOG(4) << "concat output shape:  ";
    VLOG(4) << " out  dims:  "
            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
            << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "axis_: " << axis_;
    VLOG(4) << "flag_: " << flag_;
+#endif

    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
                                               x_dims[x_dims.size() - 1]),
                    static_cast<cl::size_type>(image_shape["height"])};

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(param.output->target());
    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
            << image_shape["height"];
@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
            << (image_shape["height"]);
+#endif

    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int out_w = x_dims[x_dims.size() - 1];
@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
        image_shape = InitImageDimInfoWith(in_dims);
        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
        int in_w = in_dims[in_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
                << image_shape["height"];
+#endif
        global_work_size =
            cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
                        static_cast<cl::size_type>(image_shape["width"] /

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -78,6 +78,7 @@ void ConvImageCompute::PrepareForRun() {
  VLOG(3) << "dilation_equal:" << dilation_equal;
  VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
          << paddings[2] << " " << paddings[3];
+
  CHECK(pad_equal && stride_equal && dilation_equal);

  if (kernel_h == 1 && kernel_w == 1) {
@@ -85,9 +86,9 @@ void ConvImageCompute::PrepareForRun() {
    if (param.x->dims()[1] % 4 == 0) {
      kernel_func_names_.push_back("conv2d_1x1_simple");
    } else {
-      kernel_func_names_.push_back("conv2d_1x1");
+      kernel_func_names_.push_back("conv2d_1x1_opt");
    }
-    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+    kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");

    CLImageConverterNWBlock converter;
    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
@@ -97,7 +98,7 @@ void ConvImageCompute::PrepareForRun() {
    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

-    impl_ = &ConvImageCompute::Conv2d1x1;
+    impl_ = &ConvImageCompute::Conv2d1x1opt;
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
@@ -141,9 +142,10 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::DepthwiseConv2d;
-  } else if (kernel_h == 3 && kernel_h == 3) {
+  } else if (kernel_w == 3 && kernel_h == 3) {
    // conv2d_3x3
-    kernel_func_names_.push_back("conv2d_3x3_opt");
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
+                                        : "conv2d_3x3_opt");
    kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");

    CLImageConverterFolder converter;
@@ -156,6 +158,8 @@ void ConvImageCompute::PrepareForRun() {

    impl_ = &ConvImageCompute::Conv2d3x3opt;
  } else if (kernel_h == 5 && kernel_w == 5) {
+#define CONV_5x5_OPT
+#ifndef CONV_5x5_OPT
    // conv2d_5x5
    kernel_func_names_.push_back("conv2d_5x5");
    kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
@@ -169,7 +173,27 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d5x5;
+#else
+    // conv2d_5x5_opt
+
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
+                                        : "conv2d_5x5_opt");
+    kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d5x5opt;
+#endif
+#undef CONV_5x5_OPT
  } else if (kernel_h == 7 && kernel_w == 7) {
+#define CONV_7x7_OPT
+#ifndef CONV_7x7_OPT
    // conv2d_7x7
    kernel_func_names_.push_back("conv2d_7x7");
    kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
@@ -183,6 +207,25 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d7x7;
+
+#else
+    // conv2d_7x7
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
+                                        : "conv2d_7x7_opt");
+    kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d7x7opt;
+#endif
+#undef CONV_7x7_OPT
+
  } else {
    LOG(FATAL) << "conv image compute not support this condition yet! ";
  }
@@ -229,7 +272,7 @@ void ConvImageCompute::PrepareForRun() {
  }
 }

-void ConvImageCompute::Conv2d1x1() {
+void ConvImageCompute::Conv2d1x1opt() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -269,6 +312,7 @@ void ConvImageCompute::Conv2d1x1() {
  int w = default_work_size[1];
  int nh = default_work_size[2];

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d_1x1 params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
          << input_image_shape["height"];
@@ -290,7 +334,7 @@ void ConvImageCompute::Conv2d1x1() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
-
+#endif
  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
  CHECK_GE(input_dims.size(), 4);
@@ -313,10 +357,12 @@ void ConvImageCompute::Conv2d1x1() {
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
  int maped_w = maptofactor(w, 4);

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
  VLOG(4) << "maped_w: " << maped_w;
  VLOG(4) << "hasbias: " << has_bias;
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -363,21 +409,27 @@ void ConvImageCompute::Conv2d1x1() {
                  static_cast<size_t>(maped_w),
                  static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  size_t max_work_group_size = 0;
  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
                                  CL_KERNEL_WORK_GROUP_SIZE,
                                  &max_work_group_size);
  cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
  if (max_work_group_size > 0 && use_lws) {
    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
  }

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -453,6 +505,7 @@ void ConvImageCompute::Conv2d3x3() {
  int w = default_work_size[1];
  int nh = default_work_size[2];

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
          << input_image_shape["height"];
@@ -477,6 +530,7 @@ void ConvImageCompute::Conv2d3x3() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
+#endif

  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -496,9 +550,12 @@ void ConvImageCompute::Conv2d3x3() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_names_[0] << build_options_[0];
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
  VLOG(4) << "w: " << w;
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -513,7 +570,9 @@ void ConvImageCompute::Conv2d3x3() {
  status = kernel.setArg(++arg_idx, *filter_image);
  CL_CHECK_FATAL(status);
  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -553,9 +612,11 @@ void ConvImageCompute::Conv2d3x3() {
                  static_cast<size_t>(default_work_size.data()[1]),
                  static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
@@ -586,7 +647,8 @@ void ConvImageCompute::Conv2d3x3opt() {
  int output_width = output_dims[3];
  int output_height = output_dims[2];
  int output_channel = output_dims[1];
-
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
  auto out_image_shape = InitImageDimInfoWith(output_dims);
  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
      out_image_shape["width"], out_image_shape["height"]);
@@ -611,8 +673,9 @@ void ConvImageCompute::Conv2d3x3opt() {

  int h_blk_size = 1;
  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  // default_work_size[2] = h_blk;
+// default_work_size[2] = h_blk;

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
  //         << input_image_shape["height"];
@@ -632,6 +695,7 @@ void ConvImageCompute::Conv2d3x3opt() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
+#endif

  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -651,8 +715,11 @@ void ConvImageCompute::Conv2d3x3opt() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_names_[0] << build_options_[0];
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -667,7 +734,9 @@ void ConvImageCompute::Conv2d3x3opt() {
  status = kernel.setArg(++arg_idx, *filter_image);
  CL_CHECK_FATAL(status);
  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -681,6 +750,8 @@ void ConvImageCompute::Conv2d3x3opt() {

  status = kernel.setArg(++arg_idx, dilations[0]);
  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, input_channel);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, input_width);
@@ -696,22 +767,27 @@ void ConvImageCompute::Conv2d3x3opt() {
      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
                  static_cast<size_t>(w_blk),
                  static_cast<size_t>(h_blk)};
-
+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  size_t max_work_group_size = 0;
  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
                                  CL_KERNEL_WORK_GROUP_SIZE,
                                  &max_work_group_size);
  cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
  if (max_work_group_size > 0 && use_lws) {
    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
  }

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -767,6 +843,7 @@ void ConvImageCompute::Conv2d5x5() {
  int w = default_work_size[1];
  int nh = default_work_size[2];

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
          << input_image_shape["height"];
@@ -789,6 +866,7 @@ void ConvImageCompute::Conv2d5x5() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
+#endif

  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -808,9 +886,12 @@ void ConvImageCompute::Conv2d5x5() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_names_[0] << build_options_[0];
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
  VLOG(4) << "w: " << w;
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -825,7 +906,9 @@ void ConvImageCompute::Conv2d5x5() {
  status = kernel.setArg(++arg_idx, *filter_image);
  CL_CHECK_FATAL(status);
  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -855,9 +938,11 @@ void ConvImageCompute::Conv2d5x5() {
                  static_cast<size_t>(default_work_size.data()[1]),
                  static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
@@ -870,6 +955,172 @@ void ConvImageCompute::Conv2d5x5() {
  context.cl_wait_list()->emplace(out_image, event_);
 }

+void ConvImageCompute::Conv2d5x5opt() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 5;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  // default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+// default_work_size[2] = h_blk;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+#endif
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, h_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, paddings[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(w_blk),
+                  static_cast<size_t>(h_blk)};
+
+//  VLOG(4) << "out_image: " << out_image;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
+  size_t max_work_group_size = 0;
+  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                  CL_KERNEL_WORK_GROUP_SIZE,
+                                  &max_work_group_size);
+  cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
+  if (max_work_group_size > 0 && use_lws) {
+    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
+                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
+            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
+  }
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      local_work_size,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Conv2d7x7() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
@@ -912,6 +1163,7 @@ void ConvImageCompute::Conv2d7x7() {
  int w = default_work_size[1];
  int nh = default_work_size[2];

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
          << input_image_shape["height"];
@@ -934,6 +1186,7 @@ void ConvImageCompute::Conv2d7x7() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
+#endif

  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -953,9 +1206,12 @@ void ConvImageCompute::Conv2d7x7() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_names_[0] << build_options_[0];
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
  VLOG(4) << "w: " << w;
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -970,7 +1226,9 @@ void ConvImageCompute::Conv2d7x7() {
  status = kernel.setArg(++arg_idx, *filter_image);
  CL_CHECK_FATAL(status);
  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -1000,9 +1258,11 @@ void ConvImageCompute::Conv2d7x7() {
                  static_cast<size_t>(default_work_size.data()[1]),
                  static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
@@ -1014,7 +1274,167 @@ void ConvImageCompute::Conv2d7x7() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+void ConvImageCompute::Conv2d7x7opt() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;

+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 5;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  // default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+// default_work_size[2] = h_blk;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d 7x7 params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+#endif
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+#endif
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, h_blk);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, paddings[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(w_blk),
+                  static_cast<size_t>(h_blk)};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif
+  size_t max_work_group_size = 0;
+  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                  CL_KERNEL_WORK_GROUP_SIZE,
+                                  &max_work_group_size);
+  cl::NDRange local_work_size = cl::NullRange;
+  if (max_work_group_size > 0 && use_lws) {
+    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
+                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
+            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
+  }
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      local_work_size,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
 void ConvImageCompute::DepthwiseConv2d3x3s1() {
  const auto& param = *param_.get_mutable<param_t>();
  auto x_dims = param.x->dims();
@@ -1071,7 +1491,9 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
  const cl::Image2D* bias_image = nullptr;
  if (has_bias) {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -1099,12 +1521,16 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
                                  CL_KERNEL_WORK_GROUP_SIZE,
                                  &max_work_group_size);
  cl::NDRange local_work_size = cl::NullRange;
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+#endif
  if (max_work_group_size > 0 && use_lws) {
    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
                                                          max_work_group_size);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
            << local_work_size[1] << "," << local_work_size[2] << "}";
+#endif
  }

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -1153,6 +1579,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  int nh = output_dims[0] * output_dims[2];
  auto global_work_size = cl::NDRange(c_block, w, nh);

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "setArg";
  VLOG(4) << "c_block = " << c_block;
  VLOG(4) << "w = " << w;
@@ -1166,6 +1593,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  VLOG(4) << "x_dims[2] = " << x_dims[2];
  VLOG(4) << "output_dims[3] = " << output_dims[3];
  VLOG(4) << "output_dims[2] = " << output_dims[2];
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -1185,7 +1613,9 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  const cl::Image2D* bias_image = nullptr;
  if (has_bias) {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -1261,6 +1691,7 @@ void ConvImageCompute::DepthwiseConv2d() {
  int w = default_work_size[1];
  int nh = default_work_size[2];

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ depthwise conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
          << input_image_shape["height"];
@@ -1282,6 +1713,7 @@ void ConvImageCompute::DepthwiseConv2d() {
  VLOG(4) << "default work size{c_block, w, nh}: "
          << "{" << c_block << ", " << w << ", " << nh << ""
          << "}";
+#endif

  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -1303,9 +1735,12 @@ void ConvImageCompute::DepthwiseConv2d() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_names_[0] << build_options_[0];
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "kernel_key: " << kernel_key.str();
  VLOG(4) << "kernel ready ... " << kernel_key.str();
  VLOG(4) << "w: " << w;
+#endif

  cl_int status;
  int arg_idx = 0;
@@ -1320,7 +1755,9 @@ void ConvImageCompute::DepthwiseConv2d() {
  status = kernel.setArg(++arg_idx, *filter_image);
  CL_CHECK_FATAL(status);
  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "set bias_image: ";
+#endif
    status = kernel.setArg(++arg_idx, *bias_image);
    CL_CHECK_FATAL(status);
  }
@@ -1354,9 +1791,11 @@ void ConvImageCompute::DepthwiseConv2d() {
                  static_cast<size_t>(default_work_size.data()[1]),
                  static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
          << global_work_size[1] << "," << global_work_size[2] << "}";
+#endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,

--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  void Run() override;

 private:
-  void Conv2d1x1();
+  void Conv2d1x1opt();
  void Conv2d3x3();
  void Conv2d3x3opt();
  void Conv2d5x5();
+  void Conv2d5x5opt();
  void Conv2d7x7();
+  void Conv2d7x7opt();
  void DepthwiseConv2d3x3s1();
  void DepthwiseConv2d3x3();
  void DepthwiseConv2d();

--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
  const int dilation = 1;
  const int stride = 2;
  const int group = 1;
-  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
    for (int oc = 1; oc < 10; oc += 1) {   // oc
      for (int ih = 5; ih < 9; ih += 1) {  // ih
        int iw = ih;
@@ -532,7 +532,7 @@ const int stride = 2;
 #else  // big scale with group
  const int stride = 1;
  const int group = 32 / 1;
-  const int batch_size = 1;
+  const int batch_size = 2;
  const int ic = 32 / 1;
  const int ih = 112 / 1;
  const int iw = 112 / 1;
@@ -558,7 +558,8 @@ const int stride = 2;
                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(kernels.empty());
-              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+              //              CHECK(batch_size == 1) << "conv3x3 only supprt
+              //              batch_size == 1";

              auto kernel = std::move(kernels.front());
              SHADOW_LOG << "created conv2d kernel";
@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) {
 //  int loop_cnt = 0;

 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {   // oc
-      for (int ih = 5; ih < 9; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 5; oc += 1) {    // oc
+      for (int ih = 5; ih < 8; ih += 1) {  // ih
        int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
+        for (int ic = 2; ic < 6; ic += 1) {  // ic
          for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {/*true,*/ "relu"}) {
+            for (std::string relu_flag : {""
+                                          "relu"}) {
 #else
  const int batch_size = 2;
  const int oc = 1;
@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {

              SHADOW_LOG << "gen input and filter ...";
              for (auto& i : input_v) {
-                i = 0.01 * gen(engine);
+                i = 0.5 * gen(engine);
              }
              for (auto& f : filter_v) {
-                f = 0.01 * gen(engine);
+                f = 0.5 * gen(engine);
              }

              SHADOW_LOG << "after gen input and filter ...";
@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) {
 #undef LOOP_TEST
 #undef PRINT_RESULT
 #endif
+
 #ifdef TEST_CONV_IMAGE_7x7
-#undef FP16_ABS_DIFF
-#define FP16_ABS_DIFF (1e0)
+// #undef FP16_ABS_DIFF
+// #define FP16_ABS_DIFF (1e-1)
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_7x7) {
  // conv infos
@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) {
 //  int loop_cnt = 0;

 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {    // oc
-      for (int ih = 7; ih < 15; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 6; oc += 1) {    // oc
+      for (int ih = 7; ih < 8; ih += 1) {  // ih
        int iw = ih;
-        for (int ic = 2; ic < 10; ic += 1) {  // ic
-          for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {"relu"}) {
+        for (int ic = 2; ic < 4; ic += 1) {  // ic
+          for (bool bias_flag : {false, true}) {
+            for (std::string relu_flag : {"", "relu"}) {
 #else
  const int batch_size = 2;
  const int oc = 1;
@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) {

              SHADOW_LOG << "gen input and filter ...";
              for (auto& i : input_v) {
-                i = gen(engine);
+                i = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
                i = 1;
 #endif
              }
+              int fiii = 1;
              for (auto& f : filter_v) {
-                f = gen(engine);
+                f = 0.1 * gen(engine);
 #ifdef TEST_CONV_IMAGE_ALL_1
+                // f = fiii++;
                f = 1;
 #endif
              }
@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) {
              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                             filter_dim);

-              //              auto* filter_image2d = filter.mutable_data<float,
+              //              auto* filter_image2d =
+              // filter.mutable_data < float,
              //              cl::Image2D>(
              //                  filter_image_width,
              //                  filter_image_height,

--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() {
  STL::stringstream kernel_key;
  kernel_key << kernel_func_name_ << build_options_;
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << TargetToStr(ele_param_->X->target());
  VLOG(4) << TargetToStr(ele_param_->Y->target());
  VLOG(4) << TargetToStr(ele_param_->Out->target());
+#endif
  int arg_idx = 0;
  cl_int status = kernel.setArg(arg_idx, *x_buf);
  CL_CHECK_FATAL(status);
@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() {
  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
    num_ *= x_dims[i];
  }
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "axis: " << axis;
  VLOG(4) << "batch: " << batch_;
  VLOG(4) << "channels: " << channels_;
  VLOG(4) << "num: " << num_;
+#endif
 }

 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
  auto* out = ele_param_->Out;
  auto axis = ele_param_->axis;

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x->target():" << TargetToStr(x->target());
  VLOG(4) << "y->target():" << TargetToStr(y->target());
  VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
  VLOG(4) << "y->dims():" << y->dims();
  VLOG(4) << "out->dims():" << out->dims();
  VLOG(4) << "axis:" << axis;
+#endif

  paddle::lite::CLImageConverterDefault default_convertor;
  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() {
  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                         out_img_shape[1]);

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
          << out_img_shape[1];
+#endif

  STL::stringstream kernel_key;
  kernel_key << kernel_func_name_ << build_options_;
@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
  } else if (y_dims.size() == 1) {
    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
      int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "tensor_w:" << tensor_w;
-
+#endif
      cl_int status = kernel.setArg(arg_idx, *x_img);
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, *y_img);
@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {

  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                      static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,

--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -56,7 +56,7 @@ class ElementwiseMulImageCompute
      } else {
        kernel_func_name_ = "channel_mul_d2_hw";
      }
-    } else if (y_dims.size() == 4) {
+    } else if (y_dims.size() == 4 || x_dims.size() == 4) {
      kernel_func_name_ = "channel_mul_d4";
    } else {
      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
@@ -80,12 +80,14 @@ class ElementwiseMulImageCompute
    auto* y = ele_param_->Y;
    auto* out = ele_param_->Out;

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "y->target():" << TargetToStr(y->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << x->dims();
    VLOG(4) << "y->dims():" << y->dims();
    VLOG(4) << "out->dims():" << out->dims();
+#endif

    paddle::lite::CLImageConverterDefault default_convertor;
    auto x_img_shape =
@@ -101,10 +103,12 @@ class ElementwiseMulImageCompute
    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                           out_img_shape[1]);

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
    VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
    VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
            << out_img_shape[1];
+#endif

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
@@ -123,7 +127,9 @@ class ElementwiseMulImageCompute
      CL_CHECK_FATAL(status);
    } else if (y_dims.size() == 1 || y_dims.size() == 4) {
      auto tensor_w = x_dims[x_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "tensor_w:" << tensor_w;
+#endif
      // kernel: channel_mul_d1 / channel_mul_d4
      cl_int status = kernel.setArg(arg_idx, *x_img);
      CL_CHECK_FATAL(status);
@@ -136,7 +142,9 @@ class ElementwiseMulImageCompute
    } else if (y_dims.size() == 2) {
      if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
        auto tensor_w = x_dims[x_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
        VLOG(4) << "tensor_w:" << tensor_w;
+#endif
        // kernel: channel_mul_d2_nc
        cl_int status = kernel.setArg(arg_idx, *x_img);
        CL_CHECK_FATAL(status);
@@ -149,7 +157,9 @@ class ElementwiseMulImageCompute
      } else {
        auto y_tensor_h = y->dims()[0];
        auto y_tensor_w = y->dims()[1];
+#ifndef LITE_SHUTDOWN_LOG
        VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+#endif
        // kernel: channel_mul_d2_hw
        cl_int status = kernel.setArg(arg_idx, *x_img);
        CL_CHECK_FATAL(status);
@@ -162,6 +172,18 @@ class ElementwiseMulImageCompute
        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
        CL_CHECK_FATAL(status);
      }
+    } else if (x_dims.size() == 4) {
+      auto tensor_w = y_dims[y_dims.size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      // kernel: channel_mul_d4
+      cl_int status = kernel.setArg(arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
    } else {
      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
                 << y_dims.size();
@@ -179,8 +201,9 @@ class ElementwiseMulImageCompute
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
  }

 protected:

--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -62,6 +62,7 @@ void ElementwiseSubImageCompute::Run() {
  auto* out = ele_param_->Out;
  auto axis = ele_param_->axis;

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x->target():" << TargetToStr(x->target());
  VLOG(4) << "y->target():" << TargetToStr(y->target());
  VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -69,6 +70,7 @@ void ElementwiseSubImageCompute::Run() {
  VLOG(4) << "y->dims():" << y->dims();
  VLOG(4) << "out->dims():" << out->dims();
  VLOG(4) << "axis:" << axis;
+#endif

  paddle::lite::CLImageConverterDefault default_convertor;
  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -83,10 +85,12 @@ void ElementwiseSubImageCompute::Run() {
  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
                                                         out_img_shape[1]);

+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
          << out_img_shape[1];
+#endif

  STL::stringstream kernel_key;
  kernel_key << kernel_func_name_ << build_options_;
@@ -104,8 +108,9 @@ void ElementwiseSubImageCompute::Run() {
  } else if (y_dims.size() == 1) {
    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
      int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "tensor_w:" << tensor_w;
-
+#endif
      cl_int status = kernel.setArg(arg_idx, *x_img);
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, *y_img);
@@ -127,7 +132,10 @@ void ElementwiseSubImageCompute::Run() {

  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                      static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
+
  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,

--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
    auto out_dims = out->dims();
    auto in_dims = x->dims();

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
+#endif

    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),

    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
    // VLOG(4) << "out_image" << out_img;
    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
            << out_image_shape["height"];
-
+#endif
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[2];
+#endif
    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(arg_idx++, *grid_img);
@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }

 protected:

--- a/lite/backends/opencl/cl_im2col_test.cc
+++ b/lite/backends/opencl/cl_im2col_test.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -89,19 +89,23 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
    int in_h = in_dims[2];
    int in_w = in_dims[3];

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
+#endif

    auto out_image_shape = InitImageDimInfoWith(in_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
-
    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
            << out_image_shape["height"];

    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
+#endif

    int threads = 512;
    int group_size_x = (channel + 3) / 4;
@@ -113,10 +117,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
        cl::NDRange{static_cast<cl::size_type>(group_size_x * threads),
                    static_cast<cl::size_type>(group_size_y),
                    static_cast<cl::size_type>(1)};
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "local_work_size:[2D]:" << local_work_size[0] << " "
            << local_work_size[1] << " " << local_work_size[2];
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;

--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -42,11 +42,13 @@ class IoCopyHostToOpenCLCompute
    CHECK(param.x->target() == TARGET(kHost) ||
          param.x->target() == TARGET(kARM));
    auto mem_size = param.x->memory_size();
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "param.x->memory_size():" << mem_size;
    VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
    VLOG(2) << "param.x->dims():" << param.x->dims();
    VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
    VLOG(2) << "param.y->dims():" << param.y->dims();
+#endif
    auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
    CopyFromHostSync(data, param.x->raw_data(), mem_size);
  }
@@ -85,12 +87,14 @@ class IoCopykOpenCLToHostCompute
    CHECK(param.x->target() == TARGET(kOpenCL));
    auto mem_size = param.x->memory_size();

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "copy size " << mem_size;
    VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
    VLOG(2) << "param.x->dims():" << param.x->dims();
    VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
    VLOG(2) << "param.y->dims():" << param.y->dims();
    VLOG(2) << "param.process_type:" << param.process_type;
+#endif

    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
    const cl::Buffer* x_ptr;
@@ -104,7 +108,9 @@ class IoCopykOpenCLToHostCompute
    auto* wait_list = context.cl_wait_list();
    auto it = wait_list->find(x_ptr);
    if (it != wait_list->end()) {
+#ifndef LITE_SHUTDOWN_LOG
      VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
+#endif
      auto& event = *(it->second);
      event.wait();
    } else {

--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -74,6 +74,7 @@ class LayoutComputeBufferChwToImageDefault
    const int Stride1 = out_H * out_W;
    const int Stride0 = out_W;

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "param.process_type:" << param.process_type;
    VLOG(2) << "x_dims:" << x_dims;
    VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
@@ -89,6 +90,7 @@ class LayoutComputeBufferChwToImageDefault
    VLOG(2) << "Stride2:" << Stride2;
    VLOG(2) << "Stride1:" << Stride1;
    VLOG(2) << "Stride0:" << Stride0;
+#endif

    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
@@ -177,6 +179,7 @@ class LayoutComputeImageDefaultToBufferChw
      new_dims[4 - x_dims.size() + j] = x_dims[j];
    }

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "param.process_type:" << param.process_type;
    VLOG(2) << "x_dims:" << x_dims;
    VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
@@ -186,6 +189,7 @@ class LayoutComputeImageDefaultToBufferChw
            << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
    VLOG(2) << "y_dims:" << y_dims;
    VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
+#endif

    size_t C = new_dims[1];
    size_t in_height = new_dims[2];
@@ -217,8 +221,10 @@ class LayoutComputeImageDefaultToBufferChw
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, static_cast<const int>(C));
    CL_CHECK_FATAL(status);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
            << " " << (new_dims[0] * new_dims[2]);
+#endif
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                    static_cast<cl::size_type>(new_dims[3]),

--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -65,6 +65,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
    auto out_dims = out->dims();
    auto in_dims = x->dims();

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target(): " << TargetToStr(x->target());
    VLOG(4) << "out->target(): " << TargetToStr(out->target());
    VLOG(4) << "x->dims(): " << in_dims;
@@ -74,6 +75,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
    VLOG(4) << "alpha: " << alpha_;
    VLOG(4) << "beta: " << beta_;
    VLOG(4) << "norm_region: " << norm_region_;
+#endif

    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
@@ -81,9 +83,12 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),

    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
    // VLOG(4) << "out_image" << out_img;
    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
            << out_image_shape["height"];
+#endif

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
@@ -97,8 +102,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[3];
+#endif
    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(arg_idx++, *out_img);
@@ -130,9 +137,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }

 protected:

--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -87,6 +87,7 @@ class NearestInterpComputeImageDefault
    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
    CL_CHECK_FATAL(status);

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(param.X->target());
    VLOG(4) << TargetToStr(param.Out->target());
    VLOG(4) << "out_image_shape(w,h):" << out_image_shape["width"] << " "
@@ -95,6 +96,7 @@ class NearestInterpComputeImageDefault
            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+#endif

    const std::vector<size_t>& default_work_size =
        DefaultWorkSize(y_dims,

--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -71,10 +71,12 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
    int out_h = out_dims[2];
    int out_w = out_dims[3];

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
+#endif

    auto out_image_shape = InitImageDimInfoWith(out_dims);
    auto* x_img = x->data<half_t, cl::Image2D>();
@@ -82,11 +84,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);

+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
            << out_image_shape["height"];

    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
    VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
@@ -98,9 +102,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                        DDim(std::vector<DDim::value_type>{
                            static_cast<int64_t>(out_image_shape["width"]),
                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
            << default_work_size[1] << ", " << default_work_size[2];
-
+#endif
    int pad_h0 = pad2d_param_->paddings[0];
    int pad_h1 = pad2d_param_->paddings[1];
    int pad_w0 = pad2d_param_->paddings[2];
@@ -144,9 +149,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
+#endif
  }

 protected:

--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -89,7 +89,7 @@ void pad2d_ref(const float *x_data,
  }
 }

-#define LOOP_TEST
+// #define LOOP_TEST
 // #define PRINT_RESULT
 TEST(pad2d_image2d, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "

--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -59,10 +59,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    std::vector<int> paddings = *param.paddings;
    std::vector<int> strides = param.strides;
    std::vector<int> ksize = param.ksize;
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_pooling: " << global_pooling;
    VLOG(4) << "pooling_type: " << pooling_type;
    VLOG(4) << "paddings : " << paddings[0] << "  " << paddings[1] << "  "
            << paddings[2] << "  " << paddings[3] << "  ";
+#endif
+
    if (global_pooling) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[2 * i] = 0;
@@ -70,6 +74,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
        ksize[i] = static_cast<int>(in_dims[i + 2]);
      }
    }
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << "  "
            << in_dims[1] << "  " << in_dims[2] << "  " << in_dims[3];
    VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << "  "
@@ -82,6 +88,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
            << ksize[1] << "  " << ksize[2] << "  " << ksize[3];
    VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << "  "
            << paddings[1] << "  " << paddings[2] << "  " << paddings[3];
+#endif
+
    bool pads_equal =
        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
    if (!pads_equal) {
@@ -95,8 +103,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    //    VLOG(4) << "x_image" << x_img;

    auto out_image_shape = InitImageDimInfoWith(out_dims);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
            << out_image_shape["height"];
+#endif
    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
    //    VLOG(4) << "out_image" << out_img;
@@ -109,8 +119,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    int w = out_dims[3];
    int nh = out_dims[0] * out_dims[2];
    auto global_work_size = cl::NDRange(c_block, w, nh);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size : [" << 3 << "]" << c_block << "  " << w
            << "  " << nh << "  ";
+#endif
    cl_int status;
    int arg_idx = 0;
    status = kernel.setArg(arg_idx, *x_img);

--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -41,8 +41,6 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
  }

  void Run() override {
-    VLOG(4) << "reshape_compute run ... ";
-
    auto& param = *param_.get_mutable<param_t>();
    const Tensor* const x = param.x;

@@ -64,8 +62,9 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
        InitImageDimInfoWith(out_dims);
    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
        out_image_shape.at("width"), out_image_shape.at("height"));
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "out_dims=   " << out_dims;
-
+#endif
    const std::vector<size_t>& default_work_size = DefaultWorkSize(
        out_dims,
        DDim(std::vector<DDim::value_type>{
@@ -94,6 +93,8 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
    int out_Stride0 = out_W;
    int out_Stride1 = out_H * out_W;
    int out_Stride2 = out_C * out_H * out_W;
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "out_C=" << out_C;
    VLOG(4) << "out_H=" << out_H;
    VLOG(4) << "out_W=" << out_W;
@@ -104,17 +105,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
    VLOG(4) << "in_Stride1=" << in_Stride1;
    VLOG(4) << "out_Stride0=" << out_Stride0;
    VLOG(4) << "out_Stride1=" << out_Stride1;
+#endif

    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << TargetToStr(x->target());
    VLOG(4) << TargetToStr(param.output->target());
+#endif

    int arg_idx = 0;
-
    cl_int status;
    status = kernel.setArg(arg_idx, *x_image);
    CL_CHECK_FATAL(status);
@@ -199,8 +203,8 @@ REGISTER_LITE_KERNEL(reshape2,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),
@@ -217,7 +221,7 @@ REGISTER_LITE_KERNEL(flatten,
               {LiteType::GetTensorTy(TARGET(kOpenCL),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),
@@ -235,7 +239,7 @@ REGISTER_LITE_KERNEL(flatten2,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),

--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -51,8 +51,10 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),

    //    LOG(INFO) << "x_image" << x_img;
    auto out_image_shape = InitImageDimInfoWith(in_dims);
-    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
-              << out_image_shape["height"];
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
    //    LOG(INFO) << "out_image" << out_img;

--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -49,7 +49,7 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
  CHECK_GE(idx, 1);
  node->set_data(std::make_shared<xtcl::xExpr>(layer));
  // Generate a unique name for the current XTCL layer
-  builder_.SetLayer(name + "__" + std::to_string(idx));
+  builder_.SetLayer(name + "__" + paddle::lite::to_string(idx));
  return node;
 }


--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
@@ -155,7 +155,7 @@ TEST(ListBuilder, basic) {

  for (int i = 0; i < num_elems; i++) {
    auto* elem = li.New();
-    elem->set("elem-" + std::to_string(i));
+    elem->set("elem-" + paddle::lite::to_string(i));
  }
  li.Save();
  table.SaveToFile("2.bf");
@@ -169,7 +169,7 @@ TEST(ListBuilder, basic) {
  li1.Load();

  for (int i = 0; i < num_elems; i++) {
-    ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i));
+    ASSERT_EQ(li1.Get(i).data(), "elem-" + paddle::lite::to_string(i));
  }
 }


--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -144,6 +144,8 @@ add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
 if (LITE_WITH_TRAIN)
  add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
  add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
  add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
 endif()


--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -80,6 +80,34 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
  }
 }

+bool ConvOpLite::SmartInferShape() {
+  if (!last_input_shapes.empty()) {
+    if (last_input_shapes[0] == param_.x->dims() &&
+        last_input_lods[0] == param_.x->lod()) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.x->dims());
+  last_input_lods.push_back(param_.x->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+
+  return true;
+}
 bool ConvOpLite::InferShape() const {
  const auto in_dims = param_.x->dims();
  const auto filter_dims = param_.filter->dims();
@@ -104,9 +132,9 @@ bool ConvOpLite::InferShape() const {

  // Set output dims
  param_.output->Resize(lite::DDim(output_shape));
-
  // share LoD
-  // param_.output->set_lod(param_.x->lod());
+  param_.output->set_lod(param_.x->lod());
+
  return true;
 }


--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -36,6 +36,7 @@ class ConvOpLite : public OpLite {
  bool CheckShape() const override;

  bool InferShape() const override;
+  bool SmartInferShape() override;

  // TODO(Superjomn) replace framework::OpDesc with a lite one.
  bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {

--- a/lite/operators/elementwise_grad_ops.cc
+++ b/lite/operators/elementwise_grad_ops.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/elementwise_grad_ops.h"
+#include <algorithm>
+#include <cmath>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ElementwiseGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.XGrad || param_.YGrad);
+  CHECK_OR_FALSE(param_.OutGrad);
+  return true;
+}
+
+bool ElementwiseGradOp::InferShape() const {
+  auto x_dim = param_.X->dims();
+  auto y_dim = param_.Y->dims();
+  if (param_.XGrad) {
+    param_.XGrad->Resize(x_dim);
+  }
+  if (param_.YGrad) {
+    param_.YGrad->Resize(y_dim);
+  }
+  return true;
+}
+
+bool ElementwiseGradOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                   lite::Scope* scope) {
+  auto Y_name = opdesc.Input("Y").front();
+  auto X_name = opdesc.Input("X").front();
+  auto Out_name = opdesc.Input("Out@GRAD").front();
+  CHECK(!opdesc.Output("X@GRAD").empty() || !opdesc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  if (!opdesc.Output("X@GRAD").empty()) {
+    auto x_grad_name = opdesc.Output("X@GRAD").front();
+    param_.XGrad = GetMutableVar<lite::Tensor>(scope, x_grad_name);
+  }
+  if (!opdesc.Output("Y@GRAD").empty()) {
+    auto y_grad_name = opdesc.Output("Y@GRAD").front();
+    param_.YGrad = GetMutableVar<lite::Tensor>(scope, y_grad_name);
+  }
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.OutGrad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(elementwise_sub_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_add_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+
+REGISTER_LITE_OP(elementwise_grad_mul,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_grad_max,
+                 paddle::lite::operators::ElementwiseGradOp);
--- a/lite/operators/elementwise_grad_ops.h
+++ b/lite/operators/elementwise_grad_ops.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ElementwiseGradOp : public OpLite {
+ public:
+  explicit ElementwiseGradOp(const std::string& op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "elementwise_grad_op"; }
+
+ private:
+  mutable operators::ElementwiseGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -26,7 +26,38 @@ bool ElementwiseOp::CheckShape() const {
  CHECK_OR_FALSE(param_.Out);
  return true;
 }
+bool ElementwiseOp::SmartInferShape() {
+  if (!last_input_shapes.empty()) {
+    if (last_input_shapes[0] == param_.X->dims() &&
+        last_input_shapes[1] == param_.Y->dims() &&
+        last_input_lods[0] == param_.X->lod() &&
+        last_input_lods[1] == param_.Y->lod()) {
+      param_.Out->Resize(last_output_shapes[0]);
+      param_.Out->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }

+  last_input_shapes.push_back(param_.X->dims());
+  last_input_lods.push_back(param_.X->lod());
+  last_input_shapes.push_back(param_.Y->dims());
+  last_input_lods.push_back(param_.Y->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.Out->dims());
+  last_output_lods.push_back(param_.Out->lod());
+  return true;
+}
 bool ElementwiseOp::InferShape() const {
  auto x_dim = param_.X->dims();
  auto y_dim = param_.Y->dims();
@@ -81,6 +112,7 @@ bool ElementwiseOp::InferShape() const {
    auto out_lod = param_.Out->mutable_lod();
    *out_lod = param_.X->lod();
  }
+
  return true;
 }


--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -28,6 +28,7 @@ class ElementwiseOp : public OpLite {
  bool CheckShape() const override;

  bool InferShape() const override;
+  bool SmartInferShape() override;

  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;


--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -48,6 +48,33 @@ bool FcOpLite::CheckShape() const {
  return true;
 }

+bool FcOpLite::SmartInferShape() {
+  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
+    if (last_input_shapes[0] == param_.input->dims() &&
+        last_input_lods[0] == param_.input->lod()) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.input->dims());
+  last_input_lods.push_back(param_.input->lod());
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+
+  return true;
+}
 bool FcOpLite::InferShape() const {
  const auto& input_dims = param_.input->dims();
  const auto& w_dims = param_.w->dims();
@@ -64,6 +91,7 @@ bool FcOpLite::InferShape() const {

  // share LoD
  param_.output->set_lod(param_.input->lod());
+
  return true;
 }


--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -36,6 +36,7 @@ class FcOpLite : public OpLite {
  bool CheckShape() const override;

  bool InferShape() const override;
+  bool SmartInferShape() override;

  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;


--- a/lite/operators/mul_grad_op.cc
+++ b/lite/operators/mul_grad_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/mul_grad_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MulGradOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.y);
+  CHECK_OR_FALSE(param_.output_grad);
+  CHECK_OR_FALSE(param_.x_grad || param_.y_grad);
+  CHECK_OR_FALSE(param_.x_num_col_dims);
+  CHECK_OR_FALSE(param_.y_num_col_dims);
+
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  const auto out_dims = param_.output_grad->dims();
+
+  CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
+  CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
+
+  auto x_flatten_dims = flatten_2d(x_dims, param_.x_num_col_dims);
+  auto y_flatten_dims = flatten_2d(y_dims, param_.y_num_col_dims);
+  auto out_flatten_dims = flatten_2d(out_dims, param_.x_num_col_dims);
+
+  // Out = X * Y;
+  CHECK_EQ_OR_FALSE(x_flatten_dims[1], y_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(x_flatten_dims[0], out_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(y_flatten_dims[1], out_flatten_dims[1]);
+  return true;
+}
+
+bool MulGradOpLite::InferShape() const {
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  if (param_.x_grad) {
+    param_.x_grad->Resize(x_dims);
+    param_.x_grad->set_lod(param_.x->lod());
+  }
+  if (param_.y_grad) {
+    param_.y_grad->Resize(y_dims);
+    param_.y_grad->set_lod(param_.y->lod());
+  }
+}
+
+bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Input("Out@GRAD").empty());
+  CHECK(!op_desc.Output("X@GRAD").empty() || !op_desc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  auto *x_var = scope->FindVar(op_desc.Input("X").front());
+  CHECK(x_var);
+  param_.x = &x_var->Get<Tensor>();
+
+  auto *y_var = scope->FindVar(op_desc.Input("Y").front());
+  CHECK(y_var);
+  param_.y = &y_var->Get<Tensor>();
+
+  auto *out_grad_var = scope->FindVar(op_desc.Input("Out@GRAD").front());
+  CHECK(out_grad_var);
+  param_.output_grad = &out_grad_var->Get<Tensor>();
+
+  if (!op_desc.Output("X@GRAD").empty()) {
+    auto *x_grad_var = scope->FindVar(op_desc.Output("X@GRAD").front());
+    CHECK(x_grad_var);
+    param_.x_grad = x_grad_var->GetMutable<Tensor>();
+  }
+
+  if (!op_desc.Output("Y@GRAD").empty()) {
+    auto *y_grad_var = scope->FindVar(op_desc.Output("Y@GRAD").front());
+    CHECK(y_grad_var);
+    param_.y_grad = y_grad_var->GetMutable<Tensor>();
+  }
+  param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
+  param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
--- a/lite/operators/mul_grad_op.h
+++ b/lite/operators/mul_grad_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MulGradOpLite : public OpLite {
+ public:
+  MulGradOpLite() {}
+
+  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  std::string DebugString() const override { return "mul_grad"; }
+
+ private:
+  mutable MulGradParam param_;
+};
+
+std::vector<int64_t> flatten_2d(DDim dims, int num_col_dims) {
+  std::vector<int64_t> flatten_dims{1, 1};
+  for (int i = 0; i < dims.size(); i++) {
+    if (i < num_col_dims) {
+      flatten_dims[0] *= dims[i];
+    } else {
+      flatten_dims[1] *= dims[i];
+    }
+  }
+  return flatten_dims;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -66,28 +66,6 @@ class MulOpLite : public OpLite {
  mutable MulParam param_;
 };

-#ifdef LITE_WITH_TRAIN
-class MulGradOpLite : public OpLite {
- public:
-  MulGradOpLite() {}
-
-  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "mul_grad"; }
-
- private:
-  mutable MulGradParam param_;
-};
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -387,10 +387,11 @@ struct ElementwiseParam {
 };

 struct ElementwiseGradParam {
+  const lite::Tensor* X{};
  const lite::Tensor* Y{};
-  const lite::Tensor* Out_grad{};
-  lite::Tensor* X_grad{};
-  lite::Tensor* Y_grad{};
+  const lite::Tensor* OutGrad{};
+  lite::Tensor* XGrad{};
+  lite::Tensor* YGrad{};
  int axis{-1};  // for broadcasting.
 };


--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -29,10 +29,39 @@ bool SoftmaxOp::CheckShape() const {
  return true;
 }

+bool SoftmaxOp::SmartInferShape() {
+  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
+    if (param_.x->dims() == last_input_shapes[0] &&
+        param_.x->lod() == last_input_lods[0]) {
+      param_.output->Resize(last_output_shapes[0]);
+      param_.output->set_lod(last_output_lods[0]);
+      return true;
+    }
+  }
+
+  this->InferShape();
+
+  if (!last_input_shapes.empty()) {
+    last_input_shapes.clear();
+    last_input_lods.clear();
+  }
+  last_input_shapes.push_back(param_.x->dims());
+  last_input_lods.push_back(param_.x->lod());
+
+  if (!last_output_shapes.empty()) {
+    last_output_shapes.clear();
+    last_output_lods.clear();
+  }
+  last_output_shapes.push_back(param_.output->dims());
+  last_output_lods.push_back(param_.output->lod());
+  return true;
+}
+
 bool SoftmaxOp::InferShape() const {
  param_.output->Resize(param_.x->dims());
  auto out_lod = param_.output->mutable_lod();
  *out_lod = param_.x->lod();
+
  return true;
 }


--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
@@ -31,6 +31,7 @@ class SoftmaxOp : public OpLite {
  bool CheckShape() const override;

  bool InferShape() const override;
+  bool SmartInferShape() override;

  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;


--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -65,6 +65,8 @@ if(LITE_BUILD_EXTRA)
    if (LITE_WITH_TRAIN)
        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    endif()


--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -128,7 +128,7 @@ class ConcateComputeTester : public arena::TestCase {
      for (int i = 0; i < x_dims_.production(); i++) {
        x_data[i] = static_cast<float>(i + n);
      }
-      const std::string x_name = "x_tensor_" + std::to_string(n);
+      const std::string x_name = "x_tensor_" + paddle::lite::to_string(n);
      x_vct_.push_back(x_name);
      SetCommonTensor(x_name, x_dims_, x_data.data());
    }

--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/elementwise_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::ElementwiseParam;
+using grad_param_t = operators::ElementwiseGradParam;
+using kernel_add_t = ElementwiseAddCompute;
+using grad_kernel_add_t = ElementwiseAddGradCompute;
+using kernel_sub_t = ElementwiseSubCompute;
+using grad_kernel_sub_t = ElementwiseSubGradCompute;
+
+void elementwise_common(grad_param_t& param,           // NOLINT
+                        std::vector<float>& out_grad,  // NOLINT
+                        std::vector<float>& x_grad,    // NOLINT
+                        std::vector<float>& y_grad,    // NOLINT
+                        std::string flag) {
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  if (x_dims == y_dims) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      if (flag == "add") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = out_grad[i];
+      }
+      if (flag == "sub") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = -out_grad[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupport dims";
+  }
+}
+
+class ElementwiseAddGradTester {
+ public:
+  explicit ElementwiseAddGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_add_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_add_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "add");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_add_t kernel_;
+  grad_kernel_add_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+
+class ElementwiseSubGradTester {
+ public:
+  explicit ElementwiseSubGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_sub_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_sub_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "sub");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_sub_t kernel_;
+  grad_kernel_sub_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int axis) {
+  std::unique_ptr<ElementwiseAddGradTester> tester_add(
+      new ElementwiseAddGradTester(DDim(x_dims), DDim(y_dims), axis));
+  std::unique_ptr<ElementwiseSubGradTester> tester_sub(
+      new ElementwiseSubGradTester(DDim(x_dims), DDim(y_dims), axis));
+
+  tester_add->prepare_kernel();
+  tester_sub->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester_add->check_grad(delta, max_grad_delta);
+  tester_sub->check_grad(delta, max_grad_delta);
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Elementwise grad";
+  DeviceInfo::Init();
+  TestNormalCase({3, 2}, {3, 2}, 0);
+  TestNormalCase({3, 5}, {3, 5}, 1);
+  TestNormalCase({3, 4, 3}, {3, 4, 3}, 0);
+  TestNormalCase({9, 2, 5}, {9, 2, 5}, 1);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_add_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -52,7 +52,8 @@ class FillConstantComputeTester : public arena::TestCase {
        is_use_shape_tensor_list_(is_use_shape_tensor_list) {
    if (is_use_shape_tensor_list) {
      for (int i = 0; i < shape.size(); i++) {
-        shape_tensor_list_.push_back(shape_tensor_ + std::to_string(i));
+        shape_tensor_list_.push_back(shape_tensor_ +
+                                     paddle::lite::to_string(i));
      }
    }
  }

--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -109,6 +109,7 @@ void TestMul(const std::vector<int64_t>& x_dims,
             int y_num_col_dims,
             const Place& place,
             float abs_error) {
+  LOG(INFO) << "run test arm";
  std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
                                                               "def",
                                                               DDim(x_dims),
@@ -131,7 +132,6 @@ TEST(Mul, precision) {
 #else
  return;
 #endif
-
  TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
  TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
  TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);

--- a/lite/tests/kernels/mul_grad_compute_test.cc
+++ b/lite/tests/kernels/mul_grad_compute_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/mul_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::MulParam;
+using grad_param_t = operators::MulGradParam;
+using kernel_t = MulCompute;
+using grad_kernel_t = MulGradCompute;
+
+class MulGradTester {
+ public:
+  explicit MulGradTester(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int x_num_col_dims,
+                         int y_num_col_dims)
+      : x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_num_col_dims_(x_num_col_dims),
+        y_num_col_dims_(y_num_col_dims) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx2(new KernelContext);
+    ctx2->As<ARMContext>();
+    delta_kernel_.SetContext(std::move(ctx2));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->x = &x;
+    param->y = &y;
+    param->output = &output;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->x = &x;
+    param->x_grad = &x_grad;
+    param->y = &y;
+    param->y_grad = &y_grad;
+    param->output_grad = &out_grad;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad() {
+    std::vector<int64_t> out_shape;
+    for (int i = 0; i < x_num_col_dims_; i++) {
+      out_shape.push_back(x_dims_[i]);
+    }
+    for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
+      out_shape.push_back(y_dims_[i]);
+    }
+    out_dims_ = DDim(out_shape);
+
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+
+    float delta = 0.001;
+    float max_grad_delta = 0.005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      for (int j = 0; j < x_dims_.production(); j++) {
+        if (i == j) {
+          x_delta[j] = x[j] + delta;
+        } else {
+          x_delta[j] = x[j];
+        }
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x_delta, y, out_delta.data());
+
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += (out_delta[j] - out[j]);
+      }
+
+      EXPECT_NEAR(x_grad[i], sum / delta, max_grad_delta);
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      for (int j = 0; j < y_dims_.production(); j++) {
+        y_delta[j] = i == j ? y[j] + delta : y[j];
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x, y_delta, out_delta.data());
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += out_delta[j] - out[j];
+      }
+
+      EXPECT_NEAR(y_grad[i], sum / delta, max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int x_num_col_dims_;
+  int y_num_col_dims_;
+  kernel_t kernel_;
+  kernel_t delta_kernel_;
+  grad_kernel_t grad_kernel_;
+  param_t param_;
+  param_t delta_param_;
+  grad_param_t grad_param_;
+};
+
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int x_num_col_dims,
+                    int y_num_col_dims) {
+  std::unique_ptr<MulGradTester> tester(new MulGradTester(
+      DDim(x_dims), DDim(y_dims), x_num_col_dims, y_num_col_dims));
+
+  tester->prepare_kernel();
+
+  tester->check_grad();
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Mul grad";
+  DeviceInfo::Init();
+  TestNormalCase({1, 3}, {3, 2}, 1, 1);
+  TestNormalCase({3, 2}, {2, 1}, 1, 1);
+  TestNormalCase({3, 1}, {1, 7}, 1, 1);
+  TestNormalCase({2, 3}, {3, 2}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4, 3, 2}, 1, 1);
+  TestNormalCase({3, 4}, {2, 2, 3}, 1, 2);
+  TestNormalCase({4, 20}, {5, 4, 3, 2}, 1, 2);
+  TestNormalCase({4, 60}, {5, 4, 3, 2}, 1, 3);
+  TestNormalCase({2, 3, 4, 5}, {60, 4}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {20, 4}, 2, 1);
+  TestNormalCase({2, 3, 4, 5}, {5, 4}, 3, 1);
+  TestNormalCase({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2);
+  TestNormalCase({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul_grad, kARM, kFloat, kNCHW, def);
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -45,7 +45,8 @@ class ReshapeComputeTester : public arena::TestCase {
      : TestCase(place, alias), dims_(dims) {
    if (is_shape_tensor_vct) {
      for (size_t i = 0; i < shape.size(); i++) {
-        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
+        shape_tensor_vct_.emplace_back(op_type_ + "/shape" +
+                                       paddle::lite::to_string(i));
      }
    } else if (is_shape_tensor) {
      shape_tensor_ = op_type_ + "/shape";

--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -168,8 +168,9 @@ class SliceComputeTester : public arena::TestCase {
      std::vector<std::string> ends_tensor_list_;
      for (int i = 0; i < starts_.size(); ++i) {
        starts_tensor_list_.push_back("starts_tensor_list_" +
-                                      std::to_string(i));
-        ends_tensor_list_.push_back("ends_tensor_list_" + std::to_string(i));
+                                      paddle::lite::to_string(i));
+        ends_tensor_list_.push_back("ends_tensor_list_" +
+                                    paddle::lite::to_string(i));
      }
      op_desc->SetInput("StartsTensorList", {starts_tensor_list_});
      op_desc->SetInput("EndsTensorList", {ends_tensor_list_});
@@ -203,15 +204,15 @@ class SliceComputeTester : public arena::TestCase {
    } else if (use_tensor_list_) {
      Scope& scope_ = this->scope();
      for (int i = 0; i < starts_.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("starts_tensor_list_" + std::to_string(i));
+        auto* tensor = scope_.NewTensor("starts_tensor_list_" +
+                                        paddle::lite::to_string(i));
        tensor->Resize(DDim({1}));
        auto* d = tensor->mutable_data<int>();
        d[0] = starts_[i];
      }
      for (int i = 0; i < ends_.size(); ++i) {
        auto* tensor =
-            scope_.NewTensor("ends_tensor_list_" + std::to_string(i));
+            scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
        tensor->Resize(DDim({1}));
        auto* d = tensor->mutable_data<int>();
        d[0] = ends_[i];

--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -123,7 +123,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
    } else if (input_axes_flag_ == 3) {
      std::string name = "axes_tensor_";
      for (size_t i = 0; i < axes_.size(); i++) {
-        name = name + std::to_string(i);
+        name = name + paddle::lite::to_string(i);
        axes_tensor_list_.push_back(name);
        SetCommonTensor(name, DDim({1}), &axes_[i]);
      }

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -291,6 +291,8 @@ function make_ios {
            -DLITE_ON_TINY_PUBLISH=ON \
            -DLITE_WITH_OPENMP=OFF \
            -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+            -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
            -DARM_TARGET_ARCH_ABI=$abi \
            -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -354,10 +356,12 @@ function make_x86 {
            -DWITH_LITE=ON \
            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
            -DLITE_WITH_ARM=OFF \
+            -DLITE_WITH_PYTHON=$BUILD_PYTHON \
            -DWITH_GPU=OFF \
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
            -DLITE_BUILD_EXTRA=ON \
            -DLITE_WITH_XPU=$BUID_XPU \
-            -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT

  make publish_inference -j$NUM_PROC
  cd -

--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -184,7 +184,7 @@ function build_opencl {
        return 0
    fi

-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
    mkdir -p $build_dir
    cd $build_dir

@@ -193,11 +193,10 @@ function build_opencl {
    cmake_opencl ${os} ${abi} ${lang}
    make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
    build $TESTS_FILE
-
-    # test publish inference lib
-    make publish_inference -j$NUM_CORES_FOR_COMPILE
 }

+
+
 # This method is only called in CI.
 function cmake_x86_for_CI {
    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
@@ -387,7 +386,7 @@ function test_arm_android {
    echo "test name: ${test_name}"
    adb_work_dir="/data/local/tmp"

-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
    for skip_name in ${skip_list[@]} ; do
        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
    done
@@ -755,16 +754,58 @@ function arm_push_necessary_file {
    adb -s ${device} push ${testpath} ${adb_work_dir}
 }

+
+function test_opencl {
+    os=$1
+    abi=$2
+    lang=$3
+    device=$4
+
+    if [[ ${os} == "armlinux" ]]; then
+        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
+        echo "Skip test arm linux yet. armlinux must in another docker"
+        return 0
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+
+    # prepare for CXXApi test
+    local adb="adb -s ${device}"
+    $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
+
+    # opencl test should be marked with `opencl`
+    opencl_test_mark="opencl"
+
+    for _test in $(cat $TESTS_FILE); do
+        # tell if this test is marked with `opencl`
+        if [[ $_test == *$opencl_test_mark* ]]; then
+            test_arm_android $_test $device
+        fi
+    done
+
+}
+
 function build_test_arm_opencl {
    ########################################################################
    cur=$PWD
+    # job 1-4 must be in one runner
+    prepare_adb_devices

    # job 1
    build_opencl "android" "armv8" "gcc"
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv8}
+    test_opencl "android" "armv8" "gcc" ${device_armv8}
    cd $cur

    # job 2
    build_opencl "android" "armv7" "gcc"
+    adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv7}
+    test_opencl "android" "armv7" "gcc" ${device_armv7}
    cd $cur

    echo "Done"
@@ -1099,6 +1140,8 @@ function main {
                ;;
            build_test_arm_opencl)
                build_test_arm_opencl
+                build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
+                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                shift
                ;;
            build_test_arm_subtask_android)

--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record kernels in unvalid_places into all_kernel_faked.cc

 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from utils import *

-if len(sys.argv) != 4:
+if len(sys.argv) != 5:
    print("Error: create_fake_kernel_registry.py requires three inputs!")
    exit(1)
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-kernelmap_path = sys.argv[3]
+kernels_list_path = sys.argv[1]
+faked_kernels_list_path = sys.argv[2]
+dest_path = sys.argv[3]
+kernelmap_path = sys.argv[4]

 out_lines = [
    '#pragma once',
@@ -77,68 +79,85 @@ const std::map<std::string, std::string> kernel2path_map{
 '''
 ]

+def parse_fake_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
+                        op_type=k.op_type,
+                        target=k.target,
+                        precision=k.precision,
+                        data_layout=k.data_layout,
+                        alias=k.alias
+                    )
+
+                    kernel_define = fake_kernel % (
+                        kernel_name,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        kernel_name
+                    )
+
+                    out_lines.append(kernel_define)
+                    out_lines.append("")
+
+
+                    key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        '::paddle::lite::' + kernel_name,
+                        k.alias
+                    )
+                    out_lines.append(key)
+
+                    for input in k.inputs:
+                        io = '    .BindInput("%s", {%s})' % (input.name, input.type)
+                        out_lines.append(io)
+                    for output in k.outputs:
+                        io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
+                        out_lines.append(io)
+                    out_lines.append("    .Finalize();")
+                    out_lines.append("")
+                    out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
+
+def parse_sppported_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    index = path.rindex('/')
+                    filename = path[index + 1:]
+                    map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        k.alias,
+                        filename.strip()
+                    )
+                    kernel_src_map_lines.append(map_element)
+
+
+parse_fake_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(kernels_list_path)

-with open(ops_list_path) as f:
-    paths = set([path for path in f])
-    for path in paths:
-        print('path', path)
-        with open(path.strip()) as g:
-            c = g.read()
-            kernel_parser = RegisterLiteKernelParser(c)
-            kernel_parser.parse()
-
-            for k in kernel_parser.kernels:
-                kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
-                    op_type = k.op_type,
-                    target = k.target,
-                    precision = k.precision,
-                    data_layout = k.data_layout,
-                    alias = k.alias,
-                )
-
-                kernel_define = fake_kernel % (
-                    kernel_name,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    kernel_name,
-                )
-
-                out_lines.append(kernel_define)
-                out_lines.append("")
-
-
-                key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    '::paddle::lite::' + kernel_name,
-                    k.alias,
-                )
-                out_lines.append(key)
-
-                for input in k.inputs:
-                    io = '    .BindInput("%s", {%s})' % (input.name, input.type)
-                    out_lines.append(io)
-                for output in k.outputs:
-                    io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
-                    out_lines.append(io)
-                out_lines.append("    .Finalize();")
-                out_lines.append("")
-                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
-
-                index = path.rindex('/')
-                filename = path[index + 1:]
-                map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    k.alias,
-                    filename.strip()
-                )
-                kernel_src_map_lines.append(map_element)
 with open(dest_path, 'w') as f:
    logging.info("write kernel list to %s" % dest_path)
    f.write('\n'.join(out_lines))

--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record supported ops from kernels_src.txt

 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from ast import RegisterLiteOpParser

-if len(sys.argv) != 4:
-    print("Error: record_supported_kernel_op.py requires three inputs!")
-    exit(1)
+if len(sys.argv) != 5:
+    print("Error: record_supported_kernel_op.py requires four inputs!")
+    sys.exit(1)
 kernels_list_path = sys.argv[1]
-ops_list_path = sys.argv[2]
-kernel_op_map_dest_path = sys.argv[3]
+faked_kernels_list_path = sys.argv[2]
+ops_list_path = sys.argv[3]
+kernel_op_map_dest_path = sys.argv[4]


 out_lines = [
@@ -51,11 +53,11 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 '''
 ]

-ops_lines=[]
+ops_lines = []

 # valid targets and valid_ops
 valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[]]
+valid_ops = [[], [], [], [], [], [], [], [], [], []]
 class TargetType:
    kUnk = 0
    kHost = 1
@@ -78,8 +80,21 @@ with open(kernels_list_path) as f:
            kernel_parser.parse()
            for k in kernel_parser.kernels:
                if hasattr(TargetType, k.target):
-                    index=getattr(TargetType, k.target)
+                    index = getattr(TargetType, k.target)
                    valid_ops[index].append(k.op_type)
+# record op_info of valid kernels into `valid_ops` according to different target type
+with open(faked_kernels_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        with open(path.strip()) as g:
+            c = g.read()
+            kernel_parser = RegisterLiteKernelParser(c)
+            kernel_parser.parse()
+            for k in kernel_parser.kernels:
+                if hasattr(TargetType, k.target):
+                    index = getattr(TargetType, k.target)
+                    valid_ops[index].append(k.op_type)
+

 # clear the repeated ops
 for target in valid_targets:
@@ -114,7 +129,7 @@ with open(kernel_op_map_dest_path, 'w') as f:
    f.write('\n'.join(out_lines))
    # write kernels into head file
    for target in valid_targets:
-        if len(valid_ops[getattr(TargetType, target)]) == 0 :
+        if len(valid_ops[getattr(TargetType, target)]) == 0:
            f.write("\n    // %s_OPS: " %target)
            f.write('\n    {},')
        else:

--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
    lite_cc_library(paddle_cv_arm SRCS
            image_convert.cc
+            bgr_rotate.cc
            paddle_image_preprocess.cc
            image2tensor.cc
            image_flip.cc

--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// ncnn license
+// Tencent is pleased to support the open source community by making ncnn
+// available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lite/utils/cv/bgr_rotate.h"
+#include <arm_neon.h>
+#include <math.h>
+#include <algorithm>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+#ifdef __aarch64__
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  [w_out * h_out * 3];
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      uint8_t* outptr1 = outptr0 + wout;
+      uint8_t* outptr2 = outptr1 + wout;
+      uint8_t* outptr3 = outptr2 + wout;
+      uint8_t* outptr4 = outptr3 + wout;
+      uint8_t* outptr5 = outptr4 + wout;
+      uint8_t* outptr6 = outptr5 + wout;
+      uint8_t* outptr7 = outptr6 + wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v12.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v13.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v14.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v15.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v16.8b, v16.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v17.8b, v17.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"
+          // //00 10 20 30 04 14 24 34
+          // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"
+          // //02 12 22 32
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v6.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v7.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v8.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v24.8b, v24.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v25.8b, v25.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v26.8b, v26.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v9.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v10.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v11.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v27.8b, v27.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v28.8b, v28.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v29.8b, v29.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v0.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v1.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v2.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v18.8b, v18.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v19.8b, v19.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v20.8b, v20.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29",
+            "v30");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+#ifdef __aarch64__
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      uint8_t* outptr1 = outptr0 - wout;
+      uint8_t* outptr2 = outptr1 - wout;
+      uint8_t* outptr3 = outptr2 - wout;
+      uint8_t* outptr4 = outptr3 - wout;
+      uint8_t* outptr5 = outptr4 - wout;
+      uint8_t* outptr6 = outptr5 - wout;
+      uint8_t* outptr7 = outptr6 - wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+#ifdef __aarch64__
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[30000];  // [w_in];
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int64_t stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#else
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[30000];  // w_in
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#endif
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
--- a/lite/utils/cv/bgr_rotate.h
+++ b/lite/utils/cv/bgr_rotate.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -664,15 +664,6 @@ void resize(const uint8_t* src,
    memcpy(dst, src, sizeof(uint8_t) * size);
    return;
  }
-  double scale_x = static_cast<double>(srcw) / dstw;
-  double scale_y = static_cast<double>(srch) / dsth;
-
-  int* buf = new int[dstw * 2 + dsth * 3];
-
-  int* xofs = buf;
-  int* yofs = buf + dstw;
-  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);

  int w_out = dstw;
  int w_in = srcw;
@@ -692,12 +683,19 @@ void resize(const uint8_t* src,
    w_in = srcw * 3;
    w_out = dstw * 3;
    num = 3;
-
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
    w_in = srcw * 4;
    w_out = dstw * 4;
    num = 4;
  }
+  double scale_x = static_cast<double>(srcw) / dstw;
+  double scale_y = static_cast<double>(srch) / dsth;
+
+  int* buf = new int[dstw * 2 + dsth * 3];
+  int* xofs = buf;
+  int* yofs = buf + dstw;
+  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);

  compute_xy(
      srcw, srch, dstw, orih, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
@@ -726,10 +724,10 @@ void resize(const uint8_t* src,
  int remain = w_out % 8;
  int32x4_t _v2 = vdupq_n_s32(2);
  int prev_sy1 = -1;
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
 #pragma omp parallel for
  for (int dy = 0; dy < dsth; dy++) {
-    int16_t* rowsbuf0 = new int16_t[w_out + 1];
-    int16_t* rowsbuf1 = new int16_t[w_out + 1];
    int sy = yofs[dy];
    if (dy >= orih) {
      xofs = xofs1;
@@ -853,8 +851,6 @@ void resize(const uint8_t* src,
                    2);
    }
    ibeta += 2;
-    delete[] rowsbuf0;
-    delete[] rowsbuf1;
  }
  if (orih < dsth) {  // uv
    delete[] xofs1;
@@ -862,6 +858,8 @@ void resize(const uint8_t* src,
    delete[] ialpha1;
  }
  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
 }
 // compute xofs, yofs, alpha, beta
 void compute_xy(int srcw,

--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/cv/image_rotate.h"
 #include <math.h>
 #include <string.h>
+#include "lite/utils/cv/bgr_rotate.h"
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
  if (srcFormat == GRAY) {
    rotate_hwc1(src, dst, srcw, srch, degree);
  } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
+    // rotate_hwc3(src, dst, srcw, srch, degree);
+    bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
    rotate_hwc4(src, dst, srcw, srch, degree);
  } else {

--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -29,6 +29,7 @@
 #include <cstring>
 #include <string>
 #include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"

 #ifdef LITE_WITH_ANDROID
 #include <android/log.h>
@@ -171,7 +172,7 @@ class VLogMessage {
    if (GLOG_v_int < level_int) {
      return;
    }
-    const char* level = std::to_string(level_int).c_str();
+    const char* level = paddle::lite::to_string(level_int).c_str();
    paddle::lite::gen_log(log_stream_, file, func, lineno, level);
  }


--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/replace_stl/stream.h"
 #include <assert.h>
 #include <stdio.h>
+#include "lite/utils/string.h"

 #ifdef LITE_ON_TINY_PUBLISH

@@ -39,9 +40,9 @@ void ostream::pad(const std::string& text) {
 #ifdef LITE_SHUTDOWN_LOG
 #define ADD_DATA_AS_STRING(data_, obj_)
 #else
-#define ADD_DATA_AS_STRING(data_, obj_)    \
-  std::string text = std::to_string(obj_); \
-  pad(text);                               \
+#define ADD_DATA_AS_STRING(data_, obj_)             \
+  std::string text = paddle::lite::to_string(obj_); \
+  pad(text);                                        \
  data_ = data_ + text;

 #endif

--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -48,7 +48,14 @@ template <typename T>
 static std::string to_string_with_precision(const T& v, const int n = 6) {
  STL::stringstream ss;
  ss.precision(n);
-  // ss << std::fixed << v;
+  ss << v;
+  return ss.str();
+}
+
+template <typename T>
+static std::string to_string(const T& v) {
+  STL::stringstream ss;
+  ss << v;
  return ss.str();
 }